1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Hash table with separate chaining.
19 This header (hmap.h) supplies an "embedded" implementation of
20 a hash table that uses linked lists to resolve collisions
21 ("separate chaining"). Its companion header (hmapx.h)
22 supplies a "external" implementation that is otherwise
23 similar. The two variants are described briefly here. The
24 embedded variant, for which this is the header, is described
25 in slightly more detail below. Each function also has a
26 detailed usage comment at its point of definition. (Many of
27 those definitions are inline in this file, because they are so
28 simple. Others are in hmap.c.)
30 The "hmap" embedded hash table implementation puts the hash
31 table node (which includes the linked list used for resolving
32 collisions) within the data structure that the hash table
33 contains. This makes allocation efficient, in space and time,
34 because no additional call into an allocator is needed to
35 obtain memory for the hash table node. It also makes it easy
36 to find the hash table node associated with a given object.
37 However, it's difficult to include a given object in an
38 arbitrary number of hash tables.
40 The "hmapx" external hash table implementation allocates hash
41 table nodes separately from the objects in the hash table.
42 Inserting and removing hash table elements requires dynamic
43 allocation, so it is normally slower and takes more memory
44 than the embedded implementation. It also requires searching
45 the table to find the node associated with a given object.
46 However, it's easy to include a given object in an arbitrary
47 number of hash tables. It's also possible to create an
48 external hash table without adding a member to the data
49 structure that the hash table contains. */
51 #ifndef LIBPSPP_HMAP_H
52 #define LIBPSPP_HMAP_H 1
54 /* Embedded hash table with separate chaining.
56 To create an embedded hash table, declare an instance of
57 struct hmap, then initialize it with hmap_init():
61 struct hmap map = HMAP_INITIALIZER (map);
63 Each node in the hash table, presumably a structure type, must
64 include a struct hmap_node member. Here's an example:
67 struct hmap_node node; // hmap_node member.
68 const char *string; // Another member.
70 The hash table functions work with pointers to struct
71 hmap_node. To obtain a pointer to your structure type given a
72 pointer to struct hmap_node, use the HMAP_DATA macro.
74 Inserting and deleting elements is straightforward. Use
75 hmap_insert() to insert an element and hmap_delete() to delete
78 my_foo.string = "My string";
79 hmap_insert (&map, &my_foo.node, hsh_hash_string (my_foo.string));
81 hmap_delete (&map, &my_foo.node);
82 You must pass the element's hash value as one of
83 hmap_insert()'s arguments. The hash table saves this hash
84 value for use later to speed searches and to rehash as the
87 hmap_insert() does not check whether the newly inserted
88 element duplicates an element already in the hash table. The
89 client is responsible for doing so, if this is desirable.
91 The hash table does not provide a direct way to search for an
92 existing element. Instead, it provides the means to iterate
93 over all the elements in the hash table with a given hash
94 value. It is easy to compose a search function from such a
95 building block. For example:
97 find_foo (const struct hmap *map, const char *name)
99 const struct foo *foo;
102 hash = hsh_hash_string (name);
103 HMAP_FOR_EACH_WITH_HASH (foo, struct foo, node, hash, map)
104 if (!strcmp (foo->name, name))
109 Here is how to iterate through the elements currently in the
112 HMAP_FOR_EACH (foo, struct foo, node, &map)
114 ...do something with foo...
121 #include "libpspp/cast.h"
123 /* Returns the data structure corresponding to the given NODE,
124 assuming that NODE is embedded as the given MEMBER name in
125 data type STRUCT. NODE must not be a null pointer. */
126 #define HMAP_DATA(NODE, STRUCT, MEMBER) \
127 (CHECK_POINTER_HAS_TYPE (NODE, struct hmap_node *), \
128 UP_CAST (NODE, STRUCT, MEMBER))
130 /* Like HMAP_DATA, except that a null NODE yields a null pointer
132 #define HMAP_NULLABLE_DATA(NODE, STRUCT, MEMBER) \
133 hmap_nullable_data__ (NODE, offsetof (STRUCT, MEMBER))
135 /* Hash table node. */
138 struct hmap_node *next; /* Next in chain. */
139 size_t hash; /* Hash value. */
142 static inline size_t hmap_node_hash (const struct hmap_node *);
147 struct hmap_node **buckets; /* Array of buckets. */
148 struct hmap_node *two[2]; /* Two buckets, to eliminate corner cases. */
149 size_t count; /* Number of inserted nodes. */
150 int shift; /* Bits to shift hash to get bucket index. */
153 /* Maximum number of bits that a "size_t" value can be shifted in a single
154 operation without yielding undefined behavior. */
155 #if SIZE_MAX == UINT32_MAX
156 #define HMAP_MAX_SHIFT 31
157 #elif SIZE_MAX == UINT64_MAX
158 #define HMAP_MAX_SHIFT 63
160 #error "size_t is not 32 or 64 bits"
163 /* Suitable for use as the initializer for a struct hmap named
165 struct hmap map = HMAP_INITIALIZER (map);
166 HMAP_INITIALIZER() is an alternative to hmap_init(). */
167 #define HMAP_INITIALIZER(MAP) { (MAP).two, { NULL, NULL }, 0, HMAP_MAX_SHIFT }
169 /* Creation and destruction. */
170 void hmap_init (struct hmap *);
171 void hmap_swap (struct hmap *, struct hmap *);
172 void hmap_clear (struct hmap *);
173 void hmap_destroy (struct hmap *);
175 /* Storage management. */
176 void hmap_reserve (struct hmap *, size_t capacity);
177 void hmap_shrink (struct hmap *);
179 /* Search. Refer to the large comment near the top of this file
181 static inline struct hmap_node *hmap_first_with_hash (const struct hmap *,
183 static inline struct hmap_node *hmap_next_with_hash (const struct hmap_node *);
185 /* Insertion and deletion. */
186 void hmap_insert (struct hmap *, struct hmap_node *, size_t hash);
187 void hmap_insert_fast (struct hmap *, struct hmap_node *, size_t hash);
188 static inline void hmap_delete (struct hmap *, struct hmap_node *);
191 static inline struct hmap_node *hmap_first (const struct hmap *);
192 static inline struct hmap_node *hmap_next (const struct hmap *,
193 const struct hmap_node *);
196 static bool hmap_is_empty (const struct hmap *);
197 static inline size_t hmap_count (const struct hmap *);
198 static inline size_t hmap_capacity (const struct hmap *);
200 /* Updating data elements. */
201 void hmap_changed (struct hmap *, struct hmap_node *, size_t new_hash);
202 void hmap_moved (struct hmap *,
203 struct hmap_node *, const struct hmap_node *old);
205 /* Convenience macros for search.
207 These macros automatically use HMAP_DATA to obtain the data
208 elements that encapsulate hmap nodes, which often saves typing
209 and can make code easier to read. Refer to the large comment
210 near the top of this file for an example.
212 These macros evaluate HASH only once. They evaluate their
213 other arguments many times. */
214 #define HMAP_FIRST_WITH_HASH(STRUCT, MEMBER, HMAP, HASH) \
215 HMAP_NULLABLE_DATA (hmap_first_with_hash (HMAP, HASH), STRUCT, MEMBER)
216 #define HMAP_NEXT_WITH_HASH(DATA, STRUCT, MEMBER) \
217 HMAP_NULLABLE_DATA (hmap_next_with_hash (&(DATA)->MEMBER), STRUCT, MEMBER)
218 #define HMAP_FOR_EACH_WITH_HASH(DATA, STRUCT, MEMBER, HASH, HMAP) \
219 for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH); \
221 (DATA) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER))
222 #define HMAP_FOR_EACH_WITH_HASH_SAFE(DATA, NEXT, STRUCT, MEMBER, HASH, HMAP) \
223 for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH); \
225 ? ((NEXT) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER), 1) \
229 /* These macros are like the *_WITH_HASH macros above, except that they don't
230 skip data elements that are in the same hash bucket but have different hash
231 values. This is a small optimization in code where comparing keys is just
232 as fast as comparing hashes (e.g. the key is an "int") or comparing keys
233 would duplicate comparing the hashes (e.g. the hash is the first word of a
234 multi-word random key).
236 These macros evaluate HASH only once. They evaluate their
237 other arguments many times. */
238 #define HMAP_FIRST_IN_BUCKET(STRUCT, MEMBER, HMAP, HASH) \
239 HMAP_NULLABLE_DATA (hmap_first_in_bucket (HMAP, HASH), STRUCT, MEMBER)
240 #define HMAP_NEXT_IN_BUCKET(DATA, STRUCT, MEMBER) \
241 HMAP_NULLABLE_DATA (hmap_next_in_bucket (&(DATA)->MEMBER), STRUCT, MEMBER)
242 #define HMAP_FOR_EACH_IN_BUCKET(DATA, STRUCT, MEMBER, HASH, HMAP) \
243 for ((DATA) = HMAP_FIRST_IN_BUCKET (STRUCT, MEMBER, HMAP, HASH); \
245 (DATA) = HMAP_NEXT_IN_BUCKET (DATA, STRUCT, MEMBER))
246 #define HMAP_FOR_EACH_IN_BUCKET_SAFE(DATA, NEXT, STRUCT, MEMBER, HASH, HMAP) \
247 for ((DATA) = HMAP_FIRST_IN_BUCKET (STRUCT, MEMBER, HMAP, HASH); \
249 ? ((NEXT) = HMAP_NEXT_IN_BUCKET (DATA, STRUCT, MEMBER), 1) \
253 /* Convenience macros for iteration.
255 These macros automatically use HMAP_DATA to obtain the data
256 elements that encapsulate hmap nodes, which often saves typing
257 and can make code easier to read. Refer to the large comment
258 near the top of this file for an example.
260 These macros evaluate their arguments many times. */
261 #define HMAP_FIRST(STRUCT, MEMBER, HMAP) \
262 HMAP_NULLABLE_DATA (hmap_first (HMAP), STRUCT, MEMBER)
263 #define HMAP_NEXT(DATA, STRUCT, MEMBER, HMAP) \
264 HMAP_NULLABLE_DATA (hmap_next (HMAP, &(DATA)->MEMBER), STRUCT, MEMBER)
265 #define HMAP_FOR_EACH(DATA, STRUCT, MEMBER, HMAP) \
266 for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP); \
268 (DATA) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP))
269 #define HMAP_FOR_EACH_SAFE(DATA, NEXT, STRUCT, MEMBER, HMAP) \
270 for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP); \
272 ? ((NEXT) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP), 1) \
276 /* Inline definitions. */
278 static inline struct hmap_node *hmap_find_hash__ (struct hmap_node *, size_t);
279 static inline struct hmap_node *hmap_first_nonempty_bucket__ (
280 const struct hmap *, size_t start);
281 static inline size_t hmap_shift_to_capacity__ (int shift);
283 /* Returns the hash value associated with NODE. */
285 hmap_node_hash (const struct hmap_node *node)
290 /* Returns the first node in MAP that has hash value HASH, or a
291 null pointer if MAP does not contain any node with that hash
294 Assuming uniform hashing and no duplicate data items in MAP,
295 this function runs in constant time. (Amortized over an
296 iteration over all data items with a given HASH, its runtime
297 is proportional to the length of the hash chain for HASH, so
298 given a pathological hash function, e.g. one that returns a
299 constant value, its runtime degenerates to linear in the
300 length of NODE's hash chain.)
302 Nodes are returned in arbitrary but consistent order.
304 The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
305 macros provide convenient ways to iterate over all the nodes
306 with a given hash. The HMAP_FIRST_WITH_HASH macro is an
307 interface to this particular function that is often more
309 static inline struct hmap_node *
310 hmap_first_with_hash (const struct hmap *map, size_t hash)
312 return hmap_find_hash__ (map->buckets[hash >> map->shift], hash);
315 /* Returns the next node in MAP after NODE that has the same hash
316 value as NODE, or a null pointer if MAP does not contain any
317 more nodes with that hash value.
319 Assuming uniform hashing and no duplicate data items in MAP,
320 this function runs in constant time. (Amortized over an
321 iteration over all data items with a given HASH, its runtime
322 is proportional to the length of the hash chain for HASH, so
323 given a pathological hash function, e.g. one that returns a
324 constant value, its runtime degenerates to linear in the
325 length of NODE's hash chain.)
327 Nodes are returned in arbitrary but consistent order.
329 The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
330 macros provide convenient ways to iterate over all the nodes
331 with a given hash. The HMAP_NEXT_WITH_HASH macro is an
332 interface to this particular function that is often more
334 static inline struct hmap_node *
335 hmap_next_with_hash (const struct hmap_node *node)
337 return hmap_find_hash__ (node->next, node->hash);
340 /* Returns the first node in MAP in the bucket for HASH, or a null pointer if
341 that bucket in HASH is empty.
343 This function runs in constant time.
345 Nodes in a bucket are sorted in increasing order of hash value. Nodes with
346 equal hashes are sorted in an arbitrary but consistent order.
348 The HMAP_FOR_EACH_IN_BUCKET and HMAP_FOR_EACH_IN_BUCKET_SAFE macros provide
349 convenient ways to iterate over all the nodes with a given hash. The
350 HMAP_FIRST_IN_BUCKET macro is an interface to this particular function that
351 is often more convenient. */
352 static inline struct hmap_node *
353 hmap_first_in_bucket (const struct hmap *map, size_t hash)
355 return map->buckets[hash >> map->shift];
358 /* Returns the next node following NODE within the same bucket, or a null
359 pointer if NODE is the last node in its bucket.
361 This function runs in constant time.
363 Nodes in a bucket are sorted in increasing order of hash value. Nodes with
364 equal hashes are sorted in an arbitrary but consistent order.
366 The HMAP_FOR_EACH_IN_BUCKET and HMAP_FOR_EACH_IN_BUCKET_SAFE macros provide
367 convenient ways to iterate over all the nodes with a given hash. The
368 HMAP_NEXT_IN_BUCKET macro is an interface to this particular function that
369 is often more convenient. */
370 static inline struct hmap_node *
371 hmap_next_in_bucket (const struct hmap_node *node)
376 /* Removes NODE from MAP. The client is responsible for freeing
377 any data associated with NODE, if necessary.
379 Assuming uniform hashing, this function runs in constant time.
380 (Its runtime is proportional to the position of NODE in its
381 hash chain, so given a pathological hash function, e.g. one
382 that returns a constant value, its runtime degenerates to
383 linear in the length of NODE's hash chain.)
385 This function never reduces the number of buckets in MAP.
386 When one deletes a large number of nodes from a hash table,
387 calling hmap_shrink() afterward may therefore save a small
388 amount of memory. It is also more expensive to iterate
389 through a very sparse hash table than a denser one, so
390 shrinking the hash table could also save some time. However,
391 rehashing has an immediate cost that must be weighed against
394 hmap_delete() does not change NODE's hash value reported by
397 hmap_delete (struct hmap *map, struct hmap_node *node)
399 struct hmap_node **bucket = &map->buckets[node->hash >> map->shift];
400 while (*bucket != node)
401 bucket = &(*bucket)->next;
402 *bucket = (*bucket)->next;
406 /* Returns the first node in MAP, or a null pointer if MAP is
409 Amortized over iterating through every data element in MAP,
410 this function runs in constant time. However, this assumes
411 that MAP is not excessively sparse, that is, that
412 hmap_capacity(MAP) is at most a constant factor greater than
413 hmap_count(MAP). This will always be true unless many nodes
414 have been inserted into MAP and then most or all of them
415 deleted; in such a case, calling hmap_shrink() is advised.
417 Nodes are returned in arbitrary order that may change whenever
418 the hash table's current capacity changes, as reported by
419 hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and
420 hmap_shrink() can change the capacity of a hash map.
421 Inserting a node with hmap_insert_fast() or deleting one with
422 hmap_delete() will not change the relative ordering of nodes.
424 The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
425 convenient ways to iterate over all the nodes in a hash map.
426 The HMAP_FIRST macro is an interface to this particular
427 function that is often more convenient. */
428 static inline struct hmap_node *
429 hmap_first (const struct hmap *map)
431 return hmap_first_nonempty_bucket__ (map, 0);
434 /* Returns the next node in MAP following NODE, or a null pointer
435 if NODE is the last node in MAP.
437 Amortized over iterating through every data element in MAP,
438 this function runs in constant time. However, this assumes
439 that MAP is not excessively sparse, that is, that
440 hmap_capacity(MAP) is at most a constant factor greater than
441 hmap_count(MAP). This will always be true unless many nodes
442 have been inserted into MAP and then most or all of them
443 deleted; in such a case, calling hmap_shrink() is advised.
445 Nodes are returned in arbitrary order that may change whenever
446 the hash table's current capacity changes, as reported by
447 hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and
448 hmap_shrink() can change the capacity of a hash map.
449 Inserting a node with hmap_insert_fast() or deleting one with
450 hmap_delete() will not change the relative ordering of nodes.
452 The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
453 convenient ways to iterate over all the nodes in a hash map.
454 The HMAP_NEXT macro is an interface to this particular
455 function that is often more convenient. */
456 static inline struct hmap_node *
457 hmap_next (const struct hmap *map, const struct hmap_node *node)
459 if (node->next != NULL)
463 size_t bucket_idx = node->hash >> map->shift;
464 return hmap_first_nonempty_bucket__ (map, bucket_idx + 1);
468 /* Returns true if MAP currently contains no data items, false
471 hmap_is_empty (const struct hmap *map)
473 return map->count == 0;
476 /* Returns the number of data items currently in MAP. */
478 hmap_count (const struct hmap *map)
483 /* Returns the current capacity of MAP, that is, the maximum
484 number of data elements that MAP may hold before it becomes
487 The capacity is advisory only: it is possible to insert any
488 number of data elements into a hash map regardless of its
489 capacity. However, inserting many more elements than the
490 map's capacity will degrade search performance. */
492 hmap_capacity (const struct hmap *map)
494 return hmap_shift_to_capacity__ (map->shift);
497 /* Returns the number of buckets in MAP. */
499 hmap_n_buckets (const struct hmap *map)
501 return (SIZE_MAX >> map->shift) + 1;
504 /* Implementation details. */
506 /* Returns the first node at or after NODE in NODE's chain that
507 has hash value HASH. */
508 static inline struct hmap_node *
509 hmap_find_hash__ (struct hmap_node *node, size_t hash)
511 for (; node != NULL; node = node->next)
512 if (node->hash == hash)
517 /* Returns the first node in the lowest-numbered nonempty bucket
518 in MAP whose index is START or higher, or a null pointer if
519 all such buckets are empty. */
520 static inline struct hmap_node *
521 hmap_first_nonempty_bucket__ (const struct hmap *map, size_t start)
525 for (i = start; i <= SIZE_MAX >> map->shift; i++)
526 if (map->buckets[i] != NULL)
527 return map->buckets[i];
531 /* Returns the hash table capacity associated with a given SHIFT, which should
532 be a value for the "shift" member of struct hmap. SHIFT must be between 2
533 and HMAP_MAX_SHIFT, inclusive. */
535 hmap_shift_to_capacity__ (int shift)
537 return ((size_t) 1 << HMAP_MAX_SHIFT) >> (shift - 2);
540 /* Helper for HMAP_NULLABLE_DATA (to avoid evaluating its NODE
541 argument more than once). */
543 hmap_nullable_data__ (struct hmap_node *node, size_t member_offset)
545 return node != NULL ? (char *) node - member_offset : NULL;
548 #endif /* libpspp/hmap.h */