1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Hash table with separate chaining.
19 This header (hmap.h) supplies an "embedded" implementation of
20 a hash table that uses linked lists to resolve collisions
21 ("separate chaining"). Its companion header (hmapx.h)
22 supplies a "external" implementation that is otherwise
23 similar. The two variants are described briefly here. The
24 embedded variant, for which this is the header, is described
25 in slightly more detail below. Each function also has a
26 detailed usage comment at its point of definition. (Many of
27 those definitions are inline in this file, because they are so
28 simple. Others are in hmap.c.)
30 The "hmap" embedded hash table implementation puts the hash
31 table node (which includes the linked list used for resolving
32 collisions) within the data structure that the hash table
33 contains. This makes allocation efficient, in space and time,
34 because no additional call into an allocator is needed to
35 obtain memory for the hash table node. It also makes it easy
36 to find the hash table node associated with a given object.
37 However, it's difficult to include a given object in an
38 arbitrary number of hash tables.
40 The "hmapx" external hash table implementation allocates hash
41 table nodes separately from the objects in the hash table.
42 Inserting and removing hash table elements requires dynamic
43 allocation, so it is normally slower and takes more memory
44 than the embedded implementation. It also requires searching
45 the table to find the node associated with a given object.
46 However, it's easy to include a given object in an arbitrary
47 number of hash tables. It's also possible to create an
48 external hash table without adding a member to the data
49 structure that the hash table contains. */
51 #ifndef LIBPSPP_HMAP_H
52 #define LIBPSPP_HMAP_H 1
54 /* Embedded hash table with separate chaining.
56 To create an embedded hash table, declare an instance of
57 struct hmap, then initialize it with hmap_init():
61 struct hmap map = HMAP_INITIALIZER (map);
63 Each node in the hash table, presumably a structure type, must
64 include a struct hmap_node member. Here's an example:
67 struct hmap_node node; // hmap_node member.
68 const char *string; // Another member.
70 The hash table functions work with pointers to struct
71 hmap_node. To obtain a pointer to your structure type given a
72 pointer to struct hmap_node, use the HMAP_DATA macro.
74 Inserting and deleting elements is straightforward. Use
75 hmap_insert() to insert an element and hmap_delete() to delete
78 my_foo.string = "My string";
79 hmap_insert (&map, &my_foo.node, hsh_hash_string (my_foo.string));
81 hmap_delete (&map, &my_foo.node);
82 You must pass the element's hash value as one of
83 hmap_insert()'s arguments. The hash table saves this hash
84 value for use later to speed searches and to rehash as the
87 hmap_insert() does not check whether the newly inserted
88 element duplicates an element already in the hash table. The
89 client is responsible for doing so, if this is desirable.
91 The hash table does not provide a direct way to search for an
92 existing element. Instead, it provides the means to iterate
93 over all the elements in the hash table with a given hash
94 value. It is easy to compose a search function from such a
95 building block. For example:
97 find_foo (const struct hmap *map, const char *name)
99 const struct foo *foo;
102 hash = hsh_hash_string (name);
103 HMAP_FOR_EACH_WITH_HASH (foo, struct foo, node, hash, map)
104 if (!strcmp (foo->name, name))
109 Here is how to iterate through the elements currently in the
112 HMAP_FOR_EACH (foo, struct foo, node, &map)
114 ...do something with foo...
120 #include "libpspp/cast.h"
122 /* Returns the data structure corresponding to the given NODE,
123 assuming that NODE is embedded as the given MEMBER name in
124 data type STRUCT. NODE must not be a null pointer. */
125 #define HMAP_DATA(NODE, STRUCT, MEMBER) \
126 (CHECK_POINTER_HAS_TYPE (NODE, struct hmap_node *), \
127 UP_CAST (NODE, STRUCT, MEMBER))
129 /* Like HMAP_DATA, except that a null NODE yields a null pointer
131 #define HMAP_NULLABLE_DATA(NODE, STRUCT, MEMBER) \
132 hmap_nullable_data__ (NODE, offsetof (STRUCT, MEMBER))
134 /* Hash table node. */
137 struct hmap_node *next; /* Next in chain. */
138 size_t hash; /* Hash value. */
141 static inline size_t hmap_node_hash (const struct hmap_node *);
146 size_t count; /* Number of inserted nodes. */
147 size_t mask; /* Number of buckets (power of 2), minus 1. */
148 struct hmap_node **buckets; /* Array of buckets. */
149 struct hmap_node *one; /* One bucket, to eliminate corner cases. */
152 /* Suitable for use as the initializer for a struct hmap named
154 struct hmap map = HMAP_INITIALIZER (map);
155 HMAP_INITIALIZER() is an alternative to hmap_init(). */
156 #define HMAP_INITIALIZER(MAP) { 0, 0, &(MAP).one, NULL }
158 /* Creation and destruction. */
159 void hmap_init (struct hmap *);
160 void hmap_swap (struct hmap *, struct hmap *);
161 void hmap_clear (struct hmap *);
162 void hmap_destroy (struct hmap *);
164 /* Storage management. */
165 void hmap_reserve (struct hmap *, size_t capacity);
166 void hmap_shrink (struct hmap *);
168 /* Search. Refer to the large comment near the top of this file
170 static inline struct hmap_node *hmap_first_with_hash (const struct hmap *,
172 static inline struct hmap_node *hmap_next_with_hash (const struct hmap_node *);
174 /* Insertion and deletion. */
175 static inline void hmap_insert (struct hmap *, struct hmap_node *,
177 static inline void hmap_insert_fast (struct hmap *, struct hmap_node *,
179 static inline void hmap_delete (struct hmap *, struct hmap_node *);
182 static inline struct hmap_node *hmap_first (const struct hmap *);
183 static inline struct hmap_node *hmap_next (const struct hmap *,
184 const struct hmap_node *);
187 static bool hmap_is_empty (const struct hmap *);
188 static inline size_t hmap_count (const struct hmap *);
189 static inline size_t hmap_capacity (const struct hmap *);
191 /* Updating data elements. */
192 void hmap_changed (struct hmap *, struct hmap_node *, size_t new_hash);
193 void hmap_moved (struct hmap *,
194 struct hmap_node *, const struct hmap_node *old);
196 /* Convenience macros for search.
198 These macros automatically use HMAP_DATA to obtain the data
199 elements that encapsulate hmap nodes, which often saves typing
200 and can make code easier to read. Refer to the large comment
201 near the top of this file for an example.
203 These macros evaluate HASH only once. They evaluate their
204 other arguments many times. */
205 #define HMAP_FIRST_WITH_HASH(STRUCT, MEMBER, HMAP, HASH) \
206 HMAP_NULLABLE_DATA (hmap_first_with_hash (HMAP, HASH), STRUCT, MEMBER)
207 #define HMAP_NEXT_WITH_HASH(DATA, STRUCT, MEMBER) \
208 HMAP_NULLABLE_DATA (hmap_next_with_hash (&(DATA)->MEMBER), STRUCT, MEMBER)
209 #define HMAP_FOR_EACH_WITH_HASH(DATA, STRUCT, MEMBER, HASH, HMAP) \
210 for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH); \
212 (DATA) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER))
213 #define HMAP_FOR_EACH_WITH_HASH_SAFE(DATA, NEXT, STRUCT, MEMBER, HASH, HMAP) \
214 for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH); \
216 ? ((NEXT) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER), 1) \
220 /* These macros are like the *_WITH_HASH macros above, except that they don't
221 skip data elements that are in the same hash bucket but have different hash
222 values. This is a small optimization in code where comparing keys is just
223 as fast as comparing hashes (e.g. the key is an "int") or comparing keys
224 would duplicate comparing the hashes (e.g. the hash is the first word of a
225 multi-word random key).
227 These macros evaluate HASH only once. They evaluate their
228 other arguments many times. */
229 #define HMAP_FIRST_IN_BUCKET(STRUCT, MEMBER, HMAP, HASH) \
230 HMAP_NULLABLE_DATA (hmap_first_in_bucket (HMAP, HASH), STRUCT, MEMBER)
231 #define HMAP_NEXT_IN_BUCKET(DATA, STRUCT, MEMBER) \
232 HMAP_NULLABLE_DATA (hmap_next_in_bucket (&(DATA)->MEMBER), STRUCT, MEMBER)
233 #define HMAP_FOR_EACH_IN_BUCKET(DATA, STRUCT, MEMBER, HASH, HMAP) \
234 for ((DATA) = HMAP_FIRST_IN_BUCKET (STRUCT, MEMBER, HMAP, HASH); \
236 (DATA) = HMAP_NEXT_IN_BUCKET (DATA, STRUCT, MEMBER))
237 #define HMAP_FOR_EACH_IN_BUCKET_SAFE(DATA, NEXT, STRUCT, MEMBER, HASH, HMAP) \
238 for ((DATA) = HMAP_FIRST_IN_BUCKET (STRUCT, MEMBER, HMAP, HASH); \
240 ? ((NEXT) = HMAP_NEXT_IN_BUCKET (DATA, STRUCT, MEMBER), 1) \
244 /* Convenience macros for iteration.
246 These macros automatically use HMAP_DATA to obtain the data
247 elements that encapsulate hmap nodes, which often saves typing
248 and can make code easier to read. Refer to the large comment
249 near the top of this file for an example.
251 These macros evaluate their arguments many times. */
252 #define HMAP_FIRST(STRUCT, MEMBER, HMAP) \
253 HMAP_NULLABLE_DATA (hmap_first (HMAP), STRUCT, MEMBER)
254 #define HMAP_NEXT(DATA, STRUCT, MEMBER, HMAP) \
255 HMAP_NULLABLE_DATA (hmap_next (HMAP, &(DATA)->MEMBER), STRUCT, MEMBER)
256 #define HMAP_FOR_EACH(DATA, STRUCT, MEMBER, HMAP) \
257 for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP); \
259 (DATA) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP))
260 #define HMAP_FOR_EACH_SAFE(DATA, NEXT, STRUCT, MEMBER, HMAP) \
261 for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP); \
263 ? ((NEXT) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP), 1) \
267 /* Inline definitions. */
269 static inline struct hmap_node *hmap_find_hash__ (struct hmap_node *, size_t);
270 static inline struct hmap_node *hmap_first_nonempty_bucket__ (
271 const struct hmap *, size_t start);
272 static inline size_t hmap_mask_to_capacity__ (size_t mask);
274 /* Returns the hash value associated with NODE. */
276 hmap_node_hash (const struct hmap_node *node)
281 /* Returns the first node in MAP that has hash value HASH, or a
282 null pointer if MAP does not contain any node with that hash
285 Assuming uniform hashing and no duplicate data items in MAP,
286 this function runs in constant time. (Amortized over an
287 iteration over all data items with a given HASH, its runtime
288 is proportional to the length of the hash chain for HASH, so
289 given a pathological hash function, e.g. one that returns a
290 constant value, its runtime degenerates to linear in the
291 length of NODE's hash chain.)
293 Nodes are returned in arbitrary order that may change whenever
294 the hash table's current capacity changes, as reported by
295 hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and
296 hmap_shrink() can change the capacity of a hash map.
297 Inserting a node with hmap_insert_fast() or deleting one with
298 hmap_delete() will not change the relative ordering of nodes.
300 The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
301 macros provide convenient ways to iterate over all the nodes
302 with a given hash. The HMAP_FIRST_WITH_HASH macro is an
303 interface to this particular function that is often more
305 static inline struct hmap_node *
306 hmap_first_with_hash (const struct hmap *map, size_t hash)
308 return hmap_find_hash__ (map->buckets[hash & map->mask], hash);
311 /* Returns the next node in MAP after NODE that has the same hash
312 value as NODE, or a null pointer if MAP does not contain any
313 more nodes with that hash value.
315 Assuming uniform hashing and no duplicate data items in MAP,
316 this function runs in constant time. (Amortized over an
317 iteration over all data items with a given HASH, its runtime
318 is proportional to the length of the hash chain for HASH, so
319 given a pathological hash function, e.g. one that returns a
320 constant value, its runtime degenerates to linear in the
321 length of NODE's hash chain.)
323 Nodes are returned in arbitrary order that may change whenever
324 the hash table's current capacity changes, as reported by
325 hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and
326 hmap_shrink() can change the capacity of a hash map.
327 Inserting a node with hmap_insert_fast() or deleting one with
328 hmap_delete() will not change the relative ordering of nodes.
330 The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
331 macros provide convenient ways to iterate over all the nodes
332 with a given hash. The HMAP_NEXT_WITH_HASH macro is an
333 interface to this particular function that is often more
335 static inline struct hmap_node *
336 hmap_next_with_hash (const struct hmap_node *node)
338 return hmap_find_hash__ (node->next, node->hash);
341 /* Inserts NODE into MAP with hash value HASH. If the insertion
342 causes MAP's current capacity, as reported by hmap_capacity(),
343 to be exceeded, rehashes MAP with an increased number of hash
346 This function runs in constant time amortized over all the
349 This function does not verify that MAP does not already
350 contain a data item with the same value as NODE. If
351 duplicates should be disallowed (which is the usual case),
352 then the client must check for duplicates itself before
353 inserting the new node. */
355 hmap_insert (struct hmap *map, struct hmap_node *node, size_t hash)
357 hmap_insert_fast (map, node, hash);
358 if (map->count > hmap_capacity (map))
359 hmap_reserve (map, map->count);
362 /* Inserts NODE into MAP with hash value HASH. Does not check
363 whether this causes MAP's current capacity to be exceeded.
364 The caller must take responsibility for that (or use
365 hmap_insert() instead).
367 This function runs in constant time.
369 This function does not verify that MAP does not already
370 contain a data item with the same value as NODE. If
371 duplicates should be disallowed (which is the usual case),
372 then the client must check for duplicates itself before
373 inserting the new node. */
375 hmap_insert_fast (struct hmap *map, struct hmap_node *node, size_t hash)
377 struct hmap_node **bucket = &map->buckets[hash & map->mask];
379 node->next = *bucket;
384 /* Returns the first node in MAP in the bucket for HASH, or a null pointer if
385 that bucket in HASH is empty.
387 This function runs in constant time.
389 Nodes are returned in arbitrary order that may change whenever the hash
390 table's current capacity changes, as reported by hmap_capacity(). Calls to
391 hmap_insert(), hmap_reserve(), and hmap_shrink() can change the capacity of
392 a hash map. Inserting a node with hmap_insert_fast() or deleting one with
393 hmap_delete() will not change the relative ordering of nodes.
395 The HMAP_FOR_EACH_IN_BUCKET and HMAP_FOR_EACH_IN_BUCKET_SAFE macros provide
396 convenient ways to iterate over all the nodes with a given hash. The
397 HMAP_FIRST_IN_BUCKET macro is an interface to this particular function that
398 is often more convenient. */
399 static inline struct hmap_node *
400 hmap_first_in_bucket (const struct hmap *map, size_t hash)
402 return map->buckets[hash & map->mask];
405 /* Returns the next node following NODE within the same bucket, or a null
406 pointer if NODE is the last node in its bucket.
408 This function runs in constant time.
410 Nodes are returned in arbitrary order that may change whenever the hash
411 table's current capacity changes, as reported by hmap_capacity(). Calls to
412 hmap_insert(), hmap_reserve(), and hmap_shrink() can change the capacity of
413 a hash map. Inserting a node with hmap_insert_fast() or deleting one with
414 hmap_delete() will not change the relative ordering of nodes.
416 The HMAP_FOR_EACH_IN_BUCKET and HMAP_FOR_EACH_IN_BUCKET_SAFE macros provide
417 convenient ways to iterate over all the nodes with a given hash. The
418 HMAP_NEXT_IN_BUCKET macro is an interface to this particular function that
419 is often more convenient. */
420 static inline struct hmap_node *
421 hmap_next_in_bucket (const struct hmap_node *node)
426 /* Removes NODE from MAP. The client is responsible for freeing
427 any data associated with NODE, if necessary.
429 Assuming uniform hashing, this function runs in constant time.
430 (Its runtime is proportional to the position of NODE in its
431 hash chain, so given a pathological hash function, e.g. one
432 that returns a constant value, its runtime degenerates to
433 linear in the length of NODE's hash chain.)
435 This function never reduces the number of buckets in MAP.
436 When one deletes a large number of nodes from a hash table,
437 calling hmap_shrink() afterward may therefore save a small
438 amount of memory. It is also more expensive to iterate
439 through a very sparse hash table than a denser one, so
440 shrinking the hash table could also save some time. However,
441 rehashing has an immediate cost that must be weighed against
444 hmap_delete() does not change NODE's hash value reported by
447 hmap_delete (struct hmap *map, struct hmap_node *node)
449 struct hmap_node **bucket = &map->buckets[node->hash & map->mask];
450 while (*bucket != node)
451 bucket = &(*bucket)->next;
452 *bucket = (*bucket)->next;
456 /* Returns the first node in MAP, or a null pointer if MAP is
459 Amortized over iterating through every data element in MAP,
460 this function runs in constant time. However, this assumes
461 that MAP is not excessively sparse, that is, that
462 hmap_capacity(MAP) is at most a constant factor greater than
463 hmap_count(MAP). This will always be true unless many nodes
464 have been inserted into MAP and then most or all of them
465 deleted; in such a case, calling hmap_shrink() is advised.
467 Nodes are returned in arbitrary order that may change whenever
468 the hash table's current capacity changes, as reported by
469 hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and
470 hmap_shrink() can change the capacity of a hash map.
471 Inserting a node with hmap_insert_fast() or deleting one with
472 hmap_delete() will not change the relative ordering of nodes.
474 The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
475 convenient ways to iterate over all the nodes in a hash map.
476 The HMAP_FIRST macro is an interface to this particular
477 function that is often more convenient. */
478 static inline struct hmap_node *
479 hmap_first (const struct hmap *map)
481 return hmap_first_nonempty_bucket__ (map, 0);
484 /* Returns the next node in MAP following NODE, or a null pointer
485 if NODE is the last node in MAP.
487 Amortized over iterating through every data element in MAP,
488 this function runs in constant time. However, this assumes
489 that MAP is not excessively sparse, that is, that
490 hmap_capacity(MAP) is at most a constant factor greater than
491 hmap_count(MAP). This will always be true unless many nodes
492 have been inserted into MAP and then most or all of them
493 deleted; in such a case, calling hmap_shrink() is advised.
495 Nodes are returned in arbitrary order that may change whenever
496 the hash table's current capacity changes, as reported by
497 hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and
498 hmap_shrink() can change the capacity of a hash map.
499 Inserting a node with hmap_insert_fast() or deleting one with
500 hmap_delete() will not change the relative ordering of nodes.
502 The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
503 convenient ways to iterate over all the nodes in a hash map.
504 The HMAP_NEXT macro is an interface to this particular
505 function that is often more convenient. */
506 static inline struct hmap_node *
507 hmap_next (const struct hmap *map, const struct hmap_node *node)
509 return (node->next != NULL
511 : hmap_first_nonempty_bucket__ (map, (node->hash & map->mask) + 1));
514 /* Returns true if MAP currently contains no data items, false
517 hmap_is_empty (const struct hmap *map)
519 return map->count == 0;
522 /* Returns the number of data items currently in MAP. */
524 hmap_count (const struct hmap *map)
529 /* Returns the current capacity of MAP, that is, the maximum
530 number of data elements that MAP may hold before it becomes
533 The capacity is advisory only: it is possible to insert any
534 number of data elements into a hash map regardless of its
535 capacity. However, inserting many more elements than the
536 map's capacity will degrade search performance. */
538 hmap_capacity (const struct hmap *map)
540 return hmap_mask_to_capacity__ (map->mask);
543 /* Implementation details. */
545 /* Returns the first node at or after NODE in NODE's chain that
546 has hash value HASH. */
547 static inline struct hmap_node *
548 hmap_find_hash__ (struct hmap_node *node, size_t hash)
550 for (; node != NULL; node = node->next)
551 if (node->hash == hash)
556 /* Returns the first node in the lowest-numbered nonempty bucket
557 in MAP whose index is START or higher, or a null pointer if
558 all such buckets are empty. */
559 static inline struct hmap_node *
560 hmap_first_nonempty_bucket__ (const struct hmap *map, size_t start)
564 for (i = start; i <= map->mask; i++)
565 if (map->buckets[i] != NULL)
566 return map->buckets[i];
570 /* Returns the hash table capacity associated with a given MASK,
571 which should be a value for the "mask" member of struct hmap.
572 MASK must be a power of 2 minus 1 (including 0), that is, its
573 value in binary must be all 1-bits. */
575 hmap_mask_to_capacity__ (size_t mask)
577 return (mask + 1) * 2;
580 /* Helper for HMAP_NULLABLE_DATA (to avoid evaluating its NODE
581 argument more than once). */
583 hmap_nullable_data__ (struct hmap_node *node, size_t member_offset)
585 return node != NULL ? (char *) node - member_offset : NULL;
588 #endif /* libpspp/hmap.h */