1 /* hash - hashing table processing.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3 Written by Jim Meyering, 1992.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 /* A generic hash table package. */
21 /* Define USE_OBSTACK to 1 if you want the allocator to use obstacks instead
22 of malloc. If you change USE_OBSTACK, you have to recompile! */
33 typedef enum {false = 0, true = 1} bool;
47 # ifndef obstack_chunk_alloc
48 # define obstack_chunk_alloc malloc
50 # ifndef obstack_chunk_free
51 # define obstack_chunk_free free
57 /* An hash table contains many internal entries, each holding a pointer to
58 some user provided data (also called a user entry). An entry indistinctly
59 refers to both the internal entry and its associated user entry. A user
60 entry contents may be hashed by a randomisation function (the hashing
61 function, or just `hasher' for short) into a number (or `slot') between 0
62 and the current table size. At each slot position in the hash table,
63 starts a linked chain of entries for which the user data all hash to this
64 slot. A bucket is the collection of all entries hashing to the same slot.
66 A good `hasher' function will distribute entries rather evenly in buckets.
67 In the ideal case, the length of each bucket is roughly the number of
68 entries divided by the table size. Finding the slot for a data is usually
69 done at constant speed by the `hasher', and the later finding of a precise
70 entry is linear in time with the size of the bucket. Consequently, a
71 bigger hash table size (that is, a bigger number of buckets) is prone to
72 yielding shorter buckets, *given* the `hasher' function behaves properly.
74 Long buckets slow down the lookup algorithm. One might use big hash table
75 sizes in hope to reduce the average length of buckets, but this might
76 become inordinate, as unused slots in the hash table take some space. The
77 best bet is to make sure you are using a good `hasher' function (beware
78 that those are not that easy to write! :-), and to use a table size at
79 least bigger than the actual number of entries.
81 Currently, whenever the addition of an entry gets 80% of buckets to be
82 non-empty, this package automatically doubles the number of buckets. */
84 /* Information and lookup. */
86 /* The following few functions provide information about the overall hash
87 table organisation: the number of entries, number of buckets and maximum
90 /* Return the number of buckets in the hash table. The table size, the total
91 number of buckets (used plus unused), or the maximum number of slots, are
95 hash_get_n_buckets (const Hash_table *table)
97 return table->n_buckets;
100 /* Return the number of slots in use (non-empty buckets). */
103 hash_get_n_buckets_used (const Hash_table *table)
105 return table->n_buckets_used;
108 /* Return the number of active entries. */
111 hash_get_n_entries (const Hash_table *table)
113 return table->n_entries;
116 /* Return the length of the most lenghty chain (bucket). */
119 hash_get_max_bucket_length (const Hash_table *table)
121 struct hash_entry *bucket;
122 unsigned int max_bucket_length = 0;
124 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
128 struct hash_entry *cursor = bucket;
129 unsigned int bucket_length = 1;
131 while (cursor = cursor->next, cursor)
134 if (bucket_length > max_bucket_length)
135 max_bucket_length = bucket_length;
139 return max_bucket_length;
142 /* Do a mild validation of an hash table, by traversing it and checking two
146 hash_table_ok (const Hash_table *table)
148 struct hash_entry *bucket;
149 unsigned int n_buckets_used = 0;
150 unsigned int n_entries = 0;
152 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
156 struct hash_entry *cursor = bucket;
158 /* Count bucket head. */
162 /* Count bucket overflow. */
163 while (cursor = cursor->next, cursor)
168 if (n_buckets_used == table->n_buckets_used && n_entries == table->n_entries)
175 hash_print_statistics (const Hash_table *table, FILE *stream)
177 unsigned int n_entries = hash_get_n_entries (table);
178 unsigned int n_buckets = hash_get_n_buckets (table);
179 unsigned int n_buckets_used = hash_get_n_buckets_used (table);
180 unsigned int max_bucket_length = hash_get_max_bucket_length (table);
182 fprintf (stream, "# entries: %u\n", n_entries);
183 fprintf (stream, "# buckets: %u\n", n_buckets);
184 fprintf (stream, "# buckets used: %u (%.2f%%)\n", n_buckets_used,
185 (100.0 * n_buckets_used) / n_buckets);
186 fprintf (stream, "max bucket length: %u\n", max_bucket_length);
189 /* Return the user entry from the hash table, if some entry in the hash table
190 compares equally with ENTRY, or NULL otherwise. */
193 hash_lookup (const Hash_table *table, const void *entry)
195 struct hash_entry *bucket
196 = table->bucket + table->hasher (entry, table->n_buckets);
197 struct hash_entry *cursor;
199 assert (bucket < table->bucket_limit);
201 if (bucket->data == NULL)
204 for (cursor = bucket; cursor; cursor = cursor->next)
205 if (table->comparator (entry, cursor->data))
213 /* The functions in this page traverse the hash table and process the
214 contained entries. For the traversal to work properly, the hash table
215 should not be resized nor modified while any particular entry is being
216 processed. In particular, entries should not be added or removed. */
218 /* Return the first data in the table, or NULL if the table is empty. */
221 hash_get_first (const Hash_table *table)
223 struct hash_entry *bucket;
225 if (table->n_entries == 0)
228 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
235 /* Return the user data for the entry following ENTRY, where ENTRY has been
236 returned by a previous call to either `hash_get_first' or `hash_get_next'.
237 Return NULL if there is no more entries. */
240 hash_get_next (const Hash_table *table, const void *entry)
242 struct hash_entry *bucket
243 = table->bucket + table->hasher (entry, table->n_buckets);
244 struct hash_entry *cursor;
246 assert (bucket < table->bucket_limit);
248 /* Find next entry in the same bucket. */
249 for (cursor = bucket; cursor; cursor = cursor->next)
250 if (cursor->data == entry && cursor->next)
251 return cursor->next->data;
253 /* Find first entry in any subsequent bucket. */
254 for (; bucket < table->bucket_limit; bucket++)
262 /* Fill BUFFER with pointers to active user entries in the hash table, then
263 return the number of pointers copied. Do not copy more than BUFFER_SIZE
267 hash_get_entries (const Hash_table *table, void **buffer, unsigned int buffer_size)
269 unsigned int counter = 0;
270 struct hash_entry *bucket;
271 struct hash_entry *cursor;
273 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
277 for (cursor = bucket; cursor; cursor = cursor->next)
279 if (counter >= buffer_size)
281 buffer[counter++] = cursor->data;
289 /* Call a PROCESSOR function for each entry of an hash table, and return the
290 number of entries for which the processor function returned success. A
291 pointer to some PROCESSOR_DATA which will be made available to each call to
292 the processor function. The PROCESSOR accepts two arguments: the first is
293 the user entry being walked into, the second is the value of PROCESSOR_DATA
294 as received. The walking continue for as long as the PROCESSOR function
295 returns nonzero. When it returns zero, the walking is interrupted. */
298 hash_do_for_each (const Hash_table *table, Hash_processor processor,
299 void *processor_data)
301 unsigned int counter = 0;
302 struct hash_entry *bucket;
303 struct hash_entry *cursor;
305 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
309 for (cursor = bucket; cursor; cursor = cursor->next)
311 if (!(*processor) (cursor->data, processor_data))
321 /* Allocation and clean-up. */
323 /* Return a hash index for a NUL-terminated STRING between 0 and N_BUCKETS-1.
324 This is a convenience routine for constructing other hashing functions. */
328 /* About hashings, Paul Eggert writes to me (FP), on 1994-01-01: "Please see
329 B. J. McKenzie, R. Harries & T. Bell, Selecting a hashing algorithm,
330 Software--practice & experience 20, 2 (Feb 1990), 209-224. Good hash
331 algorithms tend to be domain-specific, so what's good for [diffutils'] io.c
332 may not be good for your application." */
335 hash_string (const char *string, unsigned int n_buckets)
340 # define ROTATE_LEFT(Value, Shift) \
341 ((Value) << (Shift) | (Value) >> ((sizeof (unsigned) * CHAR_BIT) - (Shift)))
342 # define HASH_ONE_CHAR(Value, Byte) \
343 ((Byte) + ROTATE_LEFT (Value, 7))
347 for (; *string; string++)
348 value = HASH_ONE_CHAR (value, *(const unsigned char *) string);
349 return value % n_buckets;
352 # undef HASH_ONE_CHAR
355 #else /* not USE_DIFF_HASH */
357 /* This one comes from `recode', and performs a bit better than the above as
358 per a few experiments. It is inspired from a hashing routine found in the
359 very old Cyber `snoop', itself written in typical Greg Mansfield style.
360 (By the way, what happened to this excellent man? Is he still alive?) */
363 hash_string (const char *string, unsigned int n_buckets)
368 value = ((value * 31 + (int) *(const unsigned char *) string++)
373 #endif /* not USE_DIFF_HASH */
375 /* Return true if CANDIDATE is a prime number. CANDIDATE should be an odd
376 number at least equal to 11. */
380 unsigned long candidate;
382 unsigned long divisor = 3;
383 unsigned long square = divisor * divisor;
385 while (square < candidate && (candidate % divisor))
388 square += 4 * divisor;
392 return candidate % divisor != 0;
395 /* Round a given CANDIDATE number up to the nearest prime, and return that
396 prime. CANDIDATE should be at least equal to 10. */
399 next_prime (candidate)
400 unsigned long candidate;
402 /* Make it definitely odd. */
405 while (!is_prime (candidate))
411 /* Allocate and return a new hash table, or NULL if an error is met. The
412 initial number of buckets would be at least CANDIDATE (which need not be prime).
414 If DATA_FREER is not NULL, this function may be later called with the data as
415 an argument, just before they entry containing the data gets freed. The
416 HASHER function should be supplied, and FIXME. The COMPARATOR function
417 should also be supplied, and FIXME. */
419 /* User-supplied function for freeing datas. It is specified in
420 hash_initialize. If non-null, it is used by hash_free and hash_clear.
421 You should specify `free' here only if you want these functions to free
422 all of your `data' data. This is typically the case when your data is
423 simply an auxilliary struct that you have malloc'd to aggregate several
426 /* User-supplied hash function that hashes entry ENTRY to an integer in
427 the range 0..TABLE_SIZE-1. */
429 /* User-supplied function that determines whether a new entry is unique by
430 comparing the new entry to entries that hashed to the same bucket
431 index. It should return zero for a pair of entries that compare equal,
432 non-zero otherwise. */
435 hash_initialize (unsigned int candidate, Hash_hasher hasher,
436 Hash_comparator comparator, Hash_data_freer data_freer)
439 struct hash_entry *bucket;
441 if (hasher == NULL || comparator == NULL)
444 table = (Hash_table *) malloc (sizeof (Hash_table));
448 table->n_buckets = next_prime (candidate < 10 ? 10 : candidate);
449 table->bucket = (struct hash_entry *)
450 malloc (table->n_buckets * sizeof (struct hash_entry));
451 if (table->bucket == NULL)
456 table->bucket_limit = table->bucket + table->n_buckets;
458 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
463 table->n_buckets_used = 0;
464 table->n_entries = 0;
466 table->hasher = hasher;
467 table->comparator = comparator;
468 table->data_freer = data_freer;
470 table->free_entry_list = NULL;
472 obstack_init (&table->entry_stack);
477 /* Make all buckets empty, placing any chained entries on the free list.
478 Apply the user-specified function data_freer (if any) to the datas of any
482 hash_clear (Hash_table *table)
484 struct hash_entry *bucket;
485 struct hash_entry *cursor;
487 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
491 /* Free the bucket overflow. */
492 for (cursor = bucket->next; cursor; cursor = cursor->next)
494 if (table->data_freer)
495 (*table->data_freer) (cursor->data);
498 /* Relinking is done one entry at a time, as it is to be expected
499 that overflows are either rare or short. */
500 cursor->next = table->free_entry_list;
501 table->free_entry_list = cursor;
504 /* Free the bucket head. */
505 if (table->data_freer)
506 (*table->data_freer) (bucket->data);
512 table->n_buckets_used = 0;
513 table->n_entries = 0;
516 /* Reclaim all storage associated with an hash table. If a data_freer
517 function has been supplied by the user when the hash table was created,
518 this function applies it to the data of each entry before freeing that
522 hash_free (Hash_table *table)
524 struct hash_entry *bucket;
525 struct hash_entry *cursor;
526 struct hash_entry *next;
528 /* Call the user data_freer function. */
529 if (table->data_freer && table->n_entries)
531 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
535 for (cursor = bucket; cursor; cursor = cursor->next)
537 (*table->data_freer) (cursor->data);
545 obstack_free (&table->entry_stack, NULL);
549 /* Free all bucket overflowed entries. */
550 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
552 for (cursor = bucket->next; cursor; cursor = next)
559 /* Also reclaim the internal list of previously freed entries. */
560 for (cursor = table->free_entry_list; cursor; cursor = next)
568 /* Free the remainder of the hash table structure. */
569 free (table->bucket);
573 /* Insertion and deletion. */
575 /* Get a new hash entry for a bucket overflow, possibly by reclying a
576 previously freed one. If this is not possible, allocate a new one. */
578 static struct hash_entry *
579 allocate_entry (Hash_table *table)
581 struct hash_entry *new;
583 if (table->free_entry_list)
585 new = table->free_entry_list;
586 table->free_entry_list = new->next;
591 new = (struct hash_entry *)
592 obstack_alloc (&table->entry_stack, sizeof (struct hash_entry));
594 new = (struct hash_entry *) malloc (sizeof (struct hash_entry));
601 /* Free a hash entry which was part of some bucket overflow,
602 saving it for later recycling. */
605 free_entry (Hash_table *table, struct hash_entry *entry)
608 entry->next = table->free_entry_list;
609 table->free_entry_list = entry;
612 /* This private function is used to help with insertion and deletion. When
613 ENTRY matches an entry in the table, return a pointer to the corresponding
614 user data and set *BUCKET_HEAD to the head of the selected bucket.
615 Otherwise, return NULL. When DELETE is true and ENTRY matches an entry in
616 the table, unlink the matching entry. */
619 hash_find_entry (Hash_table *table, const void *entry,
620 struct hash_entry **bucket_head, bool delete)
622 struct hash_entry *bucket
623 = table->bucket + table->hasher (entry, table->n_buckets);
624 struct hash_entry *cursor;
626 assert (bucket < table->bucket_limit);
627 *bucket_head = bucket;
629 /* Test for empty bucket. */
630 if (bucket->data == NULL)
633 /* Check if then entry is found as the bucket head. */
634 if ((*table->comparator) (entry, bucket->data))
636 void *data = bucket->data;
642 struct hash_entry *next = bucket->next;
644 /* Bump the first overflow entry into the bucket head, then save
645 the previous first overflow entry for later recycling. */
647 free_entry (table, next);
658 /* Scan the bucket overflow. */
659 for (cursor = bucket; cursor->next; cursor = cursor->next)
661 if ((*table->comparator) (entry, cursor->next->data))
663 void *data = cursor->next->data;
667 struct hash_entry *next = cursor->next;
669 /* Unlink the entry to delete, then save the freed entry for later
671 cursor->next = next->next;
672 free_entry (table, next);
679 /* No entry found. */
683 /* For an already existing hash table, change the number of buckets and make
684 it NEW_TABLE_SIZE. The contents of the hash table are preserved. */
687 hash_rehash (Hash_table *table, unsigned int new_n_buckets)
689 Hash_table *new_table;
690 struct hash_entry *bucket;
691 struct hash_entry *cursor;
692 struct hash_entry *next;
694 if (table->n_buckets <= 0 || new_n_buckets == 0)
697 new_table = hash_initialize (new_n_buckets, table->hasher,
698 table->comparator, table->data_freer);
699 if (new_table == NULL)
702 /* Merely reuse the extra old space into the new table. */
704 obstack_free (&new_table->entry_stack, NULL);
705 new_table->entry_stack = table->entry_stack;
707 new_table->free_entry_list = table->free_entry_list;
709 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
713 for (cursor = bucket; cursor; cursor = next)
715 void *data = cursor->data;
716 struct hash_entry *new_bucket
717 = new_table->bucket + new_table->hasher (data, new_n_buckets);
719 assert (new_bucket < new_table->bucket_limit);
721 /* Free overflow entries as soon as possible, moving them from the
722 old hash table into the new one, as they may be needed now. */
724 if (cursor != bucket)
725 free_entry (new_table, cursor);
727 /* Insert the entry into the new hash table. */
728 if (new_bucket->data)
730 struct hash_entry *new_entry = allocate_entry (new_table);
732 if (new_entry == NULL)
735 new_entry->data = data;
736 new_entry->next = new_bucket->next;
737 new_bucket->next = new_entry;
741 new_bucket->data = data;
742 new_table->n_buckets_used++;
748 free (table->bucket);
749 table->bucket = new_table->bucket;
750 table->bucket_limit = new_table->bucket_limit;
751 table->n_buckets = new_table->n_buckets;
752 table->n_buckets_used = new_table->n_buckets_used;
753 /* table->n_entries already holds its value. */
755 table->entry_stack = new_table->entry_stack;
762 /* If ENTRY matches an entry already in the hash table, don't modify the table
763 and return the matched entry. Otherwise, insert ENTRY and return NULL.
764 *DONE is set to true in all cases, unless the storage required for
765 insertion cannot be allocated. */
768 hash_insert (Hash_table *table, const void *entry, bool *done)
771 struct hash_entry *bucket;
773 assert (entry); /* cannot insert a NULL data */
775 if (data = hash_find_entry (table, entry, &bucket, false), data)
781 /* ENTRY is not matched, it should be inserted. */
787 struct hash_entry *new_entry = allocate_entry (table);
789 if (new_entry == NULL)
795 /* Add ENTRY in the overflow of the bucket. */
797 new_entry->data = (void *) entry;
798 new_entry->next = bucket->next;
799 bucket->next = new_entry;
804 /* Add ENTRY right in the bucket head. */
806 bucket->data = (void *) entry;
807 table->n_buckets_used++;
809 /* If more than 80% of the buckets are in use, rehash the table two
810 times bigger. It's no real use checking the number of entries, as if
811 the hashing function is ill-conditioned, rehashing is not likely to
814 if (5 * table->n_buckets_used > 4 * table->n_buckets)
816 unsigned int new_n_buckets = next_prime (2 * table->n_buckets + 1);
818 *done = hash_rehash (table, new_n_buckets);
826 /* If ENTRY is already in the table, remove it and return the just-deleted
827 data (the user may want to deallocate its storage). If ENTRY is not in the
828 table, don't modify the table and return NULL. */
831 hash_delete (Hash_table *table, const void *entry)
834 struct hash_entry *bucket;
836 if (data = hash_find_entry (table, entry, &bucket, true), !data)
840 table->n_buckets_used--;
851 hash_print (const Hash_table *table)
853 struct hash_entry *bucket;
855 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
857 struct hash_entry *cursor;
860 printf ("%d:\n", slot);
862 for (cursor = bucket; cursor; cursor = cursor->next)
864 char *s = (char *) cursor->data;