1 /* hash - hashing table processing.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3 Written by Jim Meyering, 1992.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 /* A generic hash table package. */
21 /* Define USE_OBSTACK to 1 if you want the allocator to use obstacks instead
22 of malloc. If you change USE_OBSTACK, you have to recompile! */
33 typedef enum {false = 0, true = 1} bool;
47 # ifndef obstack_chunk_alloc
48 # define obstack_chunk_alloc malloc
50 # ifndef obstack_chunk_free
51 # define obstack_chunk_free free
57 /* An hash table contains many internal entries, each holding a pointer to
58 some user provided data (also called a user entry). An entry indistinctly
59 refers to both the internal entry and its associated user entry. A user
60 entry contents may be hashed by a randomisation function (the hashing
61 function, or just `hasher' for short) into a number (or `slot') between 0
62 and the current table size. At each slot position in the hash table,
63 starts a linked chain of entries for which the user data all hash to this
64 slot. A bucket is the collection of all entries hashing to the same slot.
66 A good `hasher' function will distribute entries rather evenly in buckets.
67 In the ideal case, the length of each bucket is roughly the number of
68 entries divided by the table size. Finding the slot for a data is usually
69 done at constant speed by the `hasher', and the later finding of a precise
70 entry is linear in time with the size of the bucket. Consequently, a
71 bigger hash table size (that is, a bigger number of buckets) is prone to
72 yielding shorter buckets, *given* the `hasher' function behaves properly.
74 Long buckets slow down the lookup algorithm. One might use big hash table
75 sizes in hope to reduce the average length of buckets, but this might
76 become inordinate, as unused slots in the hash table take some space. The
77 best bet is to make sure you are using a good `hasher' function (beware
78 that those are not that easy to write! :-), and to use a table size at
79 least bigger than the actual number of entries.
81 Currently, whenever the addition of an entry gets 80% of buckets to be
82 non-empty, this package automatically doubles the number of buckets. */
84 /* Information and lookup. */
86 /* The following few functions provide information about the overall hash
87 table organisation: the number of entries, number of buckets and maximum
90 /* Return the number of buckets in the hash table. The table size, the total
91 number of buckets (used plus unused), or the maximum number of slots, are
95 hash_get_n_buckets (const Hash_table *table)
97 return table->n_buckets;
100 /* Return the number of slots in use (non-empty buckets). */
103 hash_get_n_buckets_used (const Hash_table *table)
105 return table->n_buckets_used;
108 /* Return the number of active entries. */
111 hash_get_n_entries (const Hash_table *table)
113 return table->n_entries;
116 /* Return the length of the most lenghty chain (bucket). */
119 hash_get_max_bucket_length (const Hash_table *table)
121 struct hash_entry *bucket;
122 unsigned int max_bucket_length = 0;
124 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
127 struct hash_entry *cursor = bucket;
128 unsigned int bucket_length = 1;
130 while (cursor = cursor->next, cursor)
133 if (bucket_length > max_bucket_length)
134 max_bucket_length = bucket_length;
137 return max_bucket_length;
140 /* Do a mild validation of an hash table, by traversing it and checking two
144 hash_table_ok (const Hash_table *table)
146 struct hash_entry *bucket;
147 unsigned int n_buckets_used = 0;
148 unsigned int n_entries = 0;
150 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
153 struct hash_entry *cursor = bucket;
155 /* Count bucket head. */
159 /* Count bucket overflow. */
160 while (cursor = cursor->next, cursor)
164 if (n_buckets_used == table->n_buckets_used && n_entries == table->n_entries)
171 hash_print_statistics (const Hash_table *table, FILE *stream)
173 unsigned int n_entries = hash_get_n_entries (table);
174 unsigned int n_buckets = hash_get_n_buckets (table);
175 unsigned int n_buckets_used = hash_get_n_buckets_used (table);
176 unsigned int max_bucket_length = hash_get_max_bucket_length (table);
178 fprintf (stream, "# entries: %u\n", n_entries);
179 fprintf (stream, "# buckets: %u\n", n_buckets);
180 fprintf (stream, "# buckets used: %u (%.2f%%)\n", n_buckets_used,
181 (100.0 * n_buckets_used) / n_buckets);
182 fprintf (stream, "max bucket length: %u\n", max_bucket_length);
185 /* Return the user entry from the hash table, if some entry in the hash table
186 compares equally with ENTRY, or NULL otherwise. */
189 hash_lookup (const Hash_table *table, const void *entry)
191 struct hash_entry *bucket
192 = table->bucket + table->hasher (entry, table->n_buckets);
193 struct hash_entry *cursor;
195 assert (bucket < table->bucket_limit);
197 if (bucket->data == NULL)
200 for (cursor = bucket; cursor; cursor = cursor->next)
201 if (table->comparator (entry, cursor->data))
209 /* The functions in this page traverse the hash table and process the
210 contained entries. For the traversal to work properly, the hash table
211 should not be resized nor modified while any particular entry is being
212 processed. In particular, entries should not be added or removed. */
214 /* Return the first data in the table, or NULL if the table is empty. */
217 hash_get_first (const Hash_table *table)
219 struct hash_entry *bucket;
221 if (table->n_entries == 0)
224 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
231 /* Return the user data for the entry following ENTRY, where ENTRY has been
232 returned by a previous call to either `hash_get_first' or `hash_get_next'.
233 Return NULL if there is no more entries. */
236 hash_get_next (const Hash_table *table, const void *entry)
238 struct hash_entry *bucket
239 = table->bucket + table->hasher (entry, table->n_buckets);
240 struct hash_entry *cursor;
242 assert (bucket < table->bucket_limit);
244 /* Find next entry in the same bucket. */
245 for (cursor = bucket; cursor; cursor = cursor->next)
246 if (cursor->data == entry && cursor->next)
247 return cursor->next->data;
249 /* Find first entry in any subsequent bucket. */
250 for (; bucket < table->bucket_limit; bucket++)
258 /* Fill BUFFER with pointers to active user entries in the hash table, then
259 return the number of pointers copied. Do not copy more than BUFFER_SIZE
263 hash_get_entries (const Hash_table *table, void **buffer, unsigned int buffer_size)
265 unsigned int counter = 0;
266 struct hash_entry *bucket;
267 struct hash_entry *cursor;
269 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
271 for (cursor = bucket; cursor; cursor = cursor->next)
273 if (counter >= buffer_size)
275 buffer[counter++] = cursor->data;
281 /* Call a PROCESSOR function for each entry of an hash table, and return the
282 number of entries for which the processor function returned success. A
283 pointer to some PROCESSOR_DATA which will be made available to each call to
284 the processor function. The PROCESSOR accepts two arguments: the first is
285 the user entry being walked into, the second is the value of PROCESSOR_DATA
286 as received. The walking continue for as long as the PROCESSOR function
287 returns nonzero. When it returns zero, the walking is interrupted. */
290 hash_do_for_each (const Hash_table *table, Hash_processor processor,
291 void *processor_data)
293 unsigned int counter = 0;
294 struct hash_entry *bucket;
295 struct hash_entry *cursor;
297 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
299 for (cursor = bucket; cursor; cursor = cursor->next)
301 if (!(*processor) (cursor->data, processor_data))
309 /* Allocation and clean-up. */
311 /* Return a hash index for a NUL-terminated STRING between 0 and N_BUCKETS-1.
312 This is a convenience routine for constructing other hashing functions. */
316 /* About hashings, Paul Eggert writes to me (FP), on 1994-01-01: "Please see
317 B. J. McKenzie, R. Harries & T. Bell, Selecting a hashing algorithm,
318 Software--practice & experience 20, 2 (Feb 1990), 209-224. Good hash
319 algorithms tend to be domain-specific, so what's good for [diffutils'] io.c
320 may not be good for your application." */
323 hash_string (const char *string, unsigned int n_buckets)
328 # define ROTATE_LEFT(Value, Shift) \
329 ((Value) << (Shift) | (Value) >> ((sizeof (unsigned) * CHAR_BIT) - (Shift)))
330 # define HASH_ONE_CHAR(Value, Byte) \
331 ((Byte) + ROTATE_LEFT (Value, 7))
335 for (; *string; string++)
336 value = HASH_ONE_CHAR (value, *(const unsigned char *) string);
337 return value % n_buckets;
340 # undef HASH_ONE_CHAR
343 #else /* not USE_DIFF_HASH */
345 /* This one comes from `recode', and performs a bit better than the above as
346 per a few experiments. It is inspired from a hashing routine found in the
347 very old Cyber `snoop', itself written in typical Greg Mansfield style.
348 (By the way, what happened to this excellent man? Is he still alive?) */
351 hash_string (const char *string, unsigned int n_buckets)
356 value = ((value * 31 + (int) *(const unsigned char *) string++)
361 #endif /* not USE_DIFF_HASH */
363 /* Return true if CANDIDATE is a prime number. CANDIDATE should be an odd
364 number at least equal to 11. */
368 unsigned long candidate;
370 unsigned long divisor = 3;
371 unsigned long square = divisor * divisor;
373 while (square < candidate && (candidate % divisor))
376 square += 4 * divisor;
380 return candidate % divisor != 0;
383 /* Round a given CANDIDATE number up to the nearest prime, and return that
384 prime. CANDIDATE should be at least equal to 10. */
387 next_prime (candidate)
388 unsigned long candidate;
390 /* Make it definitely odd. */
393 while (!is_prime (candidate))
399 /* Allocate and return a new hash table, or NULL if an error is met. The
400 initial number of buckets would be at least CANDIDATE (which need not be prime).
402 If DATA_FREER is not NULL, this function may be later called with the data as
403 an argument, just before they entry containing the data gets freed. The
404 HASHER function should be supplied, and FIXME. The COMPARATOR function
405 should also be supplied, and FIXME. */
407 /* User-supplied function for freeing datas. It is specified in
408 hash_initialize. If non-null, it is used by hash_free and hash_clear.
409 You should specify `free' here only if you want these functions to free
410 all of your `data' data. This is typically the case when your data is
411 simply an auxilliary struct that you have malloc'd to aggregate several
414 /* User-supplied hash function that hashes entry ENTRY to an integer in
415 the range 0..TABLE_SIZE-1. */
417 /* User-supplied function that determines whether a new entry is unique by
418 comparing the new entry to entries that hashed to the same bucket
419 index. It should return zero for a pair of entries that compare equal,
420 non-zero otherwise. */
423 hash_initialize (unsigned int candidate, Hash_hasher hasher,
424 Hash_comparator comparator, Hash_data_freer data_freer)
427 struct hash_entry *bucket;
429 if (hasher == NULL || comparator == NULL)
432 table = (Hash_table *) malloc (sizeof (Hash_table));
436 table->n_buckets = next_prime (candidate < 10 ? 10 : candidate);
437 table->bucket = (struct hash_entry *)
438 malloc (table->n_buckets * sizeof (struct hash_entry));
439 if (table->bucket == NULL)
444 table->bucket_limit = table->bucket + table->n_buckets;
446 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
451 table->n_buckets_used = 0;
452 table->n_entries = 0;
454 table->hasher = hasher;
455 table->comparator = comparator;
456 table->data_freer = data_freer;
458 table->free_entry_list = NULL;
460 obstack_init (&table->entry_stack);
465 /* Make all buckets empty, placing any chained entries on the free list.
466 Apply the user-specified function data_freer (if any) to the datas of any
470 hash_clear (Hash_table *table)
472 struct hash_entry *bucket;
473 struct hash_entry *cursor;
475 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
478 /* Free the bucket overflow. */
479 for (cursor = bucket->next; cursor; cursor = cursor->next)
481 if (table->data_freer)
482 (*table->data_freer) (cursor->data);
485 /* Relinking is done one entry at a time, as it is to be expected
486 that overflows are either rare or short. */
487 cursor->next = table->free_entry_list;
488 table->free_entry_list = cursor;
491 /* Free the bucket head. */
492 if (table->data_freer)
493 (*table->data_freer) (bucket->data);
498 table->n_buckets_used = 0;
499 table->n_entries = 0;
502 /* Reclaim all storage associated with an hash table. If a data_freer
503 function has been supplied by the user when the hash table was created,
504 this function applies it to the data of each entry before freeing that
508 hash_free (Hash_table *table)
510 struct hash_entry *bucket;
511 struct hash_entry *cursor;
512 struct hash_entry *next;
514 /* Call the user data_freer function. */
515 if (table->data_freer && table->n_entries)
516 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
518 for (cursor = bucket; cursor; cursor = cursor->next)
519 (*table->data_freer) (cursor->data);
523 obstack_free (&table->entry_stack, NULL);
527 /* Free all bucket overflowed entries. */
528 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
529 for (cursor = bucket->next; cursor; cursor = next)
535 /* Also reclaim the internal list of previously freed entries. */
536 for (cursor = table->free_entry_list; cursor; cursor = next)
544 /* Free the remainder of the hash table structure. */
545 free (table->bucket);
549 /* Insertion and deletion. */
551 /* Get a new hash entry for a bucket overflow, possibly by reclying a
552 previously freed one. If this is not possible, allocate a new one. */
554 static struct hash_entry *
555 allocate_entry (Hash_table *table)
557 struct hash_entry *new;
559 if (table->free_entry_list)
561 new = table->free_entry_list;
562 table->free_entry_list = new->next;
567 new = (struct hash_entry *)
568 obstack_alloc (&table->entry_stack, sizeof (struct hash_entry));
570 new = (struct hash_entry *) malloc (sizeof (struct hash_entry));
577 /* Free a hash entry which was part of some bucket overflow,
578 saving it for later recycling. */
581 free_entry (Hash_table *table, struct hash_entry *entry)
584 entry->next = table->free_entry_list;
585 table->free_entry_list = entry;
588 /* This private function is used to help with insertion and deletion. When
589 ENTRY matches an entry in the table, return a pointer to the corresponding
590 user data and set *BUCKET_HEAD to the head of the selected bucket.
591 Otherwise, return NULL. When DELETE is true and ENTRY matches an entry in
592 the table, unlink the matching entry. */
595 hash_find_entry (Hash_table *table, const void *entry,
596 struct hash_entry **bucket_head, bool delete)
598 struct hash_entry *bucket
599 = table->bucket + table->hasher (entry, table->n_buckets);
600 struct hash_entry *cursor;
602 assert (bucket < table->bucket_limit);
603 *bucket_head = bucket;
605 /* Test for empty bucket. */
606 if (bucket->data == NULL)
609 /* Check if then entry is found as the bucket head. */
610 if ((*table->comparator) (entry, bucket->data))
612 void *data = bucket->data;
617 struct hash_entry *next = bucket->next;
619 /* Bump the first overflow entry into the bucket head, then save
620 the previous first overflow entry for later recycling. */
622 free_entry (table, next);
630 /* Scan the bucket overflow. */
631 for (cursor = bucket; cursor->next; cursor = cursor->next)
632 if ((*table->comparator) (entry, cursor->next->data))
634 void *data = cursor->next->data;
638 struct hash_entry *next = cursor->next;
640 /* Unlink the entry to delete, then save the freed entry for later
642 cursor->next = next->next;
643 free_entry (table, next);
649 /* No entry found. */
653 /* For an already existing hash table, change the number of buckets and make
654 it NEW_TABLE_SIZE. The contents of the hash table are preserved. */
657 hash_rehash (Hash_table *table, unsigned int new_n_buckets)
659 Hash_table *new_table;
660 struct hash_entry *bucket;
661 struct hash_entry *cursor;
662 struct hash_entry *next;
664 if (table->n_buckets <= 0 || new_n_buckets == 0)
667 new_table = hash_initialize (new_n_buckets, table->hasher,
668 table->comparator, table->data_freer);
669 if (new_table == NULL)
672 /* Merely reuse the extra old space into the new table. */
674 obstack_free (&new_table->entry_stack, NULL);
675 new_table->entry_stack = table->entry_stack;
677 new_table->free_entry_list = table->free_entry_list;
679 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
681 for (cursor = bucket; cursor; cursor = next)
683 void *data = cursor->data;
684 struct hash_entry *new_bucket
685 = new_table->bucket + new_table->hasher (data, new_n_buckets);
687 assert (new_bucket < new_table->bucket_limit);
689 /* Free overflow entries as soon as possible, moving them from the
690 old hash table into the new one, as they may be needed now. */
692 if (cursor != bucket)
693 free_entry (new_table, cursor);
695 /* Insert the entry into the new hash table. */
696 if (new_bucket->data)
698 struct hash_entry *new_entry = allocate_entry (new_table);
700 if (new_entry == NULL)
703 new_entry->data = data;
704 new_entry->next = new_bucket->next;
705 new_bucket->next = new_entry;
709 new_bucket->data = data;
710 new_table->n_buckets_used++;
714 free (table->bucket);
715 table->bucket = new_table->bucket;
716 table->bucket_limit = new_table->bucket_limit;
717 table->n_buckets = new_table->n_buckets;
718 table->n_buckets_used = new_table->n_buckets_used;
719 /* table->n_entries already holds its value. */
721 table->entry_stack = new_table->entry_stack;
728 /* If ENTRY matches an entry already in the hash table, don't modify the table
729 and return the matched entry. Otherwise, insert ENTRY and return NULL.
730 *DONE is set to true in all cases, unless the storage required for
731 insertion cannot be allocated. */
734 hash_insert (Hash_table *table, const void *entry, bool *done)
737 struct hash_entry *bucket;
739 assert (entry); /* cannot insert a NULL data */
741 if (data = hash_find_entry (table, entry, &bucket, false), data)
747 /* ENTRY is not matched, it should be inserted. */
753 struct hash_entry *new_entry = allocate_entry (table);
755 if (new_entry == NULL)
761 /* Add ENTRY in the overflow of the bucket. */
763 new_entry->data = (void *) entry;
764 new_entry->next = bucket->next;
765 bucket->next = new_entry;
770 /* Add ENTRY right in the bucket head. */
772 bucket->data = (void *) entry;
773 table->n_buckets_used++;
775 /* If more than 80% of the buckets are in use, rehash the table two
776 times bigger. It's no real use checking the number of entries, as if
777 the hashing function is ill-conditioned, rehashing is not likely to
780 if (5 * table->n_buckets_used > 4 * table->n_buckets)
782 unsigned int new_n_buckets = next_prime (2 * table->n_buckets + 1);
784 *done = hash_rehash (table, new_n_buckets);
792 /* If ENTRY is already in the table, remove it and return the just-deleted
793 data (the user may want to deallocate its storage). If ENTRY is not in the
794 table, don't modify the table and return NULL. */
797 hash_delete (Hash_table *table, const void *entry)
800 struct hash_entry *bucket;
802 if (data = hash_find_entry (table, entry, &bucket, true), !data)
806 table->n_buckets_used--;
817 hash_print (const Hash_table *table)
819 struct hash_entry *bucket;
821 for (bucket = table->bucket; bucket < table->bucket_limit; bucket++)
823 struct hash_entry *cursor;
826 printf ("%d:\n", slot);
828 for (cursor = bucket; cursor; cursor = cursor->next)
830 char *s = (char *) cursor->data;