From 38492a3d6a1b4400357fb18aa17ecef9d9066cbb Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 19 Nov 2012 21:38:18 -0800 Subject: [PATCH] hmap: Replace 'mask' by 'shift'. --- src/libpspp/hmap.c | 145 ++++++++++++++++++++++++-------------- src/libpspp/hmap.h | 140 +++++++++++++----------------------- tests/libpspp/hmap-test.c | 15 +++- 3 files changed, 156 insertions(+), 144 deletions(-) diff --git a/src/libpspp/hmap.c b/src/libpspp/hmap.c index 4436f8dfca..b0d8217a49 100644 --- a/src/libpspp/hmap.c +++ b/src/libpspp/hmap.c @@ -25,16 +25,16 @@ #include "gl/xalloc.h" -static size_t capacity_to_mask (size_t capacity); +static int capacity_to_shift (size_t capacity); /* Initializes MAP as a new hash map that is initially empty. */ void hmap_init (struct hmap *map) { + map->buckets = map->two; + map->two[0] = map->two[1] = NULL; map->count = 0; - map->mask = 0; - map->buckets = &map->one; - map->one = NULL; + map->shift = HMAP_MAX_SHIFT; } /* Exchanges the contents of hash maps A and B. */ @@ -44,10 +44,10 @@ hmap_swap (struct hmap *a, struct hmap *b) struct hmap tmp = *a; *a = *b; *b = tmp; - if (!a->mask) - a->buckets = &a->one; - if (!b->mask) - b->buckets = &b->one; + if (a->buckets == b->two) + a->buckets = a->two; + if (b->buckets == a->two) + b->buckets = b->two; } /* Removes all of the elements from MAP, without destroying MAP itself and @@ -57,7 +57,7 @@ hmap_clear (struct hmap *map) { size_t i; - for (i = 0; i <= map->mask; i++) + for (i = 0; i <= SIZE_MAX >> map->shift; i++) map->buckets[i] = NULL; map->count = 0; } @@ -72,46 +72,94 @@ hmap_clear (struct hmap *map) void hmap_destroy (struct hmap *map) { - if (map != NULL && map->buckets != &map->one) + if (map != NULL && map->buckets != map->two) free (map->buckets); } -/* Reallocates MAP's hash buckets so that NEW_MASK becomes the - hash value bit-mask used to choose a hash bucket, then +/* Inserts NODE into MAP with hash value HASH. If the insertion causes MAP's + current capacity, as reported by hmap_capacity(), to be exceeded, rehashes + MAP with an increased number of hash buckets. + + This function runs in constant time amortized over all the insertions into + MAP. + + This function does not verify that MAP does not already contain a data item + with the same value as NODE. If duplicates should be disallowed (which is + the usual case), then the client must check for duplicates itself before + inserting the new node. */ +void +hmap_insert (struct hmap *map, struct hmap_node *node, size_t hash) +{ + hmap_insert_fast (map, node, hash); + if (map->count > hmap_capacity (map)) + hmap_reserve (map, map->count); +} + +/* Inserts NODE into MAP with hash value HASH. Does not check whether this + causes MAP's current capacity to be exceeded. The caller must take + responsibility for that (or use hmap_insert() instead). + + This function runs in time linear in the number of nodes already in the + bucket. Given a good hash function, this yields constant time on average. + + This function does not verify that MAP does not already contain a data item + with the same value as NODE. If duplicates should be disallowed (which is + the usual case), then the client must check for duplicates itself before + inserting the new node. */ +void +hmap_insert_fast (struct hmap *map, struct hmap_node *node, size_t hash) +{ + struct hmap_node **bucket; + + for (bucket = &map->buckets[hash >> map->shift]; *bucket != NULL; + bucket = &(*bucket)->next) + if ((*bucket)->hash > hash) + break; + + node->hash = hash; + node->next = *bucket; + *bucket = node; + map->count++; +} + +/* Reallocates MAP's hash buckets so that NEW_SHIFT becomes MAP->shift and rehashes any data elements in MAP into the new hash buckets. - NEW_MASK must be a power of 2 minus 1 (including 0), that is, - its value in binary must be all 1-bits. */ + NEW_SHIFT must be between 2 and HMAP_MAX_SHIFT, inclusive. */ static void -hmap_rehash (struct hmap *map, size_t new_mask) +hmap_rehash (struct hmap *map, size_t new_shift) { struct hmap_node **new_buckets; struct hmap_node *node, *next; - assert ((new_mask & (new_mask + 1)) == 0); - if (new_mask) - new_buckets = xcalloc (new_mask + 1, sizeof *new_buckets); - else + assert (new_shift != map->shift); + + if (new_shift < HMAP_MAX_SHIFT) + new_buckets = xcalloc ((SIZE_MAX >> new_shift) + 1, sizeof *new_buckets); + else { - new_buckets = &map->one; - new_buckets[0] = NULL; + new_buckets = map->two; + new_buckets[0] = new_buckets[1] = NULL; } - + if (map->count > 0) { for (node = hmap_first (map); node != NULL; node = next) { - size_t new_idx = node->hash & new_mask; - struct hmap_node **new_bucket = &new_buckets[new_idx]; + struct hmap_node **bucket; + next = hmap_next (map, node); - node->next = *new_bucket; - *new_bucket = node; - } + for (bucket = &new_buckets[node->hash >> new_shift]; *bucket != NULL; + bucket = &(*bucket)->next) + continue; + *bucket = node; + node->next = NULL; + } } - if (map->buckets != &map->one) + if (map->buckets != map->two) free (map->buckets); map->buckets = new_buckets; - map->mask = new_mask; + map->shift = new_shift; } /* Ensures that MAP has sufficient space to store at least @@ -121,7 +169,7 @@ void hmap_reserve (struct hmap *map, size_t capacity) { if (capacity > hmap_capacity (map)) - hmap_rehash (map, capacity_to_mask (capacity)); + hmap_rehash (map, capacity_to_shift (capacity)); } /* Shrinks MAP's set of buckets to the minimum number needed to @@ -130,9 +178,9 @@ hmap_reserve (struct hmap *map, size_t capacity) void hmap_shrink (struct hmap *map) { - size_t new_mask = capacity_to_mask (map->count); - if (new_mask < map->mask) - hmap_rehash (map, new_mask); + size_t new_shift = capacity_to_shift (map->count); + if (new_shift > map->shift) + hmap_rehash (map, new_shift); } /* Moves NODE around in MAP to compensate for its hash value @@ -146,13 +194,11 @@ hmap_shrink (struct hmap *map) void hmap_changed (struct hmap *map, struct hmap_node *node, size_t new_hash) { - if ((new_hash ^ node->hash) & map->mask) + if (new_hash != node->hash) { hmap_delete (map, node); hmap_insert_fast (map, node, new_hash); } - else - node->hash = new_hash; } /* Hash map nodes may be moved around in memory as necessary, @@ -174,28 +220,23 @@ void hmap_moved (struct hmap *map, struct hmap_node *node, const struct hmap_node *old) { - struct hmap_node **p = &map->buckets[node->hash & map->mask]; + struct hmap_node **p = &map->buckets[node->hash >> map->shift]; while (*p != old) p = &(*p)->next; *p = node; } -/* Returns the minimum-value mask required to allow for a hash - table capacity of at least CAPACITY. The return value will be - a bit-mask suitable for use as the "mask" member of struct - hmap, that is, a power of 2 minus 1 (including 0). */ -static size_t -capacity_to_mask (size_t capacity) +/* Returns the highest "shift" value that allow for a hash table capacity of at + least CAPACITY. The return value will be between 2 and HMAP_MAX_SHIFT, + inclusive. */ +static int +capacity_to_shift (size_t capacity) { - /* Calculate the minimum mask necesary to support the given - capacity. */ - size_t mask = 0; - while (hmap_mask_to_capacity__ (mask) < capacity) - mask = (mask << 1) | 1; + int shift; - /* If the mask is nonzero, make it at least 3, because there is - little point in allocating an array of just 2 pointers. */ - mask |= (mask & 1) << 1; + for (shift = HMAP_MAX_SHIFT; shift > 2; shift--) + if (hmap_shift_to_capacity__ (shift) >= capacity) + break; - return mask; + return shift; } diff --git a/src/libpspp/hmap.h b/src/libpspp/hmap.h index 3f2858a8a8..89cb7d6827 100644 --- a/src/libpspp/hmap.h +++ b/src/libpspp/hmap.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -117,6 +117,7 @@ #include #include +#include #include "libpspp/cast.h" /* Returns the data structure corresponding to the given NODE, @@ -143,17 +144,27 @@ static inline size_t hmap_node_hash (const struct hmap_node *); /* Hash table. */ struct hmap { - size_t count; /* Number of inserted nodes. */ - size_t mask; /* Number of buckets (power of 2), minus 1. */ struct hmap_node **buckets; /* Array of buckets. */ - struct hmap_node *one; /* One bucket, to eliminate corner cases. */ + struct hmap_node *two[2]; /* Two buckets, to eliminate corner cases. */ + size_t count; /* Number of inserted nodes. */ + int shift; /* Bits to shift hash to get bucket index. */ }; +/* Maximum number of bits that a "size_t" value can be shifted in a single + operation without yielding undefined behavior. */ +#if SIZE_MAX == UINT32_MAX +#define HMAP_MAX_SHIFT 31 +#elif SIZE_MAX == UINT64_MAX +#define HMAP_MAX_SHIFT 63 +#else +#error "size_t is not 32 or 64 bits" +#endif + /* Suitable for use as the initializer for a struct hmap named MAP. Typical usage: struct hmap map = HMAP_INITIALIZER (map); HMAP_INITIALIZER() is an alternative to hmap_init(). */ -#define HMAP_INITIALIZER(MAP) { 0, 0, &(MAP).one, NULL } +#define HMAP_INITIALIZER(MAP) { (MAP).two, { NULL, NULL }, 0, HMAP_MAX_SHIFT } /* Creation and destruction. */ void hmap_init (struct hmap *); @@ -172,10 +183,8 @@ static inline struct hmap_node *hmap_first_with_hash (const struct hmap *, static inline struct hmap_node *hmap_next_with_hash (const struct hmap_node *); /* Insertion and deletion. */ -static inline void hmap_insert (struct hmap *, struct hmap_node *, - size_t hash); -static inline void hmap_insert_fast (struct hmap *, struct hmap_node *, - size_t hash); +void hmap_insert (struct hmap *, struct hmap_node *, size_t hash); +void hmap_insert_fast (struct hmap *, struct hmap_node *, size_t hash); static inline void hmap_delete (struct hmap *, struct hmap_node *); /* Iteration. */ @@ -269,7 +278,7 @@ void hmap_moved (struct hmap *, static inline struct hmap_node *hmap_find_hash__ (struct hmap_node *, size_t); static inline struct hmap_node *hmap_first_nonempty_bucket__ ( const struct hmap *, size_t start); -static inline size_t hmap_mask_to_capacity__ (size_t mask); +static inline size_t hmap_shift_to_capacity__ (int shift); /* Returns the hash value associated with NODE. */ static inline size_t @@ -290,12 +299,7 @@ hmap_node_hash (const struct hmap_node *node) constant value, its runtime degenerates to linear in the length of NODE's hash chain.) - Nodes are returned in arbitrary order that may change whenever - the hash table's current capacity changes, as reported by - hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and - hmap_shrink() can change the capacity of a hash map. - Inserting a node with hmap_insert_fast() or deleting one with - hmap_delete() will not change the relative ordering of nodes. + Nodes are returned in arbitrary but consistent order. The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE macros provide convenient ways to iterate over all the nodes @@ -305,7 +309,7 @@ hmap_node_hash (const struct hmap_node *node) static inline struct hmap_node * hmap_first_with_hash (const struct hmap *map, size_t hash) { - return hmap_find_hash__ (map->buckets[hash & map->mask], hash); + return hmap_find_hash__ (map->buckets[hash >> map->shift], hash); } /* Returns the next node in MAP after NODE that has the same hash @@ -320,12 +324,7 @@ hmap_first_with_hash (const struct hmap *map, size_t hash) constant value, its runtime degenerates to linear in the length of NODE's hash chain.) - Nodes are returned in arbitrary order that may change whenever - the hash table's current capacity changes, as reported by - hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and - hmap_shrink() can change the capacity of a hash map. - Inserting a node with hmap_insert_fast() or deleting one with - hmap_delete() will not change the relative ordering of nodes. + Nodes are returned in arbitrary but consistent order. The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE macros provide convenient ways to iterate over all the nodes @@ -338,59 +337,13 @@ hmap_next_with_hash (const struct hmap_node *node) return hmap_find_hash__ (node->next, node->hash); } -/* Inserts NODE into MAP with hash value HASH. If the insertion - causes MAP's current capacity, as reported by hmap_capacity(), - to be exceeded, rehashes MAP with an increased number of hash - buckets. - - This function runs in constant time amortized over all the - insertions into MAP. - - This function does not verify that MAP does not already - contain a data item with the same value as NODE. If - duplicates should be disallowed (which is the usual case), - then the client must check for duplicates itself before - inserting the new node. */ -static inline void -hmap_insert (struct hmap *map, struct hmap_node *node, size_t hash) -{ - hmap_insert_fast (map, node, hash); - if (map->count > hmap_capacity (map)) - hmap_reserve (map, map->count); -} - -/* Inserts NODE into MAP with hash value HASH. Does not check - whether this causes MAP's current capacity to be exceeded. - The caller must take responsibility for that (or use - hmap_insert() instead). - - This function runs in constant time. - - This function does not verify that MAP does not already - contain a data item with the same value as NODE. If - duplicates should be disallowed (which is the usual case), - then the client must check for duplicates itself before - inserting the new node. */ -static inline void -hmap_insert_fast (struct hmap *map, struct hmap_node *node, size_t hash) -{ - struct hmap_node **bucket = &map->buckets[hash & map->mask]; - node->hash = hash; - node->next = *bucket; - *bucket = node; - map->count++; -} - /* Returns the first node in MAP in the bucket for HASH, or a null pointer if that bucket in HASH is empty. This function runs in constant time. - Nodes are returned in arbitrary order that may change whenever the hash - table's current capacity changes, as reported by hmap_capacity(). Calls to - hmap_insert(), hmap_reserve(), and hmap_shrink() can change the capacity of - a hash map. Inserting a node with hmap_insert_fast() or deleting one with - hmap_delete() will not change the relative ordering of nodes. + Nodes in a bucket are sorted in increasing order of hash value. Nodes with + equal hashes are sorted in an arbitrary but consistent order. The HMAP_FOR_EACH_IN_BUCKET and HMAP_FOR_EACH_IN_BUCKET_SAFE macros provide convenient ways to iterate over all the nodes with a given hash. The @@ -399,7 +352,7 @@ hmap_insert_fast (struct hmap *map, struct hmap_node *node, size_t hash) static inline struct hmap_node * hmap_first_in_bucket (const struct hmap *map, size_t hash) { - return map->buckets[hash & map->mask]; + return map->buckets[hash >> map->shift]; } /* Returns the next node following NODE within the same bucket, or a null @@ -407,11 +360,8 @@ hmap_first_in_bucket (const struct hmap *map, size_t hash) This function runs in constant time. - Nodes are returned in arbitrary order that may change whenever the hash - table's current capacity changes, as reported by hmap_capacity(). Calls to - hmap_insert(), hmap_reserve(), and hmap_shrink() can change the capacity of - a hash map. Inserting a node with hmap_insert_fast() or deleting one with - hmap_delete() will not change the relative ordering of nodes. + Nodes in a bucket are sorted in increasing order of hash value. Nodes with + equal hashes are sorted in an arbitrary but consistent order. The HMAP_FOR_EACH_IN_BUCKET and HMAP_FOR_EACH_IN_BUCKET_SAFE macros provide convenient ways to iterate over all the nodes with a given hash. The @@ -446,7 +396,7 @@ hmap_next_in_bucket (const struct hmap_node *node) static inline void hmap_delete (struct hmap *map, struct hmap_node *node) { - struct hmap_node **bucket = &map->buckets[node->hash & map->mask]; + struct hmap_node **bucket = &map->buckets[node->hash >> map->shift]; while (*bucket != node) bucket = &(*bucket)->next; *bucket = (*bucket)->next; @@ -506,9 +456,13 @@ hmap_first (const struct hmap *map) static inline struct hmap_node * hmap_next (const struct hmap *map, const struct hmap_node *node) { - return (node->next != NULL - ? node->next - : hmap_first_nonempty_bucket__ (map, (node->hash & map->mask) + 1)); + if (node->next != NULL) + return node->next; + else + { + size_t bucket_idx = node->hash >> map->shift; + return hmap_first_nonempty_bucket__ (map, bucket_idx + 1); + } } /* Returns true if MAP currently contains no data items, false @@ -537,7 +491,14 @@ hmap_count (const struct hmap *map) static inline size_t hmap_capacity (const struct hmap *map) { - return hmap_mask_to_capacity__ (map->mask); + return hmap_shift_to_capacity__ (map->shift); +} + +/* Returns the number of buckets in MAP. */ +static inline size_t +hmap_n_buckets (const struct hmap *map) +{ + return (SIZE_MAX >> map->shift) + 1; } /* Implementation details. */ @@ -545,7 +506,7 @@ hmap_capacity (const struct hmap *map) /* Returns the first node at or after NODE in NODE's chain that has hash value HASH. */ static inline struct hmap_node * -hmap_find_hash__ (struct hmap_node *node, size_t hash) +hmap_find_hash__ (struct hmap_node *node, size_t hash) { for (; node != NULL; node = node->next) if (node->hash == hash) @@ -561,20 +522,19 @@ hmap_first_nonempty_bucket__ (const struct hmap *map, size_t start) { size_t i; - for (i = start; i <= map->mask; i++) + for (i = start; i <= SIZE_MAX >> map->shift; i++) if (map->buckets[i] != NULL) return map->buckets[i]; return NULL; } -/* Returns the hash table capacity associated with a given MASK, - which should be a value for the "mask" member of struct hmap. - MASK must be a power of 2 minus 1 (including 0), that is, its - value in binary must be all 1-bits. */ +/* Returns the hash table capacity associated with a given SHIFT, which should + be a value for the "shift" member of struct hmap. SHIFT must be between 2 + and HMAP_MAX_SHIFT, inclusive. */ static inline size_t -hmap_mask_to_capacity__ (size_t mask) +hmap_shift_to_capacity__ (int shift) { - return (mask + 1) * 2; + return ((size_t) 1 << HMAP_MAX_SHIFT) >> (shift - 2); } /* Helper for HMAP_NULLABLE_DATA (to avoid evaluating its NODE diff --git a/tests/libpspp/hmap-test.c b/tests/libpspp/hmap-test.c index c8619a0511..c7f4e5c884 100644 --- a/tests/libpspp/hmap-test.c +++ b/tests/libpspp/hmap-test.c @@ -287,7 +287,18 @@ typedef size_t hash_function (int data); static size_t identity_hash (int data) { - return data; + size_t hash; + int i; + + hash = 0; + for (i = 0; i < 32; i++) + if (data & (1u << i)) + { + size_t high_bit = (size_t) 1 << (sizeof (size_t) * CHAR_BIT - 1); + hash |= high_bit >> i; + } + + return hash; } static size_t @@ -684,7 +695,7 @@ test_insert_ordered (int max_elems, hash_function *hash) int max = INT_MIN; int j; - for (j = 0; j <= hmap.mask; j++) + for (j = 0; j < hmap_n_buckets (&hmap); j++) { int count = 0; struct hmap_node *node; -- 2.30.2