src/libpspp/hmap.h

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2008 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /* Hash table with separate chaining.
  18
  19    This header (hmap.h) supplies an "embedded" implementation of
  20    a hash table that uses linked lists to resolve collisions
  21    ("separate chaining").  Its companion header (hmapx.h)
  22    supplies a "external" implementation that is otherwise
  23    similar.  The two variants are described briefly here.  The
  24    embedded variant, for which this is the header, is described
  25    in slightly more detail below.  Each function also has a
  26    detailed usage comment at its point of definition.  (Many of
  27    those definitions are inline in this file, because they are so
  28    simple.  Others are in hmap.c.)
  29
  30    The "hmap" embedded hash table implementation puts the hash
  31    table node (which includes the linked list used for resolving
  32    collisions) within the data structure that the hash table
  33    contains.  This makes allocation efficient, in space and time,
  34    because no additional call into an allocator is needed to
  35    obtain memory for the hash table node.  It also makes it easy
  36    to find the hash table node associated with a given object.
  37    However, it's difficult to include a given object in an
  38    arbitrary number of hash tables.
  39
  40    The "hmapx" external hash table implementation allocates hash
  41    table nodes separately from the objects in the hash table.
  42    Inserting and removing hash table elements requires dynamic
  43    allocation, so it is normally slower and takes more memory
  44    than the embedded implementation.  It also requires searching
  45    the table to find the node associated with a given object.
  46    However, it's easy to include a given object in an arbitrary
  47    number of hash tables.  It's also possible to create an
  48    external hash table without adding a member to the data
  49    structure that the hash table contains. */
  50
  51 #ifndef LIBPSPP_HMAP_H
  52 #define LIBPSPP_HMAP_H 1
  53
  54 /* Embedded hash table with separate chaining.
  55
  56    To create an embedded hash table, declare an instance of
  57    struct hmap, then initialize it with hmap_init():
  58      struct hmap map;
  59      hmap_init (&map);
  60    or, alternatively:
  61      struct hmap map = HMAP_INITIALIZER (map);
  62
  63    Each node in the hash table, presumably a structure type, must
  64    include a struct hmap_node member.  Here's an example:
  65      struct foo
  66        {
  67          struct hmap_node node;   // hmap_node member.
  68          const char *string;      // Another member.
  69        };
  70    The hash table functions work with pointers to struct
  71    hmap_node.  To obtain a pointer to your structure type given a
  72    pointer to struct hmap_node, use the HMAP_DATA macro.
  73
  74    Inserting and deleting elements is straightforward.  Use
  75    hmap_insert() to insert an element and hmap_delete() to delete
  76    an element, e.g.:
  77      struct foo my_foo;
  78      my_foo.string = "My string";
  79      hmap_insert (&map, &my_foo.node, hsh_hash_string (my_foo.string));
  80      ...
  81      hmap_delete (&map, &my_foo.node);
  82    You must pass the element's hash value as one of
  83    hmap_insert()'s arguments.  The hash table saves this hash
  84    value for use later to speed searches and to rehash as the
  85    hash table grows.
  86
  87    hmap_insert() does not check whether the newly inserted
  88    element duplicates an element already in the hash table.  The
  89    client is responsible for doing so, if this is desirable.
  90
  91    The hash table does not provide a direct way to search for an
  92    existing element.  Instead, it provides the means to iterate
  93    over all the elements in the hash table with a given hash
  94    value.  It is easy to compose a search function from such a
  95    building block.  For example:
  96      const struct foo *
  97      find_foo (const struct hmap *map, const char *name)
  98      {
  99        const struct foo *foo;
 100        size_t hash;
 101
 102        hash = hsh_hash_string (name);
 103        HMAP_FOR_EACH_WITH_HASH (foo, struct foo, node, hash, map)
 104          if (!strcmp (foo->name, name))
 105            break;
 106        return foo;
 107      }
 108
 109    Here is how to iterate through the elements currently in the
 110    hash table:
 111      struct foo *foo;
 112      HMAP_FOR_EACH (foo, struct foo, node, &map)
 113        {
 114          ...do something with foo...
 115        }
 116    */
 117
 118 #include <stddef.h>
 119
 120 /* Returns the data structure corresponding to the given NODE,
 121    assuming that NODE is embedded as the given MEMBER name in
 122    data type STRUCT.  NODE must not be a null pointer. */
 123 #define HMAP_DATA(NODE, STRUCT, MEMBER)                         \
 124   ((STRUCT *) ((char *) (NODE) - offsetof (STRUCT, MEMBER)))
 125
 126 /* Like HMAP_DATA, except that a null NODE yields a null pointer
 127    result. */
 128 #define HMAP_NULLABLE_DATA(NODE, STRUCT, MEMBER)        \
 129   hmap_nullable_data__ (NODE, offsetof (STRUCT, MEMBER))
 130
 131 /* Hash table node. */
 132 struct hmap_node
 133   {
 134     struct hmap_node *next;     /* Next in chain. */
 135     size_t hash;                /* Hash value. */
 136   };
 137
 138 static inline size_t hmap_node_hash (const struct hmap_node *);
 139
 140 /* Hash table. */
 141 struct hmap
 142   {
 143     size_t count;               /* Number of inserted nodes. */
 144     size_t mask;                /* Number of buckets (power of 2), minus 1. */
 145     struct hmap_node **buckets; /* Array of buckets. */
 146     struct hmap_node *one;      /* One bucket, to eliminate corner cases. */
 147   };
 148
 149 /* Suitable for use as the initializer for a struct hmap named
 150    MAP.  Typical usage:
 151        struct hmap map = HMAP_INITIALIZER (map);
 152    HMAP_INITIALIZER() is an alternative to hmap_init(). */
 153 #define HMAP_INITIALIZER(MAP) { 0, 0, &(MAP).one, NULL }
 154
 155 /* Creation and destruction. */
 156 void hmap_init (struct hmap *);
 157 void hmap_swap (struct hmap *, struct hmap *);
 158 void hmap_destroy (struct hmap *);
 159
 160 /* Storage management. */
 161 void hmap_reserve (struct hmap *, size_t capacity);
 162 void hmap_shrink (struct hmap *);
 163
 164 /* Search.  Refer to the large comment near the top of this file
 165    for an example.*/
 166 static inline struct hmap_node *hmap_first_with_hash (const struct hmap *,
 167                                                       size_t hash);
 168 static inline struct hmap_node *hmap_next_with_hash (const struct hmap_node *);
 169
 170 /* Insertion and deletion. */
 171 static inline void hmap_insert (struct hmap *, struct hmap_node *,
 172                                 size_t hash);
 173 static inline void hmap_insert_fast (struct hmap *, struct hmap_node *,
 174                                      size_t hash);
 175 static inline void hmap_delete (struct hmap *, struct hmap_node *);
 176
 177 /* Iteration. */
 178 static inline struct hmap_node *hmap_first (const struct hmap *);
 179 static inline struct hmap_node *hmap_next (const struct hmap *,
 180                                            const struct hmap_node *);
 181
 182 /* Counting. */
 183 static inline size_t hmap_count (const struct hmap *);
 184 static inline size_t hmap_capacity (const struct hmap *);
 185
 186 /* Updating data elements. */
 187 void hmap_changed (struct hmap *, struct hmap_node *, size_t new_hash);
 188 void hmap_moved (struct hmap *,
 189                  struct hmap_node *, const struct hmap_node *old);
 190
 191 /* Convenience macros for search.
 192
 193    These macros automatically use HMAP_DATA to obtain the data
 194    elements that encapsulate hmap nodes, which often saves typing
 195    and can make code easier to read.  Refer to the large comment
 196    near the top of this file for an example.
 197
 198    These macros evaluate HASH only once.  They evaluate their
 199    other arguments many times. */
 200 #define HMAP_FIRST_WITH_HASH(STRUCT, MEMBER, HMAP, HASH)                \
 201   HMAP_NULLABLE_DATA (hmap_first_with_hash (HMAP, HASH), STRUCT, MEMBER)
 202 #define HMAP_NEXT_WITH_HASH(DATA, STRUCT, MEMBER)                       \
 203   HMAP_NULLABLE_DATA (hmap_next_with_hash (&(DATA)->MEMBER), STRUCT, MEMBER)
 204 #define HMAP_FOR_EACH_WITH_HASH(DATA, STRUCT, MEMBER, HASH, HMAP)       \
 205   for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH);      \
 206        (DATA) != NULL;                                                  \
 207        (DATA) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER))
 208 #define HMAP_FOR_EACH_WITH_HASH_SAFE(DATA, NEXT, STRUCT, MEMBER, HASH, HMAP) \
 209   for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH);      \
 210        ((DATA) != NULL                                                  \
 211         ? ((NEXT) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER), 1)      \
 212         : 0);                                                           \
 213        (DATA) = (NEXT))
 214
 215 /* Convenience macros for iteration.
 216
 217    These macros automatically use HMAP_DATA to obtain the data
 218    elements that encapsulate hmap nodes, which often saves typing
 219    and can make code easier to read.  Refer to the large comment
 220    near the top of this file for an example.
 221
 222    These macros evaluate their arguments many times. */
 223 #define HMAP_FIRST(STRUCT, MEMBER, HMAP)                        \
 224   HMAP_NULLABLE_DATA (hmap_first (HMAP), STRUCT, MEMBER)
 225 #define HMAP_NEXT(DATA, STRUCT, MEMBER, HMAP)                           \
 226   HMAP_NULLABLE_DATA (hmap_next (HMAP, &(DATA)->MEMBER), STRUCT, MEMBER)
 227 #define HMAP_FOR_EACH(DATA, STRUCT, MEMBER, HMAP)       \
 228   for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP);      \
 229        (DATA) != NULL;                                  \
 230        (DATA) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP))
 231 #define HMAP_FOR_EACH_SAFE(DATA, NEXT, STRUCT, MEMBER, HMAP)    \
 232   for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP);              \
 233        ((DATA) != NULL                                          \
 234         ? ((NEXT) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP), 1)  \
 235         : 0);                                                   \
 236        (DATA) = (NEXT))
 237 \f
 238 /* Inline definitions. */
 239
 240 static inline struct hmap_node *hmap_find_hash__ (struct hmap_node *, size_t);
 241 static inline struct hmap_node *hmap_first_nonempty_bucket__ (
 242   const struct hmap *, size_t start);
 243 static inline size_t hmap_mask_to_capacity__ (size_t mask);
 244
 245 /* Returns the hash value associated with NODE. */
 246 size_t
 247 hmap_node_hash (const struct hmap_node *node)
 248 {
 249   return node->hash;
 250 }
 251
 252 /* Returns the first node in MAP that has hash value HASH, or a
 253    null pointer if MAP does not contain any node with that hash
 254    value.
 255
 256    Assuming uniform hashing and no duplicate data items in MAP,
 257    this function runs in constant time.  (Amortized over an
 258    iteration over all data items with a given HASH, its runtime
 259    is proportional to the length of the hash chain for HASH, so
 260    given a pathological hash function, e.g. one that returns a
 261    constant value, its runtime degenerates to linear in the
 262    length of NODE's hash chain.)
 263
 264    Nodes are returned in arbitrary order that may change whenever
 265    the hash table's current capacity changes, as reported by
 266    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 267    hmap_shrink() can change the capacity of a hash map.
 268    Inserting a node with hmap_insert_fast() or deleting one with
 269    hmap_delete() will not change the relative ordering of nodes.
 270
 271    The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
 272    macros provide convenient ways to iterate over all the nodes
 273    with a given hash.  The HMAP_FIRST_WITH_HASH macro is an
 274    interface to this particular function that is often more
 275    convenient. */
 276 static inline struct hmap_node *
 277 hmap_first_with_hash (const struct hmap *map, size_t hash)
 278 {
 279   return hmap_find_hash__ (map->buckets[hash & map->mask], hash);
 280 }
 281
 282 /* Returns the next node in MAP after NODE that has the same hash
 283    value as NODE, or a null pointer if MAP does not contain any
 284    more nodes with that hash value.
 285
 286    Assuming uniform hashing and no duplicate data items in MAP,
 287    this function runs in constant time.  (Amortized over an
 288    iteration over all data items with a given HASH, its runtime
 289    is proportional to the length of the hash chain for HASH, so
 290    given a pathological hash function, e.g. one that returns a
 291    constant value, its runtime degenerates to linear in the
 292    length of NODE's hash chain.)
 293
 294    Nodes are returned in arbitrary order that may change whenever
 295    the hash table's current capacity changes, as reported by
 296    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 297    hmap_shrink() can change the capacity of a hash map.
 298    Inserting a node with hmap_insert_fast() or deleting one with
 299    hmap_delete() will not change the relative ordering of nodes.
 300
 301    The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
 302    macros provide convenient ways to iterate over all the nodes
 303    with a given hash.  The HMAP_NEXT_WITH_HASH macro is an
 304    interface to this particular function that is often more
 305    convenient. */
 306 static inline struct hmap_node *
 307 hmap_next_with_hash (const struct hmap_node *node)
 308 {
 309   return hmap_find_hash__ (node->next, node->hash);
 310 }
 311
 312 /* Inserts NODE into MAP with hash value HASH.  If the insertion
 313    causes MAP's current capacity, as reported by hmap_capacity(),
 314    to be exceeded, rehashes MAP with an increased number of hash
 315    buckets.
 316
 317    This function runs in constant time amortized over all the
 318    insertions into MAP.
 319
 320    This function does not verify that MAP does not already
 321    contain a data item with the same value as NODE.  If
 322    duplicates should be disallowed (which is the usual case),
 323    then the client must check for duplicates itself before
 324    inserting the new node. */
 325 static inline void
 326 hmap_insert (struct hmap *map, struct hmap_node *node, size_t hash)
 327 {
 328   hmap_insert_fast (map, node, hash);
 329   if (map->count > hmap_capacity (map))
 330     hmap_reserve (map, map->count);
 331 }
 332
 333 /* Inserts NODE into MAP with hash value HASH.  Does not check
 334    whether this causes MAP's current capacity to be exceeded.
 335    The caller must take responsibility for that (or use
 336    hmap_insert() instead).
 337
 338    This function runs in constant time.
 339
 340    This function does not verify that MAP does not already
 341    contain a data item with the same value as NODE.  If
 342    duplicates should be disallowed (which is the usual case),
 343    then the client must check for duplicates itself before
 344    inserting the new node. */
 345 static inline void
 346 hmap_insert_fast (struct hmap *map, struct hmap_node *node, size_t hash)
 347 {
 348   struct hmap_node **bucket = &map->buckets[hash & map->mask];
 349   node->hash = hash;
 350   node->next = *bucket;
 351   *bucket = node;
 352   map->count++;
 353 }
 354
 355 /* Removes NODE from MAP.  The client is responsible for freeing
 356    any data associated with NODE, if necessary.
 357
 358    Assuming uniform hashing, this function runs in constant time.
 359    (Its runtime is proportional to the position of NODE in its
 360    hash chain, so given a pathological hash function, e.g. one
 361    that returns a constant value, its runtime degenerates to
 362    linear in the length of NODE's hash chain.)
 363
 364    This function never reduces the number of buckets in MAP.
 365    When one deletes a large number of nodes from a hash table,
 366    calling hmap_shrink() afterward may therefore save a small
 367    amount of memory.  It is also more expensive to iterate
 368    through a very sparse hash table than a denser one, so
 369    shrinking the hash table could also save some time.  However,
 370    rehashing has an immediate cost that must be weighed against
 371    these benefits.
 372
 373    hmap_delete() does not change NODE's hash value reported by
 374    hmap_node_hash(). */
 375 static inline void
 376 hmap_delete (struct hmap *map, struct hmap_node *node)
 377 {
 378   struct hmap_node **bucket = &map->buckets[node->hash & map->mask];
 379   while (*bucket != node)
 380     bucket = &(*bucket)->next;
 381   *bucket = (*bucket)->next;
 382   map->count--;
 383 }
 384
 385 /* Returns the first node in MAP, or a null pointer if MAP is
 386    empty.
 387
 388    Amortized over iterating through every data element in MAP,
 389    this function runs in constant time.  However, this assumes
 390    that MAP is not excessively sparse, that is, that
 391    hmap_capacity(MAP) is at most a constant factor greater than
 392    hmap_count(MAP).  This will always be true unless many nodes
 393    have been inserted into MAP and then most or all of them
 394    deleted; in such a case, calling hmap_shrink() is advised.
 395
 396    Nodes are returned in arbitrary order that may change whenever
 397    the hash table's current capacity changes, as reported by
 398    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 399    hmap_shrink() can change the capacity of a hash map.
 400    Inserting a node with hmap_insert_fast() or deleting one with
 401    hmap_delete() will not change the relative ordering of nodes.
 402
 403    The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
 404    convenient ways to iterate over all the nodes in a hash map.
 405    The HMAP_FIRST macro is an interface to this particular
 406    function that is often more convenient. */
 407 static inline struct hmap_node *
 408 hmap_first (const struct hmap *map)
 409 {
 410   return hmap_first_nonempty_bucket__ (map, 0);
 411 }
 412
 413 /* Returns the next node in MAP following NODE, or a null pointer
 414    if NODE is the last node in MAP.
 415
 416    Amortized over iterating through every data element in MAP,
 417    this function runs in constant time.  However, this assumes
 418    that MAP is not excessively sparse, that is, that
 419    hmap_capacity(MAP) is at most a constant factor greater than
 420    hmap_count(MAP).  This will always be true unless many nodes
 421    have been inserted into MAP and then most or all of them
 422    deleted; in such a case, calling hmap_shrink() is advised.
 423
 424    Nodes are returned in arbitrary order that may change whenever
 425    the hash table's current capacity changes, as reported by
 426    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 427    hmap_shrink() can change the capacity of a hash map.
 428    Inserting a node with hmap_insert_fast() or deleting one with
 429    hmap_delete() will not change the relative ordering of nodes.
 430
 431    The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
 432    convenient ways to iterate over all the nodes in a hash map.
 433    The HMAP_NEXT macro is an interface to this particular
 434    function that is often more convenient. */
 435 static inline struct hmap_node *
 436 hmap_next (const struct hmap *map, const struct hmap_node *node)
 437 {
 438   return (node->next != NULL
 439           ? node->next
 440           : hmap_first_nonempty_bucket__ (map, (node->hash & map->mask) + 1));
 441 }
 442
 443 /* Returns the number of data items currently in MAP. */
 444 static inline size_t
 445 hmap_count (const struct hmap *map)
 446 {
 447   return map->count;
 448 }
 449
 450 /* Returns the current capacity of MAP, that is, the maximum
 451    number of data elements that MAP may hold before it becomes
 452    advisable to rehash.
 453
 454    The capacity is advisory only: it is possible to insert any
 455    number of data elements into a hash map regardless of its
 456    capacity.  However, inserting many more elements than the
 457    map's capacity will degrade search performance. */
 458 static inline size_t
 459 hmap_capacity (const struct hmap *map)
 460 {
 461   return hmap_mask_to_capacity__ (map->mask);
 462 }
 463 \f
 464 /* Implementation details. */
 465
 466 /* Returns the first node at or after NODE in NODE's chain that
 467    has hash value HASH. */
 468 static inline struct hmap_node *
 469 hmap_find_hash__ (struct hmap_node *node, size_t hash)
 470 {
 471   for (; node != NULL; node = node->next)
 472     if (node->hash == hash)
 473       break;
 474   return node;
 475 }
 476
 477 /* Returns the first node in the lowest-numbered nonempty bucket
 478    in MAP whose index is START or higher, or a null pointer if
 479    all such buckets are empty. */
 480 static inline struct hmap_node *
 481 hmap_first_nonempty_bucket__ (const struct hmap *map, size_t start)
 482 {
 483   size_t i;
 484
 485   for (i = start; i <= map->mask; i++)
 486     if (map->buckets[i] != NULL)
 487       return map->buckets[i];
 488   return NULL;
 489 }
 490
 491 /* Returns the hash table capacity associated with a given MASK,
 492    which should be a value for the "mask" member of struct hmap.
 493    MASK must be a power of 2 minus 1 (including 0), that is, its
 494    value in binary must be all 1-bits.  */
 495 static inline size_t
 496 hmap_mask_to_capacity__ (size_t mask)
 497 {
 498   return (mask + 1) * 2;
 499 }
 500
 501 /* Helper for HMAP_NULLABLE_DATA (to avoid evaluating its NODE
 502    argument more than once).  */
 503 static inline void *
 504 hmap_nullable_data__ (struct hmap_node *node, size_t member_offset)
 505 {
 506   return node != NULL ? (char *) node - member_offset : NULL;
 507 }
 508
 509 #endif /* libpspp/hmap.h */