pintos-os.org Git - pspp/blob - src/libpspp/hmap.h

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2008, 2009, 2010 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /* Hash table with separate chaining.
  18
  19    This header (hmap.h) supplies an "embedded" implementation of
  20    a hash table that uses linked lists to resolve collisions
  21    ("separate chaining").  Its companion header (hmapx.h)
  22    supplies a "external" implementation that is otherwise
  23    similar.  The two variants are described briefly here.  The
  24    embedded variant, for which this is the header, is described
  25    in slightly more detail below.  Each function also has a
  26    detailed usage comment at its point of definition.  (Many of
  27    those definitions are inline in this file, because they are so
  28    simple.  Others are in hmap.c.)
  29
  30    The "hmap" embedded hash table implementation puts the hash
  31    table node (which includes the linked list used for resolving
  32    collisions) within the data structure that the hash table
  33    contains.  This makes allocation efficient, in space and time,
  34    because no additional call into an allocator is needed to
  35    obtain memory for the hash table node.  It also makes it easy
  36    to find the hash table node associated with a given object.
  37    However, it's difficult to include a given object in an
  38    arbitrary number of hash tables.
  39
  40    The "hmapx" external hash table implementation allocates hash
  41    table nodes separately from the objects in the hash table.
  42    Inserting and removing hash table elements requires dynamic
  43    allocation, so it is normally slower and takes more memory
  44    than the embedded implementation.  It also requires searching
  45    the table to find the node associated with a given object.
  46    However, it's easy to include a given object in an arbitrary
  47    number of hash tables.  It's also possible to create an
  48    external hash table without adding a member to the data
  49    structure that the hash table contains. */
  50
  51 #ifndef LIBPSPP_HMAP_H
  52 #define LIBPSPP_HMAP_H 1
  53
  54 /* Embedded hash table with separate chaining.
  55
  56    To create an embedded hash table, declare an instance of
  57    struct hmap, then initialize it with hmap_init():
  58      struct hmap map;
  59      hmap_init (&map);
  60    or, alternatively:
  61      struct hmap map = HMAP_INITIALIZER (map);
  62
  63    Each node in the hash table, presumably a structure type, must
  64    include a struct hmap_node member.  Here's an example:
  65      struct foo
  66        {
  67          struct hmap_node node;   // hmap_node member.
  68          const char *string;      // Another member.
  69        };
  70    The hash table functions work with pointers to struct
  71    hmap_node.  To obtain a pointer to your structure type given a
  72    pointer to struct hmap_node, use the HMAP_DATA macro.
  73
  74    Inserting and deleting elements is straightforward.  Use
  75    hmap_insert() to insert an element and hmap_delete() to delete
  76    an element, e.g.:
  77      struct foo my_foo;
  78      my_foo.string = "My string";
  79      hmap_insert (&map, &my_foo.node, hsh_hash_string (my_foo.string));
  80      ...
  81      hmap_delete (&map, &my_foo.node);
  82    You must pass the element's hash value as one of
  83    hmap_insert()'s arguments.  The hash table saves this hash
  84    value for use later to speed searches and to rehash as the
  85    hash table grows.
  86
  87    hmap_insert() does not check whether the newly inserted
  88    element duplicates an element already in the hash table.  The
  89    client is responsible for doing so, if this is desirable.
  90
  91    The hash table does not provide a direct way to search for an
  92    existing element.  Instead, it provides the means to iterate
  93    over all the elements in the hash table with a given hash
  94    value.  It is easy to compose a search function from such a
  95    building block.  For example:
  96      const struct foo *
  97      find_foo (const struct hmap *map, const char *name)
  98      {
  99        const struct foo *foo;
 100        size_t hash;
 101
 102        hash = hsh_hash_string (name);
 103        HMAP_FOR_EACH_WITH_HASH (foo, struct foo, node, hash, map)
 104          if (!strcmp (foo->name, name))
 105            break;
 106        return foo;
 107      }
 108
 109    Here is how to iterate through the elements currently in the
 110    hash table:
 111      struct foo *foo;
 112      HMAP_FOR_EACH (foo, struct foo, node, &map)
 113        {
 114          ...do something with foo...
 115        }
 116    */
 117
 118 #include <stdbool.h>
 119 #include <stddef.h>
 120 #include <libpspp/cast.h>
 121
 122 /* Returns the data structure corresponding to the given NODE,
 123    assuming that NODE is embedded as the given MEMBER name in
 124    data type STRUCT.  NODE must not be a null pointer. */
 125 #define HMAP_DATA(NODE, STRUCT, MEMBER)                         \
 126         (CHECK_POINTER_HAS_TYPE (NODE, struct hmap_node *),     \
 127          UP_CAST (NODE, STRUCT, MEMBER))
 128
 129 /* Like HMAP_DATA, except that a null NODE yields a null pointer
 130    result. */
 131 #define HMAP_NULLABLE_DATA(NODE, STRUCT, MEMBER)        \
 132   hmap_nullable_data__ (NODE, offsetof (STRUCT, MEMBER))
 133
 134 /* Hash table node. */
 135 struct hmap_node
 136   {
 137     struct hmap_node *next;     /* Next in chain. */
 138     size_t hash;                /* Hash value. */
 139   };
 140
 141 static inline size_t hmap_node_hash (const struct hmap_node *);
 142
 143 /* Hash table. */
 144 struct hmap
 145   {
 146     size_t count;               /* Number of inserted nodes. */
 147     size_t mask;                /* Number of buckets (power of 2), minus 1. */
 148     struct hmap_node **buckets; /* Array of buckets. */
 149     struct hmap_node *one;      /* One bucket, to eliminate corner cases. */
 150   };
 151
 152 /* Suitable for use as the initializer for a struct hmap named
 153    MAP.  Typical usage:
 154        struct hmap map = HMAP_INITIALIZER (map);
 155    HMAP_INITIALIZER() is an alternative to hmap_init(). */
 156 #define HMAP_INITIALIZER(MAP) { 0, 0, &(MAP).one, NULL }
 157
 158 /* Creation and destruction. */
 159 void hmap_init (struct hmap *);
 160 void hmap_swap (struct hmap *, struct hmap *);
 161 void hmap_clear (struct hmap *);
 162 void hmap_destroy (struct hmap *);
 163
 164 /* Storage management. */
 165 void hmap_reserve (struct hmap *, size_t capacity);
 166 void hmap_shrink (struct hmap *);
 167
 168 /* Search.  Refer to the large comment near the top of this file
 169    for an example.*/
 170 static inline struct hmap_node *hmap_first_with_hash (const struct hmap *,
 171                                                       size_t hash);
 172 static inline struct hmap_node *hmap_next_with_hash (const struct hmap_node *);
 173
 174 /* Insertion and deletion. */
 175 static inline void hmap_insert (struct hmap *, struct hmap_node *,
 176                                 size_t hash);
 177 static inline void hmap_insert_fast (struct hmap *, struct hmap_node *,
 178                                      size_t hash);
 179 static inline void hmap_delete (struct hmap *, struct hmap_node *);
 180
 181 /* Iteration. */
 182 static inline struct hmap_node *hmap_first (const struct hmap *);
 183 static inline struct hmap_node *hmap_next (const struct hmap *,
 184                                            const struct hmap_node *);
 185
 186 /* Counting. */
 187 static bool hmap_is_empty (const struct hmap *);
 188 static inline size_t hmap_count (const struct hmap *);
 189 static inline size_t hmap_capacity (const struct hmap *);
 190
 191 /* Updating data elements. */
 192 void hmap_changed (struct hmap *, struct hmap_node *, size_t new_hash);
 193 void hmap_moved (struct hmap *,
 194                  struct hmap_node *, const struct hmap_node *old);
 195
 196 /* Convenience macros for search.
 197
 198    These macros automatically use HMAP_DATA to obtain the data
 199    elements that encapsulate hmap nodes, which often saves typing
 200    and can make code easier to read.  Refer to the large comment
 201    near the top of this file for an example.
 202
 203    These macros evaluate HASH only once.  They evaluate their
 204    other arguments many times. */
 205 #define HMAP_FIRST_WITH_HASH(STRUCT, MEMBER, HMAP, HASH)                \
 206   HMAP_NULLABLE_DATA (hmap_first_with_hash (HMAP, HASH), STRUCT, MEMBER)
 207 #define HMAP_NEXT_WITH_HASH(DATA, STRUCT, MEMBER)                       \
 208   HMAP_NULLABLE_DATA (hmap_next_with_hash (&(DATA)->MEMBER), STRUCT, MEMBER)
 209 #define HMAP_FOR_EACH_WITH_HASH(DATA, STRUCT, MEMBER, HASH, HMAP)       \
 210   for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH);      \
 211        (DATA) != NULL;                                                  \
 212        (DATA) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER))
 213 #define HMAP_FOR_EACH_WITH_HASH_SAFE(DATA, NEXT, STRUCT, MEMBER, HASH, HMAP) \
 214   for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH);      \
 215        ((DATA) != NULL                                                  \
 216         ? ((NEXT) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER), 1)      \
 217         : 0);                                                           \
 218        (DATA) = (NEXT))
 219
 220 /* Convenience macros for iteration.
 221
 222    These macros automatically use HMAP_DATA to obtain the data
 223    elements that encapsulate hmap nodes, which often saves typing
 224    and can make code easier to read.  Refer to the large comment
 225    near the top of this file for an example.
 226
 227    These macros evaluate their arguments many times. */
 228 #define HMAP_FIRST(STRUCT, MEMBER, HMAP)                        \
 229   HMAP_NULLABLE_DATA (hmap_first (HMAP), STRUCT, MEMBER)
 230 #define HMAP_NEXT(DATA, STRUCT, MEMBER, HMAP)                           \
 231   HMAP_NULLABLE_DATA (hmap_next (HMAP, &(DATA)->MEMBER), STRUCT, MEMBER)
 232 #define HMAP_FOR_EACH(DATA, STRUCT, MEMBER, HMAP)       \
 233   for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP);      \
 234        (DATA) != NULL;                                  \
 235        (DATA) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP))
 236 #define HMAP_FOR_EACH_SAFE(DATA, NEXT, STRUCT, MEMBER, HMAP)    \
 237   for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP);              \
 238        ((DATA) != NULL                                          \
 239         ? ((NEXT) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP), 1)  \
 240         : 0);                                                   \
 241        (DATA) = (NEXT))
 242 \f
 243 /* Inline definitions. */
 244
 245 static inline struct hmap_node *hmap_find_hash__ (struct hmap_node *, size_t);
 246 static inline struct hmap_node *hmap_first_nonempty_bucket__ (
 247   const struct hmap *, size_t start);
 248 static inline size_t hmap_mask_to_capacity__ (size_t mask);
 249
 250 /* Returns the hash value associated with NODE. */
 251 size_t
 252 hmap_node_hash (const struct hmap_node *node)
 253 {
 254   return node->hash;
 255 }
 256
 257 /* Returns the first node in MAP that has hash value HASH, or a
 258    null pointer if MAP does not contain any node with that hash
 259    value.
 260
 261    Assuming uniform hashing and no duplicate data items in MAP,
 262    this function runs in constant time.  (Amortized over an
 263    iteration over all data items with a given HASH, its runtime
 264    is proportional to the length of the hash chain for HASH, so
 265    given a pathological hash function, e.g. one that returns a
 266    constant value, its runtime degenerates to linear in the
 267    length of NODE's hash chain.)
 268
 269    Nodes are returned in arbitrary order that may change whenever
 270    the hash table's current capacity changes, as reported by
 271    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 272    hmap_shrink() can change the capacity of a hash map.
 273    Inserting a node with hmap_insert_fast() or deleting one with
 274    hmap_delete() will not change the relative ordering of nodes.
 275
 276    The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
 277    macros provide convenient ways to iterate over all the nodes
 278    with a given hash.  The HMAP_FIRST_WITH_HASH macro is an
 279    interface to this particular function that is often more
 280    convenient. */
 281 static inline struct hmap_node *
 282 hmap_first_with_hash (const struct hmap *map, size_t hash)
 283 {
 284   return hmap_find_hash__ (map->buckets[hash & map->mask], hash);
 285 }
 286
 287 /* Returns the next node in MAP after NODE that has the same hash
 288    value as NODE, or a null pointer if MAP does not contain any
 289    more nodes with that hash value.
 290
 291    Assuming uniform hashing and no duplicate data items in MAP,
 292    this function runs in constant time.  (Amortized over an
 293    iteration over all data items with a given HASH, its runtime
 294    is proportional to the length of the hash chain for HASH, so
 295    given a pathological hash function, e.g. one that returns a
 296    constant value, its runtime degenerates to linear in the
 297    length of NODE's hash chain.)
 298
 299    Nodes are returned in arbitrary order that may change whenever
 300    the hash table's current capacity changes, as reported by
 301    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 302    hmap_shrink() can change the capacity of a hash map.
 303    Inserting a node with hmap_insert_fast() or deleting one with
 304    hmap_delete() will not change the relative ordering of nodes.
 305
 306    The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE
 307    macros provide convenient ways to iterate over all the nodes
 308    with a given hash.  The HMAP_NEXT_WITH_HASH macro is an
 309    interface to this particular function that is often more
 310    convenient. */
 311 static inline struct hmap_node *
 312 hmap_next_with_hash (const struct hmap_node *node)
 313 {
 314   return hmap_find_hash__ (node->next, node->hash);
 315 }
 316
 317 /* Inserts NODE into MAP with hash value HASH.  If the insertion
 318    causes MAP's current capacity, as reported by hmap_capacity(),
 319    to be exceeded, rehashes MAP with an increased number of hash
 320    buckets.
 321
 322    This function runs in constant time amortized over all the
 323    insertions into MAP.
 324
 325    This function does not verify that MAP does not already
 326    contain a data item with the same value as NODE.  If
 327    duplicates should be disallowed (which is the usual case),
 328    then the client must check for duplicates itself before
 329    inserting the new node. */
 330 static inline void
 331 hmap_insert (struct hmap *map, struct hmap_node *node, size_t hash)
 332 {
 333   hmap_insert_fast (map, node, hash);
 334   if (map->count > hmap_capacity (map))
 335     hmap_reserve (map, map->count);
 336 }
 337
 338 /* Inserts NODE into MAP with hash value HASH.  Does not check
 339    whether this causes MAP's current capacity to be exceeded.
 340    The caller must take responsibility for that (or use
 341    hmap_insert() instead).
 342
 343    This function runs in constant time.
 344
 345    This function does not verify that MAP does not already
 346    contain a data item with the same value as NODE.  If
 347    duplicates should be disallowed (which is the usual case),
 348    then the client must check for duplicates itself before
 349    inserting the new node. */
 350 static inline void
 351 hmap_insert_fast (struct hmap *map, struct hmap_node *node, size_t hash)
 352 {
 353   struct hmap_node **bucket = &map->buckets[hash & map->mask];
 354   node->hash = hash;
 355   node->next = *bucket;
 356   *bucket = node;
 357   map->count++;
 358 }
 359
 360 /* Removes NODE from MAP.  The client is responsible for freeing
 361    any data associated with NODE, if necessary.
 362
 363    Assuming uniform hashing, this function runs in constant time.
 364    (Its runtime is proportional to the position of NODE in its
 365    hash chain, so given a pathological hash function, e.g. one
 366    that returns a constant value, its runtime degenerates to
 367    linear in the length of NODE's hash chain.)
 368
 369    This function never reduces the number of buckets in MAP.
 370    When one deletes a large number of nodes from a hash table,
 371    calling hmap_shrink() afterward may therefore save a small
 372    amount of memory.  It is also more expensive to iterate
 373    through a very sparse hash table than a denser one, so
 374    shrinking the hash table could also save some time.  However,
 375    rehashing has an immediate cost that must be weighed against
 376    these benefits.
 377
 378    hmap_delete() does not change NODE's hash value reported by
 379    hmap_node_hash(). */
 380 static inline void
 381 hmap_delete (struct hmap *map, struct hmap_node *node)
 382 {
 383   struct hmap_node **bucket = &map->buckets[node->hash & map->mask];
 384   while (*bucket != node)
 385     bucket = &(*bucket)->next;
 386   *bucket = (*bucket)->next;
 387   map->count--;
 388 }
 389
 390 /* Returns the first node in MAP, or a null pointer if MAP is
 391    empty.
 392
 393    Amortized over iterating through every data element in MAP,
 394    this function runs in constant time.  However, this assumes
 395    that MAP is not excessively sparse, that is, that
 396    hmap_capacity(MAP) is at most a constant factor greater than
 397    hmap_count(MAP).  This will always be true unless many nodes
 398    have been inserted into MAP and then most or all of them
 399    deleted; in such a case, calling hmap_shrink() is advised.
 400
 401    Nodes are returned in arbitrary order that may change whenever
 402    the hash table's current capacity changes, as reported by
 403    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 404    hmap_shrink() can change the capacity of a hash map.
 405    Inserting a node with hmap_insert_fast() or deleting one with
 406    hmap_delete() will not change the relative ordering of nodes.
 407
 408    The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
 409    convenient ways to iterate over all the nodes in a hash map.
 410    The HMAP_FIRST macro is an interface to this particular
 411    function that is often more convenient. */
 412 static inline struct hmap_node *
 413 hmap_first (const struct hmap *map)
 414 {
 415   return hmap_first_nonempty_bucket__ (map, 0);
 416 }
 417
 418 /* Returns the next node in MAP following NODE, or a null pointer
 419    if NODE is the last node in MAP.
 420
 421    Amortized over iterating through every data element in MAP,
 422    this function runs in constant time.  However, this assumes
 423    that MAP is not excessively sparse, that is, that
 424    hmap_capacity(MAP) is at most a constant factor greater than
 425    hmap_count(MAP).  This will always be true unless many nodes
 426    have been inserted into MAP and then most or all of them
 427    deleted; in such a case, calling hmap_shrink() is advised.
 428
 429    Nodes are returned in arbitrary order that may change whenever
 430    the hash table's current capacity changes, as reported by
 431    hmap_capacity().  Calls to hmap_insert(), hmap_reserve(), and
 432    hmap_shrink() can change the capacity of a hash map.
 433    Inserting a node with hmap_insert_fast() or deleting one with
 434    hmap_delete() will not change the relative ordering of nodes.
 435
 436    The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide
 437    convenient ways to iterate over all the nodes in a hash map.
 438    The HMAP_NEXT macro is an interface to this particular
 439    function that is often more convenient. */
 440 static inline struct hmap_node *
 441 hmap_next (const struct hmap *map, const struct hmap_node *node)
 442 {
 443   return (node->next != NULL
 444           ? node->next
 445           : hmap_first_nonempty_bucket__ (map, (node->hash & map->mask) + 1));
 446 }
 447
 448 /* Returns true if MAP currently contains no data items, false
 449    otherwise. */
 450 static inline bool
 451 hmap_is_empty (const struct hmap *map)
 452 {
 453   return map->count == 0;
 454 }
 455
 456 /* Returns the number of data items currently in MAP. */
 457 static inline size_t
 458 hmap_count (const struct hmap *map)
 459 {
 460   return map->count;
 461 }
 462
 463 /* Returns the current capacity of MAP, that is, the maximum
 464    number of data elements that MAP may hold before it becomes
 465    advisable to rehash.
 466
 467    The capacity is advisory only: it is possible to insert any
 468    number of data elements into a hash map regardless of its
 469    capacity.  However, inserting many more elements than the
 470    map's capacity will degrade search performance. */
 471 static inline size_t
 472 hmap_capacity (const struct hmap *map)
 473 {
 474   return hmap_mask_to_capacity__ (map->mask);
 475 }
 476 \f
 477 /* Implementation details. */
 478
 479 /* Returns the first node at or after NODE in NODE's chain that
 480    has hash value HASH. */
 481 static inline struct hmap_node *
 482 hmap_find_hash__ (struct hmap_node *node, size_t hash)
 483 {
 484   for (; node != NULL; node = node->next)
 485     if (node->hash == hash)
 486       break;
 487   return node;
 488 }
 489
 490 /* Returns the first node in the lowest-numbered nonempty bucket
 491    in MAP whose index is START or higher, or a null pointer if
 492    all such buckets are empty. */
 493 static inline struct hmap_node *
 494 hmap_first_nonempty_bucket__ (const struct hmap *map, size_t start)
 495 {
 496   size_t i;
 497
 498   for (i = start; i <= map->mask; i++)
 499     if (map->buckets[i] != NULL)
 500       return map->buckets[i];
 501   return NULL;
 502 }
 503
 504 /* Returns the hash table capacity associated with a given MASK,
 505    which should be a value for the "mask" member of struct hmap.
 506    MASK must be a power of 2 minus 1 (including 0), that is, its
 507    value in binary must be all 1-bits.  */
 508 static inline size_t
 509 hmap_mask_to_capacity__ (size_t mask)
 510 {
 511   return (mask + 1) * 2;
 512 }
 513
 514 /* Helper for HMAP_NULLABLE_DATA (to avoid evaluating its NODE
 515    argument more than once).  */
 516 static inline void *
 517 hmap_nullable_data__ (struct hmap_node *node, size_t member_offset)
 518 {
 519   return node != NULL ? (char *) node - member_offset : NULL;
 520 }
 521
 522 #endif /* libpspp/hmap.h */