From 2c411d651e22704f60f117d944b9380a07d247fe Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 8 Apr 2009 21:55:31 -0700 Subject: [PATCH] Use Bob Jenkins lookup3 hash instead of FNV. The Jenkins lookup3 hash is superior to FNV in collision resistance, avalanching, and performance on systems that do not have fast multiplication. It also provides a good way to combine the result of a previous hashing step with the current hash, using its "basis" argument. This commit replaces the PSPP implementation of FNV with the Jenkins lookup3 hash and updates all the current users. In addition, John Darrington pointed out that commit dd2e61b4a "Make create_iconv() properly distinguish converters by name" unintentionally introduced gratuitous hash collisions, by causing all converters where tocode and fromcode were the same to hash to value 0, and converters where tocode and fromcode were swapped to hash to the same value as each other. Using the "basis" argument to the Jenkins hash properly, instead of just attempting to combine hash values with XOR, fixes this problem. --- src/data/attributes.c | 6 +- src/data/file-handle-def.c | 12 +- src/data/file-name.c | 6 +- src/data/short-names.c | 8 +- src/data/value-labels.c | 8 +- src/data/value.c | 6 +- src/data/variable.c | 4 +- src/language/stats/autorecode.c | 4 +- src/language/stats/crosstabs.q | 2 +- src/libpspp/hash-functions.c | 196 +++++++++++++++++++++++--------- src/libpspp/hash-functions.h | 12 +- src/libpspp/i18n.c | 2 +- src/math/covariance-matrix.c | 11 +- 13 files changed, 180 insertions(+), 97 deletions(-) diff --git a/src/data/attributes.c b/src/data/attributes.c index aa12829084..d99e945b01 100644 --- a/src/data/attributes.c +++ b/src/data/attributes.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -220,7 +220,7 @@ attrset_lookup (struct attrset *set, const char *name) { struct attribute *attr; HMAP_FOR_EACH_WITH_HASH (attr, struct attribute, node, - hsh_hash_case_string (name), &set->map) + hash_case_string (name, 0), &set->map) if (!strcasecmp (attribute_get_name (attr), name)) break; return attr; @@ -234,7 +234,7 @@ attrset_add (struct attrset *set, struct attribute *attr) { const char *name = attribute_get_name (attr); assert (attrset_lookup (set, name) == NULL); - hmap_insert (&set->map, &attr->node, hsh_hash_case_string (name)); + hmap_insert (&set->map, &attr->node, hash_case_string (name, 0)); } /* Deletes any attribute from SET that matches NAME diff --git a/src/data/file-handle-def.c b/src/data/file-handle-def.c index 5d807ab9a0..0652501faa 100644 --- a/src/data/file-handle-def.c +++ b/src/data/file-handle-def.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -592,10 +592,12 @@ static unsigned int hash_fh_lock (const void *lock_, const void *aux UNUSED) { const struct fh_lock *lock = lock_; - unsigned int hash = hsh_hash_int ((lock->referent << 3) | lock->access); + unsigned int basis; if (lock->referent == FH_REF_FILE) - hash ^= fn_hash_identity (lock->u.file); + basis = fn_hash_identity (lock->u.file); else if (lock->referent == FH_REF_SCRATCH) - hash ^= hsh_hash_int (lock->u.unique_id); - return hash; + basis = lock->u.unique_id; + else + basis = 0; + return hash_int ((lock->referent << 3) | lock->access, basis); } diff --git a/src/data/file-name.c b/src/data/file-name.c index e9411b65d9..f861028682 100644 --- a/src/data/file-name.c +++ b/src/data/file-name.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -448,9 +448,9 @@ fn_compare_file_identities (const struct file_identity *a, unsigned int fn_hash_identity (const struct file_identity *identity) { - unsigned int hash = identity->device ^ identity->inode; + unsigned int hash = hash_int (identity->device, identity->inode); if (identity->name != NULL) - hash ^= hsh_hash_string (identity->name); + hash = hash_string (identity->name, hash); return hash; } diff --git a/src/data/short-names.c b/src/data/short-names.c index 08bf758787..1b8be927a6 100644 --- a/src/data/short-names.c +++ b/src/data/short-names.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007 Free Software Foundation, Inc. + Copyright (C) 2007, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -39,9 +39,9 @@ compare_strings (const void *a, const void *b, const void *aux UNUSED) /* Hashes a string. */ static unsigned -hash_string (const void *s, const void *aux UNUSED) +do_hash_string (const void *s, const void *aux UNUSED) { - return hsh_hash_string (s); + return hash_string (s, 0); } /* Sets V's short name to BASE, followed by a suffix of the form @@ -128,7 +128,7 @@ short_names_assign (struct dictionary *d) the hash table point to strings owned by dictionary variables, not by us, so we don't need to provide a free function. */ - short_names = hsh_create (var_cnt, compare_strings, hash_string, + short_names = hsh_create (var_cnt, compare_strings, do_hash_string, NULL, NULL); /* Clear short names that conflict with a variable name. */ diff --git a/src/data/value-labels.c b/src/data/value-labels.c index 9cee8d58be..9f6113b388 100644 --- a/src/data/value-labels.c +++ b/src/data/value-labels.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -383,9 +383,9 @@ hash_int_val_lab (const void *vl_, const void *vls_) const struct val_labs *vls = vls_; if (vls->width == 0) - return hsh_hash_double (vl->value.f); + return hash_double (vl->value.f, 0); else - return hsh_hash_bytes (vl->value.s, vls->width); + return hash_bytes (vl->value.s, vls->width, 0); } /* Free a value label. */ @@ -491,7 +491,7 @@ hash_atom (const void *atom_, const void *aux UNUSED) { const struct atom *atom = atom_; - return hsh_hash_string (atom->string); + return hash_string (atom->string, 0); } /* A hsh_free_func that destroys ATOM. */ diff --git a/src/data/value.c b/src/data/value.c index baaaed48d4..180f1b6d1e 100644 --- a/src/data/value.c +++ b/src/data/value.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -65,9 +65,7 @@ hash_value_short (const void *v_, const void *var_) const union value *v = v_; const struct variable *var = var_; int width = var_get_width (var); - return (width == 0 - ? hsh_hash_double (v->f) - : hsh_hash_bytes (v->s, width)); + return width == 0 ? hash_double (v->f, 0) : hash_bytes (v->s, width, 0); } diff --git a/src/data/variable.c b/src/data/variable.c index 00c3f07f05..505ae79da3 100644 --- a/src/data/variable.c +++ b/src/data/variable.c @@ -323,7 +323,7 @@ hash_var_by_name (const void *v_, const void *aux UNUSED) { const struct variable *v = v_; - return hsh_hash_case_string (v->name); + return hash_case_string (v->name, 0); } /* A hsh_compare_func that orders pointers to variables A and B @@ -359,7 +359,7 @@ hash_var_ptr_by_name (const void *v_, const void *aux UNUSED) { struct variable *const *v = v_; - return hsh_hash_case_string (var_get_name (*v)); + return hash_case_string (var_get_name (*v), 0); } /* Returns the type of variable V. */ diff --git a/src/language/stats/autorecode.c b/src/language/stats/autorecode.c index de9416821c..887b12230d 100644 --- a/src/language/stats/autorecode.c +++ b/src/language/stats/autorecode.c @@ -355,7 +355,7 @@ hash_alpha_value (const void *a_, const void *v_) const union arc_value *a = a_; const struct variable *v = v_; - return hsh_hash_bytes (a->c, var_get_width (v)); + return hash_bytes (a->c, var_get_width (v), 0); } static int @@ -372,5 +372,5 @@ hash_numeric_value (const void *a_, const void *aux UNUSED) { const union arc_value *a = a_; - return hsh_hash_double (a->f); + return hash_double (a->f, 0); } diff --git a/src/language/stats/crosstabs.q b/src/language/stats/crosstabs.q index 309b27fc3c..0a5e34099d 100644 --- a/src/language/stats/crosstabs.q +++ b/src/language/stats/crosstabs.q @@ -777,7 +777,7 @@ hash_table_entry (const void *a_, const void *aux UNUSED) hash = a->table; for (i = 0; i < xtab[a->table]->nvar; i++) - hash ^= hsh_hash_bytes (&a->values[i], sizeof a->values[i]); + hash = hash_bytes (&a->values[i], sizeof a->values[i], hash); return hash; } diff --git a/src/libpspp/hash-functions.c b/src/libpspp/hash-functions.c index 2318777392..f9f1f0e165 100644 --- a/src/libpspp/hash-functions.c +++ b/src/libpspp/hash-functions.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2008 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2008, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,72 +19,156 @@ #include #include #include - -/* Fowler-Noll-Vo hash constants, for 32-bit word sizes. */ -#define FNV_32_PRIME 16777619u -#define FNV_32_BASIS 2166136261u - -/* Fowler-Noll-Vo 32-bit hash, for bytes. */ -unsigned -hsh_hash_bytes (const void *buf_, size_t size) +#include +#include + +/* Based on http://burtleburtle.net/bob/c/lookup3.c, by Bob + Jenkins , as retrieved on April + 8, 2009. The license information there says the following: + "You can use this free for any purpose. It's in the public + domain. It has no warranty." and "You may use this code any + way you wish, private, educational, or commercial. It's + free." */ + +#define HASH_ROT(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) + +#define HASH_MIX(a, b, c) \ + do \ + { \ + a -= c; a ^= HASH_ROT (c, 4); c += b; \ + b -= a; b ^= HASH_ROT (a, 6); a += c; \ + c -= b; c ^= HASH_ROT (b, 8); b += a; \ + a -= c; a ^= HASH_ROT (c, 16); c += b; \ + b -= a; b ^= HASH_ROT (a, 19); a += c; \ + c -= b; c ^= HASH_ROT (b, 4); b += a; \ + } \ + while (0) + +#define HASH_FINAL(a, b, c) \ + do \ + { \ + c ^= b; c -= HASH_ROT (b, 14); \ + a ^= c; a -= HASH_ROT (c, 11); \ + b ^= a; b -= HASH_ROT (a, 25); \ + c ^= b; c -= HASH_ROT (b, 16); \ + a ^= c; a -= HASH_ROT (c, 4); \ + b ^= a; b -= HASH_ROT (a, 14); \ + c ^= b; c -= HASH_ROT (b, 24); \ + } \ + while (0) + +/* Returns a hash value for the N bytes starting at P, starting + from BASIS. */ +unsigned int +hash_bytes (const void *p_, size_t n, unsigned int basis) { - const unsigned char *buf = (const unsigned char *) buf_; - unsigned hash; - - assert (buf != NULL); - - hash = FNV_32_BASIS; - while (size-- > 0) - hash = (hash * FNV_32_PRIME) ^ *buf++; - - return hash; + const uint8_t *p = p_; + uint32_t a, b, c; + uint32_t tmp[3]; + + a = b = c = 0xdeadbeef + n + basis; + + while (n >= 12) + { + memcpy (tmp, p, 12); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + HASH_MIX (a, b, c); + n -= 12; + p += 12; + } + + if (n > 0) + { + memset (tmp, 0, 12); + memcpy (tmp, p, n); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + } + + HASH_FINAL (a, b, c); + return c; } -/* Fowler-Noll-Vo 32-bit hash, for strings. */ -unsigned -hsh_hash_string (const char *s_) +/* Returns a hash value for null-terminated string S, starting + from BASIS. */ +unsigned int +hash_string (const char *s, unsigned int basis) { - const unsigned char *s = (const unsigned char *) s_; - unsigned hash; - - assert (s != NULL); - - hash = FNV_32_BASIS; - while (*s != '\0') - hash = (hash * FNV_32_PRIME) ^ *s++; - - return hash; + return hash_bytes (s, strlen (s), basis); } -/* Fowler-Noll-Vo 32-bit hash, for case-insensitive strings. */ -unsigned -hsh_hash_case_string (const char *s_) +/* Returns a hash value for null-terminated string S, with + lowercase and uppercase letters treated as equal, starting + from BASIS. */ +unsigned int +hash_case_string (const char *s, unsigned int basis) { - const unsigned char *s = (const unsigned char *) s_; - unsigned hash; - - assert (s != NULL); - - hash = FNV_32_BASIS; - while (*s != '\0') - hash = (hash * FNV_32_PRIME) ^ toupper (*s++); - - return hash; + size_t n = strlen (s); + uint32_t a, b, c; + uint32_t tmp[3]; + int i; + + a = b = c = 0xdeadbeef + n + basis; + + while (n >= 12) + { + for (i = 0; i < 12; i++) + ((unsigned char *)tmp)[i] = toupper ((unsigned char) s[i]); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + HASH_MIX (a, b, c); + n -= 12; + s += 12; + } + + if (n > 0) + { + memset (tmp, 0, 12); + for (i = 0; i < n; i++) + ((unsigned char *)tmp)[i] = toupper ((unsigned char) s[i]); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + } + + HASH_FINAL (a, b, c); + return c; } -/* Hash for ints. */ -unsigned -hsh_hash_int (int i) +/* Returns a hash value for integer X, starting from BASIS. */ +unsigned int +hash_int (int x, unsigned int basis) { - return hsh_hash_bytes (&i, sizeof i); + x -= x << 6; + x ^= x >> 17; + x -= x << 9; + x ^= x << 4; + x -= x << 3; + x ^= x << 10; + x ^= x >> 15; + return x + basis; } -/* Hash for double. */ -unsigned -hsh_hash_double (double d) +/* Returns a hash value for double D, starting from BASIS. */ +unsigned int +hash_double (double d, unsigned int basis) { - if (!isnan (d)) - return hsh_hash_bytes (&d, sizeof d); - else - return 0; +#if SIZEOF_DOUBLE == 8 + uint32_t tmp[2]; + uint32_t a, b, c; + + a = b = c = 0xdeadbeef + 8 + basis; + + memcpy (tmp, &d, 8); + a += tmp[0]; + b += tmp[1]; + HASH_FINAL (a, b, c); + return c; +#else /* SIZEOF_DOUBLE != 8 */ + return hash_bytes (&d, sizeof d, basis); +#endif /* SIZEOF_DOUBLE != 8 */ } diff --git a/src/libpspp/hash-functions.h b/src/libpspp/hash-functions.h index 328f4deda0..089134b43f 100644 --- a/src/libpspp/hash-functions.h +++ b/src/libpspp/hash-functions.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,10 +19,10 @@ #include -unsigned hsh_hash_bytes (const void *, size_t); -unsigned hsh_hash_string (const char *); -unsigned hsh_hash_case_string (const char *); -unsigned hsh_hash_int (int); -unsigned hsh_hash_double (double); +unsigned int hash_bytes (const void *, size_t, unsigned int basis); +unsigned int hash_string (const char *, unsigned int basis); +unsigned int hash_case_string (const char *, unsigned int basis); +unsigned int hash_int (int, unsigned int basis); +unsigned int hash_double (double, unsigned int basis); #endif /* libpspp/hash-functions.h */ diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 293276c6aa..f10a52a50e 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -57,7 +57,7 @@ create_iconv (const char* tocode, const char* fromcode) struct hmapx_node *node; struct converter *converter; - hash = hsh_hash_string (tocode) ^ hsh_hash_string (fromcode); + hash = hash_string (tocode, hash_string (fromcode, 0)); HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map) if (!strcmp (tocode, converter->tocode) && !strcmp (fromcode, converter->fromcode)) diff --git a/src/math/covariance-matrix.c b/src/math/covariance-matrix.c index 95895eaeac..2e83749c6e 100644 --- a/src/math/covariance-matrix.c +++ b/src/math/covariance-matrix.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -372,10 +372,9 @@ covariance_accumulator_hash (const void *h, const void *aux) } if (var_is_alpha (v_max) && var_is_alpha (v_min)) { - unsigned tmp = hsh_hash_bytes (val_max, var_get_width (v_max)); - tmp ^= hsh_hash_bytes (val_min, var_get_width (v_min)); - tmp += *n_vars * (*n_vars + 1 + idx_max) + idx_min; - return (size_t) tmp; + unsigned hash = hash_bytes (val_max, var_get_width (v_max), 0); + hash = hash_bytes (val_min, var_get_width (v_min), hash); + return hash_int (*n_vars * (*n_vars + 1 + idx_max) + idx_min, hash); } return -1u; } @@ -478,7 +477,7 @@ hash_numeric_alpha (const struct variable *v1, const struct variable *v2, if (var_is_numeric (v1) && var_is_alpha (v2)) { result = n_vars * ((n_vars + 1) + var_get_dict_index (v1)) - + var_get_dict_index (v2) + hsh_hash_string (val->s); + + var_get_dict_index (v2) + hash_string (val->s, 0); } else if (var_is_alpha (v1) && var_is_numeric (v2)) { -- 2.30.2