From 2c411d651e22704f60f117d944b9380a07d247fe Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 8 Apr 2009 21:55:31 -0700 Subject: [PATCH] Use Bob Jenkins lookup3 hash instead of FNV. The Jenkins lookup3 hash is superior to FNV in collision resistance, avalanching, and performance on systems that do not have fast multiplication. It also provides a good way to combine the result of a previous hashing step with the current hash, using its "basis" argument. This commit replaces the PSPP implementation of FNV with the Jenkins lookup3 hash and updates all the current users. In addition, John Darrington pointed out that commit dd2e61b4a "Make create_iconv() properly distinguish converters by name" unintentionally introduced gratuitous hash collisions, by causing all converters where tocode and fromcode were the same to hash to value 0, and converters where tocode and fromcode were swapped to hash to the same value as each other. Using the "basis" argument to the Jenkins hash properly, instead of just attempting to combine hash values with XOR, fixes this problem. --- src/data/attributes.c | 6 +- src/data/file-handle-def.c | 12 +- src/data/file-name.c | 6 +- src/data/short-names.c | 8 +- src/data/value-labels.c | 8 +- src/data/value.c | 6 +- src/data/variable.c | 4 +- src/language/stats/autorecode.c | 4 +- src/language/stats/crosstabs.q | 2 +- src/libpspp/hash-functions.c | 196 +++++++++++++++++++++++--------- src/libpspp/hash-functions.h | 12 +- src/libpspp/i18n.c | 2 +- src/math/covariance-matrix.c | 11 +- 13 files changed, 180 insertions(+), 97 deletions(-) diff --git a/src/data/attributes.c b/src/data/attributes.c index aa128290..d99e945b 100644 --- a/src/data/attributes.c +++ b/src/data/attributes.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -220,7 +220,7 @@ attrset_lookup (struct attrset *set, const char *name) { struct attribute *attr; HMAP_FOR_EACH_WITH_HASH (attr, struct attribute, node, - hsh_hash_case_string (name), &set->map) + hash_case_string (name, 0), &set->map) if (!strcasecmp (attribute_get_name (attr), name)) break; return attr; @@ -234,7 +234,7 @@ attrset_add (struct attrset *set, struct attribute *attr) { const char *name = attribute_get_name (attr); assert (attrset_lookup (set, name) == NULL); - hmap_insert (&set->map, &attr->node, hsh_hash_case_string (name)); + hmap_insert (&set->map, &attr->node, hash_case_string (name, 0)); } /* Deletes any attribute from SET that matches NAME diff --git a/src/data/file-handle-def.c b/src/data/file-handle-def.c index 5d807ab9..0652501f 100644 --- a/src/data/file-handle-def.c +++ b/src/data/file-handle-def.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -592,10 +592,12 @@ static unsigned int hash_fh_lock (const void *lock_, const void *aux UNUSED) { const struct fh_lock *lock = lock_; - unsigned int hash = hsh_hash_int ((lock->referent << 3) | lock->access); + unsigned int basis; if (lock->referent == FH_REF_FILE) - hash ^= fn_hash_identity (lock->u.file); + basis = fn_hash_identity (lock->u.file); else if (lock->referent == FH_REF_SCRATCH) - hash ^= hsh_hash_int (lock->u.unique_id); - return hash; + basis = lock->u.unique_id; + else + basis = 0; + return hash_int ((lock->referent << 3) | lock->access, basis); } diff --git a/src/data/file-name.c b/src/data/file-name.c index e9411b65..f8610286 100644 --- a/src/data/file-name.c +++ b/src/data/file-name.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -448,9 +448,9 @@ fn_compare_file_identities (const struct file_identity *a, unsigned int fn_hash_identity (const struct file_identity *identity) { - unsigned int hash = identity->device ^ identity->inode; + unsigned int hash = hash_int (identity->device, identity->inode); if (identity->name != NULL) - hash ^= hsh_hash_string (identity->name); + hash = hash_string (identity->name, hash); return hash; } diff --git a/src/data/short-names.c b/src/data/short-names.c index 08bf7587..1b8be927 100644 --- a/src/data/short-names.c +++ b/src/data/short-names.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007 Free Software Foundation, Inc. + Copyright (C) 2007, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -39,9 +39,9 @@ compare_strings (const void *a, const void *b, const void *aux UNUSED) /* Hashes a string. */ static unsigned -hash_string (const void *s, const void *aux UNUSED) +do_hash_string (const void *s, const void *aux UNUSED) { - return hsh_hash_string (s); + return hash_string (s, 0); } /* Sets V's short name to BASE, followed by a suffix of the form @@ -128,7 +128,7 @@ short_names_assign (struct dictionary *d) the hash table point to strings owned by dictionary variables, not by us, so we don't need to provide a free function. */ - short_names = hsh_create (var_cnt, compare_strings, hash_string, + short_names = hsh_create (var_cnt, compare_strings, do_hash_string, NULL, NULL); /* Clear short names that conflict with a variable name. */ diff --git a/src/data/value-labels.c b/src/data/value-labels.c index 9cee8d58..9f6113b3 100644 --- a/src/data/value-labels.c +++ b/src/data/value-labels.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -383,9 +383,9 @@ hash_int_val_lab (const void *vl_, const void *vls_) const struct val_labs *vls = vls_; if (vls->width == 0) - return hsh_hash_double (vl->value.f); + return hash_double (vl->value.f, 0); else - return hsh_hash_bytes (vl->value.s, vls->width); + return hash_bytes (vl->value.s, vls->width, 0); } /* Free a value label. */ @@ -491,7 +491,7 @@ hash_atom (const void *atom_, const void *aux UNUSED) { const struct atom *atom = atom_; - return hsh_hash_string (atom->string); + return hash_string (atom->string, 0); } /* A hsh_free_func that destroys ATOM. */ diff --git a/src/data/value.c b/src/data/value.c index baaaed48..180f1b6d 100644 --- a/src/data/value.c +++ b/src/data/value.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -65,9 +65,7 @@ hash_value_short (const void *v_, const void *var_) const union value *v = v_; const struct variable *var = var_; int width = var_get_width (var); - return (width == 0 - ? hsh_hash_double (v->f) - : hsh_hash_bytes (v->s, width)); + return width == 0 ? hash_double (v->f, 0) : hash_bytes (v->s, width, 0); } diff --git a/src/data/variable.c b/src/data/variable.c index 00c3f07f..505ae79d 100644 --- a/src/data/variable.c +++ b/src/data/variable.c @@ -323,7 +323,7 @@ hash_var_by_name (const void *v_, const void *aux UNUSED) { const struct variable *v = v_; - return hsh_hash_case_string (v->name); + return hash_case_string (v->name, 0); } /* A hsh_compare_func that orders pointers to variables A and B @@ -359,7 +359,7 @@ hash_var_ptr_by_name (const void *v_, const void *aux UNUSED) { struct variable *const *v = v_; - return hsh_hash_case_string (var_get_name (*v)); + return hash_case_string (var_get_name (*v), 0); } /* Returns the type of variable V. */ diff --git a/src/language/stats/autorecode.c b/src/language/stats/autorecode.c index de941682..887b1223 100644 --- a/src/language/stats/autorecode.c +++ b/src/language/stats/autorecode.c @@ -355,7 +355,7 @@ hash_alpha_value (const void *a_, const void *v_) const union arc_value *a = a_; const struct variable *v = v_; - return hsh_hash_bytes (a->c, var_get_width (v)); + return hash_bytes (a->c, var_get_width (v), 0); } static int @@ -372,5 +372,5 @@ hash_numeric_value (const void *a_, const void *aux UNUSED) { const union arc_value *a = a_; - return hsh_hash_double (a->f); + return hash_double (a->f, 0); } diff --git a/src/language/stats/crosstabs.q b/src/language/stats/crosstabs.q index 309b27fc..0a5e3409 100644 --- a/src/language/stats/crosstabs.q +++ b/src/language/stats/crosstabs.q @@ -777,7 +777,7 @@ hash_table_entry (const void *a_, const void *aux UNUSED) hash = a->table; for (i = 0; i < xtab[a->table]->nvar; i++) - hash ^= hsh_hash_bytes (&a->values[i], sizeof a->values[i]); + hash = hash_bytes (&a->values[i], sizeof a->values[i], hash); return hash; } diff --git a/src/libpspp/hash-functions.c b/src/libpspp/hash-functions.c index 23187773..f9f1f0e1 100644 --- a/src/libpspp/hash-functions.c +++ b/src/libpspp/hash-functions.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2008 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2008, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,72 +19,156 @@ #include #include #include - -/* Fowler-Noll-Vo hash constants, for 32-bit word sizes. */ -#define FNV_32_PRIME 16777619u -#define FNV_32_BASIS 2166136261u - -/* Fowler-Noll-Vo 32-bit hash, for bytes. */ -unsigned -hsh_hash_bytes (const void *buf_, size_t size) +#include +#include + +/* Based on http://burtleburtle.net/bob/c/lookup3.c, by Bob + Jenkins , as retrieved on April + 8, 2009. The license information there says the following: + "You can use this free for any purpose. It's in the public + domain. It has no warranty." and "You may use this code any + way you wish, private, educational, or commercial. It's + free." */ + +#define HASH_ROT(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) + +#define HASH_MIX(a, b, c) \ + do \ + { \ + a -= c; a ^= HASH_ROT (c, 4); c += b; \ + b -= a; b ^= HASH_ROT (a, 6); a += c; \ + c -= b; c ^= HASH_ROT (b, 8); b += a; \ + a -= c; a ^= HASH_ROT (c, 16); c += b; \ + b -= a; b ^= HASH_ROT (a, 19); a += c; \ + c -= b; c ^= HASH_ROT (b, 4); b += a; \ + } \ + while (0) + +#define HASH_FINAL(a, b, c) \ + do \ + { \ + c ^= b; c -= HASH_ROT (b, 14); \ + a ^= c; a -= HASH_ROT (c, 11); \ + b ^= a; b -= HASH_ROT (a, 25); \ + c ^= b; c -= HASH_ROT (b, 16); \ + a ^= c; a -= HASH_ROT (c, 4); \ + b ^= a; b -= HASH_ROT (a, 14); \ + c ^= b; c -= HASH_ROT (b, 24); \ + } \ + while (0) + +/* Returns a hash value for the N bytes starting at P, starting + from BASIS. */ +unsigned int +hash_bytes (const void *p_, size_t n, unsigned int basis) { - const unsigned char *buf = (const unsigned char *) buf_; - unsigned hash; - - assert (buf != NULL); - - hash = FNV_32_BASIS; - while (size-- > 0) - hash = (hash * FNV_32_PRIME) ^ *buf++; - - return hash; + const uint8_t *p = p_; + uint32_t a, b, c; + uint32_t tmp[3]; + + a = b = c = 0xdeadbeef + n + basis; + + while (n >= 12) + { + memcpy (tmp, p, 12); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + HASH_MIX (a, b, c); + n -= 12; + p += 12; + } + + if (n > 0) + { + memset (tmp, 0, 12); + memcpy (tmp, p, n); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + } + + HASH_FINAL (a, b, c); + return c; } -/* Fowler-Noll-Vo 32-bit hash, for strings. */ -unsigned -hsh_hash_string (const char *s_) +/* Returns a hash value for null-terminated string S, starting + from BASIS. */ +unsigned int +hash_string (const char *s, unsigned int basis) { - const unsigned char *s = (const unsigned char *) s_; - unsigned hash; - - assert (s != NULL); - - hash = FNV_32_BASIS; - while (*s != '\0') - hash = (hash * FNV_32_PRIME) ^ *s++; - - return hash; + return hash_bytes (s, strlen (s), basis); } -/* Fowler-Noll-Vo 32-bit hash, for case-insensitive strings. */ -unsigned -hsh_hash_case_string (const char *s_) +/* Returns a hash value for null-terminated string S, with + lowercase and uppercase letters treated as equal, starting + from BASIS. */ +unsigned int +hash_case_string (const char *s, unsigned int basis) { - const unsigned char *s = (const unsigned char *) s_; - unsigned hash; - - assert (s != NULL); - - hash = FNV_32_BASIS; - while (*s != '\0') - hash = (hash * FNV_32_PRIME) ^ toupper (*s++); - - return hash; + size_t n = strlen (s); + uint32_t a, b, c; + uint32_t tmp[3]; + int i; + + a = b = c = 0xdeadbeef + n + basis; + + while (n >= 12) + { + for (i = 0; i < 12; i++) + ((unsigned char *)tmp)[i] = toupper ((unsigned char) s[i]); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + HASH_MIX (a, b, c); + n -= 12; + s += 12; + } + + if (n > 0) + { + memset (tmp, 0, 12); + for (i = 0; i < n; i++) + ((unsigned char *)tmp)[i] = toupper ((unsigned char) s[i]); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + } + + HASH_FINAL (a, b, c); + return c; } -/* Hash for ints. */ -unsigned -hsh_hash_int (int i) +/* Returns a hash value for integer X, starting from BASIS. */ +unsigned int +hash_int (int x, unsigned int basis) { - return hsh_hash_bytes (&i, sizeof i); + x -= x << 6; + x ^= x >> 17; + x -= x << 9; + x ^= x << 4; + x -= x << 3; + x ^= x << 10; + x ^= x >> 15; + return x + basis; } -/* Hash for double. */ -unsigned -hsh_hash_double (double d) +/* Returns a hash value for double D, starting from BASIS. */ +unsigned int +hash_double (double d, unsigned int basis) { - if (!isnan (d)) - return hsh_hash_bytes (&d, sizeof d); - else - return 0; +#if SIZEOF_DOUBLE == 8 + uint32_t tmp[2]; + uint32_t a, b, c; + + a = b = c = 0xdeadbeef + 8 + basis; + + memcpy (tmp, &d, 8); + a += tmp[0]; + b += tmp[1]; + HASH_FINAL (a, b, c); + return c; +#else /* SIZEOF_DOUBLE != 8 */ + return hash_bytes (&d, sizeof d, basis); +#endif /* SIZEOF_DOUBLE != 8 */ } diff --git a/src/libpspp/hash-functions.h b/src/libpspp/hash-functions.h index 328f4ded..089134b4 100644 --- a/src/libpspp/hash-functions.h +++ b/src/libpspp/hash-functions.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,10 +19,10 @@ #include -unsigned hsh_hash_bytes (const void *, size_t); -unsigned hsh_hash_string (const char *); -unsigned hsh_hash_case_string (const char *); -unsigned hsh_hash_int (int); -unsigned hsh_hash_double (double); +unsigned int hash_bytes (const void *, size_t, unsigned int basis); +unsigned int hash_string (const char *, unsigned int basis); +unsigned int hash_case_string (const char *, unsigned int basis); +unsigned int hash_int (int, unsigned int basis); +unsigned int hash_double (double, unsigned int basis); #endif /* libpspp/hash-functions.h */ diff --git a/src/libpspp/i18n.c b/src/libpspp/i18n.c index 293276c6..f10a52a5 100644 --- a/src/libpspp/i18n.c +++ b/src/libpspp/i18n.c @@ -57,7 +57,7 @@ create_iconv (const char* tocode, const char* fromcode) struct hmapx_node *node; struct converter *converter; - hash = hsh_hash_string (tocode) ^ hsh_hash_string (fromcode); + hash = hash_string (tocode, hash_string (fromcode, 0)); HMAPX_FOR_EACH_WITH_HASH (converter, node, hash, &map) if (!strcmp (tocode, converter->tocode) && !strcmp (fromcode, converter->fromcode)) diff --git a/src/math/covariance-matrix.c b/src/math/covariance-matrix.c index 95895eae..2e83749c 100644 --- a/src/math/covariance-matrix.c +++ b/src/math/covariance-matrix.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2008 Free Software Foundation, Inc. + Copyright (C) 2008, 2009 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -372,10 +372,9 @@ covariance_accumulator_hash (const void *h, const void *aux) } if (var_is_alpha (v_max) && var_is_alpha (v_min)) { - unsigned tmp = hsh_hash_bytes (val_max, var_get_width (v_max)); - tmp ^= hsh_hash_bytes (val_min, var_get_width (v_min)); - tmp += *n_vars * (*n_vars + 1 + idx_max) + idx_min; - return (size_t) tmp; + unsigned hash = hash_bytes (val_max, var_get_width (v_max), 0); + hash = hash_bytes (val_min, var_get_width (v_min), hash); + return hash_int (*n_vars * (*n_vars + 1 + idx_max) + idx_min, hash); } return -1u; } @@ -478,7 +477,7 @@ hash_numeric_alpha (const struct variable *v1, const struct variable *v2, if (var_is_numeric (v1) && var_is_alpha (v2)) { result = n_vars * ((n_vars + 1) + var_get_dict_index (v1)) - + var_get_dict_index (v2) + hsh_hash_string (val->s); + + var_get_dict_index (v2) + hash_string (val->s, 0); } else if (var_is_alpha (v1) && var_is_numeric (v2)) { -- 2.30.2