From 9879b94f4381a342c66489dae3b4d4150d0dce6b Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 21 Aug 2012 14:26:23 -0700 Subject: [PATCH] hash: Introduce an implementation of murmurhash. Murmurhash is generally superior to the Jenkins lookup3 hash according to the available figures. Perhaps we should generally replace our current hashes by murmurhash. For now, I'm introducing a parallel implementation to allow it to be used in cases where an incremental one-word-at-a-time hash is desirable. The first user will be added in an upcoming commit. Signed-off-by: Ben Pfaff Acked-by: Ethan Jackson --- lib/hash.c | 15 ++++++++++++ lib/hash.h | 42 ++++++++++++++++++++++++++++++++ tests/test-hash.c | 62 +++++++++++++++++++++++++++++------------------ 3 files changed, 95 insertions(+), 24 deletions(-) diff --git a/lib/hash.c b/lib/hash.c index f7aa9163..41ad1bf2 100644 --- a/lib/hash.c +++ b/lib/hash.c @@ -102,3 +102,18 @@ hash_bytes(const void *p_, size_t n, uint32_t basis) return c; } + +/* Returns the hash of the 'n' 32-bit words at 'p', starting from 'basis'. + * 'p' must be properly aligned. */ +uint32_t +mhash_words(const uint32_t p[], size_t n_words, uint32_t basis) +{ + uint32_t hash; + size_t i; + + hash = basis; + for (i = 0; i < n_words; i++) { + hash = mhash_add(hash, p[i]); + } + return mhash_finish(hash, n_words); +} diff --git a/lib/hash.h b/lib/hash.h index ac6a63cf..701e6866 100644 --- a/lib/hash.h +++ b/lib/hash.h @@ -118,6 +118,48 @@ static inline uint32_t hash_pointer(const void *p, uint32_t basis) return hash_int((uint32_t) (uintptr_t) p, basis); } +/* Murmurhash by Austin Appleby, + * from http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp. + * + * The upstream license there says: + * + * // MurmurHash3 was written by Austin Appleby, and is placed in the public + * // domain. The author hereby disclaims copyright to this source code. + * + * Murmurhash is faster and higher-quality than the Jenkins lookup3 hash. When + * we have a little more familiarity with it, it's probably a good idea to + * switch all of OVS to it. + * + * For now, we have this implementation here for use by code that needs a hash + * that is convenient for use one word at a time, since the Jenkins lookup3 + * hash works three words at a time. + * + * See mhash_words() for sample usage. */ + +uint32_t mhash_words(const uint32_t data[], size_t n_words, uint32_t basis); + +static inline uint32_t mhash_add(uint32_t hash, uint32_t data) +{ + data *= 0xcc9e2d51; + data = hash_rot(data, 15); + data *= 0x1b873593; + + hash ^= data; + hash = hash_rot(hash, 13); + return hash * 5 + 0xe6546b64; +} + +static inline uint32_t mhash_finish(uint32_t hash, size_t n) +{ + hash ^= n * 4; + hash ^= hash_rot(hash, 16); + hash *= 0x85ebca6b; + hash ^= hash_rot(hash, 13); + hash *= 0xc2b2ae35; + hash ^= hash_rot(hash, 16); + return hash; +} + #ifdef __cplusplus } #endif diff --git a/tests/test-hash.c b/tests/test-hash.c index bdf1435f..d53ba2ea 100644 --- a/tests/test-hash.c +++ b/tests/test-hash.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Nicira, Inc. + * Copyright (c) 2009, 2012 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,12 @@ hash_words_cb(uint32_t input) return hash_words(&input, 1, 0); } +static uint32_t +mhash_words_cb(uint32_t input) +{ + return mhash_words(&input, 1, 0); +} + static uint32_t hash_int_cb(uint32_t input) { @@ -76,11 +82,37 @@ check_word_hash(uint32_t (*hash)(uint32_t), const char *name, } } -int -main(void) +static void +check_3word_hash(uint32_t (*hash)(const uint32_t[], size_t, uint32_t), + const char *name) { int i, j; + for (i = 0; i <= 96; i++) { + for (j = i + 1; j <= 96; j++) { + uint32_t in1[3], in2[3]; + uint32_t out1, out2; + const int min_unique = 12; + const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1; + + set_bit(in1, i); + set_bit(in2, j); + out1 = hash(in1, 3, 0); + out2 = hash(in2, 3, 0); + if ((out1 & unique_mask) == (out2 & unique_mask)) { + printf("%s has a partial collision:\n", name); + printf("hash(1 << %d) == %08"PRIx32"\n", i, out1); + printf("hash(1 << %d) == %08"PRIx32"\n", j, out2); + printf("The low-order %d bits of output are both " + "0x%"PRIx32"\n", min_unique, out1 & unique_mask); + } + } + } +} + +int +main(void) +{ /* Check that all hashes computed with hash_words with one 1-bit (or no * 1-bits) set within a single 32-bit word have different values in all * 11-bit consecutive runs. @@ -98,6 +130,7 @@ main(void) * independence must be a bad assumption :-) */ check_word_hash(hash_words_cb, "hash_words", 11); + check_word_hash(mhash_words_cb, "mhash_words", 11); /* Check that all hash functions of with one 1-bit (or no 1-bits) set * within three 32-bit words have different values in their lowest 12 @@ -112,27 +145,8 @@ main(void) * * so we are doing pretty well to not have any collisions in 12 bits. */ - for (i = 0; i <= 96; i++) { - for (j = i + 1; j <= 96; j++) { - uint32_t in1[3], in2[3]; - uint32_t out1, out2; - const int min_unique = 12; - const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1; - - set_bit(in1, i); - set_bit(in2, j); - out1 = hash_words(in1, 3, 0); - out2 = hash_words(in2, 3, 0); - if ((out1 & unique_mask) == (out2 & unique_mask)) { - printf("Partial collision:\n"); - printf("hash(1 << %d) == %08"PRIx32"\n", i, out1); - printf("hash(1 << %d) == %08"PRIx32"\n", j, out2); - printf("The low-order %d bits of output are both " - "0x%"PRIx32"\n", min_unique, out1 & unique_mask); - exit(1); - } - } - } + check_3word_hash(hash_words, "hash_words"); + check_3word_hash(mhash_words, "mhash_words"); /* Check that all hashes computed with hash_int with one 1-bit (or no * 1-bits) set within a single 32-bit word have different values in all -- 2.30.2