From: Ben Pfaff Date: Fri, 20 Jul 2012 19:38:59 +0000 (-0700) Subject: util: New function popcount(). X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a656cb7736fd9f5926c2b1bdd6f50dafc3aa796a;p=openvswitch util: New function popcount(). This is the fastest portable implementation among the ones below, as measured with GCC 4.4 on a Xeon X3430. The measeured times were, in seconds: popcount1 25.6 popcount2 6.9 (but is not portable) popcount3 31.4 popcount4 25.6 popcount5 61.6 (and is buggy) popcount6 64.6 popcount7 32.3 popcount8 11.2 int popcount1(unsigned int x) { return __builtin_popcount(x); } int popcount2(unsigned int x) { unsigned int y; asm("popcnt %1, %0" : "=r" (y) : "g" (x)); return y; } int popcount3(unsigned int x) { unsigned int n; n = (x >> 1) & 033333333333; x -= n; n = (n >> 1) & 033333333333; x -= n; x = (x + (x >> 3)) & 030707070707; return x % 63; } int popcount4(unsigned int x) { x -= (x >> 1) & 0x55555555; x = (x & 0x33333333) + ((x >> 2) & 0x33333333); x = (x + (x >> 4)) & 0x0f0f0f0f; x += x >> 8; x += x >> 16; return x & 0x3f; } int popcount5(unsigned int x) { int n; n = 0; while (x) { if (x & 0xf) { n += ((0xe9949440 >> (x & 0xf)) & 3) + 1; } x >>= 4; } return n; } int popcount6(unsigned int x) { int n; n = 0; while (x) { n += (0xe994 >> (x & 7)) & 3; x >>= 3; } return n; } int popcount7(unsigned int x) { static const int table[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; return (table[x & 0xf] + table[(x >> 4) & 0xf] + table[(x >> 8) & 0xf] + table[(x >> 12) & 0xf] + table[(x >> 16) & 0xf] + table[(x >> 20) & 0xf] + table[(x >> 24) & 0xf] + table[x >> 28]); } static int popcount8(unsigned int x) { ((((X) & (1 << 0)) != 0) + \ (((X) & (1 << 1)) != 0) + \ (((X) & (1 << 2)) != 0) + \ (((X) & (1 << 3)) != 0) + \ (((X) & (1 << 4)) != 0) + \ (((X) & (1 << 5)) != 0) + \ (((X) & (1 << 6)) != 0) + \ (((X) & (1 << 7)) != 0)) static const uint8_t popcount8[256] = { INIT64(0), INIT64(64), INIT64(128), INIT64(192) }; return (popcount8[x & 0xff] + popcount8[(x >> 8) & 0xff] + popcount8[(x >> 16) & 0xff] + popcount8[x >> 24]); } int main(void) { unsigned long long int x; int n; n = 0; for (x = 0; x <= UINT32_MAX; x++) { n += popcount8(x); } printf("%d\n", n); return 0; } Signed-off-by: Ben Pfaff --- diff --git a/lib/util.c b/lib/util.c index cd99142c..256421a6 100644 --- a/lib/util.c +++ b/lib/util.c @@ -823,6 +823,40 @@ ctz(uint32_t n) } } +/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */ +int +popcount(uint32_t x) +{ + /* In my testing, this implementation is over twice as fast as any other + * portable implementation that I tried, including GCC 4.4 + * __builtin_popcount(), although nonportable asm("popcnt") was over 50% + * faster. */ +#define INIT1(X) \ + ((((X) & (1 << 0)) != 0) + \ + (((X) & (1 << 1)) != 0) + \ + (((X) & (1 << 2)) != 0) + \ + (((X) & (1 << 3)) != 0) + \ + (((X) & (1 << 4)) != 0) + \ + (((X) & (1 << 5)) != 0) + \ + (((X) & (1 << 6)) != 0) + \ + (((X) & (1 << 7)) != 0)) +#define INIT2(X) INIT1(X), INIT1((X) + 1) +#define INIT4(X) INIT2(X), INIT2((X) + 2) +#define INIT8(X) INIT4(X), INIT4((X) + 4) +#define INIT16(X) INIT8(X), INIT8((X) + 8) +#define INIT32(X) INIT16(X), INIT16((X) + 16) +#define INIT64(X) INIT32(X), INIT32((X) + 32) + + static const uint8_t popcount8[256] = { + INIT64(0), INIT64(64), INIT64(128), INIT64(192) + }; + + return (popcount8[x & 0xff] + + popcount8[(x >> 8) & 0xff] + + popcount8[(x >> 16) & 0xff] + + popcount8[x >> 24]); +} + /* Returns true if the 'n' bytes starting at 'p' are zeros. */ bool is_all_zeros(const uint8_t *p, size_t n) diff --git a/lib/util.h b/lib/util.h index 8c9103c6..a8d86657 100644 --- a/lib/util.h +++ b/lib/util.h @@ -17,6 +17,7 @@ #ifndef UTIL_H #define UTIL_H 1 +#include #include #include #include @@ -244,6 +245,7 @@ void ignore(bool x OVS_UNUSED); int log_2_floor(uint32_t); int log_2_ceil(uint32_t); int ctz(uint32_t); +int popcount(uint32_t); bool is_all_zeros(const uint8_t *, size_t); bool is_all_ones(const uint8_t *, size_t); diff --git a/tests/library.at b/tests/library.at index 70660a25..0765c3f3 100644 --- a/tests/library.at +++ b/tests/library.at @@ -103,12 +103,14 @@ AT_CLEANUP m4_foreach( [testname], [[ctz], + [popcount], [log_2_floor], [bitwise_copy], [bitwise_zero], [bitwise_one], [bitwise_is_all_zeros]], [AT_SETUP([testname[()] function]) + AT_KEYWORDS([testname]) AT_CHECK([test-util testname], [0], [], []) AT_CLEANUP]) diff --git a/tests/test-util.c b/tests/test-util.c index 71a7f218..b98fc79a 100644 --- a/tests/test-util.c +++ b/tests/test-util.c @@ -26,6 +26,9 @@ #include "random.h" #include "util.h" +#undef NDEBUG +#include + static void check_log_2_floor(uint32_t x, int n) { @@ -85,6 +88,59 @@ test_ctz(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) check_ctz(0, 32); } +static void +shuffle(unsigned int *p, size_t n) +{ + for (; n > 1; n--, p++) { + unsigned int *q = &p[rand() % n]; + unsigned int tmp = *p; + *p = *q; + *q = tmp; + } +} + +static void +check_popcount(uint32_t x, int n) +{ + if (popcount(x) != n) { + fprintf(stderr, "popcount(%#"PRIx32") is %d but should be %d\n", + x, popcount(x), n); + abort(); + } +} + +static void +test_popcount(int argc OVS_UNUSED, char *argv[] OVS_UNUSED) +{ + unsigned int bits[32]; + int i; + + for (i = 0; i < ARRAY_SIZE(bits); i++) { + bits[i] = 1u << i; + } + + check_popcount(0, 0); + + for (i = 0; i < 1000; i++) { + uint32_t x = 0; + int j; + + shuffle(bits, ARRAY_SIZE(bits)); + for (j = 0; j < 32; j++) { + x |= bits[j]; + check_popcount(x, j + 1); + } + assert(x == UINT32_MAX); + + shuffle(bits, ARRAY_SIZE(bits)); + for (j = 31; j >= 0; j--) { + x &= ~bits[j]; + check_popcount(x, j); + } + assert(x == 0); + } +} + /* Returns the sum of the squares of the first 'n' positive integers. */ static unsigned int sum_of_squares(int n) @@ -282,6 +338,7 @@ test_follow_symlinks(int argc, char *argv[]) static const struct command commands[] = { {"ctz", 0, 0, test_ctz}, + {"popcount", 0, 0, test_popcount}, {"log_2_floor", 0, 0, test_log_2_floor}, {"bitwise_copy", 0, 0, test_bitwise_copy}, {"bitwise_zero", 0, 0, test_bitwise_zero},