This is the fastest portable implementation among the ones below, as
measured with GCC 4.4 on a Xeon X3430. The measeured times were, in
seconds:
popcount1 25.6
popcount2 6.9 (but is not portable)
popcount3 31.4
popcount4 25.6
popcount5 61.6 (and is buggy)
popcount6 64.6
popcount7 32.3
popcount8 11.2
int
popcount1(unsigned int x)
{
return __builtin_popcount(x);
}
int
popcount2(unsigned int x)
{
unsigned int y;
asm("popcnt %1, %0" : "=r" (y) : "g" (x));
return y;
}
int
popcount3(unsigned int x)
{
unsigned int n;
n = (x >> 1) &
033333333333;
x -= n;
n = (n >> 1) &
033333333333;
x -= n;
x = (x + (x >> 3)) &
030707070707;
return x % 63;
}
int
popcount4(unsigned int x)
{
x -= (x >> 1) & 0x55555555;
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0f0f0f0f;
x += x >> 8;
x += x >> 16;
return x & 0x3f;
}
int
popcount5(unsigned int x)
{
int n;
n = 0;
while (x) {
if (x & 0xf) {
n += ((0xe9949440 >> (x & 0xf)) & 3) + 1;
}
x >>= 4;
}
return n;
}
int
popcount6(unsigned int x)
{
int n;
n = 0;
while (x) {
n += (0xe994 >> (x & 7)) & 3;
x >>= 3;
}
return n;
}
int
popcount7(unsigned int x)
{
static const int table[16] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
};
return (table[x & 0xf]
+ table[(x >> 4) & 0xf]
+ table[(x >> 8) & 0xf]
+ table[(x >> 12) & 0xf]
+ table[(x >> 16) & 0xf]
+ table[(x >> 20) & 0xf]
+ table[(x >> 24) & 0xf]
+ table[x >> 28]);
}
static int
popcount8(unsigned int x)
{
((((X) & (1 << 0)) != 0) + \
(((X) & (1 << 1)) != 0) + \
(((X) & (1 << 2)) != 0) + \
(((X) & (1 << 3)) != 0) + \
(((X) & (1 << 4)) != 0) + \
(((X) & (1 << 5)) != 0) + \
(((X) & (1 << 6)) != 0) + \
(((X) & (1 << 7)) != 0))
static const uint8_t popcount8[256] = {
INIT64(0), INIT64(64), INIT64(128), INIT64(192)
};
return (popcount8[x & 0xff] +
popcount8[(x >> 8) & 0xff] +
popcount8[(x >> 16) & 0xff] +
popcount8[x >> 24]);
}
int
main(void)
{
unsigned long long int x;
int n;
n = 0;
for (x = 0; x <= UINT32_MAX; x++) {
n += popcount8(x);
}
printf("%d\n", n);
return 0;
}
Signed-off-by: Ben Pfaff <blp@nicira.com>
}
}
+/* Returns the number of 1-bits in 'x', between 0 and 32 inclusive. */
+int
+popcount(uint32_t x)
+{
+ /* In my testing, this implementation is over twice as fast as any other
+ * portable implementation that I tried, including GCC 4.4
+ * __builtin_popcount(), although nonportable asm("popcnt") was over 50%
+ * faster. */
+#define INIT1(X) \
+ ((((X) & (1 << 0)) != 0) + \
+ (((X) & (1 << 1)) != 0) + \
+ (((X) & (1 << 2)) != 0) + \
+ (((X) & (1 << 3)) != 0) + \
+ (((X) & (1 << 4)) != 0) + \
+ (((X) & (1 << 5)) != 0) + \
+ (((X) & (1 << 6)) != 0) + \
+ (((X) & (1 << 7)) != 0))
+#define INIT2(X) INIT1(X), INIT1((X) + 1)
+#define INIT4(X) INIT2(X), INIT2((X) + 2)
+#define INIT8(X) INIT4(X), INIT4((X) + 4)
+#define INIT16(X) INIT8(X), INIT8((X) + 8)
+#define INIT32(X) INIT16(X), INIT16((X) + 16)
+#define INIT64(X) INIT32(X), INIT32((X) + 32)
+
+ static const uint8_t popcount8[256] = {
+ INIT64(0), INIT64(64), INIT64(128), INIT64(192)
+ };
+
+ return (popcount8[x & 0xff] +
+ popcount8[(x >> 8) & 0xff] +
+ popcount8[(x >> 16) & 0xff] +
+ popcount8[x >> 24]);
+}
+
/* Returns true if the 'n' bytes starting at 'p' are zeros. */
bool
is_all_zeros(const uint8_t *p, size_t n)
#ifndef UTIL_H
#define UTIL_H 1
+#include <limits.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stddef.h>
int log_2_floor(uint32_t);
int log_2_ceil(uint32_t);
int ctz(uint32_t);
+int popcount(uint32_t);
bool is_all_zeros(const uint8_t *, size_t);
bool is_all_ones(const uint8_t *, size_t);
m4_foreach(
[testname],
[[ctz],
+ [popcount],
[log_2_floor],
[bitwise_copy],
[bitwise_zero],
[bitwise_one],
[bitwise_is_all_zeros]],
[AT_SETUP([testname[()] function])
+ AT_KEYWORDS([testname])
AT_CHECK([test-util testname], [0], [], [])
AT_CLEANUP])
#include "random.h"
#include "util.h"
+#undef NDEBUG
+#include <assert.h>
+
static void
check_log_2_floor(uint32_t x, int n)
{
check_ctz(0, 32);
}
+static void
+shuffle(unsigned int *p, size_t n)
+{
+ for (; n > 1; n--, p++) {
+ unsigned int *q = &p[rand() % n];
+ unsigned int tmp = *p;
+ *p = *q;
+ *q = tmp;
+ }
+}
+
+static void
+check_popcount(uint32_t x, int n)
+{
+ if (popcount(x) != n) {
+ fprintf(stderr, "popcount(%#"PRIx32") is %d but should be %d\n",
+ x, popcount(x), n);
+ abort();
+ }
+}
+
+static void
+test_popcount(int argc OVS_UNUSED, char *argv[] OVS_UNUSED)
+{
+ unsigned int bits[32];
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(bits); i++) {
+ bits[i] = 1u << i;
+ }
+
+ check_popcount(0, 0);
+
+ for (i = 0; i < 1000; i++) {
+ uint32_t x = 0;
+ int j;
+
+ shuffle(bits, ARRAY_SIZE(bits));
+ for (j = 0; j < 32; j++) {
+ x |= bits[j];
+ check_popcount(x, j + 1);
+ }
+ assert(x == UINT32_MAX);
+
+ shuffle(bits, ARRAY_SIZE(bits));
+ for (j = 31; j >= 0; j--) {
+ x &= ~bits[j];
+ check_popcount(x, j);
+ }
+ assert(x == 0);
+ }
+}
+
/* Returns the sum of the squares of the first 'n' positive integers. */
static unsigned int
sum_of_squares(int n)
\f
static const struct command commands[] = {
{"ctz", 0, 0, test_ctz},
+ {"popcount", 0, 0, test_popcount},
{"log_2_floor", 0, 0, test_log_2_floor},
{"bitwise_copy", 0, 0, test_bitwise_copy},
{"bitwise_zero", 0, 0, test_bitwise_zero},