I am looking for an efficient way to determine the position of the least significant bit that is set in an integer, e.g. for 0x0FF0 it would be 4.
A trivial impleme
Weee, loads of solutions and not a benchmark in sight. You people should be ashamed of yourselves ;-)
My machine is an Intel i530 (2.9 GHz), running Windows 7 64-bit. I compiled with a 32-bit version of MinGW.
$ gcc --version
gcc.exe (GCC) 4.7.2
$ gcc bench.c -o bench.exe -std=c99 -Wall -O2
$ bench
Naive loop. Time = 2.91 (Original questioner)
De Bruijn multiply. Time = 1.16 (Tykhyy)
Lookup table. Time = 0.36 (Andrew Grant)
FFS instruction. Time = 0.90 (ephemient)
Branch free mask. Time = 3.48 (Dan / Jim Balter)
Double hack. Time = 3.41 (DocMax)
$ gcc bench.c -o bench.exe -std=c99 -Wall -O2 -march=native
$ bench
Naive loop. Time = 2.92
De Bruijn multiply. Time = 0.47
Lookup table. Time = 0.35
FFS instruction. Time = 0.68
Branch free mask. Time = 3.49
Double hack. Time = 0.92
My code:
#include
#include
#include
#define ARRAY_SIZE 65536
#define NUM_ITERS 5000 // Number of times to process array
int find_first_bits_naive_loop(unsigned nums[ARRAY_SIZE])
{
int total = 0; // Prevent compiler from optimizing out the code
for (int j = 0; j < NUM_ITERS; j++) {
for (int i = 0; i < ARRAY_SIZE; i++) {
unsigned value = nums[i];
if (value == 0)
continue;
unsigned pos = 0;
while (!(value & 1))
{
value >>= 1;
++pos;
}
total += pos + 1;
}
}
return total;
}
int find_first_bits_de_bruijn(unsigned nums[ARRAY_SIZE])
{
static const int MultiplyDeBruijnBitPosition[32] =
{
1, 2, 29, 3, 30, 15, 25, 4, 31, 23, 21, 16, 26, 18, 5, 9,
32, 28, 14, 24, 22, 20, 17, 8, 27, 13, 19, 7, 12, 6, 11, 10
};
int total = 0; // Prevent compiler from optimizing out the code
for (int j = 0; j < NUM_ITERS; j++) {
for (int i = 0; i < ARRAY_SIZE; i++) {
unsigned int c = nums[i];
total += MultiplyDeBruijnBitPosition[((unsigned)((c & -c) * 0x077CB531U)) >> 27];
}
}
return total;
}
unsigned char lowestBitTable[256];
int get_lowest_set_bit(unsigned num) {
unsigned mask = 1;
for (int cnt = 1; cnt <= 32; cnt++, mask <<= 1) {
if (num & mask) {
return cnt;
}
}
return 0;
}
int find_first_bits_lookup_table(unsigned nums[ARRAY_SIZE])
{
int total = 0; // Prevent compiler from optimizing out the code
for (int j = 0; j < NUM_ITERS; j++) {
for (int i = 0; i < ARRAY_SIZE; i++) {
unsigned int value = nums[i];
// note that order to check indices will depend whether you are on a big
// or little endian machine. This is for little-endian
unsigned char *bytes = (unsigned char *)&value;
if (bytes[0])
total += lowestBitTable[bytes[0]];
else if (bytes[1])
total += lowestBitTable[bytes[1]] + 8;
else if (bytes[2])
total += lowestBitTable[bytes[2]] + 16;
else
total += lowestBitTable[bytes[3]] + 24;
}
}
return total;
}
int find_first_bits_ffs_instruction(unsigned nums[ARRAY_SIZE])
{
int total = 0; // Prevent compiler from optimizing out the code
for (int j = 0; j < NUM_ITERS; j++) {
for (int i = 0; i < ARRAY_SIZE; i++) {
total += __builtin_ffs(nums[i]);
}
}
return total;
}
int find_first_bits_branch_free_mask(unsigned nums[ARRAY_SIZE])
{
int total = 0; // Prevent compiler from optimizing out the code
for (int j = 0; j < NUM_ITERS; j++) {
for (int i = 0; i < ARRAY_SIZE; i++) {
unsigned value = nums[i];
int i16 = !(value & 0xffff) << 4;
value >>= i16;
int i8 = !(value & 0xff) << 3;
value >>= i8;
int i4 = !(value & 0xf) << 2;
value >>= i4;
int i2 = !(value & 0x3) << 1;
value >>= i2;
int i1 = !(value & 0x1);
int i0 = (value >> i1) & 1? 0 : -32;
total += i16 + i8 + i4 + i2 + i1 + i0 + 1;
}
}
return total;
}
int find_first_bits_double_hack(unsigned nums[ARRAY_SIZE])
{
int total = 0; // Prevent compiler from optimizing out the code
for (int j = 0; j < NUM_ITERS; j++) {
for (int i = 0; i < ARRAY_SIZE; i++) {
unsigned value = nums[i];
double d = value ^ (value - !!value);
total += (((int*)&d)[1]>>20)-1022;
}
}
return total;
}
int main() {
unsigned nums[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++) {
nums[i] = rand() + (rand() << 15);
}
for (int i = 0; i < 256; i++) {
lowestBitTable[i] = get_lowest_set_bit(i);
}
clock_t start_time, end_time;
int result;
start_time = clock();
result = find_first_bits_naive_loop(nums);
end_time = clock();
printf("Naive loop. Time = %.2f, result = %d\n",
(end_time - start_time) / (double)(CLOCKS_PER_SEC), result);
start_time = clock();
result = find_first_bits_de_bruijn(nums);
end_time = clock();
printf("De Bruijn multiply. Time = %.2f, result = %d\n",
(end_time - start_time) / (double)(CLOCKS_PER_SEC), result);
start_time = clock();
result = find_first_bits_lookup_table(nums);
end_time = clock();
printf("Lookup table. Time = %.2f, result = %d\n",
(end_time - start_time) / (double)(CLOCKS_PER_SEC), result);
start_time = clock();
result = find_first_bits_ffs_instruction(nums);
end_time = clock();
printf("FFS instruction. Time = %.2f, result = %d\n",
(end_time - start_time) / (double)(CLOCKS_PER_SEC), result);
start_time = clock();
result = find_first_bits_branch_free_mask(nums);
end_time = clock();
printf("Branch free mask. Time = %.2f, result = %d\n",
(end_time - start_time) / (double)(CLOCKS_PER_SEC), result);
start_time = clock();
result = find_first_bits_double_hack(nums);
end_time = clock();
printf("Double hack. Time = %.2f, result = %d\n",
(end_time - start_time) / (double)(CLOCKS_PER_SEC), result);
}