Compute fast log base 2 ceiling

前端 未结 14 638
栀梦
栀梦 2020-11-28 11:35

What is a fast way to compute the (long int) ceiling(log_2(i)), where the input and output are 64-bit integers? Solutions for signed or unsigned integers are ac

14条回答
  •  迷失自我
    2020-11-28 12:33

    I have benchmarked several implementations of a 64 bit "highest bit". The most "branch free" code is not in fact the fastest.

    This is my highest-bit.c source file:

    int highest_bit_unrolled(unsigned long long n)
    {
      if (n & 0xFFFFFFFF00000000) {
        if (n & 0xFFFF000000000000) {
          if (n & 0xFF00000000000000) {
            if (n & 0xF000000000000000) {
              if (n & 0xC000000000000000)
                return (n & 0x8000000000000000) ? 64 : 63;
              else
                return (n & 0x2000000000000000) ? 62 : 61;
            } else {
              if (n & 0x0C00000000000000)
                return (n & 0x0800000000000000) ? 60 : 59;
              else
                return (n & 0x0200000000000000) ? 58 : 57;
            }
          } else {
            if (n & 0x00F0000000000000) {
              if (n & 0x00C0000000000000)
                return (n & 0x0080000000000000) ? 56 : 55;
              else
                return (n & 0x0020000000000000) ? 54 : 53;
            } else {
              if (n & 0x000C000000000000)
                return (n & 0x0008000000000000) ? 52 : 51;
              else
                return (n & 0x0002000000000000) ? 50 : 49;
            }
          }
        } else {
          if (n & 0x0000FF0000000000) {
            if (n & 0x0000F00000000000) {
              if (n & 0x0000C00000000000)
                return (n & 0x0000800000000000) ? 48 : 47;
              else
                return (n & 0x0000200000000000) ? 46 : 45;
            } else {
              if (n & 0x00000C0000000000)
                return (n & 0x0000080000000000) ? 44 : 43;
              else
                return (n & 0x0000020000000000) ? 42 : 41;
            }
          } else {
            if (n & 0x000000F000000000) {
              if (n & 0x000000C000000000)
                return (n & 0x0000008000000000) ? 40 : 39;
              else
                return (n & 0x0000002000000000) ? 38 : 37;
            } else {
              if (n & 0x0000000C00000000)
                return (n & 0x0000000800000000) ? 36 : 35;
              else
                return (n & 0x0000000200000000) ? 34 : 33;
            }
          }
        }
      } else {
        if (n & 0x00000000FFFF0000) {
          if (n & 0x00000000FF000000) {
            if (n & 0x00000000F0000000) {
              if (n & 0x00000000C0000000)
                return (n & 0x0000000080000000) ? 32 : 31;
              else
                return (n & 0x0000000020000000) ? 30 : 29;
            } else {
              if (n & 0x000000000C000000)
                return (n & 0x0000000008000000) ? 28 : 27;
              else
                return (n & 0x0000000002000000) ? 26 : 25;
            }
          } else {
            if (n & 0x0000000000F00000) {
              if (n & 0x0000000000C00000)
                return (n & 0x0000000000800000) ? 24 : 23;
              else
                return (n & 0x0000000000200000) ? 22 : 21;
            } else {
              if (n & 0x00000000000C0000)
                return (n & 0x0000000000080000) ? 20 : 19;
              else
                return (n & 0x0000000000020000) ? 18 : 17;
            }
          }
        } else {
          if (n & 0x000000000000FF00) {
            if (n & 0x000000000000F000) {
              if (n & 0x000000000000C000)
                return (n & 0x0000000000008000) ? 16 : 15;
              else
                return (n & 0x0000000000002000) ? 14 : 13;
            } else {
              if (n & 0x0000000000000C00)
                return (n & 0x0000000000000800) ? 12 : 11;
              else
                return (n & 0x0000000000000200) ? 10 : 9;
            }
          } else {
            if (n & 0x00000000000000F0) {
              if (n & 0x00000000000000C0)
                return (n & 0x0000000000000080) ? 8 : 7;
              else
                return (n & 0x0000000000000020) ? 6 : 5;
            } else {
              if (n & 0x000000000000000C)
                return (n & 0x0000000000000008) ? 4 : 3;
              else
                return (n & 0x0000000000000002) ? 2 : (n ? 1 : 0);
            }
          }
        }
      }
    }
    
    int highest_bit_bs(unsigned long long n)
    {
      const unsigned long long mask[] = {
        0x000000007FFFFFFF,
        0x000000000000FFFF,
        0x00000000000000FF,
        0x000000000000000F,
        0x0000000000000003,
        0x0000000000000001
      };
      int hi = 64;
      int lo = 0;
      int i = 0;
    
      if (n == 0)
        return 0;
    
      for (i = 0; i < sizeof mask / sizeof mask[0]; i++) {
        int mi = lo + (hi - lo) / 2;
    
        if ((n >> mi) != 0)
          lo = mi;
        else if ((n & (mask[i] << lo)) != 0)
          hi = mi;
      }
    
      return lo + 1;
    }
    
    int highest_bit_shift(unsigned long long n)
    {
      int i = 0;
      for (; n; n >>= 1, i++)
        ; /* empty */
      return i;
    }
    
    static int count_ones(unsigned long long d)
    {
      d = ((d & 0xAAAAAAAAAAAAAAAA) >>  1) + (d & 0x5555555555555555);
      d = ((d & 0xCCCCCCCCCCCCCCCC) >>  2) + (d & 0x3333333333333333);
      d = ((d & 0xF0F0F0F0F0F0F0F0) >>  4) + (d & 0x0F0F0F0F0F0F0F0F);
      d = ((d & 0xFF00FF00FF00FF00) >>  8) + (d & 0x00FF00FF00FF00FF);
      d = ((d & 0xFFFF0000FFFF0000) >> 16) + (d & 0x0000FFFF0000FFFF);
      d = ((d & 0xFFFFFFFF00000000) >> 32) + (d & 0x00000000FFFFFFFF);
      return d;
    }
    
    int highest_bit_parallel(unsigned long long n)
    {
      n |= n >> 1;
      n |= n >> 2;
      n |= n >> 4;
      n |= n >> 8;
      n |= n >> 16;
      n |= n >> 32;
      return count_ones(n);
    }
    
    int highest_bit_so(unsigned long long x)
    {
      static const unsigned long long t[6] = {
        0xFFFFFFFF00000000ull,
        0x00000000FFFF0000ull,
        0x000000000000FF00ull,
        0x00000000000000F0ull,
        0x000000000000000Cull,
        0x0000000000000002ull
      };
    
      int y = (((x & (x - 1)) == 0) ? 0 : 1);
      int j = 32;
      int i;
    
      for (i = 0; i < 6; i++) {
        int k = (((x & t[i]) == 0) ? 0 : j);
        y += k;
        x >>= k;
        j >>= 1;
      }
    
      return y;
    }
    
    int highest_bit_so2(unsigned long long value)
    {
      int pos = 0;
      if (value & (value - 1ULL))
      {
        pos = 1;
      }
      if (value & 0xFFFFFFFF00000000ULL)
      {
        pos += 32;
        value = value >> 32;
      }
      if (value & 0x00000000FFFF0000ULL)
      {
        pos += 16;
        value = value >> 16;
      }
      if (value & 0x000000000000FF00ULL)
      {
        pos += 8;
        value = value >> 8;
      }
      if (value & 0x00000000000000F0ULL)
      {
        pos += 4;
        value = value >> 4;
      }
      if (value & 0x000000000000000CULL)
      {
        pos += 2;
        value = value >> 2;
      }
      if (value & 0x0000000000000002ULL)
      {
        pos += 1;
        value = value >> 1;
      }
      return pos;
    }
    

    This is highest-bit.h:

    int highest_bit_unrolled(unsigned long long n);
    int highest_bit_bs(unsigned long long n);
    int highest_bit_shift(unsigned long long n);
    int highest_bit_parallel(unsigned long long n);
    int highest_bit_so(unsigned long long n);
    int highest_bit_so2(unsigned long long n);
    

    And the main program (sorry about all the copy and paste):

    #include 
    #include 
    #include 
    #include "highest-bit.h"
    
    double timedelta(clock_t start, clock_t end)
    {
      return (end - start)*1.0/CLOCKS_PER_SEC;
    }
    
    int main(int argc, char **argv)
    {
      int i;
      volatile unsigned long long v;
      clock_t start, end;
    
      start = clock();
    
      for (i = 0; i < 10000000; i++) {
        for (v = 0x8000000000000000; v; v >>= 1)
          highest_bit_unrolled(v);
      }
    
      end = clock();
    
      printf("highest_bit_unrolled = %6.3fs\n", timedelta(start, end));
    
      start = clock();
    
      for (i = 0; i < 10000000; i++) {
        for (v = 0x8000000000000000; v; v >>= 1)
          highest_bit_parallel(v);
      }
    
      end = clock();
    
      printf("highest_bit_parallel = %6.3fs\n", timedelta(start, end));
    
      start = clock();
    
      for (i = 0; i < 10000000; i++) {
        for (v = 0x8000000000000000; v; v >>= 1)
          highest_bit_bs(v);
      }
    
      end = clock();
    
      printf("highest_bit_bs = %6.3fs\n", timedelta(start, end));
    
      start = clock();
    
      for (i = 0; i < 10000000; i++) {
        for (v = 0x8000000000000000; v; v >>= 1)
          highest_bit_shift(v);
      }
    
      end = clock();
    
      printf("highest_bit_shift = %6.3fs\n", timedelta(start, end));
    
      start = clock();
    
      for (i = 0; i < 10000000; i++) {
        for (v = 0x8000000000000000; v; v >>= 1)
          highest_bit_so(v);
      }
    
      end = clock();
    
      printf("highest_bit_so = %6.3fs\n", timedelta(start, end));
    
      start = clock();
    
      for (i = 0; i < 10000000; i++) {
        for (v = 0x8000000000000000; v; v >>= 1)
          highest_bit_so2(v);
      }
    
      end = clock();
    
      printf("highest_bit_so2 = %6.3fs\n", timedelta(start, end));
    
      return 0;
    }
    

    I've tried this various Intel x86 boxes, old and new.

    The highest_bit_unrolled (unrolled binary search) is consistently significantly faster than highest_bit_parallel (branch-free bit ops). This is faster than than highest_bit_bs (binary search loop), and that in turn is faster than highest_bit_shift (naive shift and count loop).

    highest_bit_unrolled is also faster than the one in the accepted SO answer (highest_bit_so) and also one given in another answer (highest_bit_so2).

    The benchmark cycles through a one-bit mask that covers successive bits. This is to try to defeat branch prediction in the unrolled binary search, which is realistic: in a real world program, the input cases are unlikely to exhibit locality of bit position.

    Here are results on an old Intel(R) Core(TM)2 Duo CPU E4500 @ 2.20GHz:

    $ ./highest-bit
    highest_bit_unrolled =  6.090s
    highest_bit_parallel =  9.260s
    highest_bit_bs = 19.910s
    highest_bit_shift = 21.130s
    highest_bit_so =  8.230s
    highest_bit_so2 =  6.960s
    

    On a newer model Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz:

    highest_bit_unrolled =  1.555s
    highest_bit_parallel =  3.420s
    highest_bit_bs =  6.486s
    highest_bit_shift =  9.505s
    highest_bit_so =  4.127s
    highest_bit_so2 =  1.645s
    

    On the newer hardware, highest_bit_so2 comes closer to highest_bit_unrolled on the newer hardware. The order is not quite the same; now highest_bit_so really falls behind, and is slower than highest_bit_parallel.

    The fastest one, highest_bit_unrolled contains the most code with the most branches. Every single return value reached by a different set of conditions with its own dedicated piece of code.

    The intuition of "avoid all branches" (due to worries about branch mis-predictions) is not always right. Modern (and even not-so-modern any more) processors contain considerable cunning in order not to be hampered by branching.


    P.S. the highest_bit_unrolled was introduced in the TXR language in December 2011 (with mistakes, since debugged).

    Recently, I started wondering whether some nicer, more compact code without branches might not be faster.

    I'm somewhat surprised by the results.

    Arguably, the code should really be #ifdef-ing for GNU C and using some compiler primitives, but as far as portability goes, that version stays.

提交回复
热议问题