Efficient Algorithm for Bit Reversal (from MSB->LSB to LSB->MSB) in C

后端 未结 26 1802
情深已故
情深已故 2020-11-22 06:08

What is the most efficient algorithm to achieve the following:

0010 0000 => 0000 0100

The conversion is from MSB->LSB to LSB->MSB. All bits

26条回答
  •  再見小時候
    2020-11-22 06:54

    This thread caught my attention since it deals with a simple problem that requires a lot of work (CPU cycles) even for a modern CPU. And one day I also stood there with the same ¤#%"#" problem. I had to flip millions of bytes. However I know all my target systems are modern Intel-based so let's start optimizing to the extreme!!!

    So I used Matt J's lookup code as the base. the system I'm benchmarking on is a i7 haswell 4700eq.

    Matt J's lookup bitflipping 400 000 000 bytes: Around 0.272 seconds.

    I then went ahead and tried to see if Intel's ISPC compiler could vectorise the arithmetics in the reverse.c.

    I'm not going to bore you with my findings here since I tried a lot to help the compiler find stuff, anyhow I ended up with performance of around 0.15 seconds to bitflip 400 000 000 bytes. It's a great reduction but for my application that's still way way too slow..

    So people let me present the fastest Intel based bitflipper in the world. Clocked at:

    Time to bitflip 400000000 bytes: 0.050082 seconds !!!!!

    // Bitflip using AVX2 - The fastest Intel based bitflip in the world!!
    // Made by Anders Cedronius 2014 (anders.cedronius (you know what) gmail.com)
    
    #include 
    #include 
    #include 
    #include 
    
    using namespace std;
    
    #define DISPLAY_HEIGHT  4
    #define DISPLAY_WIDTH   32
    #define NUM_DATA_BYTES  400000000
    
    // Constants (first we got the mask, then the high order nibble look up table and last we got the low order nibble lookup table)
    __attribute__ ((aligned(32))) static unsigned char k1[32*3]={
            0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,
            0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,0x00,0x08,0x04,0x0c,0x02,0x0a,0x06,0x0e,0x01,0x09,0x05,0x0d,0x03,0x0b,0x07,0x0f,
            0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0,0x00,0x80,0x40,0xc0,0x20,0xa0,0x60,0xe0,0x10,0x90,0x50,0xd0,0x30,0xb0,0x70,0xf0
    };
    
    // The data to be bitflipped (+32 to avoid the quantization out of memory problem)
    __attribute__ ((aligned(32))) static unsigned char data[NUM_DATA_BYTES+32]={};
    
    extern "C" {
    void bitflipbyte(unsigned char[],unsigned int,unsigned char[]);
    }
    
    int main()
    {
    
        for(unsigned int i = 0; i < NUM_DATA_BYTES; i++)
        {
            data[i] = rand();
        }
    
        printf ("\r\nData in(start):\r\n");
        for (unsigned int j = 0; j < 4; j++)
        {
            for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
            {
                printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
            }
            printf ("\r\n");
        }
    
        printf ("\r\nNumber of 32-byte chunks to convert: %d\r\n",(unsigned int)ceil(NUM_DATA_BYTES/32.0));
    
        double start_time = omp_get_wtime();
        bitflipbyte(data,(unsigned int)ceil(NUM_DATA_BYTES/32.0),k1);
        double end_time = omp_get_wtime();
    
        printf ("\r\nData out:\r\n");
        for (unsigned int j = 0; j < 4; j++)
        {
            for (unsigned int i = 0; i < DISPLAY_WIDTH; i++)
            {
                printf ("0x%02x,",data[i+(j*DISPLAY_WIDTH)]);
            }
            printf ("\r\n");
        }
        printf("\r\n\r\nTime to bitflip %d bytes: %f seconds\r\n\r\n",NUM_DATA_BYTES, end_time-start_time);
    
        // return with no errors
        return 0;
    }
    

    The printf's are for debugging..

    Here is the workhorse:

    bits 64
    global bitflipbyte
    
    bitflipbyte:    
            vmovdqa     ymm2, [rdx]
            add         rdx, 20h
            vmovdqa     ymm3, [rdx]
            add         rdx, 20h
            vmovdqa     ymm4, [rdx]
    bitflipp_loop:
            vmovdqa     ymm0, [rdi] 
            vpand       ymm1, ymm2, ymm0 
            vpandn      ymm0, ymm2, ymm0 
            vpsrld      ymm0, ymm0, 4h 
            vpshufb     ymm1, ymm4, ymm1 
            vpshufb     ymm0, ymm3, ymm0         
            vpor        ymm0, ymm0, ymm1
            vmovdqa     [rdi], ymm0
            add     rdi, 20h
            dec     rsi
            jnz     bitflipp_loop
            ret
    

    The code takes 32 bytes then masks out the nibbles. The high nibble gets shifted right by 4. Then I use vpshufb and ymm4 / ymm3 as lookup tables. I could use a single lookup table but then I would have to shift left before ORing the nibbles together again.

    There are even faster ways of flipping the bits. But I'm bound to single thread and CPU so this was the fastest I could achieve. Can you make a faster version?

    Please make no comments about using the Intel C/C++ Compiler Intrinsic Equivalent commands...

提交回复
热议问题