Matrix transpose and population count

前端 未结 3 1505
旧巷少年郎
旧巷少年郎 2021-01-22 18:03

I have a square boolean matrix M of size N, stored by rows and I want to count the number of bits set to 1 for each column.

For instance for n=4:

1101
01         


        
3条回答
  •  醉话见心
    2021-01-22 18:41

    I made some benchmark between the two approaches:

    1. transpose + popcount
    2. update row by row

    I wrote a naive version and an AVX2 one for both approaches. I used some functions (found on stackoverflow or elsewhere) for the AVX2 "transpose+popcount" approach.

    In my test, I make the assumption that the input is a nbRowsx32 matrix in a bits packed format (nbRows itself being a multiple of 32); the matrix is therefore stored as an array of uint32_t.

    The code is the following:

    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    using namespace std;
    using namespace std::chrono;
    
    // see https://stackoverflow.com/questions/24225786/fastest-way-to-unpack-32-bits-to-a-32-byte-simd-vector
    static __m256i expand_bits_to_bytes (uint32_t x);
    
    // see https://mischasan.wordpress.com/2011/10/03/the-full-sse2-bit-matrix-transpose-routine/
    static void sse_trans(char const *inp, char *out);
    
    static double deviation (double n, double sum2, double sum);
    
    ////////////////////////////////////////////////////////////////////////////////
    // Naive approach (matrix transposition)
    ////////////////////////////////////////////////////////////////////////////////
    void test_transpose_popcnt_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        assert (nbRows%32==0);
    
        uint8_t transpo[32][32];  memset (transpo, 0, sizeof(transpo));
    
        for (uint64_t k=0; k> col) & 1 ;  }
            }
    
            for (size_t row=0; row<32; row++)
            {
                // We popcount the current row
                u_int8_t sum=0;
                for (size_t col=0; col<32; col++)  {  sum += transpo[row][col];  }
    
                // We update the corresponding global sum
                globalSums[row] += sum;
            }
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    // Naive approach (row by row)
    ////////////////////////////////////////////////////////////////////////////////
    void test_update_row_by_row_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        for (uint64_t row=0; row> col) & 1;
            }
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    // AVX2 (matrix transposition + popcount)
    ////////////////////////////////////////////////////////////////////////////////
    void test_transpose_popcnt_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        assert (nbRows%32==0);
    
        uint32_t transpo[32];
    
        const uint32_t* loop = bitmap;
        for (uint64_t k=0; k
    void UpdateLocalSums (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
    {
        // We update the local sums with the current row
        localSums = _mm256_sub_epi8 (localSums, expand_bits_to_bytes (bitmap[k++]));
    
        // Go recursively
        UpdateLocalSums(localSums, bitmap, k);
    }
    
    template<>
    void UpdateLocalSums<0> (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
    {
    }
    
    // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
    #define USE_AVX2_FOR_GRAND_TOTALS 1
    
    void test_update_row_by_row_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
    {
        union U256i {  __m256i v;   uint8_t a[32];  uint32_t b[8];  };
    
        // We use 1 register for updating local totals
        __m256i   localSums = _mm256_setzero_si256();
    
    #ifdef USE_AVX2_FOR_GRAND_TOTALS
        // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
        __m256i   globalSumsReg[4];  for (size_t r=0; r<4; r++)  {   globalSumsReg[r] = _mm256_setzero_si256(); }
    #endif
    
        uint64_t steps = nbRows / 255;
        uint64_t k=0;
    
        const int divisorOf255 = 5;
    
        // We iterate over all rows
        for (uint64_t i=0; i(localSums, bitmap, k);
            }
    
    #ifdef USE_AVX2_FOR_GRAND_TOTALS
            // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums
    
            // We take the 128 high bits of the local sums
            __m256i   localSums2 = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(localSums,1));
    
            globalSumsReg[0] = _mm256_add_epi32 (globalSumsReg[0],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 0)))
            );
            globalSumsReg[1] = _mm256_add_epi32 (globalSumsReg[1],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 8)))
            );
            globalSumsReg[2] = _mm256_add_epi32 (globalSumsReg[2],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 0)))
            );
            globalSumsReg[3] = _mm256_add_epi32 (globalSumsReg[3],
                _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 8)))
            );
    #else
            // we update the global totals
            U256i tmp = { localSums };
            for (size_t k=0; k<32; k++)  {  globalSums[k] += tmp.a[k];  }
    #endif
            // we reset the local totals
            localSums = _mm256_setzero_si256();
        }
    
    #ifdef USE_AVX2_FOR_GRAND_TOTALS
        // We update the global totals into the final uint32_t array
        for (size_t r=0; r<4; r++)
        {
            U256i tmp = { globalSumsReg[r] };
            for (size_t k=0; k<8; k++)  {  globalSums[r*8+k] += tmp.b[k];  }
        }
    #endif
    
        // we update the remaining local totals
        for (uint64_t i=steps*255; i(localSums, bitmap, k);
        }
    
        // we update the global totals
        U256i tmp = { localSums };
        for (size_t k=0; k<32; k++)  {  globalSums[k] += tmp.a[k];  }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    void execute (
        const char* name,
        void (*fct)(uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums),
        size_t nbRuns,
        uint64_t nbRows,
        u_int32_t* bitmap
    )
    {
        uint64_t  sums[32];
    
        double timeTotal=0;
        double cycleTotal=0;
        double timeTotal2=0;
        double cycleTotal2=0;
        uint64_t check=0;
    
        for (size_t n=0; n(system_clock::now().time_since_epoch());
            uint64_t c0 = ReadTSC();
    
            // We run the test
            (*fct) (nbRows, bitmap, sums);
    
            uint64_t c1 = ReadTSC();
            milliseconds t1 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());
    
            timeTotal  += (t1-t0).count();
            cycleTotal += (double)(c1-c0) / nbRows;
    
            timeTotal2  += (t1-t0).count() * (t1-t0).count();
            cycleTotal2 += ((double)(c1-c0) / nbRows) * ((double)(c1-c0) / nbRows);
    
            // We compute some dummy checksum
            for (size_t k=0; k<32; k++)  {  check += sums[k];  }
        }
    
        printf ("%-21s |  %5.0lf (%5.1lf)            |  %5.2lf (%4.2lf)          |  %.3lf           |  0x%lx\n",
            name,
            timeTotal / nbRuns,
            deviation (nbRuns, timeTotal2, timeTotal),
            cycleTotal/nbRuns,
            deviation (nbRuns, cycleTotal2, cycleTotal),
            check,
            nbRows * cycleTotal / timeTotal / 1000000.0
        );
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    int main(int argc, char **argv)
    {
        // We set rows number as 2^n where n is the provided argument
        // For simplification, we assume that the rows number is a multiple of 32
        uint64_t nbRows = 1ULL << (argc>1 ? atoi(argv[1]) : 28);
        size_t   nbRuns = argc>2 ? atoi(argv[2]) : 10;
    
        // We build an bitmap of size nbRows*32
        uint32_t* bitmap = new uint32_t[nbRows];
        if (bitmap==nullptr)
        {
            fprintf(stderr, "unable to allocate the bitmap\n");
            exit(1);
        }
    
        // We fill the bitmap with random values
        srand(time(nullptr));
        for (uint64_t i=0; i 8 bytes, pattern repeats.
        __m256i isolated_inverted = _mm256_and_si256(shuf, andmask);
    
        // Avoid an _mm256_add_epi8 thanks to Peter Cordes's comment
        return _mm256_cmpeq_epi8(isolated_inverted, andmask);
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    void sse_trans(char const *inp, char *out)
    {
    #define INP(x,y) inp[(x)*4 + (y)/8]
    #define OUT(x,y) out[(y)*4 + (x)/8]
    
        int rr, cc, i, h;
        union { __m256i x; uint8_t b[32]; } tmp;
    
        for (cc = 0; cc < 32; cc += 8)
        {
            for (i = 0; i < 32; ++i)
                tmp.b[i] = INP(i, cc);
    
            for (i = 8; i--; tmp.x = _mm256_slli_epi64(tmp.x, 1))
                *(uint32_t*)&OUT(0, cc + i) = _mm256_movemask_epi8(tmp.x);
        }
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    double deviation (double n, double sum2, double sum)  {  return sqrt (sum2/n - (sum/n)*(sum/n)); }
    

    Some remarks:

    • I used the Agner Fog's asmlib to have a function that returns CPU cycles
    • The compilation command is g++ -O3 -march=native ../Test.cpp -o ./Test -laelf64
    • The gcc version is 7.3.1
    • The CPU is Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
    • I compute some dummy checksum to compare the results of the different tests

    Now the results:

    ------------------------------------------------------------------------------------------------------------
    name                  | time in msec : mean (sd)  | cycles/row : mean (sd) | frequency in GHz | checksum
    ------------------------------------------------------------------------------------------------------------
    naive (transpo)       |   4548 ( 36.5)            |  43.91 (0.35)          |  2.592           |  0x9affeb5a6
    naive (row by row)    |   3033 ( 11.0)            |  29.29 (0.11)          |  2.592           |  0x9affeb5a6
    AVX2  (transpo)       |    767 ( 12.8)            |   7.40 (0.12)          |  2.592           |  0x9affeb5a6
    AVX2  (row by row)    |    130 (  4.0)            |   1.25 (0.04)          |  2.591           |  0x9affeb5a6
    

    So it seems that the "row by row" in AVX2 is the best so far.

    Note that when I saw this result (less than 2 cycles per row), I made no more effort to optimize the AVX2 "transpose+popcount" method, which should be feasable by computing several popcounts in parallel (I may test it later).

提交回复
热议问题