Matrix transpose and population count

前端未结

关注

 3  1505

旧巷少年郎 2021-01-22 18:03

I have a square boolean matrix M of size N, stored by rows and I want to count the number of bits set to 1 for each column.

For instance for n=4:

1101
01


      
      
        
          3条回答        

        
                    
            
            
                         
                
              
              
                
                   醉话见心
                                             
                
                
                (楼主)
            
              
              
                2021-01-22 18:41
              

            
            
                        
I made some benchmark between the two approaches:


transpose + popcount
update row by row


I wrote a naive version and an AVX2 one for both approaches. I used some functions (found on stackoverflow or elsewhere) for the AVX2 "transpose+popcount" approach.

In my test, I make the assumption that the input is a nbRowsx32 matrix in a bits packed format (nbRows itself being a multiple of 32); the matrix is therefore stored as an array of uint32_t.

The code is the following:

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

using namespace std;
using namespace std::chrono;

// see https://stackoverflow.com/questions/24225786/fastest-way-to-unpack-32-bits-to-a-32-byte-simd-vector
static __m256i expand_bits_to_bytes (uint32_t x);

// see https://mischasan.wordpress.com/2011/10/03/the-full-sse2-bit-matrix-transpose-routine/
static void sse_trans(char const *inp, char *out);

static double deviation (double n, double sum2, double sum);

////////////////////////////////////////////////////////////////////////////////
// Naive approach (matrix transposition)
////////////////////////////////////////////////////////////////////////////////
void test_transpose_popcnt_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
{
    assert (nbRows%32==0);

    uint8_t transpo[32][32];  memset (transpo, 0, sizeof(transpo));

    for (uint64_t k=0; k> col) & 1 ;  }
        }

        for (size_t row=0; row<32; row++)
        {
            // We popcount the current row
            u_int8_t sum=0;
            for (size_t col=0; col<32; col++)  {  sum += transpo[row][col];  }

            // We update the corresponding global sum
            globalSums[row] += sum;
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// Naive approach (row by row)
////////////////////////////////////////////////////////////////////////////////
void test_update_row_by_row_naive (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
{
    for (uint64_t row=0; row> col) & 1;
        }
    }
}

////////////////////////////////////////////////////////////////////////////////
// AVX2 (matrix transposition + popcount)
////////////////////////////////////////////////////////////////////////////////
void test_transpose_popcnt_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
{
    assert (nbRows%32==0);

    uint32_t transpo[32];

    const uint32_t* loop = bitmap;
    for (uint64_t k=0; k
void UpdateLocalSums (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
{
    // We update the local sums with the current row
    localSums = _mm256_sub_epi8 (localSums, expand_bits_to_bytes (bitmap[k++]));

    // Go recursively
    UpdateLocalSums(localSums, bitmap, k);
}

template<>
void UpdateLocalSums<0> (__m256i& localSums, const uint32_t* bitmap, uint64_t& k)
{
}

// Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
#define USE_AVX2_FOR_GRAND_TOTALS 1

void test_update_row_by_row_avx2 (uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums)
{
    union U256i {  __m256i v;   uint8_t a[32];  uint32_t b[8];  };

    // We use 1 register for updating local totals
    __m256i   localSums = _mm256_setzero_si256();

#ifdef USE_AVX2_FOR_GRAND_TOTALS
    // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums with AVX2
    __m256i   globalSumsReg[4];  for (size_t r=0; r<4; r++)  {   globalSumsReg[r] = _mm256_setzero_si256(); }
#endif

    uint64_t steps = nbRows / 255;
    uint64_t k=0;

    const int divisorOf255 = 5;

    // We iterate over all rows
    for (uint64_t i=0; i(localSums, bitmap, k);
        }

#ifdef USE_AVX2_FOR_GRAND_TOTALS
        // Dillon Davis proposal: use 4 registers holding uint32_t values and update them from local sums

        // We take the 128 high bits of the local sums
        __m256i   localSums2 = _mm256_broadcastsi128_si256(_mm256_extracti128_si256(localSums,1));

        globalSumsReg[0] = _mm256_add_epi32 (globalSumsReg[0],
            _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 0)))
        );
        globalSumsReg[1] = _mm256_add_epi32 (globalSumsReg[1],
            _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums, 8)))
        );
        globalSumsReg[2] = _mm256_add_epi32 (globalSumsReg[2],
            _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 0)))
        );
        globalSumsReg[3] = _mm256_add_epi32 (globalSumsReg[3],
            _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (_mm256_srli_si256(localSums2, 8)))
        );
#else
        // we update the global totals
        U256i tmp = { localSums };
        for (size_t k=0; k<32; k++)  {  globalSums[k] += tmp.a[k];  }
#endif
        // we reset the local totals
        localSums = _mm256_setzero_si256();
    }

#ifdef USE_AVX2_FOR_GRAND_TOTALS
    // We update the global totals into the final uint32_t array
    for (size_t r=0; r<4; r++)
    {
        U256i tmp = { globalSumsReg[r] };
        for (size_t k=0; k<8; k++)  {  globalSums[r*8+k] += tmp.b[k];  }
    }
#endif

    // we update the remaining local totals
    for (uint64_t i=steps*255; i(localSums, bitmap, k);
    }

    // we update the global totals
    U256i tmp = { localSums };
    for (size_t k=0; k<32; k++)  {  globalSums[k] += tmp.a[k];  }
}

////////////////////////////////////////////////////////////////////////////////
void execute (
    const char* name,
    void (*fct)(uint64_t nbRows, const uint32_t* bitmap, uint64_t*  globalSums),
    size_t nbRuns,
    uint64_t nbRows,
    u_int32_t* bitmap
)
{
    uint64_t  sums[32];

    double timeTotal=0;
    double cycleTotal=0;
    double timeTotal2=0;
    double cycleTotal2=0;
    uint64_t check=0;

    for (size_t n=0; n(system_clock::now().time_since_epoch());
        uint64_t c0 = ReadTSC();

        // We run the test
        (*fct) (nbRows, bitmap, sums);

        uint64_t c1 = ReadTSC();
        milliseconds t1 = duration_cast< milliseconds >(system_clock::now().time_since_epoch());

        timeTotal  += (t1-t0).count();
        cycleTotal += (double)(c1-c0) / nbRows;

        timeTotal2  += (t1-t0).count() * (t1-t0).count();
        cycleTotal2 += ((double)(c1-c0) / nbRows) * ((double)(c1-c0) / nbRows);

        // We compute some dummy checksum
        for (size_t k=0; k<32; k++)  {  check += sums[k];  }
    }

    printf ("%-21s |  %5.0lf (%5.1lf)            |  %5.2lf (%4.2lf)          |  %.3lf           |  0x%lx\n",
        name,
        timeTotal / nbRuns,
        deviation (nbRuns, timeTotal2, timeTotal),
        cycleTotal/nbRuns,
        deviation (nbRuns, cycleTotal2, cycleTotal),
        check,
        nbRows * cycleTotal / timeTotal / 1000000.0
    );
}

////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    // We set rows number as 2^n where n is the provided argument
    // For simplification, we assume that the rows number is a multiple of 32
    uint64_t nbRows = 1ULL << (argc>1 ? atoi(argv[1]) : 28);
    size_t   nbRuns = argc>2 ? atoi(argv[2]) : 10;

    // We build an bitmap of size nbRows*32
    uint32_t* bitmap = new uint32_t[nbRows];
    if (bitmap==nullptr)
    {
        fprintf(stderr, "unable to allocate the bitmap\n");
        exit(1);
    }

    // We fill the bitmap with random values
    srand(time(nullptr));
    for (uint64_t i=0; i 8 bytes, pattern repeats.
    __m256i isolated_inverted = _mm256_and_si256(shuf, andmask);

    // Avoid an _mm256_add_epi8 thanks to Peter Cordes's comment
    return _mm256_cmpeq_epi8(isolated_inverted, andmask);
}

////////////////////////////////////////////////////////////////////////////////
void sse_trans(char const *inp, char *out)
{
#define INP(x,y) inp[(x)*4 + (y)/8]
#define OUT(x,y) out[(y)*4 + (x)/8]

    int rr, cc, i, h;
    union { __m256i x; uint8_t b[32]; } tmp;

    for (cc = 0; cc < 32; cc += 8)
    {
        for (i = 0; i < 32; ++i)
            tmp.b[i] = INP(i, cc);

        for (i = 8; i--; tmp.x = _mm256_slli_epi64(tmp.x, 1))
            *(uint32_t*)&OUT(0, cc + i) = _mm256_movemask_epi8(tmp.x);
    }
}

////////////////////////////////////////////////////////////////////////////////
double deviation (double n, double sum2, double sum)  {  return sqrt (sum2/n - (sum/n)*(sum/n)); }


Some remarks:


I used the Agner Fog's asmlib to have a function that returns CPU cycles
The compilation command is g++ -O3 -march=native ../Test.cpp -o ./Test -laelf64
The gcc version is 7.3.1
The CPU is Intel(R) Core(TM) i7-6700HQ CPU @ 2.60GHz
I compute some dummy checksum to compare the results of the different tests


Now the results:

------------------------------------------------------------------------------------------------------------
name                  | time in msec : mean (sd)  | cycles/row : mean (sd) | frequency in GHz | checksum
------------------------------------------------------------------------------------------------------------
naive (transpo)       |   4548 ( 36.5)            |  43.91 (0.35)          |  2.592           |  0x9affeb5a6
naive (row by row)    |   3033 ( 11.0)            |  29.29 (0.11)          |  2.592           |  0x9affeb5a6
AVX2  (transpo)       |    767 ( 12.8)            |   7.40 (0.12)          |  2.592           |  0x9affeb5a6
AVX2  (row by row)    |    130 (  4.0)            |   1.25 (0.04)          |  2.591           |  0x9affeb5a6


So it seems that the "row by row" in AVX2 is the best so far. 

Note that when I saw this result (less than 2 cycles per row), I made no more effort to optimize the AVX2 "transpose+popcount" method, which should be feasable by computing several popcounts in parallel (I may test it later).
    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它3个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复