Efficient sse shuffle mask generation for left-packing byte elements

问题

What would be an efficient way to optimize the following code with sse ?

uint16_t change1= ... ;
uint8_t* pSrc   = ... ;
uint8_t* pDest  = ... ;

if(change1 & 0x0001) *pDest++ = pSrc[0];
if(change1 & 0x0002) *pDest++ = pSrc[1];
if(change1 & 0x0004) *pDest++ = pSrc[2];
if(change1 & 0x0008) *pDest++ = pSrc[3];

if(change1 & 0x0010) *pDest++ = pSrc[4];
if(change1 & 0x0020) *pDest++ = pSrc[5];
if(change1 & 0x0040) *pDest++ = pSrc[6];
if(change1 & 0x0080) *pDest++ = pSrc[7];

if(change1 & 0x0100) *pDest++ = pSrc[8];
if(change1 & 0x0200) *pDest++ = pSrc[9];
if(change1 & 0x0400) *pDest++ = pSrc[10];
if(change1 & 0x0800) *pDest++ = pSrc[11];

if(change1 & 0x1000) *pDest++ = pSrc[12];
if(change1 & 0x2000) *pDest++ = pSrc[13];
if(change1 & 0x4000) *pDest++ = pSrc[14];
if(change1 & 0x8000) *pDest++ = pSrc[15];

So far I am using a quite big lookup table for it, but I really want to get rid of it:

SSE3Shuffle::Entry& e0 = SSE3Shuffle::g_Shuffle.m_Entries[change1];
_mm_storeu_si128((__m128i*)pDest, _mm_shuffle_epi8(*(__m128i*)pSrc, e0.mask));
pDest += e0.offset;

回答1:

Assuming:

change1 = _mm_movemask_epi8(bytemask);
offset = popcnt(change1);

It seems that 1KiB table and two shuffles is only ~10% slower than a 1MiB table and 1 shuffle.

*edited

static const uint64_t table[128] __attribute__((aligned(64))) = {
    0x0706050403020100, 0x0007060504030201, ..., 0x0605040302010700, 0x0605040302010007 
};
const __m128i mask_01 = _mm_set1_epi8( 0x01 );

__m128i vector0 = _mm_loadu_si128((__m128i*)src);
__m128i vector1 = _mm_shuffle_epi32( vector0, 0x0E );

__m128i bytemask0 = _mm_cmpeq_epi8( ???, vector0); // detect bytes to omit

uint32_t bitmask0 = _mm_movemask_epi8(bytemask0) & 0x7F7F;
__m128i hsum = _mm_sad_epu8(_mm_add_epi8(bytemask0, mask_01), _mm_setzero_si128());

vector0 = _mm_shuffle_epi8(vector0, _mm_loadl_epi64((__m128i*) &table[(uint8_t)bitmask0]));
_mm_storel_epi64((__m128i*)dst, vector0);
dst += (uint32_t)_mm_cvtsi128_si32(hsum);

vector1 = _mm_shuffle_epi8(vector1, _mm_loadl_epi64((__m128i*) &table[bitmask0 >> 8]));
_mm_storel_epi64((__m128i*)dst, vector1);
dst += (uint32_t)_mm_cvtsi128_si32(_mm_unpackhi_epi64(hsum, hsum));

Generating the shuffle mask via prefix sums and bit twiddling seem to be about about half the speed of the table based methods.

Using avx2, generating the shuffle mask is nearly the same speed as a lut approach.

const __m256i mask_index = _mm256_set1_epi64x( 0x80A0908884828180 );
const __m256i mask_shift = _mm256_set1_epi64x( 0x0000000008080808 );
const __m256i mask_invert = _mm256_set1_epi64x( 0x0020100800000000 );
const __m256i mask_fixup = _mm256_set_epi32(
    0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707,
    0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707
);
const __m256i lut = _mm256_set_epi32(
    0x04050607, // 0x03020100', 0x000000'07
    0x04050704, // 0x030200'00, 0x0000'0704
    0x04060705, // 0x030100'00, 0x0000'0705
    0x04070504, // 0x0300'0000, 0x00'070504
    0x05060706, // 0x020100'00, 0x0000'0706
    0x05070604, // 0x0200'0000, 0x00'070604
    0x06070605, // 0x0100'0000, 0x00'070605
    0x07060504  // 0x00'000000, 0x'07060504
);

__m256i r0,r1,r2,r3,r4;

r0 = _mm256_loadu_si256(src++);
r1 = _mm256_cmpeq_epi8( ???, r0); // detect changes

r2 = _mm256_andnot_si256(r1, mask_index);
r1 = _mm256_and_si256(r1, mask_shift);
r2 = _mm256_sad_epu8(r2, mask_invert); // bitmap[0:5], popcnt[7:15]
r1 = _mm256_sad_epu8(r1, _mm256_setzero_si256()); // shift amount
r3 = _mm256_slli_epi64(r2, 29); // move hi index to 2nd dword
r4 = _mm256_srli_epi64(r2, 7); // popcnt
r2 = _mm256_or_si256(r2, r3);
r2 = _mm256_permutevar8x32_epi32(lut, r2);
r2 = _mm256_xor_si256(r2, mask_fixup);
r2 = _mm256_srlv_epi64(r2, r1);
r0 = _mm256_shuffle_epi8(r0, r2);

*((uint64_t*)dst) = _mm256_extract_epi64(r0, 0);
dst += _mm256_extract_epi64(r4, 0);
*((uint64_t*)dst) = _mm256_extract_epi64(r0, 1);
dst += _mm256_extract_epi64(r4, 1);
*((uint64_t*)dst) = _mm256_extract_epi64(r0, 2);
dst += _mm256_extract_epi64(r4, 2);
*((uint64_t*)dst) = _mm256_extract_epi64(r0, 3);
dst += _mm256_extract_epi64(r4, 3);

benchmark & code: https://github.com/aqrit/despacer

回答2:

If one is willing to use BMI2 available on haswell and later, one can use pdep to first compress unwanted nibbles out from uint64_t, and then use pext to scatter the result to shuffle mask.

// Step 1 -- replicate mask to nibbles
uint64_t change4 = pdep(change1, 0x1111111111111111ULL) * 0x0F;
// Step 2 -- extract index from array of nibbles
uint64_t indices = pext(0xfedcba09876543210, change4);
// Step 3 -- interleave nibbles to octects
uint64_t high = pdep(indices >> 32ULL,0x0F0F0F0F0F0F0F0F);
uint64_t low = pdep(indices, 0x0F0F0F0F0F0F0F0FULL);
// Step 4 -- use these two masks to compress pSrc
__m128i compressed = _mm_shuffle_epi8(pSrc, _mm_set_epi64(high, low));
// Step 5 -- store 16 bytes unaligned
_mm_storeu_si128(pDst, compressed);
// Step 6 -- increment target pointer
pDst += __mm_popcnt(change1);

Also other variants (based on cumulative sum or sorting the 'X's (or zero bits) out from XX23456789abXXef will first require some technique to spread the bits from uint16_t evenly to __m128i (i.e. reverse of movemask_epi8).

The 64k entry LUT can however be split to top and bottom parts:

int c = change1 & 0xff;
int p = __popcount(c);
uint64_t a = LUT256[c];               // low part of index
uint64_t b = LUT256[change1 >> 8];    // top part of index
b += addlut9[p];                      // 0x0101010101010101 * p
// Then must concatenate b|a at pth position of 'a'
if (p < 8)
{
   a |= b << (8*(8-p));
   b >>= 8*p;
}
__m128i d = _mm_shuffle_epi8(_mm_loadu_si128(pSrc),_mm_set1_epi64(b,a));
// and continue with steps 5 and 6 as before

来源：https://stackoverflow.com/questions/45506309/efficient-sse-shuffle-mask-generation-for-left-packing-byte-elements

标签

performance

x86

sse

shuffle

simd