How to add an AVX2 vector horizontally 3 by 3?

问题

I have a __m256i vector containing 16x16-bit elements.I want to apply a three adjacent horizontal addition on it. In scalar mode I use the following code:

unsigned short int temp[16];
__m256i sum_v;//has some values. 16 elements of 16-bit vector.   | 0 | x15 | x14 | x13 | ... | x3 | x2 | x1 |
_mm256_store_si256((__m256i *)&temp[0], sum_v);
output1 = (temp[0] + temp[1] + temp[2]);
output2 = (temp[3] + temp[4] + temp[5]);
output3 = (temp[6] + temp[7] + temp[8]);
output4 = (temp[9] + temp[10] + temp[11]);
output5 = (temp[12] + temp[13] + temp[14]); 
// Dont want the 15th element

Because this part is placed in the bottleneck section of my program, I decided to vectorize is using AVX2. Dreamy I can add them like the following pseudo:

sum_v                                     //|  0  | x15 | x14 | x13 |...| x10 |...| x7 |...| x4 |...| x1 | 
sum_v1 = sum_v >> 1*16                    //|  0  |  0  | x15 | x14 |...| x11 |...| x8 |...| x5 |...| x2 |  
sum_v2 = sumv >> 2*16                     //|  0  |  0  |  0  | x15 |...| x12 |...| x9 |...| x6 |...| x3 |
result_vec = add_epi16 (sum_v,sum_v1,sum_v2)

//then I should extact the result_vec to outputs

Adding them vertically will provide the answer. But unfortunately, AVX2 has not a shift operation for 256 bits while the 256-bit register is viewed as two 128-bit lanes. I should use permutation for this case. But I could not find an appropriate permut, shuffle, etc. to do this. Is there any suggestion for this implementation that should be as fast as possible.

Using gcc, linux mint, intrinsics, skylake.

回答1:

You can do this with two adds and only 2 "shuffles": _mm256_bsrli_epi128 shifts in zeros at positions that are not of interest to the answer. For _mm256_permutevar8x32_epi32 we choose a permutation that duplicates the upper 32 bits, but these bits are also not relevant for the answer.

#include <stdio.h>
#include <x86intrin.h>
/*  gcc -O3 -Wall -m64 -march=haswell hor_sum3x3.c   */
int print_vec_short(__m256i x);
int print_12_9_6_3_0_short(__m256i x);

int main() {
   short x[16];

   for(int i=0; i<16; i++) x[i] = i+1; x[15] = 0;

   __m256i t0   = _mm256_loadu_si256((__m256i*)x);                              


   __m256i t1   = _mm256_bsrli_epi128(t0,2);             /* Shift 128 bit lanes in t0 right by 2 bytes while shifting in zeros. Fortunately the zeros are in the positions that we don't need */ 
   __m256i t2   = _mm256_permutevar8x32_epi32(t0,_mm256_set_epi32(7,7,6,5,4,3,2,1)); /* Shift right by 4 bytes     */
   __m256i sum  = _mm256_add_epi16(_mm256_add_epi16(t0,t1),t2);

   printf("t0  = ");print_vec_short(t0);
   printf("t1  = ");print_vec_short(t1);
   printf("t2  = ");print_vec_short(t2);
   printf("sum = ");print_vec_short(sum);

   printf("\nvector elements of interest: columns 12, 9, 6, 3, 0:\n");
   printf("t0[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t0);
   printf("t1[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t1);
   printf("t2[12, 9, 6, 3, 0]  = ");print_12_9_6_3_0_short(t2);
   printf("sum[12, 9, 6, 3, 0] = ");print_12_9_6_3_0_short(sum);
   return 0;
}


int print_vec_short(__m256i x){
   short int v[16];
   _mm256_storeu_si256((__m256i *)v,x);
   printf("%4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi | %4hi %4hi %4hi %4hi  | %4hi %4hi %4hi %4hi \n",
          v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
   return 0;
}

int print_12_9_6_3_0_short(__m256i x){
   short int v[16];
   _mm256_storeu_si256((__m256i *)v,x);
   printf("%4hi %4hi %4hi %4hi %4hi  \n",v[12],v[9],v[6],v[3],v[0]);
   return 0;
}

The output is:

$ ./a.out
t0  =    0   15   14   13 |   12   11   10    9 |    8    7    6    5  |    4    3    2    1 
t1  =    0    0   15   14 |   13   12   11   10 |    0    8    7    6  |    5    4    3    2 
t2  =    0   15    0   15 |   14   13   12   11 |   10    9    8    7  |    6    5    4    3 
sum =    0   30   29   42 |   39   36   33   30 |   18   24   21   18  |   15   12    9    6 

vector elements of interest: columns 12, 9, 6, 3, 0:
t0[12, 9, 6, 3, 0]  =   13   10    7    4    1  
t1[12, 9, 6, 3, 0]  =   14   11    8    5    2  
t2[12, 9, 6, 3, 0]  =   15   12    9    6    3  
sum[12, 9, 6, 3, 0] =   42   33   24   15    6

回答2:

You can try to use something like this:

#include <immintrin.h>
#include <iostream>

template<class T> inline void Print(const __m256i & v)
{
    T b[sizeof(v) / sizeof(T)];
    _mm256_storeu_si256((__m256i*)b, v);
    for (int i = 0; i < sizeof(v) / sizeof(T); i++)
        std::cout << int(b[i]) << " ";
    std::cout << std::endl;
}

template<int shift> inline __m256i Shift(const __m256i & a)
{
    return _mm256_alignr_epi8(_mm256_permute2x128_si256(a, _mm256_setzero_si256(), 0x31), a, shift * 2);
}

int main()
{
    __m256i v0 = _mm256_setr_epi16(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, 0);
    __m256i v1 = Shift<1>(v0);
    __m256i v2 = Shift<2>(v0);
    __m256i r = _mm256_add_epi16(v0, _mm256_add_epi16(v1, v2));

    Print<short>(v0);
    Print<short>(v1);
    Print<short>(v2);
    Print<short>(r);
}

Output:

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0
2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 0
3 4 5 6 7 8 9 10 11 12 13 14 15 0 0 0
6 9 12 15 18 21 24 27 30 33 36 39 42 29 15 0

回答3:

You could try something like this

__m256i idx1 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1);
__m256i idx2 = _mm256_setr_epi32(1,2,3,4,5,6,7,0);

__m256i t1 = _mm256_shuffle_epi8 (t0, idx1);
__m256i t2 = _mm256_permute2x128_si256(t1, t1, 1);
__m256i t3 = _mm256_blend_epi16(t1,t2,0x80);
__m256i t4 = _mm256_permutevar8x32_epi32(t0, idx2);
__m256i s = _mm256_add_epi16(t0, _mm256_add_epi16(t3,t4));

I based this example off this question.

Here is a working example

#include <stdio.h>
#include <x86intrin.h>

int main(void) {
  short x[16];

  for(int i=0; i<16; i++) x[i] = i;
  __m256i idx1 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
                  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1);
  __m256i idx2 = _mm256_setr_epi32(1,2,3,4,5,6,7,0);

  __m256i t0 = _mm256_loadu_si256((__m256i*)x);
  __m256i t1 = _mm256_shuffle_epi8 (t0, idx1);
  __m256i t2 = _mm256_permute2x128_si256(t1, t1, 1);
  __m256i t3 = _mm256_blend_epi16(t1,t2,0x80);
  __m256i t4 = _mm256_permutevar8x32_epi32(t0, idx2);
  __m256i s = _mm256_add_epi16(t0, _mm256_add_epi16(t3,t4));

  short y[16];
  _mm256_storeu_si256((__m256i*)y, t0);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
  _mm256_storeu_si256((__m256i*)y, t3);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
  _mm256_storeu_si256((__m256i*)y, t4);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
  _mm256_storeu_si256((__m256i*)y, s);
  for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
}

Output

0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f 
1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0 
2  3  4  5  6  7  8  9  a  b  c  d  e  f  0  1 
3  6  9  c  f 12 15 18 1b 1e 21 24 27 2a 1d 10

来源：https://stackoverflow.com/questions/42108358/how-to-add-an-avx2-vector-horizontally-3-by-3

标签

x86

simd

intrinsics

avx2