Unroll loop and do independent sum with vectorization

守給你的承諾、 提交于 2019-11-28 01:08:22

Some use of gcc intrinsics and __builtin_ produce this:

typedef float v8sf __attribute__((vector_size(32)));
typedef uint32_t v8u32 __attribute__((vector_size(32)));

static v8sf sumfvhelper1(v8sf arr[4])
{
  v8sf retval = {0};
  for (size_t i = 0; i < 4; i++)
    retval += arr[i];
  return retval;
}

static float sumfvhelper2(v8sf x)
{
  v8sf t = __builtin_shuffle(x, (v8u32){4,5,6,7,0,1,2,3});
  x += t;
  t = __builtin_shuffle(x, (v8u32){2,3,0,1,6,7,4,5});
  x += t;
  t = __builtin_shuffle(x, (v8u32){1,0,3,2,5,4,7,6});
  x += t;
  return x[0];
}

float sumfv(float *x)
{
  //x = __builtin_assume_aligned(x, 64);
  v8sf *vx = (v8sf*)x;
  v8sf sumvv[4] = {{0}};
  for (size_t i = 0; i < 2048/8; i+=4)
    {
      sumvv[0] += vx[i+0];
      sumvv[1] += vx[i+1];
      sumvv[2] += vx[i+2];
      sumvv[3] += vx[i+3];
    }
  v8sf sumv = sumfvhelper1(sumvv);
  return sumfvhelper2(sumv);
}

Which gcc 4.8.4 gcc -Wall -Wextra -Wpedantic -std=gnu11 -march=native -O3 -fno-signed-zeros -fno-trapping-math -freciprocal-math -ffinite-math-only -fassociative-math -S turns into:

sumfv:
    vxorps  %xmm2, %xmm2, %xmm2
    xorl    %eax, %eax
    vmovaps %ymm2, %ymm3
    vmovaps %ymm2, %ymm0
    vmovaps %ymm2, %ymm1
.L7:
    addq    $4, %rax
    vaddps  (%rdi), %ymm1, %ymm1
    subq    $-128, %rdi
    vaddps  -96(%rdi), %ymm0, %ymm0
    vaddps  -64(%rdi), %ymm3, %ymm3
    vaddps  -32(%rdi), %ymm2, %ymm2
    cmpq    $256, %rax
    jne .L7
    vaddps  %ymm2, %ymm3, %ymm2
    vaddps  %ymm0, %ymm1, %ymm0
    vaddps  %ymm0, %ymm2, %ymm0
    vperm2f128  $1, %ymm0, %ymm0, %ymm1
    vaddps  %ymm0, %ymm1, %ymm0
    vpermilps   $78, %ymm0, %ymm1
    vaddps  %ymm0, %ymm1, %ymm0
    vpermilps   $177, %ymm0, %ymm1
    vaddps  %ymm0, %ymm1, %ymm0
    vzeroupper
    ret

The second helper function isn't strictly necessary, but summing over the elements of a vector tends to produce terrible code in gcc. If you're willing to do platform-dependent intrinsics, you can probably replace most of it with __builtin_ia32_hadps256().

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!