How are the gather instructions in AVX2 implemented?

﹥>﹥吖頭↗ 提交于 2019-11-28 06:23:09

I did some benchmarking of the AVX gather instructions and it seems to be a fairly simple brute force implementation - even when the elements to be loaded are contiguous it seems that there is still one read cycle per element, so performance is really no better than just doing scalar loads.

Z boson

Gather was first implemented with Haswell but was not optimized until Broadwell (the first generation after Haswell).

I wrote my own code to test gather (see below). Here is a summary on Skylake, SkylakeX (with a dedicated AVX512 port), and KNL systems.

                 scalar    auto   AVX2   AVX512
Skylake GCC        0.47    0.38   0.38       NA
SkylakeX GCC       0.56    0.23   0.35     0.24
KNL GCC            3.95    1.37   2.11     1.16
KNL ICC            3.92    1.17   2.31     1.17

From the table it's clear that in all cases gather loads are faster than scalar loads (for the benchmark I used).

I'm not sure how Intel implements gather internally. The masks don't seem to have an effect on performance for gather. That's one thing Intel could optimize (if you only read one scalar value to due the mask it should be faster than gathering all values and then using the mask.

The Intel manual shows some nice figures on gather

https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
DCU = L1 Data Cache Unit. MCU = mid-level = L2 cache. LLC = last-level = L3 cache. L3 is shared, L2 and L1d are per-core private.
Intel is just benchmarking gathers, not using the result for anything.

//gather.c
#include <stdio.h>
#include <omp.h>
#include <stdlib.h>

#define N 1024
#define R 1000000

void foo_auto(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n);
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n);
void foo1(double * restrict a, double * restrict b, int *idx, int n);
void foo2(double * restrict a, double * restrict b, int *idx, int n);
void foo3(double * restrict a, double * restrict b, int *idx, int n);


double test(int *idx, void (*fp)(double * restrict a, double * restrict b, int *idx, int n)) {
  double a[N];
  double b[N];
  double dtime;

  for(int i=0; i<N; i++) a[i] = 1.0*N;
  for(int i=0; i<N; i++) b[i] = 1.0;
  fp(a, b, idx, N);
  dtime = -omp_get_wtime();
  for(int i=0; i<R; i++) fp(a, b, idx, N);
  dtime += omp_get_wtime();
  return dtime;
}

int main(void) {

  //for(int i=0; i<N; i++) idx[i] = N - i - 1;
  //for(int i=0; i<N; i++) idx[i] = i;
  //for(int i=0; i<N; i++) idx[i] = rand()%N;

  //for(int i=0; i<R; i++) foo2(a, b, idx, N);
  int idx[N];
  double dtime;
  int ntests=2;
  void (*fp[4])(double * restrict a, double * restrict b, int *idx, int n);
  fp[0] = foo_auto;
  fp[1] = foo_AVX2;
#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
  fp[2] = foo_AVX512;
  ntests=3;
#endif     

  for(int i=0; i<ntests; i++) { 
    for(int i=0; i<N; i++) idx[i] = 0;
    test(idx, fp[i]);
    dtime = test(idx, fp[i]);
    printf("%.2f      ", dtime);

    for(int i=0; i<N; i++) idx[i] = i;
    test(idx, fp[i]);
    dtime = test(idx, fp[i]);
    printf("%.2f      ", dtime);

    for(int i=0; i<N; i++) idx[i] = N-i-1;
    test(idx, fp[i]);
    dtime = test(idx, fp[i]);
    printf("%.2f      ", dtime);

    for(int i=0; i<N; i++) idx[i] = rand()%N;
    test(idx, fp[i]);
    dtime = test(idx, fp[i]);
    printf("%.2f\n", dtime);
  }

  for(int i=0; i<N; i++) idx[i] = 0;
  test(idx, foo1);
  dtime = test(idx, foo1);
  printf("%.2f      ", dtime);

  for(int i=0; i<N; i++) idx[i] = i;
  test(idx, foo2);
  dtime = test(idx, foo2);
  printf("%.2f      ", dtime);

  for(int i=0; i<N; i++) idx[i] = N-i-1;
  test(idx, foo3);
  dtime = test(idx, foo3);
  printf("%.2f      ", dtime);
  printf("NA\n");
}

//foo2.c
#include <x86intrin.h>
void foo_auto(double * restrict a, double * restrict b, int *idx, int n) {
  for(int i=0; i<n; i++) b[i] = a[idx[i]];
}

void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n) {
  for(int i=0; i<n; i+=4) {
    __m128i vidx = _mm_loadu_si128((__m128i*)&idx[i]);
    __m256d av = _mm256_i32gather_pd(&a[i], vidx, 8);
    _mm256_storeu_pd(&b[i],av);
  }
}

#if defined ( __AVX512F__ ) || defined ( __AVX512__ )
void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n) {
  for(int i=0; i<n; i+=8) {
    __m256i vidx = _mm256_loadu_si256((__m256i*)&idx[i]);
    __m512d av = _mm512_i32gather_pd(vidx, &a[i], 8);
    _mm512_storeu_pd(&b[i],av);
  }
}
#endif

void foo1(double * restrict a, double * restrict b, int *idx, int n) {
  for(int i=0; i<n; i++) b[i] = a[0];
}

void foo2(double * restrict a, double * restrict b, int *idx, int n) {
  for(int i=0; i<n; i++) b[i] = a[i];
}

void foo3(double * restrict a, double * restrict b, int *idx, int n) {
  for(int i=0; i<n; i++) b[i] = a[n-i-1];
}
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!