How are the gather instructions in AVX2 implemented?

前端 未结 2 1574
故里飘歌
故里飘歌 2020-12-08 10:36

Suppose I\'m using AVX2\'s VGATHERDPS - this should load 8 single-precision floats using 8 DWORD indices.

What happens when the data to be loaded exists in different

相关标签:
2条回答
  • 2020-12-08 11:00

    I did some benchmarking of the AVX gather instructions (on a Haswell CPU) and it seems to be a fairly simple brute force implementation - even when the elements to be loaded are contiguous it seems that there is still one read cycle per element, so performance is really no better than just doing scalar loads.

    NB: this answer is now obsolete as things have changed considerably since Haswell. See the accepted answer for full details (unless you happen to be targeting Haswell CPUs).

    0 讨论(0)
  • 2020-12-08 11:15

    Gather was first implemented with Haswell but was not optimized until Broadwell (the first generation after Haswell).

    I wrote my own code to test gather (see below). Here is a summary on Skylake, SkylakeX (with a dedicated AVX512 port), and KNL systems.

                     scalar    auto   AVX2   AVX512
    Skylake GCC        0.47    0.38   0.38       NA
    SkylakeX GCC       0.56    0.23   0.35     0.24
    KNL GCC            3.95    1.37   2.11     1.16
    KNL ICC            3.92    1.17   2.31     1.17
    

    From the table it's clear that in all cases gather loads are faster than scalar loads (for the benchmark I used).

    I'm not sure how Intel implements gather internally. The masks don't seem to have an effect on performance for gather. That's one thing Intel could optimize (if you only read one scalar value to due the mask it should be faster than gathering all values and then using the mask.

    The Intel manual shows some nice figures on gather

    https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
    DCU = L1 Data Cache Unit. MCU = mid-level = L2 cache. LLC = last-level = L3 cache. L3 is shared, L2 and L1d are per-core private.
    Intel is just benchmarking gathers, not using the result for anything.

    //gather.c
    #include <stdio.h>
    #include <omp.h>
    #include <stdlib.h>
    
    #define N 1024
    #define R 1000000
    
    void foo_auto(double * restrict a, double * restrict b, int *idx, int n);
    void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n);
    void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n);
    void foo1(double * restrict a, double * restrict b, int *idx, int n);
    void foo2(double * restrict a, double * restrict b, int *idx, int n);
    void foo3(double * restrict a, double * restrict b, int *idx, int n);
    
    
    double test(int *idx, void (*fp)(double * restrict a, double * restrict b, int *idx, int n)) {
      double a[N];
      double b[N];
      double dtime;
    
      for(int i=0; i<N; i++) a[i] = 1.0*N;
      for(int i=0; i<N; i++) b[i] = 1.0;
      fp(a, b, idx, N);
      dtime = -omp_get_wtime();
      for(int i=0; i<R; i++) fp(a, b, idx, N);
      dtime += omp_get_wtime();
      return dtime;
    }
    
    int main(void) {
    
      //for(int i=0; i<N; i++) idx[i] = N - i - 1;
      //for(int i=0; i<N; i++) idx[i] = i;
      //for(int i=0; i<N; i++) idx[i] = rand()%N;
    
      //for(int i=0; i<R; i++) foo2(a, b, idx, N);
      int idx[N];
      double dtime;
      int ntests=2;
      void (*fp[4])(double * restrict a, double * restrict b, int *idx, int n);
      fp[0] = foo_auto;
      fp[1] = foo_AVX2;
    #if defined ( __AVX512F__ ) || defined ( __AVX512__ )
      fp[2] = foo_AVX512;
      ntests=3;
    #endif     
    
      for(int i=0; i<ntests; i++) { 
        for(int i=0; i<N; i++) idx[i] = 0;
        test(idx, fp[i]);
        dtime = test(idx, fp[i]);
        printf("%.2f      ", dtime);
    
        for(int i=0; i<N; i++) idx[i] = i;
        test(idx, fp[i]);
        dtime = test(idx, fp[i]);
        printf("%.2f      ", dtime);
    
        for(int i=0; i<N; i++) idx[i] = N-i-1;
        test(idx, fp[i]);
        dtime = test(idx, fp[i]);
        printf("%.2f      ", dtime);
    
        for(int i=0; i<N; i++) idx[i] = rand()%N;
        test(idx, fp[i]);
        dtime = test(idx, fp[i]);
        printf("%.2f\n", dtime);
      }
    
      for(int i=0; i<N; i++) idx[i] = 0;
      test(idx, foo1);
      dtime = test(idx, foo1);
      printf("%.2f      ", dtime);
    
      for(int i=0; i<N; i++) idx[i] = i;
      test(idx, foo2);
      dtime = test(idx, foo2);
      printf("%.2f      ", dtime);
    
      for(int i=0; i<N; i++) idx[i] = N-i-1;
      test(idx, foo3);
      dtime = test(idx, foo3);
      printf("%.2f      ", dtime);
      printf("NA\n");
    }
    
    //foo2.c
    #include <x86intrin.h>
    void foo_auto(double * restrict a, double * restrict b, int *idx, int n) {
      for(int i=0; i<n; i++) b[i] = a[idx[i]];
    }
    
    void foo_AVX2(double * restrict a, double * restrict b, int *idx, int n) {
      for(int i=0; i<n; i+=4) {
        __m128i vidx = _mm_loadu_si128((__m128i*)&idx[i]);
        __m256d av = _mm256_i32gather_pd(&a[i], vidx, 8);
        _mm256_storeu_pd(&b[i],av);
      }
    }
    
    #if defined ( __AVX512F__ ) || defined ( __AVX512__ )
    void foo_AVX512(double * restrict a, double * restrict b, int *idx, int n) {
      for(int i=0; i<n; i+=8) {
        __m256i vidx = _mm256_loadu_si256((__m256i*)&idx[i]);
        __m512d av = _mm512_i32gather_pd(vidx, &a[i], 8);
        _mm512_storeu_pd(&b[i],av);
      }
    }
    #endif
    
    void foo1(double * restrict a, double * restrict b, int *idx, int n) {
      for(int i=0; i<n; i++) b[i] = a[0];
    }
    
    void foo2(double * restrict a, double * restrict b, int *idx, int n) {
      for(int i=0; i<n; i++) b[i] = a[i];
    }
    
    void foo3(double * restrict a, double * restrict b, int *idx, int n) {
      for(int i=0; i<n; i++) b[i] = a[n-i-1];
    }
    
    0 讨论(0)
提交回复
热议问题