Produce loops without cmp instruction in GCC

后端 未结 3 1214
佛祖请我去吃肉
佛祖请我去吃肉 2020-12-03 17:42

I have a number of tight loops I\'m trying to optimize with GCC and intrinsics. Consider for example the following function.

void triad(float *x, float *y,          


        
相关标签:
3条回答
  • 2020-12-03 18:19

    Final code:

    #define SF sizeof(float)
    #ifndef NO                   //floats per vector, compile with -DNO = 1,2,4,8,...
    #define NO 8                 //MUST be power of two
    #endif
    
    void triadfinaler(float const *restrict x, float const *restrict y,   \
                      float *restrict z, size_t n)
    {
      float *restrict d = __builtin_assume_aligned(z, NO*SF);       //gcc builtin,
      float const *restrict m = __builtin_assume_aligned(y, NO*SF); //optional but produces
      float const *restrict a = __builtin_assume_aligned(x, NO*SF); //better code
      float const k = 3.14159f;
      n*=SF;
      while (n &= ~((size_t)(NO*SF)-1))    //this is why NO*SF must be power of two
        {
          size_t nl = n/SF;
          for (size_t i = 0; i<NO; i++)
            {
              d[nl-NO+i] = k * m[nl-NO+i] + a[nl-NO+i];
            }
          n -= (NO*SF);
        }
    }
    

    I prefer to let the compiler choose the instructions, rather than using intrinsics (not least because you used intel-intrinsics, which gcc doesn't really like). Anyway, the following code produces nice assembly for me on gcc 4.8:

    void triad(float *restrict x, float *restrict y, float *restrict z, size_t n)
    //I hope you weren't aliasing any function arguments... Oh, an it's void, not float
    {
      float *restrict d = __builtin_assume_aligned(z, 32);  // Uh, make sure your arrays
      float *restrict m = __builtin_assume_aligned(y, 32);  // are aligned? Faster that way
      float *restrict a = __builtin_assume_aligned(x, 32);  //
      float const k = 3.14159f;
      while (n &= ~((size_t)0x7))       //black magic, causes gcc to omit code for non-multiples of 8 floats
        {
          n -= 8;                       //You were always computing on 8 floats at a time, right?
          d[n+0] = k * m[n+0] + a[n+0]; //manual unrolling
          d[n+1] = k * m[n+1] + a[n+1];
          d[n+2] = k * m[n+2] + a[n+2];
          d[n+3] = k * m[n+3] + a[n+3];
          d[n+4] = k * m[n+4] + a[n+4];
          d[n+5] = k * m[n+5] + a[n+5];
          d[n+6] = k * m[n+6] + a[n+6];
          d[n+7] = k * m[n+7] + a[n+7];
        }
    }
    

    This produces nice code for my corei7avx2, with -O3:

    triad:
        andq    $-8, %rcx
        je  .L8
        vmovaps .LC0(%rip), %ymm1
    
    .L4:
        subq    $8, %rcx
        vmovaps (%rsi,%rcx,4), %ymm0
        vfmadd213ps (%rdi,%rcx,4), %ymm1, %ymm0
        vmovaps %ymm0, (%rdx,%rcx,4)
        andq    $-8, %rcx
        jne .L4
        vzeroupper
    .L8:
        rep ret
        .cfi_endproc
    
    .LC0:
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
    

    Edit: I was a bit disappointed with the compiler not optimizing this code down to the last instruction, so I messed around with it a bit more. Just changing the order of things in the loop got rid of the AND emitted by the compiler, which got me on the right track. I then only had to get it to not do unnecessary address calculation in the loop instead. Sigh.

    void triadtwo(float *restrict x, float *restrict y, float *restrict z, size_t n)
    {
      float *restrict d = __builtin_assume_aligned(z, 32);
      float *restrict m = __builtin_assume_aligned(y, 32);
      float *restrict a = __builtin_assume_aligned(x, 32);
      float const k = 3.14159f;
      n<<=2;
      while (n &= -32)
        {
          d[(n>>2)-8] = k * m[(n>>2)-8] + a[(n>>2)-8];
          d[(n>>2)-7] = k * m[(n>>2)-7] + a[(n>>2)-7];
          d[(n>>2)-6] = k * m[(n>>2)-6] + a[(n>>2)-6];
          d[(n>>2)-5] = k * m[(n>>2)-5] + a[(n>>2)-5];
          d[(n>>2)-4] = k * m[(n>>2)-4] + a[(n>>2)-4];
          d[(n>>2)-3] = k * m[(n>>2)-3] + a[(n>>2)-3];
          d[(n>>2)-2] = k * m[(n>>2)-2] + a[(n>>2)-2];
          d[(n>>2)-1] = k * m[(n>>2)-1] + a[(n>>2)-1];
          n -= 32;
        }
    }
    

    Ugly code? Yes. But the assembly:

    triadtwo:
        salq    $2, %rcx
        andq    $-32, %rcx
        je  .L54
        vmovaps .LC0(%rip), %ymm1
    
    .L50:
        vmovaps -32(%rsi,%rcx), %ymm0
        vfmadd213ps -32(%rdi,%rcx), %ymm1, %ymm0
        vmovaps %ymm0, -32(%rdx,%rcx)
        subq    $32, %rcx
        jne .L50
        vzeroupper
    .L54:
        rep ret
        .cfi_endproc
    .LC0:
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
    

    Mmmmhhh, glorious five instructions in the loop, macro-op fusable subtract-and-branch...

    0 讨论(0)
  • 2020-12-03 18:28

    The instruction decoder on Intel Ivy Bridge or later can fuse the cmp and jne into a single operation in the pipeline (called macro-op fusion), so on these recent processors the cmp should disappear anyway.

    0 讨论(0)
  • 2020-12-03 18:30

    How about this. Compiler is gcc 4.9.0 mingw x64:

    void triad(float *x, float *y, float *z, const int n) {
        float k = 3.14159f;
        intptr_t i;
        __m256 k4 = _mm256_set1_ps(k);
    
        for(i = -n; i < 0; i += 8) {
            _mm256_store_ps(&z[i+n], _mm256_add_ps(_mm256_load_ps(&x[i+n]), _mm256_mul_ps(k4, _mm256_load_ps(&y[i+n]))));
        }
    }
    

    gcc -c -O3 -march=corei7 -mavx2 triad.c

    0000000000000000 <triad>:
       0:   44 89 c8                mov    eax,r9d
       3:   f7 d8                   neg    eax
       5:   48 98                   cdqe
       7:   48 85 c0                test   rax,rax
       a:   79 31                   jns    3d <triad+0x3d>
       c:   c5 fc 28 0d 00 00 00 00 vmovaps ymm1,YMMWORD PTR [rip+0x0]
      14:   4d 63 c9                movsxd r9,r9d
      17:   49 c1 e1 02             shl    r9,0x2
      1b:   4c 01 ca                add    rdx,r9
      1e:   4c 01 c9                add    rcx,r9
      21:   4d 01 c8                add    r8,r9
    
      24:   c5 f4 59 04 82          vmulps ymm0,ymm1,YMMWORD PTR [rdx+rax*4]
      29:   c5 fc 58 04 81          vaddps ymm0,ymm0,YMMWORD PTR [rcx+rax*4]
      2e:   c4 c1 7c 29 04 80       vmovaps YMMWORD PTR [r8+rax*4],ymm0
      34:   48 83 c0 08             add    rax,0x8
      38:   78 ea                   js     24 <triad+0x24>
    
      3a:   c5 f8 77                vzeroupper
      3d:   c3                      ret
    

    Like your hand written code, gcc is using 5 instructions for the loop. The gcc code uses scale=4 where yours uses scale=1. I was able to get gcc to use scale=1 with a 5 instruction loop, but the C code is awkward and 2 of the AVX instructions in the loop grow from 5 bytes to 6 bytes.

    0 讨论(0)
提交回复
热议问题