Produce loops without cmp instruction in GCC

后端 未结 3 1220
佛祖请我去吃肉
佛祖请我去吃肉 2020-12-03 17:42

I have a number of tight loops I\'m trying to optimize with GCC and intrinsics. Consider for example the following function.

void triad(float *x, float *y,          


        
3条回答
  •  北荒
    北荒 (楼主)
    2020-12-03 18:19

    Final code:

    #define SF sizeof(float)
    #ifndef NO                   //floats per vector, compile with -DNO = 1,2,4,8,...
    #define NO 8                 //MUST be power of two
    #endif
    
    void triadfinaler(float const *restrict x, float const *restrict y,   \
                      float *restrict z, size_t n)
    {
      float *restrict d = __builtin_assume_aligned(z, NO*SF);       //gcc builtin,
      float const *restrict m = __builtin_assume_aligned(y, NO*SF); //optional but produces
      float const *restrict a = __builtin_assume_aligned(x, NO*SF); //better code
      float const k = 3.14159f;
      n*=SF;
      while (n &= ~((size_t)(NO*SF)-1))    //this is why NO*SF must be power of two
        {
          size_t nl = n/SF;
          for (size_t i = 0; i

    I prefer to let the compiler choose the instructions, rather than using intrinsics (not least because you used intel-intrinsics, which gcc doesn't really like). Anyway, the following code produces nice assembly for me on gcc 4.8:

    void triad(float *restrict x, float *restrict y, float *restrict z, size_t n)
    //I hope you weren't aliasing any function arguments... Oh, an it's void, not float
    {
      float *restrict d = __builtin_assume_aligned(z, 32);  // Uh, make sure your arrays
      float *restrict m = __builtin_assume_aligned(y, 32);  // are aligned? Faster that way
      float *restrict a = __builtin_assume_aligned(x, 32);  //
      float const k = 3.14159f;
      while (n &= ~((size_t)0x7))       //black magic, causes gcc to omit code for non-multiples of 8 floats
        {
          n -= 8;                       //You were always computing on 8 floats at a time, right?
          d[n+0] = k * m[n+0] + a[n+0]; //manual unrolling
          d[n+1] = k * m[n+1] + a[n+1];
          d[n+2] = k * m[n+2] + a[n+2];
          d[n+3] = k * m[n+3] + a[n+3];
          d[n+4] = k * m[n+4] + a[n+4];
          d[n+5] = k * m[n+5] + a[n+5];
          d[n+6] = k * m[n+6] + a[n+6];
          d[n+7] = k * m[n+7] + a[n+7];
        }
    }
    

    This produces nice code for my corei7avx2, with -O3:

    triad:
        andq    $-8, %rcx
        je  .L8
        vmovaps .LC0(%rip), %ymm1
    
    .L4:
        subq    $8, %rcx
        vmovaps (%rsi,%rcx,4), %ymm0
        vfmadd213ps (%rdi,%rcx,4), %ymm1, %ymm0
        vmovaps %ymm0, (%rdx,%rcx,4)
        andq    $-8, %rcx
        jne .L4
        vzeroupper
    .L8:
        rep ret
        .cfi_endproc
    
    .LC0:
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
    

    Edit: I was a bit disappointed with the compiler not optimizing this code down to the last instruction, so I messed around with it a bit more. Just changing the order of things in the loop got rid of the AND emitted by the compiler, which got me on the right track. I then only had to get it to not do unnecessary address calculation in the loop instead. Sigh.

    void triadtwo(float *restrict x, float *restrict y, float *restrict z, size_t n)
    {
      float *restrict d = __builtin_assume_aligned(z, 32);
      float *restrict m = __builtin_assume_aligned(y, 32);
      float *restrict a = __builtin_assume_aligned(x, 32);
      float const k = 3.14159f;
      n<<=2;
      while (n &= -32)
        {
          d[(n>>2)-8] = k * m[(n>>2)-8] + a[(n>>2)-8];
          d[(n>>2)-7] = k * m[(n>>2)-7] + a[(n>>2)-7];
          d[(n>>2)-6] = k * m[(n>>2)-6] + a[(n>>2)-6];
          d[(n>>2)-5] = k * m[(n>>2)-5] + a[(n>>2)-5];
          d[(n>>2)-4] = k * m[(n>>2)-4] + a[(n>>2)-4];
          d[(n>>2)-3] = k * m[(n>>2)-3] + a[(n>>2)-3];
          d[(n>>2)-2] = k * m[(n>>2)-2] + a[(n>>2)-2];
          d[(n>>2)-1] = k * m[(n>>2)-1] + a[(n>>2)-1];
          n -= 32;
        }
    }
    

    Ugly code? Yes. But the assembly:

    triadtwo:
        salq    $2, %rcx
        andq    $-32, %rcx
        je  .L54
        vmovaps .LC0(%rip), %ymm1
    
    .L50:
        vmovaps -32(%rsi,%rcx), %ymm0
        vfmadd213ps -32(%rdi,%rcx), %ymm1, %ymm0
        vmovaps %ymm0, -32(%rdx,%rcx)
        subq    $32, %rcx
        jne .L50
        vzeroupper
    .L54:
        rep ret
        .cfi_endproc
    .LC0:
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
        .long   1078530000
    

    Mmmmhhh, glorious five instructions in the loop, macro-op fusable subtract-and-branch...

提交回复
热议问题