Produce loops without cmp instruction in GCC

后端未结

关注

 3  1220

佛祖请我去吃肉 2020-12-03 17:42

I have a number of tight loops I\'m trying to optimize with GCC and intrinsics. Consider for example the following function.

void triad(float *x, float *y,


      
      
        
          3条回答        

        
                    
            
            
                         
                
              
              
                
                   北荒
                                             
                
                
                (楼主)
            
              
              
                2020-12-03 18:19
              

            
            
                        
Final code:

#define SF sizeof(float)
#ifndef NO                   //floats per vector, compile with -DNO = 1,2,4,8,...
#define NO 8                 //MUST be power of two
#endif

void triadfinaler(float const *restrict x, float const *restrict y,   \
                  float *restrict z, size_t n)
{
  float *restrict d = __builtin_assume_aligned(z, NO*SF);       //gcc builtin,
  float const *restrict m = __builtin_assume_aligned(y, NO*SF); //optional but produces
  float const *restrict a = __builtin_assume_aligned(x, NO*SF); //better code
  float const k = 3.14159f;
  n*=SF;
  while (n &= ~((size_t)(NO*SF)-1))    //this is why NO*SF must be power of two
    {
      size_t nl = n/SF;
      for (size_t i = 0; i


I prefer to let the compiler choose the instructions, rather than using intrinsics (not least because you used intel-intrinsics, which gcc doesn't really like). Anyway, the following code produces nice assembly for me on gcc 4.8:

void triad(float *restrict x, float *restrict y, float *restrict z, size_t n)
//I hope you weren't aliasing any function arguments... Oh, an it's void, not float
{
  float *restrict d = __builtin_assume_aligned(z, 32);  // Uh, make sure your arrays
  float *restrict m = __builtin_assume_aligned(y, 32);  // are aligned? Faster that way
  float *restrict a = __builtin_assume_aligned(x, 32);  //
  float const k = 3.14159f;
  while (n &= ~((size_t)0x7))       //black magic, causes gcc to omit code for non-multiples of 8 floats
    {
      n -= 8;                       //You were always computing on 8 floats at a time, right?
      d[n+0] = k * m[n+0] + a[n+0]; //manual unrolling
      d[n+1] = k * m[n+1] + a[n+1];
      d[n+2] = k * m[n+2] + a[n+2];
      d[n+3] = k * m[n+3] + a[n+3];
      d[n+4] = k * m[n+4] + a[n+4];
      d[n+5] = k * m[n+5] + a[n+5];
      d[n+6] = k * m[n+6] + a[n+6];
      d[n+7] = k * m[n+7] + a[n+7];
    }
}


This produces nice code for my corei7avx2, with -O3:

triad:
    andq    $-8, %rcx
    je  .L8
    vmovaps .LC0(%rip), %ymm1

.L4:
    subq    $8, %rcx
    vmovaps (%rsi,%rcx,4), %ymm0
    vfmadd213ps (%rdi,%rcx,4), %ymm1, %ymm0
    vmovaps %ymm0, (%rdx,%rcx,4)
    andq    $-8, %rcx
    jne .L4
    vzeroupper
.L8:
    rep ret
    .cfi_endproc

.LC0:
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000


Edit:
I was a bit disappointed with the compiler not optimizing this code down to the last instruction, so I messed around with it a bit more. Just changing the order of things in the loop got rid of the AND emitted by the compiler, which got me on the right track. I then only had to get it to not do unnecessary address calculation in the loop instead. Sigh.

void triadtwo(float *restrict x, float *restrict y, float *restrict z, size_t n)
{
  float *restrict d = __builtin_assume_aligned(z, 32);
  float *restrict m = __builtin_assume_aligned(y, 32);
  float *restrict a = __builtin_assume_aligned(x, 32);
  float const k = 3.14159f;
  n<<=2;
  while (n &= -32)
    {
      d[(n>>2)-8] = k * m[(n>>2)-8] + a[(n>>2)-8];
      d[(n>>2)-7] = k * m[(n>>2)-7] + a[(n>>2)-7];
      d[(n>>2)-6] = k * m[(n>>2)-6] + a[(n>>2)-6];
      d[(n>>2)-5] = k * m[(n>>2)-5] + a[(n>>2)-5];
      d[(n>>2)-4] = k * m[(n>>2)-4] + a[(n>>2)-4];
      d[(n>>2)-3] = k * m[(n>>2)-3] + a[(n>>2)-3];
      d[(n>>2)-2] = k * m[(n>>2)-2] + a[(n>>2)-2];
      d[(n>>2)-1] = k * m[(n>>2)-1] + a[(n>>2)-1];
      n -= 32;
    }
}


Ugly code? Yes. But the assembly:

triadtwo:
    salq    $2, %rcx
    andq    $-32, %rcx
    je  .L54
    vmovaps .LC0(%rip), %ymm1

.L50:
    vmovaps -32(%rsi,%rcx), %ymm0
    vfmadd213ps -32(%rdi,%rcx), %ymm1, %ymm0
    vmovaps %ymm0, -32(%rdx,%rcx)
    subq    $32, %rcx
    jne .L50
    vzeroupper
.L54:
    rep ret
    .cfi_endproc
.LC0:
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000
    .long   1078530000


Mmmmhhh, glorious five instructions in the loop, macro-op fusable subtract-and-branch...
    
             
                                                        
            

            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它3个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          

                              			
        

        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复