I have a number of tight loops I\'m trying to optimize with GCC and intrinsics. Consider for example the following function.
void triad(float *x, float *y,
Final code:
#define SF sizeof(float)
#ifndef NO //floats per vector, compile with -DNO = 1,2,4,8,...
#define NO 8 //MUST be power of two
#endif
void triadfinaler(float const *restrict x, float const *restrict y, \
float *restrict z, size_t n)
{
float *restrict d = __builtin_assume_aligned(z, NO*SF); //gcc builtin,
float const *restrict m = __builtin_assume_aligned(y, NO*SF); //optional but produces
float const *restrict a = __builtin_assume_aligned(x, NO*SF); //better code
float const k = 3.14159f;
n*=SF;
while (n &= ~((size_t)(NO*SF)-1)) //this is why NO*SF must be power of two
{
size_t nl = n/SF;
for (size_t i = 0; i
I prefer to let the compiler choose the instructions, rather than using intrinsics (not least because you used intel-intrinsics, which gcc doesn't really like). Anyway, the following code produces nice assembly for me on gcc 4.8:
void triad(float *restrict x, float *restrict y, float *restrict z, size_t n)
//I hope you weren't aliasing any function arguments... Oh, an it's void, not float
{
float *restrict d = __builtin_assume_aligned(z, 32); // Uh, make sure your arrays
float *restrict m = __builtin_assume_aligned(y, 32); // are aligned? Faster that way
float *restrict a = __builtin_assume_aligned(x, 32); //
float const k = 3.14159f;
while (n &= ~((size_t)0x7)) //black magic, causes gcc to omit code for non-multiples of 8 floats
{
n -= 8; //You were always computing on 8 floats at a time, right?
d[n+0] = k * m[n+0] + a[n+0]; //manual unrolling
d[n+1] = k * m[n+1] + a[n+1];
d[n+2] = k * m[n+2] + a[n+2];
d[n+3] = k * m[n+3] + a[n+3];
d[n+4] = k * m[n+4] + a[n+4];
d[n+5] = k * m[n+5] + a[n+5];
d[n+6] = k * m[n+6] + a[n+6];
d[n+7] = k * m[n+7] + a[n+7];
}
}
This produces nice code for my corei7avx2, with -O3:
triad:
andq $-8, %rcx
je .L8
vmovaps .LC0(%rip), %ymm1
.L4:
subq $8, %rcx
vmovaps (%rsi,%rcx,4), %ymm0
vfmadd213ps (%rdi,%rcx,4), %ymm1, %ymm0
vmovaps %ymm0, (%rdx,%rcx,4)
andq $-8, %rcx
jne .L4
vzeroupper
.L8:
rep ret
.cfi_endproc
.LC0:
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
Edit:
I was a bit disappointed with the compiler not optimizing this code down to the last instruction, so I messed around with it a bit more. Just changing the order of things in the loop got rid of the AND emitted by the compiler, which got me on the right track. I then only had to get it to not do unnecessary address calculation in the loop instead. Sigh.
void triadtwo(float *restrict x, float *restrict y, float *restrict z, size_t n)
{
float *restrict d = __builtin_assume_aligned(z, 32);
float *restrict m = __builtin_assume_aligned(y, 32);
float *restrict a = __builtin_assume_aligned(x, 32);
float const k = 3.14159f;
n<<=2;
while (n &= -32)
{
d[(n>>2)-8] = k * m[(n>>2)-8] + a[(n>>2)-8];
d[(n>>2)-7] = k * m[(n>>2)-7] + a[(n>>2)-7];
d[(n>>2)-6] = k * m[(n>>2)-6] + a[(n>>2)-6];
d[(n>>2)-5] = k * m[(n>>2)-5] + a[(n>>2)-5];
d[(n>>2)-4] = k * m[(n>>2)-4] + a[(n>>2)-4];
d[(n>>2)-3] = k * m[(n>>2)-3] + a[(n>>2)-3];
d[(n>>2)-2] = k * m[(n>>2)-2] + a[(n>>2)-2];
d[(n>>2)-1] = k * m[(n>>2)-1] + a[(n>>2)-1];
n -= 32;
}
}
Ugly code? Yes. But the assembly:
triadtwo:
salq $2, %rcx
andq $-32, %rcx
je .L54
vmovaps .LC0(%rip), %ymm1
.L50:
vmovaps -32(%rsi,%rcx), %ymm0
vfmadd213ps -32(%rdi,%rcx), %ymm1, %ymm0
vmovaps %ymm0, -32(%rdx,%rcx)
subq $32, %rcx
jne .L50
vzeroupper
.L54:
rep ret
.cfi_endproc
.LC0:
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
.long 1078530000
Mmmmhhh, glorious five instructions in the loop, macro-op fusable subtract-and-branch...