问题
Can the compiler optimize loops if the last index of the loops (a
and b
in the following example) are not known at compile time?
Unoptimized:
int* arr = new int[a*b];
for (i = 0; i < a; ++i){
for(j = 0; j < b; ++j){
arr[i*b+j] *= 8;
}
}
//delete arr after done.
More Optimized: (assuming a and b are large...)
int c = a*b;
int* arr = new int[c];
for (i = 0; i < c; ++i){
arr[c] *= 8;
}
//delete arr after done.
回答1:
If you treat the array as linear space, gcc (and presumably others) will optimise even without knowing the extents at compile time.
This code:
void by8(int* arr, int a, int b)
{
auto extent = a * b;
for (int i = 0; i < extent; ++i)
{
arr[i] *= 8;
}
}
compiles to this (notice how the inner part of the loop is vectorised)
by8(int*, int, int):
imull %esi, %edx
testl %edx, %edx
jle .L23
movq %rdi, %rax
andl $31, %eax
shrq $2, %rax
negq %rax
andl $7, %eax
cmpl %edx, %eax
cmova %edx, %eax
cmpl $8, %edx
jg .L26
movl %edx, %eax
.L3:
sall $3, (%rdi)
cmpl $1, %eax
je .L15
sall $3, 4(%rdi)
cmpl $2, %eax
je .L16
sall $3, 8(%rdi)
cmpl $3, %eax
je .L17
sall $3, 12(%rdi)
cmpl $4, %eax
je .L18
sall $3, 16(%rdi)
cmpl $5, %eax
je .L19
sall $3, 20(%rdi)
cmpl $6, %eax
je .L20
sall $3, 24(%rdi)
cmpl $7, %eax
je .L21
sall $3, 28(%rdi)
movl $8, %ecx
.L5:
cmpl %eax, %edx
je .L27
.L4:
leal -1(%rdx), %r8d
movl %edx, %r9d
movl %eax, %r10d
subl %eax, %r9d
subl %eax, %r8d
leal -8(%r9), %esi
shrl $3, %esi
addl $1, %esi
leal 0(,%rsi,8), %r11d
cmpl $6, %r8d
jbe .L7
leaq (%rdi,%r10,4), %r10
xorl %eax, %eax
xorl %r8d, %r8d
.L9:
vmovdqa (%r10,%rax), %ymm0
addl $1, %r8d
vpslld $3, %ymm0, %ymm0
vmovdqa %ymm0, (%r10,%rax)
addq $32, %rax
cmpl %r8d, %esi
ja .L9
addl %r11d, %ecx
cmpl %r11d, %r9d
je .L22
vzeroupper
.L7:
movslq %ecx, %rax
sall $3, (%rdi,%rax,4)
leal 1(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 2(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 3(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 4(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 5(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
addl $6, %ecx
sall $3, (%rdi,%rax,4)
cmpl %ecx, %edx
jle .L28
movslq %ecx, %rcx
sall $3, (%rdi,%rcx,4)
ret
.L22:
vzeroupper
.L23:
ret
.L27:
ret
.L26:
testl %eax, %eax
jne .L3
xorl %ecx, %ecx
jmp .L4
.L28:
ret
.L21:
movl $7, %ecx
jmp .L5
.L15:
movl $1, %ecx
jmp .L5
.L16:
movl $2, %ecx
jmp .L5
.L17:
movl $3, %ecx
jmp .L5
.L18:
movl $4, %ecx
jmp .L5
.L19:
movl $5, %ecx
jmp .L5
.L20:
movl $6, %ecx
jmp .L5
compiler : gcc 5.4 with command line options: -std=c++14 -O3 -march=native
回答2:
Yes it probably can, given that the sizes are constant and do not change in your loop, as it happens here. Read Optimize "for" loop for more please.
FYI, in your fist example, this:
arr[j*a+b] *= 8;
should be this:
arr[j*a+i] *= 8;
回答3:
Modern compilers can definitely change the order of the two array, to prevent unneeded cache misses, from:
for (i = 0; i < a; ++i){
for(j = 0; j < b; ++j){
arr[j*a+i] *= 8;
}
}
to this:
for(j = 0; j < b; ++j){
for (i = 0; i < a; ++i){
arr[j*a+i] *= 8;
}
}
After this optimizations, the two examples (compared to your manual optimization) shouldn't measurably differ in performance.
回答4:
if you are using the visual studio compiler you can use the /Qvec-report command line argument and it will tell you which loops are/are not being vectorized and give you reason codes as to why they are not
vectorization of loops (unlike unrolling) is where the compiler uses the SIMD (SSE,SSE2,AVX) instructions to break the loop into a series of operation which are performed in parallel
https://msdn.microsoft.com/en-us/library/jj658585.aspx
gcc and clang may have similar capabilities
回答5:
You can always unroll a for loop. Even if you don't know the number of iterations it should do with a trick called Duff's device
Also see the explanation here on stackoverflow: How does Duff's device work?
You can have an interleaved switch and while loop, and let the while loop process, say, 4 items at once. if you'd like to process 6 items, you can then cheat by jumping to the second last item in the loop processing 2+4=6 items:
int n = 6;
int it = n / 4;
int check = 0;
switch (n % 4) {
case 0: do { check += 1;
case 3: check += 1;
case 2: check += 1;
case 1: check += 1;
} while (it--);
}
printf("processed %i items\n", check);
来源:https://stackoverflow.com/questions/39724268/can-compiler-optimize-loop-with-variable-length