问题
I am learning how to use SIMD directives with OpenMP/Fortran. I wrote the simple code:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
when I compile this code and run it I get:
ifort -O3 -vec-report3 -xhost loop_simd.f90
loop_simd.f90(10): (col. 12) remark: LOOP WAS VECTORIZED
loop_simd.f90(9): (col. 7) remark: loop was not vectorized: not inner loop
time ./a.out
97876060.8355515
real 0m8.940s
user 0m8.937s
sys 0m0.005s
I did what the compiler suggested about the "not inner loop" and added a SIMD collapse(2) directive:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
!$omp simd collapse(2) reduction(+:x)
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
then I compiled and run the code again and I got the following output:
ifort -O3 -vec-report3 -openmp -xhost loop_simd.f90
loop_simd.f90(8): (col. 7) remark: OpenMP SIMD LOOP WAS VECTORIZED
time ./a.out
97876054.9903757
real 0m26.535s
user 0m26.540s
sys 0m0.003s
What I don't know is why the performance decreases with SIMD? And when SIMD will be better than standard Fortran code?
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -openmp -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.12: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $.2.3_2_kmpc_loc_struct_pack.1, %edi #1.9
..LN10:
xorl %esi, %esi #1.9
..LN11:
orl $32832, (%rsp) #1.9
..LN12:
xorl %eax, %eax #1.9
..LN13:
vldmxcsr (%rsp) #1.9
..___tag_value_MAIN__.6: #1.9
..LN14:
call __kmpc_begin #1.9
..___tag_value_MAIN__.7: #
..LN15:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.12
..LN16:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN17:
call for_set_reentrancy #1.9
..LN18:
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
..LN19:
.loc 1 8 is_stmt 1
movl $4, %eax #8.7
..LN20:
.loc 1 6 is_stmt 1
vxorpd %ymm2, %ymm2, %ymm2 #6.7
..LN21:
.loc 1 8 is_stmt 1
vmovd %eax, %xmm0 #8.7
..LN22:
xorl %eax, %eax #8.7
..LN23:
vpshufd $0, %xmm0, %xmm1 #8.7
..LN24:
vmovdqu .L_2il0floatpacket.19(%rip), %xmm0 #8.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.4: # Preds ..B1.6 ..B1.3
..LN26:
.loc 1 11 is_stmt 1
vcvtdq2ps %xmm0, %xmm3 #11.34
..LN27:
vrcpps %xmm3, %xmm5 #11.28
..LN28:
vmulps %xmm3, %xmm5, %xmm4 #11.28
..LN29:
vaddps %xmm5, %xmm5, %xmm6 #11.28
..LN30:
vmulps %xmm5, %xmm4, %xmm7 #11.28
..LN31:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN32:
.loc 1 11 is_stmt 1
vsubps %xmm7, %xmm6, %xmm8 #11.28
..LN33:
vcvtps2pd %xmm8, %ymm3 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.5: # Preds ..B1.5 ..B1.4
..LN35:
.loc 1 10 is_stmt 1
incl %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm3, %ymm2, %ymm2 #11.17
..LN37:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN38:
jb ..B1.5 # Prob 99% #10.12
..LN39:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.6: # Preds ..B1.5
..LN40:
.loc 1 8 is_stmt 1
addl $4, %eax #8.7
..LN41:
.loc 1 10 is_stmt 1
vpaddd %xmm1, %xmm0, %xmm0 #10.12
..LN42:
.loc 1 8 is_stmt 1
cmpl $10000, %eax #8.7
..LN43:
jb ..B1.4 # Prob 66% #8.7
..LN44:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.7: # Preds ..B1.6
..LN45:
.loc 1 6 is_stmt 1
..LN46:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN47:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm2, %xmm0 #6.7
..LN48:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN49:
.loc 1 6 is_stmt 1
vaddpd %xmm0, %xmm2, %xmm1 #6.7
..LN50:
vunpckhpd %xmm1, %xmm1, %xmm3 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea 64(%rsp), %r8 #15.7
..LN52:
movq $0x1208384ff00, %rdx #15.7
..LN53:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN54:
xorl %eax, %eax #15.7
..LN55:
.loc 1 6 is_stmt 1
vaddsd %xmm3, %xmm1, %xmm4 #6.7
..LN56:
.loc 1 15 is_stmt 1
vmovsd %xmm4, 64(%rsp) #15.7
..LN57:
movq $0, (%rsp) #15.7
..LN58:
vzeroupper #15.7
..LN59:
call for_write_seq_lis #15.7
..LN60:
# LOE rbx r12 r13 r14 r15
..B1.8: # Preds ..B1.7
..LN61:
.loc 1 18 is_stmt 1
movl $.2.3_2_kmpc_loc_struct_pack.12, %edi #18.1
..LN62:
xorl %eax, %eax #18.1
..___tag_value_MAIN__.8: #18.1
..LN63:
call __kmpc_end #18.1
..___tag_value_MAIN__.9: #
..LN64:
# LOE rbx r12 r13 r14 r15
..B1.9: # Preds ..B1.8
..LN65:
movl $1, %eax #18.1
..LN66:
movq %rbp, %rsp #18.1
..LN67:
popq %rbp #18.1
..___tag_value_MAIN__.10: #
..LN68:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.12: #
..LN69:
# LOE
..LN70:
# mark_end;
.type MAIN__,@function
.size MAIN__,.-MAIN__
..LNMAIN__.71:
.LNMAIN__:
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.1:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.0
.align 4
.2.3_2__kmpc_loc_pack.0:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 59
.byte 49
.byte 59
.byte 59
.space 3, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.12:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.11
.align 4
.2.3_2__kmpc_loc_pack.11:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 56
.byte 59
.byte 49
.byte 56
.byte 59
.byte 59
.section .rodata, "a"
.align 16
.align 8
__NLITPACK_0.0.1:
.long 0x00000002,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 16
.L_2il0floatpacket.19:
.long 0x00000001,0x00000002,0x00000003,0x00000004
.type .L_2il0floatpacket.19,@object
.size .L_2il0floatpacket.19,16
.align 16
.L_2il0floatpacket.20:
.long 0x3f800000,0x3f800000,0x3f800000,0x3f800000
.type .L_2il0floatpacket.20,@object
.size .L_2il0floatpacket.20,16
.data
.section .note.GNU-stack, ""
# End
ASM output for the non-openmp code
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN10:
orl $32832, (%rsp) #1.9
..LN11:
vldmxcsr (%rsp) #1.9
..LN12:
call for_set_reentrancy #1.9
..LN13:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
..LN14:
.loc 1 6 is_stmt 1
..LN15:
.loc 1 11 is_stmt 1
vmovss .L_2il0floatpacket.0(%rip), %xmm6 #11.28
..LN16:
.loc 1 9 is_stmt 1
xorl %eax, %eax #9.7
..LN17:
.loc 1 6 is_stmt 1
vxorpd %ymm8, %ymm8, %ymm8 #6.7
..LN18:
vmovapd %ymm8, %ymm7 #6.7
..LN19:
vmovapd %ymm8, %ymm0 #6.7
..LN20:
vmovapd %ymm8, %ymm1 #6.7
..LN21:
vmovapd %ymm8, %ymm2 #6.7
..LN22:
vmovapd %ymm8, %ymm3 #6.7
..LN23:
vmovapd %ymm8, %ymm4 #6.7
..LN24:
vmovapd %ymm8, %ymm5 #6.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.3: # Preds ..B1.5 ..B1.2
..LN26:
incl %eax #
..LN27:
.loc 1 11 is_stmt 1
vxorps %xmm9, %xmm9, %xmm9 #11.28
..LN28:
vcvtsi2ss %eax, %xmm9, %xmm9 #11.28
..LN29:
vdivss %xmm9, %xmm6, %xmm10 #11.28
..LN30:
vcvtss2sd %xmm10, %xmm10, %xmm10 #11.28
..LN31:
vmovddup %xmm10, %xmm11 #11.28
..LN32:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN33:
.loc 1 11 is_stmt 1
vinsertf128 $1, %xmm11, %ymm11, %ymm9 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.4: # Preds ..B1.4 ..B1.3
..LN35:
.loc 1 10 is_stmt 1
addl $32, %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm9, %ymm8, %ymm8 #11.17
..LN37:
vaddpd %ymm7, %ymm9, %ymm7 #11.17
..LN38:
vaddpd %ymm0, %ymm9, %ymm0 #11.17
..LN39:
vaddpd %ymm1, %ymm9, %ymm1 #11.17
..LN40:
vaddpd %ymm2, %ymm9, %ymm2 #11.17
..LN41:
vaddpd %ymm3, %ymm9, %ymm3 #11.17
..LN42:
vaddpd %ymm4, %ymm9, %ymm4 #11.17
..LN43:
vaddpd %ymm5, %ymm9, %ymm5 #11.17
..LN44:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN45:
jb ..B1.4 # Prob 99% #10.12
..LN46:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.5: # Preds ..B1.4
..LN47:
.loc 1 9 is_stmt 1
cmpl $10000, %eax #9.7
..LN48:
jb ..B1.3 # Prob 66% #9.7
..LN49:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.6: # Preds ..B1.5
..LN50:
.loc 1 6 is_stmt 1
vaddpd %ymm7, %ymm8, %ymm6 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN52:
.loc 1 6 is_stmt 1
vaddpd %ymm1, %ymm0, %ymm0 #6.7
..LN53:
vaddpd %ymm3, %ymm2, %ymm1 #6.7
..LN54:
vaddpd %ymm5, %ymm4, %ymm2 #6.7
..LN55:
vaddpd %ymm0, %ymm6, %ymm3 #6.7
..LN56:
vaddpd %ymm2, %ymm1, %ymm4 #6.7
..LN57:
vaddpd %ymm4, %ymm3, %ymm5 #6.7
..LN58:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN59:
movq $0x1208384ff00, %rdx #15.7
..LN60:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN61:
xorl %eax, %eax #15.7
..LN62:
lea 64(%rsp), %r8 #15.7
..LN63:
movq $0, (%rsp) #15.7
..LN64:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm5, %xmm7 #6.7
..LN65:
vaddpd %xmm7, %xmm5, %xmm8 #6.7
..LN66:
vunpckhpd %xmm8, %xmm8, %xmm9 #6.7
..LN67:
vaddsd %xmm9, %xmm8, %xmm10 #6.7
..LN68:
.loc 1 15 is_stmt 1
vmovsd %xmm10, 64(%rsp) #15.7
..LN69:
vzeroupper #15.7
..LN70:
call for_write_seq_lis #15.7
..LN71:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
..LN72:
.loc 1 18 is_stmt 1
movl $1, %eax #18.1
..LN73:
movq %rbp, %rsp #18.1
..LN74:
popq %rbp #18.1
..___tag_value_MAIN__.6: #
..LN75:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.8: #
..LN76:
# LOE
..LN77:
# mark_end;
.type MAIN__,@function
.size MAIN__,.-MAIN__
..LNMAIN__.78:
.LNMAIN__:
.section .rodata, "a"
.align 8
.align 8
__NLITPACK_0.0.1:
.long 0x00000000,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 4
.L_2il0floatpacket.0:
.long 0x3f800000
.type .L_2il0floatpacket.0,@object
.size .L_2il0floatpacket.0,4
.data
.section .note.GNU-stack, ""
# End
回答1:
With OpenMP, Ifort is using SIMD to vectorize the outer loop (over i
), so essentially all the time is spent doing
## set up ymm3 with 4 copies of 1.0/(1.0*i),
# and j = %edx = 0
..B1.5: do {
incl %edx # j++
vaddpd %ymm3, %ymm2, %ymm2 # ymm3 + ymm2 => ymm2
cmpl $10000000, %edx } while(j<10000000);
jb ..B1.5 # Prob 99%
10M iterations of vaddpd
will completely dominate the cost of everything outside the loop, so all that matters is that this inner-loop is executed 10k / 4 times. (note the add $4, %eax
/ cmp $10000, %eax
/ jb
, with a branch target back to before the inner loop.)
Since it's only using a single accumulator, throughput is limited by the loop-carried dependency (3 cycles).
Without OpenMP:
It's still doing the full amount of work, not optimizing away any of the loops.
It auto-vectorizes like with #pragma openmp
, but using multiple accumulators for increased parallelism. Multiple add instructions can be in-flight at once, instead of having each one depend on the previous.
The setup for the inner loop is very similar, and then the inner loop is:
## set up ymm3 with 4 copies of 1.0/(1.0*i),
..B1.4:
addl $32, %edx #10.12
vaddpd %ymm9, %ymm8, %ymm8 # ymm8 + ymm9 => ymm8
vaddpd %ymm7, %ymm9, %ymm7 # ymm7 + ymm9 => ymm7
vaddpd %ymm0, %ymm9, %ymm0 # ymm0 + ymm9 => ymm0
vaddpd %ymm1, %ymm9, %ymm1 # ...
vaddpd %ymm2, %ymm9, %ymm2
vaddpd %ymm3, %ymm9, %ymm3
vaddpd %ymm4, %ymm9, %ymm4
vaddpd %ymm5, %ymm9, %ymm5
cmpl $10000000, %edx
jb ..B1.4 # Prob 99%
# then combine the 8 vector accumulators down to one, and horizontal sum that.
8 accumulators could keep 8 vaddpd
s in flight at once, but the latency is only 3 cycles on Intel SnB/IvB (See Agner Fog's insn tables). You didn't say what microarchitecture you're using, but I could infer Sandybridge/Ivybridge from the fact that -xhost
uses AVX1 but not AVX2. (broadcast with vmovddup
/ vinsertf128
, rather than AVX2 vbroadcastsd %xmm9, %ymm9
)
This perfectly explains the 3x speed ratio: 26.535 / 8.940 = 2.97 ~= 3. (vaddpd
has a throughput of one per clock on pre-Skylake Intel CPUs, latency=3. This version is limited by throughput rather than latency, because of the increased instruction-level parallelism).
Unrolling with this many accumulators will help for Skylake, where FP add has 4 cycle latency and two per cycle throughput. (SKL dropped the lower-latency dedicated vector FP add unit from port 1, and runs it in the improved 4c-latency FMA units on ports 0 and 1.)
回答2:
You may be better off using SIMD for the inner loop only. Then You can use !$OMP parallel on the outer loop. As the I is related to the outer loop, you could/should probably also reverse the outer and inner loops. If you allocated another variable for the 1.0/(1.0*i) , then perhaps that could be vectorised. Then the reduction assumes that the new variable is a vector that is in the heap, and not a value that is private in OMP.
These things usually take a bit to work out...
回答3:
SIMD instructions are intended to improve the performance of code that operates on vectors or arrays. Your sample code only operates on a scalar variable, it is therefore unsurprising that forcing vectorization does not improve performance!
来源:https://stackoverflow.com/questions/37361360/why-the-openmp-simd-directive-reduces-performance