-O3 in ICC messes up intrinsics, fine with -O1 or -O2 or with corresponding manual assembly

This is a followup on this question.

The code below for a 4x4 matrix multiplication C = AB compiles fine on ICC on all optimization settings. It executes correctly on -O1 and -O2, but gives an incorrect result on -O3. The problem seems to come from the _mm256_storeu_pd operation, as substituting it (and only it) with the asm statement below gives correct results after execution. Any ideas?

inline void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(double *A, double *B, double *C)
{
    size_t i;

    /* the registers you use */
    __m256d a0, a1, a2, a3, b0, b1, b2, b3, sum;
    //  __m256d *C256 = (__m256d *)C;

    /* load values from B */
    b0 = _mm256_loadu_pd(&B[0]);
    b1 = _mm256_loadu_pd(&B[4]);
    b2 = _mm256_loadu_pd(&B[8]);
    b3 = _mm256_loadu_pd(&B[12]);

    for (i = 0; i < 4; i++) {
        /* load values from A */
        a0 = _mm256_set1_pd(A[4*i + 0]);
        a1 = _mm256_set1_pd(A[4*i + 1]);
        a2 = _mm256_set1_pd(A[4*i + 2]);
        a3 = _mm256_set1_pd(A[4*i + 3]);

        sum = _mm256_mul_pd(a0, b0);
        sum = _mm256_fmadd_pd(a1, b1, sum);
        sum = _mm256_fmadd_pd(a2, b2, sum);
        sum = _mm256_fmadd_pd(a3, b3, sum);

        // asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
        _mm256_storeu_pd(&C[4*i], sum);

    }

}

Also, here is the assembly generated by ICC. The arrows indicate the line with _mm256_storeu_pd or the asm statement respectively. RunIntrinsics_FMA_UnalignedCopy_Struct is a function that takes stored numbers from SourceMatrix and calls the matrix multiplication routine.

-O2 -xcore-avx2

ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:

0x1000053c0 <+0>:    pushq  %rbp
0x1000053c1 <+1>:    movq   %rsp, %rbp
0x1000053c4 <+4>:    andq   $-0x20, %rsp
0x1000053c8 <+8>:    pushq  %r12
0x1000053ca <+10>:   pushq  %r13
0x1000053cc <+12>:   pushq  %r14
0x1000053ce <+14>:   pushq  %r15
0x1000053d0 <+16>:   pushq  %rbx
0x1000053d1 <+17>:   subq   $0x4f8, %rsp              ; imm = 0x4F8 
0x1000053d8 <+24>:   callq  0x10000b538               ; symbol stub for: clock
0x1000053dd <+29>:   movq   %rax, %rbx
0x1000053e0 <+32>:   vmovupd 0x95f8(%rip), %ymm11      ; SourceMatrix + 190
0x1000053e8 <+40>:   xorl   %eax, %eax
0x1000053ea <+42>:   vxorpd %xmm1, %xmm1, %xmm1
0x1000053ee <+46>:   vmovsd %xmm1, 0x40(%rsp)
0x1000053f4 <+52>:   vmovupd 0x9604(%rip), %ymm10      ; SourceMatrix + 222
0x1000053fc <+60>:   vmovupd 0x95bc(%rip), %ymm12      ; SourceMatrix + 158
0x100005404 <+68>:   vmovupd 0x9574(%rip), %ymm13      ; SourceMatrix + 94
0x10000540c <+76>:   vmovupd 0x952c(%rip), %ymm14      ; SourceMatrix + 30
0x100005414 <+84>:   vmovupd 0x9504(%rip), %ymm15      ; c_feature_names + 446
0x10000541c <+92>:   vmovupd 0x95fc(%rip), %ymm8       ; SourceMatrix + 254
0x100005424 <+100>:  vmovupd 0x9614(%rip), %ymm7       ; SourceMatrix + 286
0x10000542c <+108>:  vmovupd 0x962c(%rip), %ymm6       ; SourceMatrix + 318
0x100005434 <+116>:  vmovupd 0x9644(%rip), %ymm5       ; SourceMatrix + 350
0x10000543c <+124>:  vmovupd 0x965c(%rip), %ymm3       ; SourceMatrix + 382
0x100005444 <+132>:  vmovupd 0x9674(%rip), %ymm2       ; SourceMatrix + 414
0x10000544c <+140>:  vmovupd 0x968c(%rip), %ymm1       ; SourceMatrix + 446
0x100005454 <+148>:  vmovupd %ymm10, 0x420(%rsp)
0x10000545d <+157>:  vmovupd %ymm11, 0x440(%rsp)
0x100005466 <+166>:  vmovsd 0x626a(%rip), %xmm10      ; xmm10 = mem[0],zero 
0x10000546e <+174>:  vmovsd 0x40(%rsp), %xmm11        ; xmm11 = mem[0],zero 
0x100005474 <+180>:  vmovupd 0x94e4(%rip), %ymm9       ; SourceMatrix + 62
0x10000547c <+188>:  vmovupd %ymm1, 0x20(%rsp)
0x100005482 <+194>:  vmovupd 0x9516(%rip), %ymm4       ; SourceMatrix + 126
0x10000548a <+202>:  vmovupd %ymm2, 0x360(%rsp)
0x100005493 <+211>:  vmovupd %ymm3, 0x3c0(%rsp)
0x10000549c <+220>:  vmovupd %ymm5, 0x380(%rsp)
0x1000054a5 <+229>:  vmovupd %ymm6, 0x3a0(%rsp)
0x1000054ae <+238>:  vmovupd %ymm7, 0x3e0(%rsp)
0x1000054b7 <+247>:  vmovupd %ymm8, 0x400(%rsp)
0x1000054c0 <+256>:  vmovupd %ymm12, 0x4c0(%rsp)
0x1000054c9 <+265>:  vmovupd %ymm13, 0x4a0(%rsp)
0x1000054d2 <+274>:  vmovupd %ymm14, 0x480(%rsp)
0x1000054db <+283>:  vmovupd %ymm15, 0x460(%rsp)
0x1000054e4 <+292>:  vxorpd %ymm0, %ymm0, %ymm0
0x1000054e8 <+296>:  vmovupd %ymm0, 0x260(%rsp)
0x1000054f1 <+305>:  vmovupd %ymm0, 0x2e0(%rsp)
0x1000054fa <+314>:  vmovupd %ymm0, 0x280(%rsp)
0x100005503 <+323>:  vmovupd %ymm0, 0x300(%rsp)
0x10000550c <+332>:  vmovupd %ymm0, 0x2a0(%rsp)
0x100005515 <+341>:  vmovupd %ymm0, 0x320(%rsp)
0x10000551e <+350>:  vmovupd %ymm0, 0x2c0(%rsp)
0x100005527 <+359>:  vmovupd %ymm0, 0x340(%rsp)
0x100005530 <+368>:  vmovupd 0x95c8(%rip), %ymm0       ; SourceMatrix + 478
0x100005538 <+376>:  vmovupd %ymm0, (%rsp)
0x10000553d <+381>:  incl   %eax
0x10000553f <+383>:  vxorpd %xmm3, %xmm3, %xmm3
0x100005543 <+387>:  vcvtsi2sdl %eax, %xmm3, %xmm3
0x100005547 <+391>:  vdivsd %xmm3, %xmm10, %xmm2
0x10000554b <+395>:  vbroadcastsd %xmm2, %ymm8
0x100005550 <+400>:  vaddpd 0x460(%rsp), %ymm8, %ymm1
0x100005559 <+409>:  vaddpd %ymm4, %ymm8, %ymm3
0x10000555d <+413>:  vaddpd 0x480(%rsp), %ymm8, %ymm0
0x100005566 <+422>:  vaddpd 0x420(%rsp), %ymm8, %ymm2
0x10000556f <+431>:  vaddpd %ymm9, %ymm8, %ymm6
0x100005574 <+436>:  vaddpd 0x4a0(%rsp), %ymm8, %ymm7
0x10000557d <+445>:  vaddpd 0x400(%rsp), %ymm8, %ymm5
0x100005586 <+454>:  vmovupd %ymm1, 0x60(%rsp)
0x10000558c <+460>:  vmovupd %ymm0, 0x80(%rsp)
0x100005595 <+469>:  vmovupd %ymm6, 0xa0(%rsp)
0x10000559e <+478>:  vmovupd %ymm7, 0xc0(%rsp)
0x1000055a7 <+487>:  vmovupd %ymm3, 0xe0(%rsp)
0x1000055b0 <+496>:  vmovupd %ymm5, 0x160(%rsp)
0x1000055b9 <+505>:  vmovupd %ymm2, 0x140(%rsp)
0x1000055c2 <+514>:  vbroadcastsd 0x60(%rsp), %ymm14
0x1000055c9 <+521>:  vbroadcastsd 0x68(%rsp), %ymm13
0x1000055d0 <+528>:  vbroadcastsd 0x70(%rsp), %ymm15
0x1000055d7 <+535>:  vbroadcastsd 0x78(%rsp), %ymm12
0x1000055de <+542>:  vmulpd %ymm14, %ymm3, %ymm14
0x1000055e3 <+547>:  vaddpd 0x4c0(%rsp), %ymm8, %ymm1
0x1000055ec <+556>:  vaddpd 0x440(%rsp), %ymm8, %ymm0
0x1000055f5 <+565>:  vaddpd 0x380(%rsp), %ymm8, %ymm5
0x1000055fe <+574>:  vaddpd 0x3e0(%rsp), %ymm8, %ymm6
0x100005607 <+583>:  vaddpd 0x3a0(%rsp), %ymm8, %ymm7
0x100005610 <+592>:  vfmadd213pd %ymm14, %ymm1, %ymm13
0x100005615 <+597>:  vmovupd %ymm5, 0x1c0(%rsp)
0x10000561e <+606>:  vmovupd %ymm1, 0x100(%rsp)
0x100005627 <+615>:  vmovupd %ymm6, 0x180(%rsp)
0x100005630 <+624>:  vmovupd %ymm7, 0x1a0(%rsp)
0x100005639 <+633>:  vfmadd213pd %ymm13, %ymm0, %ymm15
0x10000563e <+638>:  vmovupd %ymm0, 0x120(%rsp)
0x100005647 <+647>:  vbroadcastsd 0x88(%rsp), %ymm13
0x100005651 <+657>:  vbroadcastsd 0x90(%rsp), %ymm14
0x10000565b <+667>:  vfmadd213pd %ymm15, %ymm2, %ymm12
0x100005660 <+672>:  vbroadcastsd 0x80(%rsp), %ymm15
0x10000566a <+682>:  vaddpd 0x3c0(%rsp), %ymm8, %ymm5
0x100005673 <+691>:  vaddpd 0x360(%rsp), %ymm8, %ymm7
0x10000567c <+700>:  vaddpd 0x20(%rsp), %ymm8, %ymm6
0x100005682 <+706>:  vaddpd (%rsp), %ymm8, %ymm8
0x100005687 <+711>:  vmulpd %ymm15, %ymm3, %ymm15
->  0x10000568c <+716>:  vmovupd %ymm12, 0x260(%rsp)
0x100005695 <+725>:  vmovupd %ymm5, 0x1e0(%rsp)
0x10000569e <+734>:  vmovupd %ymm8, 0x240(%rsp)
0x1000056a7 <+743>:  vmovupd %ymm6, 0x220(%rsp)
0x1000056b0 <+752>:  vfmadd213pd %ymm15, %ymm1, %ymm13
0x1000056b5 <+757>:  vmovupd %ymm7, 0x200(%rsp)

-O3 -xcore-avx2

ICC Test`RunIntrinsics_FMA_UnalignedCopy_Struct:

0x100004c10 <+0>:    pushq  %rbp
0x100004c11 <+1>:    movq   %rsp, %rbp
0x100004c14 <+4>:    andq   $-0x20, %rsp
0x100004c18 <+8>:    pushq  %r12
0x100004c1a <+10>:   pushq  %r13
0x100004c1c <+12>:   pushq  %r14
0x100004c1e <+14>:   pushq  %r15
0x100004c20 <+16>:   pushq  %rbx
0x100004c21 <+17>:   subq   $0x858, %rsp              ; imm = 0x858 
0x100004c28 <+24>:   callq  0x10000b538               ; symbol stub for: clock
0x100004c2d <+29>:   movq   %rax, %rbx
0x100004c30 <+32>:   vbroadcastsd 0xc0(%rsp), %ymm15
0x100004c3a <+42>:   xorl   %eax, %eax
0x100004c3c <+44>:   vxorpd %xmm1, %xmm1, %xmm1
0x100004c40 <+48>:   vmovsd %xmm1, 0x40(%rsp)
0x100004c46 <+54>:   vmovupd 0x9c12(%rip), %ymm2       ; c_feature_names + 446
0x100004c4e <+62>:   vmovupd %ymm15, 0x620(%rsp)
0x100004c57 <+71>:   vmovupd 0x9c41(%rip), %ymm13      ; SourceMatrix + 62
0x100004c5f <+79>:   vmovupd 0x9c19(%rip), %ymm14      ; SourceMatrix + 30
0x100004c67 <+87>:   vmovupd 0x9c51(%rip), %ymm12      ; SourceMatrix + 94
0x100004c6f <+95>:   vmovupd %ymm2, 0x640(%rsp)
0x100004c78 <+104>:  vmovupd 0x9c60(%rip), %ymm11      ; SourceMatrix + 126
0x100004c80 <+112>:  vmovupd 0x9c78(%rip), %ymm10      ; SourceMatrix + 158
0x100004c88 <+120>:  vmovupd 0x9c90(%rip), %ymm9       ; SourceMatrix + 190
0x100004c90 <+128>:  vmovupd %ymm13, 0x680(%rsp)
0x100004c99 <+137>:  vmovupd 0x9c9f(%rip), %ymm8       ; SourceMatrix + 222
0x100004ca1 <+145>:  vmovupd 0x9cb7(%rip), %ymm7       ; SourceMatrix + 254
0x100004ca9 <+153>:  vmovupd 0x9ccf(%rip), %ymm6       ; SourceMatrix + 286
0x100004cb1 <+161>:  vmovupd %ymm9, 0x700(%rsp)
0x100004cba <+170>:  vmovupd 0x9cde(%rip), %ymm5       ; SourceMatrix + 318
0x100004cc2 <+178>:  vmovupd 0x9cf6(%rip), %ymm4       ; SourceMatrix + 350
0x100004cca <+186>:  vmovupd 0x9d0e(%rip), %ymm3       ; SourceMatrix + 382
0x100004cd2 <+194>:  vmovupd %ymm6, 0x760(%rsp)
0x100004cdb <+203>:  vmovupd 0x9d1d(%rip), %ymm2       ; SourceMatrix + 414
0x100004ce3 <+211>:  vmovupd 0x9d35(%rip), %ymm1       ; SourceMatrix + 446
0x100004ceb <+219>:  vmovsd 0x40(%rsp), %xmm13        ; xmm13 = mem[0],zero 
0x100004cf1 <+225>:  vbroadcastsd 0xc8(%rsp), %ymm15
0x100004cfb <+235>:  vmovupd %ymm3, 0x7c0(%rsp)
0x100004d04 <+244>:  vmovupd %ymm2, 0x7e0(%rsp)
0x100004d0d <+253>:  vmovupd %ymm1, 0x800(%rsp)
0x100004d16 <+262>:  vmovupd %ymm15, 0x600(%rsp)
0x100004d1f <+271>:  vmovupd %ymm4, 0x7a0(%rsp)
0x100004d28 <+280>:  vmovupd %ymm5, 0x780(%rsp)
0x100004d31 <+289>:  vmovupd %ymm7, 0x740(%rsp)
0x100004d3a <+298>:  vmovupd %ymm8, 0x720(%rsp)
0x100004d43 <+307>:  vmovupd %ymm10, 0x6e0(%rsp)
0x100004d4c <+316>:  vmovupd %ymm11, 0x6c0(%rsp)
0x100004d55 <+325>:  vmovupd %ymm12, 0x6a0(%rsp)
0x100004d5e <+334>:  vmovupd %ymm14, 0x660(%rsp)
0x100004d67 <+343>:  vbroadcastsd 0xd0(%rsp), %ymm15
0x100004d71 <+353>:  vmovupd %ymm15, 0x5e0(%rsp)
0x100004d7a <+362>:  vbroadcastsd 0xd8(%rsp), %ymm15
0x100004d84 <+372>:  vmovupd %ymm15, 0x5c0(%rsp)
0x100004d8d <+381>:  vbroadcastsd 0xe0(%rsp), %ymm15
0x100004d97 <+391>:  vmovupd %ymm15, 0x5a0(%rsp)
0x100004da0 <+400>:  vbroadcastsd 0xe8(%rsp), %ymm15
0x100004daa <+410>:  vmovupd %ymm15, 0x580(%rsp)
0x100004db3 <+419>:  vbroadcastsd 0xf0(%rsp), %ymm15
0x100004dbd <+429>:  vmovupd %ymm15, 0x560(%rsp)
0x100004dc6 <+438>:  vbroadcastsd 0xf8(%rsp), %ymm15
0x100004dd0 <+448>:  vmovupd %ymm15, 0x540(%rsp)
0x100004dd9 <+457>:  vbroadcastsd 0x100(%rsp), %ymm15
0x100004de3 <+467>:  vmovupd %ymm15, 0x520(%rsp)
0x100004dec <+476>:  vbroadcastsd 0x108(%rsp), %ymm15
0x100004df6 <+486>:  vmovupd %ymm15, 0x500(%rsp)
0x100004dff <+495>:  vbroadcastsd 0x110(%rsp), %ymm15
0x100004e09 <+505>:  vmovupd %ymm15, 0x4e0(%rsp)
0x100004e12 <+514>:  vbroadcastsd 0x118(%rsp), %ymm15
0x100004e1c <+524>:  vmovupd %ymm15, 0x4c0(%rsp)
0x100004e25 <+533>:  vbroadcastsd 0x1c0(%rsp), %ymm15
0x100004e2f <+543>:  vmovupd %ymm15, 0x4a0(%rsp)
0x100004e38 <+552>:  vbroadcastsd 0x1c8(%rsp), %ymm15
0x100004e42 <+562>:  vmovupd %ymm15, 0x480(%rsp)
0x100004e4b <+571>:  vbroadcastsd 0x1d0(%rsp), %ymm15
0x100004e55 <+581>:  vmovupd %ymm15, 0x460(%rsp)
0x100004e5e <+590>:  vbroadcastsd 0x1d8(%rsp), %ymm15
0x100004e68 <+600>:  vmovupd %ymm15, 0x440(%rsp)
0x100004e71 <+609>:  vbroadcastsd 0x1e0(%rsp), %ymm15
0x100004e7b <+619>:  vmovupd %ymm15, 0x420(%rsp)
0x100004e84 <+628>:  vbroadcastsd 0x1e8(%rsp), %ymm15
0x100004e8e <+638>:  vmovupd %ymm15, 0x400(%rsp)
0x100004e97 <+647>:  vbroadcastsd 0x1f0(%rsp), %ymm15
0x100004ea1 <+657>:  vmovupd %ymm15, 0x3e0(%rsp)
0x100004eaa <+666>:  vbroadcastsd 0x1f8(%rsp), %ymm15
0x100004eb4 <+676>:  vmovupd %ymm15, 0x3c0(%rsp)
0x100004ebd <+685>:  vbroadcastsd 0x200(%rsp), %ymm15
0x100004ec7 <+695>:  vmovupd %ymm15, 0x3a0(%rsp)
0x100004ed0 <+704>:  vbroadcastsd 0x208(%rsp), %ymm15
0x100004eda <+714>:  vmovupd %ymm15, 0x80(%rsp)
0x100004ee3 <+723>:  vbroadcastsd 0x210(%rsp), %ymm15
0x100004eed <+733>:  vxorpd %ymm0, %ymm0, %ymm0
0x100004ef1 <+737>:  vmovupd %ymm0, 0x2a0(%rsp)
0x100004efa <+746>:  vmovupd %ymm0, 0x320(%rsp)
0x100004f03 <+755>:  vmovupd %ymm0, 0x2c0(%rsp)
0x100004f0c <+764>:  vmovupd %ymm0, 0x340(%rsp)
0x100004f15 <+773>:  vmovupd %ymm0, 0x2e0(%rsp)
0x100004f1e <+782>:  vmovupd %ymm0, 0x360(%rsp)
0x100004f27 <+791>:  vmovupd %ymm0, 0x300(%rsp)
0x100004f30 <+800>:  vmovupd %ymm0, 0x380(%rsp)
0x100004f39 <+809>:  vmovupd 0x9aff(%rip), %ymm0       ; SourceMatrix + 478
0x100004f41 <+817>:  vmovupd %ymm15, 0x60(%rsp)
0x100004f47 <+823>:  vbroadcastsd 0x218(%rsp), %ymm15
0x100004f51 <+833>:  vmovupd %ymm0, 0x820(%rsp)
0x100004f5a <+842>:  vmovupd %ymm15, 0x20(%rsp)
0x100004f60 <+848>:  incl   %eax
0x100004f62 <+850>:  vxorpd %xmm12, %xmm12, %xmm12
0x100004f67 <+855>:  vcvtsi2sdl %eax, %xmm12, %xmm12
0x100004f6b <+859>:  vmovsd 0x6765(%rip), %xmm11      ; xmm11 = mem[0],zero 
0x100004f73 <+867>:  vdivsd %xmm12, %xmm11, %xmm8
0x100004f78 <+872>:  vbroadcastsd %xmm8, %ymm7
0x100004f7d <+877>:  vaddpd 0x640(%rsp), %ymm7, %ymm9
0x100004f86 <+886>:  vaddpd 0x6c0(%rsp), %ymm7, %ymm0
0x100004f8f <+895>:  vaddpd 0x6e0(%rsp), %ymm7, %ymm1
0x100004f98 <+904>:  vaddpd 0x700(%rsp), %ymm7, %ymm2
0x100004fa1 <+913>:  vaddpd 0x720(%rsp), %ymm7, %ymm3
0x100004faa <+922>:  vaddpd 0x740(%rsp), %ymm7, %ymm8
0x100004fb3 <+931>:  vaddpd 0x7c0(%rsp), %ymm7, %ymm4
0x100004fbc <+940>:  vaddpd 0x7e0(%rsp), %ymm7, %ymm5
0x100004fc5 <+949>:  vaddpd 0x800(%rsp), %ymm7, %ymm6
0x100004fce <+958>:  vaddpd 0x660(%rsp), %ymm7, %ymm10
0x100004fd7 <+967>:  vaddpd 0x680(%rsp), %ymm7, %ymm12
0x100004fe0 <+976>:  vaddpd 0x6a0(%rsp), %ymm7, %ymm11
0x100004fe9 <+985>:  vmovupd %ymm9, 0xa0(%rsp)
0x100004ff2 <+994>:  vmovupd %ymm0, 0x120(%rsp)
0x100004ffb <+1003>: vmovupd %ymm8, 0x1a0(%rsp)
0x100005004 <+1012>: vmovupd %ymm3, 0x180(%rsp)
0x10000500d <+1021>: vmovupd %ymm1, 0x140(%rsp)
0x100005016 <+1030>: vmovupd %ymm2, 0x160(%rsp)
0x10000501f <+1039>: vmovupd %ymm10, (%rsp)
0x100005024 <+1044>: vmovupd %ymm4, 0x220(%rsp)
0x10000502d <+1053>: vmovupd %ymm5, 0x240(%rsp)
0x100005036 <+1062>: vmovupd %ymm6, 0x260(%rsp)
0x10000503f <+1071>: vbroadcastsd 0xa0(%rsp), %ymm14
0x100005049 <+1081>: vbroadcastsd 0xa8(%rsp), %ymm15
0x100005053 <+1091>: vaddpd 0x760(%rsp), %ymm7, %ymm10
0x10000505c <+1100>: vaddpd 0x780(%rsp), %ymm7, %ymm9
0x100005065 <+1109>: vaddpd 0x7a0(%rsp), %ymm7, %ymm8
0x10000506e <+1118>: vaddpd 0x820(%rsp), %ymm7, %ymm7
0x100005077 <+1127>: vmulpd %ymm14, %ymm0, %ymm14
0x10000507c <+1132>: vmovupd %ymm7, 0x280(%rsp)
0x100005085 <+1141>: vfmadd213pd %ymm14, %ymm1, %ymm15
0x10000508a <+1146>: vbroadcastsd 0xb0(%rsp), %ymm14
0x100005094 <+1156>: vfmadd213pd %ymm15, %ymm2, %ymm14
0x100005099 <+1161>: vbroadcastsd 0xb8(%rsp), %ymm15
0x1000050a3 <+1171>: vfmadd213pd %ymm14, %ymm3, %ymm15
0x1000050a8 <+1176>: vmulpd 0x620(%rsp), %ymm0, %ymm14
->  0x1000050b1 <+1185>: vmovupd %ymm15, 0x2a0(%rsp)
0x1000050ba <+1194>: vmulpd 0x5a0(%rsp), %ymm0, %ymm15
0x1000050c3 <+1203>: vmulpd 0x520(%rsp), %ymm0, %ymm0
0x1000050cc <+1212>: vfmadd231pd 0x600(%rsp), %ymm1, %ymm14
0x1000050d6 <+1222>: vfmadd231pd 0x580(%rsp), %ymm1, %ymm15
0x1000050e0 <+1232>: vfmadd231pd 0x500(%rsp), %ymm1, %ymm0

Edit: Working assembly with asm command from before deleted due to space restriction.

Edit: The following code compiles on gcc.godbolt.org with ICC 18:

//
//  main.cpp
//  ICC Test
//

#include <iostream>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
#include <iomanip>
#include <complex>
#include <cmath>
#include <fstream>
#include <immintrin.h>

using namespace std;

#define N 4

#define POWER_FACTOR 4 // Power factor tells us how many matrices need to be multiplied. For the standard Wilson action, this is 4. For the first improvement, 6. But the relative runtime ration is independent of this.

#define ITERATIONS 10000000

#define GENERATE_NEW_RANDOMS false

typedef double FP_TYPE;

FP_TYPE SourceMatrix[POWER_FACTOR][N][N];





void InitialiseSourceMatrices();


void Run3ForLoops_Pointer();

inline void Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C);


void RunIntrinsics_FMA_UnalignedCopy_Struct();

inline void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C);



inline FP_TYPE random(FP_TYPE min, FP_TYPE max) {
    return min + (max-min)*FP_TYPE(rand())/FP_TYPE(RAND_MAX);
}



int main(int argc, const char * argv[]) {


    cout <<"Beginning computation\n\n";



    InitialiseSourceMatrices();

    Run3ForLoops_Pointer();

    RunIntrinsics_FMA_UnalignedCopy_Struct();


    return 0;

}


void InitialiseSourceMatrices() {

    int i, j, k;

    // Assing random numbers to imaginary and real parts
    for(j=0; j<N; j++) {
        for(k=0; k<N; k++) {
            for(i=0; i<POWER_FACTOR; i++) {
                SourceMatrix[i][j][k] = random(-1.0, 1.0);
            }
        }
    }

}





void RunIntrinsics_FMA_UnalignedCopy_Struct() {


    clock_t timer;
    timer = clock();

    // Initialise Variables:


    int i, j, k, n;

    FP_TYPE MatrixDummy1[N][N], MatrixDummy2[N][N], MatrixDummy3[N][N];

    for(k=0; k<N; k++) {
        for(i=0; i<POWER_FACTOR; i++) {
            MatrixDummy1[k][i]=0.;
            MatrixDummy2[k][i]=0.;
            MatrixDummy3[k][i]=0.;
        }
    }


    struct matrix_struct {
        //  int dummy;
        //   __declspec(aligned(32))  double m[N][N];
        FP_TYPE m[N][N] __attribute__ ((aligned (32)));
    };

    // matrix_struct Matrix[POWER_FACTOR];

    matrix_struct Matrix[POWER_FACTOR]; // __attribute__ ((aligned (32)));

    //  double *p1, *p2, *p3, *p0;

    FP_TYPE trace = 0.0;

    // Read source matrices in own data format


    for(n=0; n<ITERATIONS; n++) { // We do the whole process ITERATIONS times to get less error for the runtime .

        // srand (time(NULL));

        for(j=0; j<N; j++) {
            for(k=0; k<N; k++) {
                for(i=0; i<POWER_FACTOR; i++) {
                    if(GENERATE_NEW_RANDOMS) Matrix[i].m[j][k] = random(-1.0, 1.0);
                    else Matrix[i].m[j][k] = SourceMatrix[i][j][k]+1.0/(double)(n+1);
                }
            }
        }


        RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(&Matrix[0].m[0][0], &Matrix[1].m[0][0], (&MatrixDummy1)[0][0]);

        RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(&Matrix[2].m[0][0], &Matrix[3].m[0][0], (&MatrixDummy2)[0][0]);

        RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix((&MatrixDummy1)[0][0], (&MatrixDummy2)[0][0], (&MatrixDummy3)[0][0]);


        for(j=0; j<N; j++) {
            trace += MatrixDummy3[j][j];
        }



    }

    cout << setprecision(15);

    cout << "Trace Intrinsics = \t" << trace / (double) ITERATIONS << "    took " << (double) (clock()-timer) / CLOCKS_PER_SEC << "s" << endl << endl;


}



void Run3ForLoops_Pointer() {


    clock_t timer;
    timer = clock();

    // Initialise Variables:


    int i, j, k, n;

    FP_TYPE MatrixDummy1[N][N], MatrixDummy2[N][N], MatrixDummy3[N][N];;


    struct matrix_struct {
        //  int dummy;
        //   __declspec(aligned(32))  double m[N][N];
        FP_TYPE m[N][N] __attribute__ ((aligned (32)));
    };

    // matrix_struct Matrix[POWER_FACTOR];

    matrix_struct Matrix[POWER_FACTOR]; // __attribute__ ((aligned (32)));

    //  double *p1, *p2, *p3, *p0;

    FP_TYPE trace = 0.0;

    // Read source matrices in own data format


    for(n=0; n<ITERATIONS; n++) { // We do the whole process ITERATIONS times to get less error for the runtime .

        // srand (time(NULL));

        for(j=0; j<N; j++) {
            for(k=0; k<N; k++) {
                for(i=0; i<POWER_FACTOR; i++) {
                    if(GENERATE_NEW_RANDOMS) Matrix[i].m[j][k] = random(-1.0, 1.0);
                    else Matrix[i].m[j][k] = SourceMatrix[i][j][k]+1.0/(double)(n+1);
                }
            }
        }



        Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(&Matrix[0].m[0][0], &Matrix[1].m[0][0], (&MatrixDummy1)[0][0]);

        Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(&Matrix[2].m[0][0], &Matrix[3].m[0][0], (&MatrixDummy2)[0][0]);

        Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3((&MatrixDummy1)[0][0], (&MatrixDummy2)[0][0], (&MatrixDummy3)[0][0]);




        for(j=0; j<N; j++) {
            trace += MatrixDummy3[j][j];
        }



    }

    cout << setprecision(15);

    cout << "Trace For Point. = \t\t" << trace / (double) ITERATIONS << "    took " << (double) (clock()-timer) / CLOCKS_PER_SEC << "s" << endl << endl;


}



inline void Run3ForLoops_MultiplyMatrixByMatrix_OutputTo3(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C){

    int i, j, k;

    FP_TYPE dummy[N][N];

    for(j=0; j<N; j++) {
        for(k=0; k<N; k++) {
            dummy[j][k] = 0.0;
            for(i=0; i<N; i++) {
                dummy[j][k] += *(A+j*4+i)*(*(B+i*4+k));
            }
        }
    }

    for(j=0; j<N; j++) {
        for(k=0; k<N; k++) {
            *(C+j*4+k) = dummy[j][k];
        }
    }


}



void RunIntrinsics_FMA_UnalignedCopy_MultiplyMatrixByMatrix(FP_TYPE *A, FP_TYPE *B, FP_TYPE *C)
{
    size_t i;

    // the registers you use
    __m256d a0, a1, a2, a3, b0, b1, b2, b3, sum;
    __m256d *B256 = (__m256d *)B, *C256 = (__m256d *)C;

    // load values from B
    b0 = _mm256_loadu_pd(&B[0]);
    b1 = _mm256_loadu_pd(&B[4]);
    b2 = _mm256_loadu_pd(&B[8]);
    b3 = _mm256_loadu_pd(&B[12]);

    for (i = 0; i < 4; i++) {
        // load values from A
        a0 = _mm256_set1_pd(A[4*i + 0]);
        a1 = _mm256_set1_pd(A[4*i + 1]);
        a2 = _mm256_set1_pd(A[4*i + 2]);
        a3 = _mm256_set1_pd(A[4*i + 3]);

        sum = _mm256_mul_pd(a0, b0);
        sum = _mm256_fmadd_pd(a1, b1, sum);
        sum = _mm256_fmadd_pd(a2, b2, sum);
        sum = _mm256_fmadd_pd(a3, b3, sum);

     //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
            _mm256_storeu_pd(&C[4*i], sum);

    }
}

One thing I noticed is that you are passing a FPTYPE* to the multiplication functions which are actually multi dimensional arrays.

Maybe the Intel compiler doesn't like this too much?

In order to better understand your code I did some C++ification of the C constructs and my code is now passing a const struct reference to the multiplication functions.

I don't have a license for the Intel compiler, but maybe you can check if the code now works at -O3:

#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <immintrin.h>

constexpr int N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr int POWER_FACTOR = 4;
constexpr int ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;

typedef double FP_TYPE;

struct Matrix
{
  FP_TYPE m[N][N] __attribute__ ((aligned (32)));
};

typedef void (*multiply_method)(const Matrix&, const Matrix&, Matrix&);

Matrix source_matrices[POWER_FACTOR];

FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_source_matrices ();
void test_run (multiply_method method, const std::string &method_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);

FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
  return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}

void randomize_source_matrices ()
{
  // Assign random numbers to imaginary and real parts
  for (int j = 0; j < N; j++)
    {
      for (int k = 0; k < N; k++)
        {
          for (int i = 0; i < POWER_FACTOR; i++)
            {
              source_matrices[i].m[j][k] = random(-1.0, 1.0);
            }
        }
    }
}

void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
  for (int j = 0; j < N; j++)
    {
      for (int k = 0; k < N; k++)
        {
          c.m[j][k] = 0.0;
          for (int i = 0; i < N; i++)
            {
              c.m[j][k] += a.m[j][i] * b.m[i][k];
            }
        }
    }
}

void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
  //__m256d *B256 = (__m256d *) B;
  //__m256d *C256 = (__m256d *) C;

  // load values from B
  __m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
  __m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
  __m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
  __m256d b3 = _mm256_loadu_pd (&b.m[3][0]);

  for (size_t i = 0; i < 4; i++)
    {
      // load values from A
      __m256d a0 = _mm256_set1_pd (a.m[i][0]);
      __m256d a1 = _mm256_set1_pd (a.m[i][1]);
      __m256d a2 = _mm256_set1_pd (a.m[i][2]);
      __m256d a3 = _mm256_set1_pd (a.m[i][3]);

      __m256d sum;
      sum = _mm256_mul_pd (a0, b0);
      sum = _mm256_fmadd_pd (a1, b1, sum);
      sum = _mm256_fmadd_pd (a2, b2, sum);
      sum = _mm256_fmadd_pd (a3, b3, sum);

      //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
      _mm256_storeu_pd(&c.m[i][0], sum);
    }
}

void test_run (multiply_method method, const std::string &method_name)
{
  clock_t timer = clock ();

  Matrix matrix_dummy1 = {0};
  Matrix matrix_dummy2 = {0};
  Matrix matrix_dummy3 = {0};

  Matrix matrices[POWER_FACTOR];
  FP_TYPE trace = 0.0;

  // Read source matrices in own data format
  // We do the whole process ITERATIONS times to get less error for the runtime .
  for (int n = 0; n < ITERATIONS; n++)
    {
      for (int j = 0; j < N; j++)
        {
          for (int k = 0; k < N; k++)
            {
              for (int i = 0; i < POWER_FACTOR; i++)
                {
                  if (GENERATE_NEW_RANDOMS)
                    {
                      matrices[i].m[j][k] = random (-1.0, 1.0);
                    }
                  else
                    {
                      matrices[i].m[j][k] = source_matrices[i].m[j][k] + 1.0 / (double)(n + 1);
                    }
                }
            }
        }
      method (matrices[0], matrices[1], matrix_dummy1);
      method (matrices[2], matrices[3], matrix_dummy2);
      method (matrix_dummy1, matrix_dummy2, matrix_dummy3);

      for (int j = 0; j < N; j++)
        {
          trace += matrix_dummy3.m[j][j];
        }
    }
  std::cout << std::setprecision(15);
  std::cout << "Trace " << method_name << " = \t";
  std::cout << trace / (double) ITERATIONS;
  std::cout << "    took ";
  std::cout << (double) (clock() - timer) / CLOCKS_PER_SEC << "s\n\n";
}

int main ()
{
  std::cout << "Beginning computation\n\n";
  randomize_source_matrices ();
  test_run (multiply_plain, "For Point");
  test_run (multiply_intrinsics, "Intrinsics");
}

It is a bit slower, because I fused the two test functions into one and removed the inline directives in the process.

(It should be no problem to add them back of course if your willing to tolerate some code duplication.)

There are still some things that are dangerous about this code, for example it only works correctly with N = 4. Be sure to add some static assertions or some similar safety measures before using such code in production.

Another thing is that there are still some C style (double) casts sprayed into it, but I assume that is only because it is test code. I'm also not sure if the code would ever work for a different FP_TYPE (never worked with intrinsics before ...).

Just for completeness here is a further improved version:

#include <iostream>
#include <cstdlib>
#include <iomanip>
#include <vector>
#include <immintrin.h>

using FP_TYPE = double;

constexpr size_t N = 4;
// Power factor tells us how many matrices need to be multiplied.
// For the standard Wilson action, this is 4.
// For the first improvement, 6.
// But the relative runtime ration is independent of this.
constexpr size_t POWER_FACTOR = 4;
constexpr size_t ITERATIONS = 10 * 1000 * 1000;
constexpr bool GENERATE_NEW_RANDOMS = false;

struct Matrix
{
  FP_TYPE m[N][N] __attribute__ ((aligned (32))) = {{0}};
};

using multiply_func = void (*) (const Matrix&, const Matrix&, Matrix&);
using set_func = FP_TYPE (*) ();
using transform_func = FP_TYPE (*) (FP_TYPE value);

FP_TYPE random (FP_TYPE min, FP_TYPE max);
void randomize_matrix (Matrix &matrix);
void test_run (const std::vector<Matrix> &source_matrices,
               const multiply_func &func,
               const std::string &func_name);
void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c);
void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c);
void set_each_matrix_value (Matrix &matrix, const set_func &func);
void init_matrix (const Matrix &source_matrix, Matrix &matrix,
                  size_t iteration);

FP_TYPE random (FP_TYPE min, FP_TYPE max)
{
  return min + (max - min) * FP_TYPE(rand()) / FP_TYPE(RAND_MAX);
}

void set_each_matrix_value (Matrix &matrix, const set_func &func)
{
  for (auto &j : matrix.m)
    {
      for (auto &k : j)
        {
          k = func ();
        }
    }
}

void randomize_matrix (Matrix &matrix)
{
  // Assign random numbers to imaginary and real parts
  set_each_matrix_value (matrix, [] ()
  {
    return random(-1.0, 1.0);
  });
}

void multiply_plain (const Matrix &a, const Matrix &b, Matrix &c)
{
  for (size_t j = 0; j < N; j++)
    {
      for (size_t k = 0; k < N; k++)
        {
          auto &val = c.m[j][k];
          val = 0.0;
          for (size_t i = 0; i < N; i++)
            {
              val += a.m[j][i] * b.m[i][k];
            }
        }
    }
}

void multiply_intrinsics (const Matrix &a, const Matrix &b, Matrix &c)
{
  static_assert (N == 4);
  static_assert (sizeof (FP_TYPE) == 8);
  static_assert (N * sizeof(FP_TYPE) == 256 / 8);
  // In addition the array in Matrix.m must be properly aligned

  //__m256d *B256 = (__m256d *) B;
  //__m256d *C256 = (__m256d *) C;

  // load values from B
  __m256d b0 = _mm256_loadu_pd (&b.m[0][0]);
  __m256d b1 = _mm256_loadu_pd (&b.m[1][0]);
  __m256d b2 = _mm256_loadu_pd (&b.m[2][0]);
  __m256d b3 = _mm256_loadu_pd (&b.m[3][0]);

  for (size_t i = 0; i < 4; i++)
    {
      // load values from A
      __m256d a0 = _mm256_set1_pd (a.m[i][0]);
      __m256d a1 = _mm256_set1_pd (a.m[i][1]);
      __m256d a2 = _mm256_set1_pd (a.m[i][2]);
      __m256d a3 = _mm256_set1_pd (a.m[i][3]);

      __m256d sum;
      sum = _mm256_mul_pd (a0, b0);
      sum = _mm256_fmadd_pd (a1, b1, sum);
      sum = _mm256_fmadd_pd (a2, b2, sum);
      sum = _mm256_fmadd_pd (a3, b3, sum);

      //   asm ("vmovupd %1, %0" : "=m"(C256[i]) : "x"(sum));
      _mm256_storeu_pd(&c.m[i][0], sum);
    }
}

void init_matrix (const Matrix &source_matrix, Matrix &matrix, size_t iteration)
{
  for (size_t j = 0; j < N; j++)
    {
      for (size_t k = 0; k < N; k++)
        {
          matrix.m[j][k] = source_matrix.m[j][k] + 1.0 / static_cast<FP_TYPE>
                           (iteration + 1);
        }
    }
}

void test_run (const std::vector<Matrix> &source_matrices,
               const multiply_func &func, const std::string &func_name)
{
  clock_t timer = clock ();

  Matrix matrix_dummy1;
  Matrix matrix_dummy2;
  Matrix matrix_dummy3;

  std::vector<Matrix> matrices (POWER_FACTOR);
  FP_TYPE trace = 0.0;

  // Read source matrices in own data format
  // We do the whole process ITERATIONS times to get less error for the runtime .
  for (size_t n = 0; n < ITERATIONS; n++)
    {
      if constexpr (GENERATE_NEW_RANDOMS)
        {
          for (auto &matrix : matrices)
            {
              randomize_matrix (matrix);
            }
        }
      else
        {
          for (size_t i = 0; i < POWER_FACTOR; i++)
            {
              init_matrix (source_matrices[i], matrices[i], n);
            }
        }

      func (matrices[0], matrices[1], matrix_dummy1);
      func (matrices[2], matrices[3], matrix_dummy2);
      func (matrix_dummy1, matrix_dummy2, matrix_dummy3);

      for (size_t j = 0; j < N; j++)
        {
          trace += matrix_dummy3.m[j][j];
        }
    }
  std::cout << std::setprecision(15);
  std::cout << "Trace " << func_name << " = \t";
  std::cout << trace / static_cast<FP_TYPE> (ITERATIONS);
  std::cout << "    took ";
  std::cout << static_cast<double> (clock() - timer) / CLOCKS_PER_SEC << "s\n";
  std::cout << std::endl;
}

int main ()
{
  std::vector<Matrix> source_matrices (POWER_FACTOR);
  std::cout << "Beginning computation\n";
  std::cout << std::endl;
  for (auto &matrix : source_matrices)
    {
      randomize_matrix (matrix);
    }
  test_run (source_matrices, multiply_plain, "For Point");
  test_run (source_matrices, multiply_intrinsics, "Intrinsics");
}

BTW: To compile with g++ or clang++ you have to add -march=haswell (or whatever CPU you have).

来源：https://stackoverflow.com/questions/49858977/o3-in-icc-messes-up-intrinsics-fine-with-o1-or-o2-or-with-corresponding-manu

标签

c++

assembly

optimization

intrinsics

icc