Why does GCC generate 15-20

后端 未结 6 2157
天涯浪人
天涯浪人 2020-11-28 17:17

I first noticed in 2009 that GCC (at least on my projects and on my machines) have the tendency to generate noticeably faster code if I optimize for size (<

6条回答
  •  -上瘾入骨i
    2020-11-28 17:38

    By default compilers optimize for "average" processor. Since different processors favor different instruction sequences, compiler optimizations enabled by -O2 might benefit average processor, but decrease performance on your particular processor (and the same applies to -Os). If you try the same example on different processors, you will find that on some of them benefit from -O2 while other are more favorable to -Os optimizations.

    Here are the results for time ./test 0 0 on several processors (user time reported):

    Processor (System-on-Chip)             Compiler   Time (-O2)  Time (-Os)  Fastest
    AMD Opteron 8350                       gcc-4.8.1    0.704s      0.896s      -O2
    AMD FX-6300                            gcc-4.8.1    0.392s      0.340s      -Os
    AMD E2-1800                            gcc-4.7.2    0.740s      0.832s      -O2
    Intel Xeon E5405                       gcc-4.8.1    0.603s      0.804s      -O2
    Intel Xeon E5-2603                     gcc-4.4.7    1.121s      1.122s       -
    Intel Core i3-3217U                    gcc-4.6.4    0.709s      0.709s       -
    Intel Core i3-3217U                    gcc-4.7.3    0.708s      0.822s      -O2
    Intel Core i3-3217U                    gcc-4.8.1    0.708s      0.944s      -O2
    Intel Core i7-4770K                    gcc-4.8.1    0.296s      0.288s      -Os
    Intel Atom 330                         gcc-4.8.1    2.003s      2.007s      -O2
    ARM 1176JZF-S (Broadcom BCM2835)       gcc-4.6.3    3.470s      3.480s      -O2
    ARM Cortex-A8 (TI OMAP DM3730)         gcc-4.6.3    2.727s      2.727s       -
    ARM Cortex-A9 (TI OMAP 4460)           gcc-4.6.3    1.648s      1.648s       -
    ARM Cortex-A9 (Samsung Exynos 4412)    gcc-4.6.3    1.250s      1.250s       -
    ARM Cortex-A15 (Samsung Exynos 5250)   gcc-4.7.2    0.700s      0.700s       -
    Qualcomm Snapdragon APQ8060A           gcc-4.8       1.53s       1.52s      -Os
    

    In some cases you can alleviate the effect of disadvantageous optimizations by asking gcc to optimize for your particular processor (using options -mtune=native or -march=native):

    Processor            Compiler   Time (-O2 -mtune=native) Time (-Os -mtune=native)
    AMD FX-6300          gcc-4.8.1         0.340s                   0.340s
    AMD E2-1800          gcc-4.7.2         0.740s                   0.832s
    Intel Xeon E5405     gcc-4.8.1         0.603s                   0.803s
    Intel Core i7-4770K  gcc-4.8.1         0.296s                   0.288s
    

    Update: on Ivy Bridge-based Core i3 three versions of gcc (4.6.4, 4.7.3, and 4.8.1) produce binaries with significantly different performance, but the assembly code has only subtle variations. So far, I have no explanation of this fact.

    Assembly from gcc-4.6.4 -Os (executes in 0.709 secs):

    00000000004004d2 <_ZL3addRKiS0_.isra.0>:
      4004d2:       8d 04 37                lea    eax,[rdi+rsi*1]
      4004d5:       c3                      ret
    
    00000000004004d6 <_ZL4workii>:
      4004d6:       41 55                   push   r13
      4004d8:       41 89 fd                mov    r13d,edi
      4004db:       41 54                   push   r12
      4004dd:       41 89 f4                mov    r12d,esi
      4004e0:       55                      push   rbp
      4004e1:       bd 00 c2 eb 0b          mov    ebp,0xbebc200
      4004e6:       53                      push   rbx
      4004e7:       31 db                   xor    ebx,ebx
      4004e9:       41 8d 34 1c             lea    esi,[r12+rbx*1]
      4004ed:       41 8d 7c 1d 00          lea    edi,[r13+rbx*1+0x0]
      4004f2:       e8 db ff ff ff          call   4004d2 <_ZL3addRKiS0_.isra.0>
      4004f7:       01 c3                   add    ebx,eax
      4004f9:       ff cd                   dec    ebp
      4004fb:       75 ec                   jne    4004e9 <_ZL4workii+0x13>
      4004fd:       89 d8                   mov    eax,ebx
      4004ff:       5b                      pop    rbx
      400500:       5d                      pop    rbp
      400501:       41 5c                   pop    r12
      400503:       41 5d                   pop    r13
      400505:       c3                      ret
    

    Assembly from gcc-4.7.3 -Os (executes in 0.822 secs):

    00000000004004fa <_ZL3addRKiS0_.isra.0>:
      4004fa:       8d 04 37                lea    eax,[rdi+rsi*1]
      4004fd:       c3                      ret
    
    00000000004004fe <_ZL4workii>:
      4004fe:       41 55                   push   r13
      400500:       41 89 f5                mov    r13d,esi
      400503:       41 54                   push   r12
      400505:       41 89 fc                mov    r12d,edi
      400508:       55                      push   rbp
      400509:       bd 00 c2 eb 0b          mov    ebp,0xbebc200
      40050e:       53                      push   rbx
      40050f:       31 db                   xor    ebx,ebx
      400511:       41 8d 74 1d 00          lea    esi,[r13+rbx*1+0x0]
      400516:       41 8d 3c 1c             lea    edi,[r12+rbx*1]
      40051a:       e8 db ff ff ff          call   4004fa <_ZL3addRKiS0_.isra.0>
      40051f:       01 c3                   add    ebx,eax
      400521:       ff cd                   dec    ebp
      400523:       75 ec                   jne    400511 <_ZL4workii+0x13>
      400525:       89 d8                   mov    eax,ebx
      400527:       5b                      pop    rbx
      400528:       5d                      pop    rbp
      400529:       41 5c                   pop    r12
      40052b:       41 5d                   pop    r13
      40052d:       c3                      ret
    

    Assembly from gcc-4.8.1 -Os (executes in 0.994 secs):

    00000000004004fd <_ZL3addRKiS0_.isra.0>:
      4004fd:       8d 04 37                lea    eax,[rdi+rsi*1]
      400500:       c3                      ret
    
    0000000000400501 <_ZL4workii>:
      400501:       41 55                   push   r13
      400503:       41 89 f5                mov    r13d,esi
      400506:       41 54                   push   r12
      400508:       41 89 fc                mov    r12d,edi
      40050b:       55                      push   rbp
      40050c:       bd 00 c2 eb 0b          mov    ebp,0xbebc200
      400511:       53                      push   rbx
      400512:       31 db                   xor    ebx,ebx
      400514:       41 8d 74 1d 00          lea    esi,[r13+rbx*1+0x0]
      400519:       41 8d 3c 1c             lea    edi,[r12+rbx*1]
      40051d:       e8 db ff ff ff          call   4004fd <_ZL3addRKiS0_.isra.0>
      400522:       01 c3                   add    ebx,eax
      400524:       ff cd                   dec    ebp
      400526:       75 ec                   jne    400514 <_ZL4workii+0x13>
      400528:       89 d8                   mov    eax,ebx
      40052a:       5b                      pop    rbx
      40052b:       5d                      pop    rbp
      40052c:       41 5c                   pop    r12
      40052e:       41 5d                   pop    r13
      400530:       c3                      ret
    

提交回复
热议问题