Why does JIT order affect performance?

前端 未结 3 1896
佛祖请我去吃肉
佛祖请我去吃肉 2020-12-24 05:57

Why does the order in which C# methods in .NET 4.0 are just-in-time compiled affect how quickly they execute? For example, consider two equivalent methods:

p         


        
相关标签:
3条回答
  • 2020-12-24 06:34

    Please note that I do not trust the "Suppress JIT optimization on module load" option, I spawn the process without debugging and attach my debugger after the JIT has run.

    In the version where single-line runs faster, this is Main:

            SingleLineTest();
    00000000  push        ebp 
    00000001  mov         ebp,esp 
    00000003  call        dword ptr ds:[0019380Ch] 
                MultiLineTest();
    00000009  call        dword ptr ds:[00193818h] 
                SingleLineTest();
    0000000f  call        dword ptr ds:[0019380Ch] 
                MultiLineTest();
    00000015  call        dword ptr ds:[00193818h] 
                SingleLineTest();
    0000001b  call        dword ptr ds:[0019380Ch] 
                MultiLineTest();
    00000021  call        dword ptr ds:[00193818h] 
    00000027  pop         ebp 
            }
    00000028  ret 
    

    Note that MultiLineTest has been placed on an 8 byte boundary, and SingleLineTest on a 4 byte boundary.

    Here's Main for the version where both run at the same speed:

                MultiLineTest();
    00000000  push        ebp 
    00000001  mov         ebp,esp 
    00000003  call        dword ptr ds:[00153818h] 
    
                SingleLineTest();
    00000009  call        dword ptr ds:[0015380Ch] 
                MultiLineTest();
    0000000f  call        dword ptr ds:[00153818h] 
                SingleLineTest();
    00000015  call        dword ptr ds:[0015380Ch] 
                MultiLineTest();
    0000001b  call        dword ptr ds:[00153818h] 
                SingleLineTest();
    00000021  call        dword ptr ds:[0015380Ch] 
                MultiLineTest();
    00000027  call        dword ptr ds:[00153818h] 
    0000002d  pop         ebp 
            }
    0000002e  ret 
    

    Amazingly, the addresses chosen by the JIT are identical in the last 4 digits, even though it allegedly processed them in the opposite order. Not sure I believe that any more.

    More digging is necessary. I think it was mentioned that the code before the loop wasn't exactly the same in both versions? Going to investigate.

    Here's the "slow" version of SingleLineTest (and I checked, the last digits of the function address haven't changed).

                Stopwatch stopwatch = new Stopwatch();
    00000000  push        ebp 
    00000001  mov         ebp,esp 
    00000003  push        edi 
    00000004  push        esi 
    00000005  push        ebx 
    00000006  mov         ecx,7A5A2C68h 
    0000000b  call        FFF91EA0 
    00000010  mov         esi,eax 
    00000012  mov         dword ptr [esi+4],0 
    00000019  mov         dword ptr [esi+8],0 
    00000020  mov         byte ptr [esi+14h],0 
    00000024  mov         dword ptr [esi+0Ch],0 
    0000002b  mov         dword ptr [esi+10h],0 
                stopwatch.Start();
    00000032  cmp         byte ptr [esi+14h],0 
    00000036  jne         00000047 
    00000038  call        7A22B314 
    0000003d  mov         dword ptr [esi+0Ch],eax 
    00000040  mov         dword ptr [esi+10h],edx 
    00000043  mov         byte ptr [esi+14h],1 
                int count = 0;
    00000047  xor         edi,edi 
                for (uint i = 0; i < 1000000000; ++i) {
    00000049  xor         edx,edx 
                    count += i % 16 == 0 ? 1 : 0;
    0000004b  mov         eax,edx 
    0000004d  and         eax,0Fh 
    00000050  test        eax,eax 
    00000052  je          00000058 
    00000054  xor         eax,eax 
    00000056  jmp         0000005D 
    00000058  mov         eax,1 
    0000005d  add         edi,eax 
                for (uint i = 0; i < 1000000000; ++i) {
    0000005f  inc         edx 
    00000060  cmp         edx,3B9ACA00h 
    00000066  jb          0000004B 
                }
                stopwatch.Stop();
    00000068  mov         ecx,esi 
    0000006a  call        7A23F2C0 
                Console.WriteLine("Single-line test --> Count: {0}, Time: {1}", count, stopwatch.ElapsedMilliseconds);
    0000006f  mov         ecx,797C29B4h 
    00000074  call        FFF91EA0 
    00000079  mov         ecx,eax 
    0000007b  mov         dword ptr [ecx+4],edi 
    0000007e  mov         ebx,ecx 
    00000080  mov         ecx,797BA240h 
    00000085  call        FFF91EA0 
    0000008a  mov         edi,eax 
    0000008c  mov         ecx,esi 
    0000008e  call        7A23ABE8 
    00000093  push        edx 
    00000094  push        eax 
    00000095  push        0 
    00000097  push        2710h 
    0000009c  call        783247EC 
    000000a1  mov         dword ptr [edi+4],eax 
    000000a4  mov         dword ptr [edi+8],edx 
    000000a7  mov         esi,edi 
    000000a9  call        793C6F40 
    000000ae  push        ebx 
    000000af  push        esi 
    000000b0  mov         ecx,eax 
    000000b2  mov         edx,dword ptr ds:[03392034h] 
    000000b8  mov         eax,dword ptr [ecx] 
    000000ba  mov         eax,dword ptr [eax+3Ch] 
    000000bd  call        dword ptr [eax+1Ch] 
    000000c0  pop         ebx 
            }
    000000c1  pop         esi 
    000000c2  pop         edi 
    000000c3  pop         ebp 
    000000c4  ret 
    

    And the "fast" version:

                Stopwatch stopwatch = new Stopwatch();
    00000000  push        ebp 
    00000001  mov         ebp,esp 
    00000003  push        edi 
    00000004  push        esi 
    00000005  push        ebx 
    00000006  mov         ecx,7A5A2C68h 
    0000000b  call        FFE11F70 
    00000010  mov         esi,eax 
    00000012  mov         ecx,esi 
    00000014  call        7A1068BC 
                stopwatch.Start();
    00000019  cmp         byte ptr [esi+14h],0 
    0000001d  jne         0000002E 
    0000001f  call        7A12B3E4 
    00000024  mov         dword ptr [esi+0Ch],eax 
    00000027  mov         dword ptr [esi+10h],edx 
    0000002a  mov         byte ptr [esi+14h],1 
                int count = 0;
    0000002e  xor         edi,edi 
                for (uint i = 0; i < 1000000000; ++i) {
    00000030  xor         edx,edx 
                    count += i % 16 == 0 ? 1 : 0;
    00000032  mov         eax,edx 
    00000034  and         eax,0Fh 
    00000037  test        eax,eax 
    00000039  je          0000003F 
    0000003b  xor         eax,eax 
    0000003d  jmp         00000044 
    0000003f  mov         eax,1 
    00000044  add         edi,eax 
                for (uint i = 0; i < 1000000000; ++i) {
    00000046  inc         edx 
    00000047  cmp         edx,3B9ACA00h 
    0000004d  jb          00000032 
                }
                stopwatch.Stop();
    0000004f  mov         ecx,esi 
    00000051  call        7A13F390 
                Console.WriteLine("Single-line test --> Count: {0}, Time: {1}", count, stopwatch.ElapsedMilliseconds);
    00000056  mov         ecx,797C29B4h 
    0000005b  call        FFE11F70 
    00000060  mov         ecx,eax 
    00000062  mov         dword ptr [ecx+4],edi 
    00000065  mov         ebx,ecx 
    00000067  mov         ecx,797BA240h 
    0000006c  call        FFE11F70 
    00000071  mov         edi,eax 
    00000073  mov         ecx,esi 
    00000075  call        7A13ACB8 
    0000007a  push        edx 
    0000007b  push        eax 
    0000007c  push        0 
    0000007e  push        2710h 
    00000083  call        782248BC 
    00000088  mov         dword ptr [edi+4],eax 
    0000008b  mov         dword ptr [edi+8],edx 
    0000008e  mov         esi,edi 
    00000090  call        792C7010 
    00000095  push        ebx 
    00000096  push        esi 
    00000097  mov         ecx,eax 
    00000099  mov         edx,dword ptr ds:[03562030h] 
    0000009f  mov         eax,dword ptr [ecx] 
    000000a1  mov         eax,dword ptr [eax+3Ch] 
    000000a4  call        dword ptr [eax+1Ch] 
    000000a7  pop         ebx 
            }
    000000a8  pop         esi 
    000000a9  pop         edi 
    000000aa  pop         ebp 
    000000ab  ret 
    

    Just the loops, fast on the left, slow on the right:

    00000030  xor         edx,edx                 00000049  xor         edx,edx 
    00000032  mov         eax,edx                 0000004b  mov         eax,edx 
    00000034  and         eax,0Fh                 0000004d  and         eax,0Fh 
    00000037  test        eax,eax                 00000050  test        eax,eax 
    00000039  je          0000003F                00000052  je          00000058 
    0000003b  xor         eax,eax                 00000054  xor         eax,eax 
    0000003d  jmp         00000044                00000056  jmp         0000005D 
    0000003f  mov         eax,1                   00000058  mov         eax,1 
    00000044  add         edi,eax                 0000005d  add         edi,eax 
    00000046  inc         edx                     0000005f  inc         edx 
    00000047  cmp         edx,3B9ACA00h           00000060  cmp         edx,3B9ACA00h 
    0000004d  jb          00000032                00000066  jb          0000004B 
    

    The instructions are identical (being relative jumps, the machine code is identical even though the disassembly shows different addresses), but the alignment is different. There are three jumps. the je loading a constant 1 is aligned in the slow version and not in the fast version, but it hardly matters, since that jump is only taken 1/16 of the time. The other two jumps ( jmp after loading a constant zero, and jb repeating the entire loop) are taken millions more times, and are aligned in the "fast" version.

    I think this is the smoking gun.

    0 讨论(0)
  • 2020-12-24 06:42

    My time is 2400 and 2600 on i5-2410M 2,3Ghz 4GB ram 64bit Win 7.

    Here is my output: Single first

    After starting the process and then attaching the debugger

                SingleLineTest();
                MultiLineTest();
                SingleLineTest();
                MultiLineTest();
                SingleLineTest();
                MultiLineTest();
    --------------------------------
    SingleLineTest()
               Stopwatch stopwatch = new Stopwatch();
    00000000  push        ebp 
    00000001  mov         ebp,esp 
    00000003  push        edi 
    00000004  push        esi 
    00000005  push        ebx 
    00000006  mov         ecx,685D2C68h 
    0000000b  call        FFD91F70 
    00000010  mov         esi,eax 
    00000012  mov         ecx,esi 
    00000014  call        681D68BC 
                stopwatch.Start();
    00000019  cmp         byte ptr [esi+14h],0 
    0000001d  jne         0000002E 
    0000001f  call        681FB3E4 
    00000024  mov         dword ptr [esi+0Ch],eax 
    00000027  mov         dword ptr [esi+10h],edx 
    0000002a  mov         byte ptr [esi+14h],1 
                int count = 0;
    0000002e  xor         edi,edi 
                for (int i = 0; i < 1000000000; ++i)
    00000030  xor         edx,edx 
                {
                    count += i % 16 == 0 ? 1 : 0;
    00000032  mov         eax,edx 
    00000034  and         eax,8000000Fh 
    00000039  jns         00000040 
    0000003b  dec         eax 
    0000003c  or          eax,0FFFFFFF0h 
    0000003f  inc         eax 
    00000040  test        eax,eax 
    00000042  je          00000048 
    00000044  xor         eax,eax 
    00000046  jmp         0000004D 
    00000048  mov         eax,1 
    0000004d  add         edi,eax 
                for (int i = 0; i < 1000000000; ++i)
    0000004f  inc         edx 
    00000050  cmp         edx,3B9ACA00h 
    00000056  jl          00000032 
                }
                stopwatch.Stop();
    00000058  mov         ecx,esi 
    0000005a  call        6820F390 
                Console.WriteLine("Single-line test --> Count: {0}, Time: {1}", count, stopwatch.ElapsedMilliseconds);
    0000005f  mov         ecx,6A8B29B4h 
    00000064  call        FFD91F70 
    00000069  mov         ecx,eax 
    0000006b  mov         dword ptr [ecx+4],edi 
    0000006e  mov         ebx,ecx 
    00000070  mov         ecx,6A8AA240h 
    00000075  call        FFD91F70 
    0000007a  mov         edi,eax 
    0000007c  mov         ecx,esi 
    0000007e  call        6820ACB8 
    00000083  push        edx 
    00000084  push        eax 
    00000085  push        0 
    00000087  push        2710h 
    0000008c  call        6AFF48BC 
    00000091  mov         dword ptr [edi+4],eax 
    00000094  mov         dword ptr [edi+8],edx 
    00000097  mov         esi,edi 
    00000099  call        6A457010 
    0000009e  push        ebx 
    0000009f  push        esi 
    000000a0  mov         ecx,eax 
    000000a2  mov         edx,dword ptr ds:[039F2030h] 
    000000a8  mov         eax,dword ptr [ecx] 
    000000aa  mov         eax,dword ptr [eax+3Ch] 
    000000ad  call        dword ptr [eax+1Ch] 
    000000b0  pop         ebx 
            }
    000000b1  pop         esi 
    000000b2  pop         edi 
    000000b3  pop         ebp 
    000000b4  ret 
    

    Multi first:

                MultiLineTest();
    
                SingleLineTest();
                MultiLineTest();
                SingleLineTest();
                MultiLineTest();
                SingleLineTest();
                MultiLineTest();
    --------------------------------
    SingleLineTest()
                Stopwatch stopwatch = new Stopwatch();
    00000000  push        ebp 
    00000001  mov         ebp,esp 
    00000003  push        edi 
    00000004  push        esi 
    00000005  push        ebx 
    00000006  mov         ecx,685D2C68h 
    0000000b  call        FFF31EA0 
    00000010  mov         esi,eax 
    00000012  mov         dword ptr [esi+4],0 
    00000019  mov         dword ptr [esi+8],0 
    00000020  mov         byte ptr [esi+14h],0 
    00000024  mov         dword ptr [esi+0Ch],0 
    0000002b  mov         dword ptr [esi+10h],0 
                stopwatch.Start();
    00000032  cmp         byte ptr [esi+14h],0 
    00000036  jne         00000047 
    00000038  call        682AB314 
    0000003d  mov         dword ptr [esi+0Ch],eax 
    00000040  mov         dword ptr [esi+10h],edx 
    00000043  mov         byte ptr [esi+14h],1 
                int count = 0;
    00000047  xor         edi,edi 
                for (int i = 0; i < 1000000000; ++i)
    00000049  xor         edx,edx 
                {
                    count += i % 16 == 0 ? 1 : 0;
    0000004b  mov         eax,edx 
    0000004d  and         eax,8000000Fh 
    00000052  jns         00000059 
    00000054  dec         eax 
    00000055  or          eax,0FFFFFFF0h 
    00000058  inc         eax 
    00000059  test        eax,eax 
    0000005b  je          00000061 
    0000005d  xor         eax,eax 
    0000005f  jmp         00000066 
    00000061  mov         eax,1 
    00000066  add         edi,eax 
                for (int i = 0; i < 1000000000; ++i)
    00000068  inc         edx 
    00000069  cmp         edx,3B9ACA00h 
    0000006f  jl          0000004B 
                }
                stopwatch.Stop();
    00000071  mov         ecx,esi 
    00000073  call        682BF2C0 
                Console.WriteLine("Single-line test --> Count: {0}, Time: {1}", count, stopwatch.ElapsedMilliseconds);
    00000078  mov         ecx,6A8B29B4h 
    0000007d  call        FFF31EA0 
    00000082  mov         ecx,eax 
    00000084  mov         dword ptr [ecx+4],edi 
    00000087  mov         ebx,ecx 
    00000089  mov         ecx,6A8AA240h 
    0000008e  call        FFF31EA0 
    00000093  mov         edi,eax 
    00000095  mov         ecx,esi 
    00000097  call        682BABE8 
    0000009c  push        edx 
    0000009d  push        eax 
    0000009e  push        0 
    000000a0  push        2710h 
    000000a5  call        6B0A47EC 
    000000aa  mov         dword ptr [edi+4],eax 
    000000ad  mov         dword ptr [edi+8],edx 
    000000b0  mov         esi,edi 
    000000b2  call        6A506F40 
    000000b7  push        ebx 
    000000b8  push        esi 
    000000b9  mov         ecx,eax 
    000000bb  mov         edx,dword ptr ds:[038E2034h] 
    000000c1  mov         eax,dword ptr [ecx] 
    000000c3  mov         eax,dword ptr [eax+3Ch] 
    000000c6  call        dword ptr [eax+1Ch] 
    000000c9  pop         ebx 
            }
    000000ca  pop         esi 
    000000cb  pop         edi 
    000000cc  pop         ebp 
    000000cd  ret
    
    0 讨论(0)
  • 2020-12-24 06:46

    So for a definitive answer... I suspect we would need to dig into the dis-assembly.

    However, I have a guess. The compiler for the SingleLineTest() stores each result of the equation on the stack and pops each value as needed. However, the MultiLineTest() may be storing values and having to access them from there. This could cause a few clock cycles to be missed. Where as grabbing the values off the stack will keep it in a register.

    Interestingly, changing the order of the function compilation may be adjusting the garbage collector's actions. Because isMultipleOf16 is defined within the loop, it may be be handled funny. You may want to move the definition outside of the loop and see what that changes...

    0 讨论(0)
提交回复
热议问题