reason why custom loop is faster? bad compiler? unsafe custom code? luck?(lucky cache hits)

后端 未结 5 1052
失恋的感觉
失恋的感觉 2021-01-16 15:08

i just started learning assembly and making some custom loop for swapping two variables using C++ \'s asm{} body with Digital-Mars compiler in C-Free 5.0

Enabled th

5条回答
  •  [愿得一人]
    2021-01-16 15:16

    It's likely due the fact that the compiler fails to make it register-operands, working on indirect (address) operands instead.

    Switch compilers <-- this is your best optimization.

    Update I have gone through the trouble of translating the the same program gcc intel inline assembly: test.c. It clearly shows how the for-loop and and-while loop are vastly superior to the handwritten assembly.


    That said, with Digital Mars, the following is faster:

    __asm
    {
        xor ecx,j     //init of loop range(200000000 to 0)
    
        mov eax,a     //getting variables to registers
        mov ebx,b
    
    do_it_again3: //begin to loop
    
        //swapping with xor idiom
        xor eax,ebx
        xor ebx,eax         
        xor eax,ebx         
    
        mov a,eax
        mov b,ebx
    
        dec ecx           // j--
        jnz do_it_again3  // end of loop block
    }
    

    using

    • the XOR swap idiom
    • descending loop
    • implicit comparison flags (with dec ecx)

    My benchmark with Digital Mars Compiler Version 8.42n results in:

    time of for-loop(cycles) 572  
    time of while-loop(cycles)  566  
    time of custom-loop-1(cycles)   355   
    time of custom-loop-2(cycles)  317   
    time of custom-loop-3(cycles)  234   
    

    Full listing:

    #include
    #include
    #include
    
    int main()
    {
        int j=0;
    
        int a=0,b=0,temp=0;
    
        srand(time(0));
        time_t t1=0;
        time_t t2=0;
    
    
        t1=clock();
        for(int i=0; i<200000000; i++)
        {
            temp=a;//instruction 1
            a=b;//instruction 2
            b=temp;//3 instructions total
        }
        t2=clock();
        printf("\n time of for-loop(cycles) %i  \n",(t2-t1));
    
    
        t1=clock();
        while(j<200000000)
        {
            temp=a;//again it is three instructions
            a=b;
            b=temp;
            j++;
        }
        t2=clock();
        printf("\n time of while-loop(cycles)  %i  \n",(t2-t1));
    
    
        t1=clock();
        j=200000000;//setting the count
        __asm
        {
            pushf           //backup
            push eax        //backup
            push ebx        //backup
            push ecx        //backup
            push edx        //backup
    
            mov ecx,0       //init of loop range(0 to 200000000)
            mov edx,j
    
            do_it_again:    //begin to loop
    
    
            mov eax,a       //basic swap steps between cpu and mem(cache)
            mov ebx,b
            mov b,eax
            mov a,ebx       //four instructions total
    
            inc ecx         // j++
            cmp ecx,edx     //i<200000000  ?
            jb do_it_again  // end of loop block
    
            pop edx     //rolling back to history
            pop ecx
            pop ebx
            pop eax
            popf
        }
    
        t2=clock();
        printf("\n time of custom-loop-1(cycles)   %i   \n",(t2-t1));
    
        t1=clock();
        j=200000000;//setting the count
        __asm
        {
            pushf           //backup
                push eax        
                push ebx        
                push ecx        
                push edx        
    
                mov ecx,0       //init of loop range(0 to 200000000)
                mov edx,j
    
                mov eax,a       //getting variables to registers
                mov ebx,b
    
                do_it_again2:   //begin to loop
    
                //swapping with using only 2 variables(only in cpu)
                sub eax,ebx         //a is now a-b
                add ebx,eax         //b is now a
                sub eax,ebx         //a is now -b
                xor eax,80000000h   //a is now b and four instructions total
    
                inc ecx         // j++
                cmp ecx,edx     //i<200000000  ?
                jb do_it_again2  // end of loop block
    
                pop edx         //rollback
                pop ecx         
                pop ebx         
                pop eax         
                popf            
        }
    
        t2=clock();
        printf("\n time of custom-loop-2(cycles)  %i   \n",(t2-t1));
    
        t1=clock();
        j=200000000;//setting the count
        __asm
        {
            xor ecx,j     //init of loop range(200000000 to 0)
    
            mov eax,a     //getting variables to registers
            mov ebx,b
    
        do_it_again3:   //begin to loop
    
            //swapping with using only 2 variables(only in cpu)
            xor eax,ebx
            xor ebx,eax         
            xor eax,ebx         
    
            mov a,eax
            mov b,ebx
    
            dec ecx         // j--
            jnz do_it_again3  // end of loop block
        }
    
        t2=clock();
        printf("\n time of custom-loop-3(cycles)  %i   \n",(t2-t1));
    
        return 0;
    
    }
    

提交回复
热议问题