Why are mov ah,bh and mov al, bl together much faster than single instruction mov ax, bx?

前端 未结 4 1991
长发绾君心
长发绾君心 2020-12-15 05:58

I\'ve found that

mov al, bl
mov ah, bh

is much faster than

mov ax, bx

Can anyone explain me why? I\'m run

4条回答
  •  一个人的身影
    2020-12-15 06:45

    It is also faster on my Core 2 Duo CPU L9300 1.60GHz. As I wrote in a comment I think this is related to the use of partial registers (ah, al, ax). See more e.g. here, here and here (pg. 88).

    I've written a small test suite to try and improve on the code, and while not using the ax version presented in the OP is the smartest, trying to eliminate partial register usage does improve on the speed (even more so than my quick attempt at freeing up another register).

    To get more information on why one version is faster than another I think requires more careful reading of the source material and/or using something like Intel VTune or AMD CodeAnalyst. (It could turn out that I'm wrong)

    UPDATE, while the below output from oprofile doesn't prove anything it does show that there are a lot of partial register stalls occurring in both versions, but roughly twice as many in the slowest version (triAsm2) as in the 'fast' version (triAsm1).

    $ opreport -l test                            
    CPU: Core 2, speed 1600 MHz (estimated)
    Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 800500
    Counted RAT_STALLS events (Partial register stall cycles) with a unit mask of 0x0f (All RAT) count 1000000
    samples  %        samples  %        symbol name
    21039    27.3767  10627    52.3885  triAsm2.loop
    16125    20.9824  4815     23.7368  triC
    14439    18.7885  4828     23.8008  triAsm1.loop
    12557    16.3396  0              0  triAsm3.loop
    12161    15.8243  8         0.0394  triAsm4.loop
    

    Complete oprofile output.

    Results:

    triC: 7410.000000 ms, a5afb9 (C implementation of the asm code)

    triAsm1: 6690.000000 ms, a5afb9 (Code from OP, using al and ah)

    triAsm2: 9290.000000 ms, a5afb9 (Code from OP, using ax)

    triAsm3: 5760.000000 ms, a5afb9 (Straight forward translation of OPs code to one without partial register usage)

    triAsm4: 5640.000000 ms, a5afb9 (Quick attempt at making it faster)

    Here is my test suite, compiled with -std=c99 -ggdb -m32 -O3 -march=native -mtune=native:

    test.c:

    #include 
    #include 
    #include 
    #include 
    
    extern void triC(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
    extern void triAsm1(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
    extern void triAsm2(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
    extern void triAsm3(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
    extern void triAsm4(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
    
    uint32_t scanline[640];
    
    #define test(tri) \
        {\
            clock_t start = clock();\
            srand(60);\
            for (int i = 0; i < 5000000; i++) {\
                tri(scanline, rand() % 640, 10<<16, 20<<16, 30<<16, 1<<14, 1<<14, 1<<14);\
            }\
            printf(#tri ": %f ms, %x\n",(clock()-start)*1000.0/CLOCKS_PER_SEC,scanline[620]);\
        }
    
    int main() {
        test(triC);
        test(triAsm1);
        test(triAsm2);
        test(triAsm3);
        test(triAsm4);
        return 0;
    }
    

    tri.c:

    #include 
    #include 
    #include 
    
    void triC(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb) {
        while (cnt--) {
            cr += dcr;
            cg += dcg;
            cb += dcb;
            *dest++ = (cr & 0xffff0000) | ((cg >> 8) & 0xff00) | ((cb >> 16) & 0xff);
        }
    }
    

    atri.asm:

        bits 32
        section .text
        global triAsm1
        global triAsm2
        global triAsm3
        global triAsm4
    
    %define cr DWORD [ebp+0x10]
    %define dcr DWORD [ebp+0x1c]
    %define dcg DWORD [ebp+0x20]
    %define dcb DWORD [ebp+0x24]
    
    triAsm1:
        push ebp
        mov ebp, esp
    
        pusha
    
        mov edx, [ebp+0x08] ; dest
        mov ecx, [ebp+0x0c] ; cnt
        mov esi, [ebp+0x14] ; cg
        mov edi, [ebp+0x18] ; cb
    
    .loop:
    
        add esi, dcg
        mov eax, esi
        shr eax, 8
    
        add edi, dcb
        mov ebx, edi
        shr ebx, 16
        mov bh, ah
    
        mov eax, cr
        add eax, dcr
        mov cr, eax
    
        mov ah, bh  ; faster
        mov al, bl
    
        mov DWORD [edx], eax
    
        add edx, 4
    
        dec ecx
        jge .loop
    
        popa
    
        pop ebp
        ret
    
    
    triAsm2:
        push ebp
        mov ebp, esp
    
        pusha
    
        mov edx, [ebp+0x08] ; dest
        mov ecx, [ebp+0x0c] ; cnt
        mov esi, [ebp+0x14] ; cg
        mov edi, [ebp+0x18] ; cb
    
    .loop:
    
        add esi, dcg
        mov eax, esi
        shr eax, 8
    
        add edi, dcb
        mov ebx, edi
        shr ebx, 16
        mov bh, ah
    
        mov eax, cr
        add eax, dcr
        mov cr, eax
    
        mov ax, bx ; slower
    
        mov DWORD [edx], eax
    
        add edx, 4
    
        dec ecx
        jge .loop
    
        popa
    
        pop ebp
        ret
    
    triAsm3:
        push ebp
        mov ebp, esp
    
        pusha
    
        mov edx, [ebp+0x08] ; dest
        mov ecx, [ebp+0x0c] ; cnt
        mov esi, [ebp+0x14] ; cg
        mov edi, [ebp+0x18] ; cb
    
    .loop:
        mov eax, cr
        add eax, dcr
        mov cr, eax
    
        and eax, 0xffff0000
    
        add esi, dcg
        mov ebx, esi
        shr ebx, 8
        and ebx, 0x0000ff00
        or eax, ebx
    
        add edi, dcb
        mov ebx, edi
        shr ebx, 16
        and ebx, 0x000000ff
        or eax, ebx
    
        mov DWORD [edx], eax
    
        add edx, 4
    
        dec ecx
        jge .loop
    
        popa
    
        pop ebp
        ret
    
    triAsm4:
        push ebp
        mov ebp, esp
    
        pusha
    
        mov [stackptr], esp
    
        mov edi, [ebp+0x08] ; dest
        mov ecx, [ebp+0x0c] ; cnt
        mov edx, [ebp+0x10] ; cr
        mov esi, [ebp+0x14] ; cg
        mov esp, [ebp+0x18] ; cb
    
    .loop:
        add edx, dcr
        add esi, dcg
        add esp, dcb
    
        ;*dest++ = (cr & 0xffff0000) | ((cg >> 8) & 0xff00) | ((cb >> 16) & 0xff);
        mov eax, edx ; eax=cr
        and eax, 0xffff0000
    
        mov ebx, esi ; ebx=cg
        shr ebx, 8
        and ebx, 0xff00
        or eax, ebx
        ;mov ah, bh
    
        mov ebx, esp
        shr ebx, 16
        and ebx, 0xff
        or eax, ebx
        ;mov al, bl
    
        mov DWORD [edi], eax
        add edi, 4
    
        dec ecx
        jge .loop
    
        mov esp, [stackptr]
    
        popa
    
        pop ebp
        ret
    
        section .data
    stackptr: dd 0
    

提交回复
热议问题