Why are mov ah,bh and mov al, bl together much faster than single instruction mov ax, bx?

前端未结

关注

 4  1991

长发绾君心 2020-12-15 05:58

I\'ve found that

mov al, bl
mov ah, bh

is much faster than

mov ax, bx

Can anyone explain me why? I\'m run

4条回答

一个人的身影 (楼主)

2020-12-15 06:45

It is also faster on my Core 2 Duo CPU L9300 1.60GHz. As I wrote in a comment I think this is related to the use of partial registers (ah, al, ax). See more e.g. here, here and here (pg. 88).

I've written a small test suite to try and improve on the code, and while not using the ax version presented in the OP is the smartest, trying to eliminate partial register usage does improve on the speed (even more so than my quick attempt at freeing up another register).

To get more information on why one version is faster than another I think requires more careful reading of the source material and/or using something like Intel VTune or AMD CodeAnalyst. (It could turn out that I'm wrong)

UPDATE, while the below output from oprofile doesn't prove anything it does show that there are a lot of partial register stalls occurring in both versions, but roughly twice as many in the slowest version (triAsm2) as in the 'fast' version (triAsm1).

$ opreport -l test                            
CPU: Core 2, speed 1600 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 800500
Counted RAT_STALLS events (Partial register stall cycles) with a unit mask of 0x0f (All RAT) count 1000000
samples  %        samples  %        symbol name
21039    27.3767  10627    52.3885  triAsm2.loop
16125    20.9824  4815     23.7368  triC
14439    18.7885  4828     23.8008  triAsm1.loop
12557    16.3396  0              0  triAsm3.loop
12161    15.8243  8         0.0394  triAsm4.loop

Complete oprofile output.

Results:

triC: 7410.000000 ms, a5afb9 (C implementation of the asm code)

triAsm1: 6690.000000 ms, a5afb9 (Code from OP, using al and ah)

triAsm2: 9290.000000 ms, a5afb9 (Code from OP, using ax)

triAsm3: 5760.000000 ms, a5afb9 (Straight forward translation of OPs code to one without partial register usage)

triAsm4: 5640.000000 ms, a5afb9 (Quick attempt at making it faster)

Here is my test suite, compiled with -std=c99 -ggdb -m32 -O3 -march=native -mtune=native:

test.c:

#include 
#include 
#include 
#include 

extern void triC(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm1(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm2(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm3(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm4(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);

uint32_t scanline[640];

#define test(tri) \
    {\
        clock_t start = clock();\
        srand(60);\
        for (int i = 0; i < 5000000; i++) {\
            tri(scanline, rand() % 640, 10<<16, 20<<16, 30<<16, 1<<14, 1<<14, 1<<14);\
        }\
        printf(#tri ": %f ms, %x\n",(clock()-start)*1000.0/CLOCKS_PER_SEC,scanline[620]);\
    }

int main() {
    test(triC);
    test(triAsm1);
    test(triAsm2);
    test(triAsm3);
    test(triAsm4);
    return 0;
}

tri.c:

#include 
#include 
#include 

void triC(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb) {
    while (cnt--) {
        cr += dcr;
        cg += dcg;
        cb += dcb;
        *dest++ = (cr & 0xffff0000) | ((cg >> 8) & 0xff00) | ((cb >> 16) & 0xff);
    }
}

atri.asm:

    bits 32
    section .text
    global triAsm1
    global triAsm2
    global triAsm3
    global triAsm4

%define cr DWORD [ebp+0x10]
%define dcr DWORD [ebp+0x1c]
%define dcg DWORD [ebp+0x20]
%define dcb DWORD [ebp+0x24]

triAsm1:
    push ebp
    mov ebp, esp

    pusha

    mov edx, [ebp+0x08] ; dest
    mov ecx, [ebp+0x0c] ; cnt
    mov esi, [ebp+0x14] ; cg
    mov edi, [ebp+0x18] ; cb

.loop:

    add esi, dcg
    mov eax, esi
    shr eax, 8

    add edi, dcb
    mov ebx, edi
    shr ebx, 16
    mov bh, ah

    mov eax, cr
    add eax, dcr
    mov cr, eax

    mov ah, bh  ; faster
    mov al, bl

    mov DWORD [edx], eax

    add edx, 4

    dec ecx
    jge .loop

    popa

    pop ebp
    ret


triAsm2:
    push ebp
    mov ebp, esp

    pusha

    mov edx, [ebp+0x08] ; dest
    mov ecx, [ebp+0x0c] ; cnt
    mov esi, [ebp+0x14] ; cg
    mov edi, [ebp+0x18] ; cb

.loop:

    add esi, dcg
    mov eax, esi
    shr eax, 8

    add edi, dcb
    mov ebx, edi
    shr ebx, 16
    mov bh, ah

    mov eax, cr
    add eax, dcr
    mov cr, eax

    mov ax, bx ; slower

    mov DWORD [edx], eax

    add edx, 4

    dec ecx
    jge .loop

    popa

    pop ebp
    ret

triAsm3:
    push ebp
    mov ebp, esp

    pusha

    mov edx, [ebp+0x08] ; dest
    mov ecx, [ebp+0x0c] ; cnt
    mov esi, [ebp+0x14] ; cg
    mov edi, [ebp+0x18] ; cb

.loop:
    mov eax, cr
    add eax, dcr
    mov cr, eax

    and eax, 0xffff0000

    add esi, dcg
    mov ebx, esi
    shr ebx, 8
    and ebx, 0x0000ff00
    or eax, ebx

    add edi, dcb
    mov ebx, edi
    shr ebx, 16
    and ebx, 0x000000ff
    or eax, ebx

    mov DWORD [edx], eax

    add edx, 4

    dec ecx
    jge .loop

    popa

    pop ebp
    ret

triAsm4:
    push ebp
    mov ebp, esp

    pusha

    mov [stackptr], esp

    mov edi, [ebp+0x08] ; dest
    mov ecx, [ebp+0x0c] ; cnt
    mov edx, [ebp+0x10] ; cr
    mov esi, [ebp+0x14] ; cg
    mov esp, [ebp+0x18] ; cb

.loop:
    add edx, dcr
    add esi, dcg
    add esp, dcb

    ;*dest++ = (cr & 0xffff0000) | ((cg >> 8) & 0xff00) | ((cb >> 16) & 0xff);
    mov eax, edx ; eax=cr
    and eax, 0xffff0000

    mov ebx, esi ; ebx=cg
    shr ebx, 8
    and ebx, 0xff00
    or eax, ebx
    ;mov ah, bh

    mov ebx, esp
    shr ebx, 16
    and ebx, 0xff
    or eax, ebx
    ;mov al, bl

    mov DWORD [edi], eax
    add edi, 4

    dec ecx
    jge .loop

    mov esp, [stackptr]

    popa

    pop ebp
    ret

    section .data
stackptr: dd 0

0 讨论(0)

查看其它4个回答