I\'ve found that
mov al, bl
mov ah, bh
is much faster than
mov ax, bx
Can anyone explain me why? I\'m run
It is also faster on my Core 2 Duo CPU L9300 1.60GHz. As I wrote in a comment I think this is related to the use of partial registers (ah, al, ax). See more e.g. here, here and here (pg. 88).
I've written a small test suite to try and improve on the code, and while not using the ax version presented in the OP is the smartest, trying to eliminate partial register usage does improve on the speed (even more so than my quick attempt at freeing up another register).
To get more information on why one version is faster than another I think requires more careful reading of the source material and/or using something like Intel VTune or AMD CodeAnalyst. (It could turn out that I'm wrong)
UPDATE, while the below output from oprofile doesn't prove anything it does show that there are a lot of partial register stalls occurring in both versions, but roughly twice as many in the slowest version (triAsm2) as in the 'fast' version (triAsm1).
$ opreport -l test
CPU: Core 2, speed 1600 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 800500
Counted RAT_STALLS events (Partial register stall cycles) with a unit mask of 0x0f (All RAT) count 1000000
samples % samples % symbol name
21039 27.3767 10627 52.3885 triAsm2.loop
16125 20.9824 4815 23.7368 triC
14439 18.7885 4828 23.8008 triAsm1.loop
12557 16.3396 0 0 triAsm3.loop
12161 15.8243 8 0.0394 triAsm4.loop
Complete oprofile output.
Results:
triC: 7410.000000 ms, a5afb9 (C implementation of the asm code)
triAsm1: 6690.000000 ms, a5afb9 (Code from OP, using al and ah)
triAsm2: 9290.000000 ms, a5afb9 (Code from OP, using ax)
triAsm3: 5760.000000 ms, a5afb9 (Straight forward translation of OPs code to one without partial register usage)
triAsm4: 5640.000000 ms, a5afb9 (Quick attempt at making it faster)
Here is my test suite, compiled with -std=c99 -ggdb -m32 -O3 -march=native -mtune=native:
test.c:
#include
#include
#include
#include
extern void triC(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm1(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm2(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm3(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
extern void triAsm4(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb);
uint32_t scanline[640];
#define test(tri) \
{\
clock_t start = clock();\
srand(60);\
for (int i = 0; i < 5000000; i++) {\
tri(scanline, rand() % 640, 10<<16, 20<<16, 30<<16, 1<<14, 1<<14, 1<<14);\
}\
printf(#tri ": %f ms, %x\n",(clock()-start)*1000.0/CLOCKS_PER_SEC,scanline[620]);\
}
int main() {
test(triC);
test(triAsm1);
test(triAsm2);
test(triAsm3);
test(triAsm4);
return 0;
}
tri.c:
#include
#include
#include
void triC(uint32_t* dest, uint32_t cnt, uint32_t cr, uint32_t cg, uint32_t cb, uint32_t dcr, uint32_t dcg, uint32_t dcb) {
while (cnt--) {
cr += dcr;
cg += dcg;
cb += dcb;
*dest++ = (cr & 0xffff0000) | ((cg >> 8) & 0xff00) | ((cb >> 16) & 0xff);
}
}
atri.asm:
bits 32
section .text
global triAsm1
global triAsm2
global triAsm3
global triAsm4
%define cr DWORD [ebp+0x10]
%define dcr DWORD [ebp+0x1c]
%define dcg DWORD [ebp+0x20]
%define dcb DWORD [ebp+0x24]
triAsm1:
push ebp
mov ebp, esp
pusha
mov edx, [ebp+0x08] ; dest
mov ecx, [ebp+0x0c] ; cnt
mov esi, [ebp+0x14] ; cg
mov edi, [ebp+0x18] ; cb
.loop:
add esi, dcg
mov eax, esi
shr eax, 8
add edi, dcb
mov ebx, edi
shr ebx, 16
mov bh, ah
mov eax, cr
add eax, dcr
mov cr, eax
mov ah, bh ; faster
mov al, bl
mov DWORD [edx], eax
add edx, 4
dec ecx
jge .loop
popa
pop ebp
ret
triAsm2:
push ebp
mov ebp, esp
pusha
mov edx, [ebp+0x08] ; dest
mov ecx, [ebp+0x0c] ; cnt
mov esi, [ebp+0x14] ; cg
mov edi, [ebp+0x18] ; cb
.loop:
add esi, dcg
mov eax, esi
shr eax, 8
add edi, dcb
mov ebx, edi
shr ebx, 16
mov bh, ah
mov eax, cr
add eax, dcr
mov cr, eax
mov ax, bx ; slower
mov DWORD [edx], eax
add edx, 4
dec ecx
jge .loop
popa
pop ebp
ret
triAsm3:
push ebp
mov ebp, esp
pusha
mov edx, [ebp+0x08] ; dest
mov ecx, [ebp+0x0c] ; cnt
mov esi, [ebp+0x14] ; cg
mov edi, [ebp+0x18] ; cb
.loop:
mov eax, cr
add eax, dcr
mov cr, eax
and eax, 0xffff0000
add esi, dcg
mov ebx, esi
shr ebx, 8
and ebx, 0x0000ff00
or eax, ebx
add edi, dcb
mov ebx, edi
shr ebx, 16
and ebx, 0x000000ff
or eax, ebx
mov DWORD [edx], eax
add edx, 4
dec ecx
jge .loop
popa
pop ebp
ret
triAsm4:
push ebp
mov ebp, esp
pusha
mov [stackptr], esp
mov edi, [ebp+0x08] ; dest
mov ecx, [ebp+0x0c] ; cnt
mov edx, [ebp+0x10] ; cr
mov esi, [ebp+0x14] ; cg
mov esp, [ebp+0x18] ; cb
.loop:
add edx, dcr
add esi, dcg
add esp, dcb
;*dest++ = (cr & 0xffff0000) | ((cg >> 8) & 0xff00) | ((cb >> 16) & 0xff);
mov eax, edx ; eax=cr
and eax, 0xffff0000
mov ebx, esi ; ebx=cg
shr ebx, 8
and ebx, 0xff00
or eax, ebx
;mov ah, bh
mov ebx, esp
shr ebx, 16
and ebx, 0xff
or eax, ebx
;mov al, bl
mov DWORD [edi], eax
add edi, 4
dec ecx
jge .loop
mov esp, [stackptr]
popa
pop ebp
ret
section .data
stackptr: dd 0