i just started learning assembly and making some custom loop for swapping two variables using C++ \'s asm{} body with Digital-Mars compiler in C-Free 5.0
Enabled th
It's likely due the fact that the compiler fails to make it register-operands, working on indirect (address) operands instead.
Switch compilers <-- this is your best optimization.
Update I have gone through the trouble of translating the the same program gcc intel inline assembly: test.c. It clearly shows how the for-loop and and-while loop are vastly superior to the handwritten assembly.
That said, with Digital Mars, the following is faster:
__asm
{
xor ecx,j //init of loop range(200000000 to 0)
mov eax,a //getting variables to registers
mov ebx,b
do_it_again3: //begin to loop
//swapping with xor idiom
xor eax,ebx
xor ebx,eax
xor eax,ebx
mov a,eax
mov b,ebx
dec ecx // j--
jnz do_it_again3 // end of loop block
}
using
dec ecx
)My benchmark with Digital Mars Compiler Version 8.42n results in:
time of for-loop(cycles) 572
time of while-loop(cycles) 566
time of custom-loop-1(cycles) 355
time of custom-loop-2(cycles) 317
time of custom-loop-3(cycles) 234
Full listing:
#include
#include
#include
int main()
{
int j=0;
int a=0,b=0,temp=0;
srand(time(0));
time_t t1=0;
time_t t2=0;
t1=clock();
for(int i=0; i<200000000; i++)
{
temp=a;//instruction 1
a=b;//instruction 2
b=temp;//3 instructions total
}
t2=clock();
printf("\n time of for-loop(cycles) %i \n",(t2-t1));
t1=clock();
while(j<200000000)
{
temp=a;//again it is three instructions
a=b;
b=temp;
j++;
}
t2=clock();
printf("\n time of while-loop(cycles) %i \n",(t2-t1));
t1=clock();
j=200000000;//setting the count
__asm
{
pushf //backup
push eax //backup
push ebx //backup
push ecx //backup
push edx //backup
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
do_it_again: //begin to loop
mov eax,a //basic swap steps between cpu and mem(cache)
mov ebx,b
mov b,eax
mov a,ebx //four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again // end of loop block
pop edx //rolling back to history
pop ecx
pop ebx
pop eax
popf
}
t2=clock();
printf("\n time of custom-loop-1(cycles) %i \n",(t2-t1));
t1=clock();
j=200000000;//setting the count
__asm
{
pushf //backup
push eax
push ebx
push ecx
push edx
mov ecx,0 //init of loop range(0 to 200000000)
mov edx,j
mov eax,a //getting variables to registers
mov ebx,b
do_it_again2: //begin to loop
//swapping with using only 2 variables(only in cpu)
sub eax,ebx //a is now a-b
add ebx,eax //b is now a
sub eax,ebx //a is now -b
xor eax,80000000h //a is now b and four instructions total
inc ecx // j++
cmp ecx,edx //i<200000000 ?
jb do_it_again2 // end of loop block
pop edx //rollback
pop ecx
pop ebx
pop eax
popf
}
t2=clock();
printf("\n time of custom-loop-2(cycles) %i \n",(t2-t1));
t1=clock();
j=200000000;//setting the count
__asm
{
xor ecx,j //init of loop range(200000000 to 0)
mov eax,a //getting variables to registers
mov ebx,b
do_it_again3: //begin to loop
//swapping with using only 2 variables(only in cpu)
xor eax,ebx
xor ebx,eax
xor eax,ebx
mov a,eax
mov b,ebx
dec ecx // j--
jnz do_it_again3 // end of loop block
}
t2=clock();
printf("\n time of custom-loop-3(cycles) %i \n",(t2-t1));
return 0;
}