I\'m coding a simple primality tester program for Windows in x86 assembly language (MASM32), which involves calculating a square root of a (64-bit) integer. My question is:
Here is my original 64-bit Square root routine:
// X:int64
// sqrt64(X)
....
asm
push ESI {preserve ESI,EDI and EBX}
push EDI
push EBX
mov ESI,dword ptr X
mov EDI,dword ptr X+4
xor Eax,Eax
xor Ebx,Ebx
mov cx,32
@next:
// Add ESI,ESI //old RCL ESI,1 - Peter Cordes suggestion
// ADC EDI,EDI //old RCL EDI,1 ~1.38x faster!
// ADC EBX,EBX //old RCL Ebx,1
//Add ESI,ESI //old RCL ESI,1
//ADC EDI,EDI //old RCL EDI,1
//ADC EBX,EBX //old RCL Ebx,1
shld ebx, edi, 2 //- Peter Cordes 41% speed up!
shld edi, esi, 2
lea esi, [esi*4]
//mov EDX,EAX
//Add EDX,EDX //old shl Edx,1
//stc
//ADC EDX,EDX //old RCL Edx,1 {+01}
lea edx, [eax*4 + 1] //- Peter Cordes +20% speed up
cmp EBX,EDX {if BX>=DX -> BX-DX}
JC @skip
sub EBX,EDX
@skip:
cmc {invert C}
ADC EAX,EAX //old RCL Eax,1
dec cx
jnz @next // - Peter Cordes +40% speed up
//LOOP @next
pop EBX
pop EDI
pop ESI
mov result,Eax
end;
....