I have a 128-bit unsigned integer A and a 64-bit unsigned integer B. What\'s the fastest way to calculate A % B - that is the (64-bit) remainder from dividing A
This is almost untested partly speed modificated Mod128by64 'Russian peasant' algorithm function. Unfortunately I'm a Delphi user so this function works under Delphi. :) But the assembler is almost the same so...
function Mod128by64(Dividend: PUInt128; Divisor: PUInt64): UInt64;
//In : eax = @Dividend
// : edx = @Divisor
//Out: eax:edx as Remainder
asm
//Registers inside rutine
//Divisor = edx:ebp
//Dividend = bh:ebx:edx //We need 64 bits + 1 bit in bh
//Result = esi:edi
//ecx = Loop counter and Dividend index
push ebx //Store registers to stack
push esi
push edi
push ebp
mov ebp, [edx] //Divisor = edx:ebp
mov edx, [edx + 4]
mov ecx, ebp //Div by 0 test
or ecx, edx
jz @DivByZero
xor edi, edi //Clear result
xor esi, esi
//Start of 64 bit division Loop
mov ecx, 15 //Load byte loop shift counter and Dividend index
@SkipShift8Bits: //Small Dividend numbers shift optimisation
cmp [eax + ecx], ch //Zero test
jnz @EndSkipShiftDividend
loop @SkipShift8Bits //Skip 8 bit loop
@EndSkipShiftDividend:
test edx, $FF000000 //Huge Divisor Numbers Shift Optimisation
jz @Shift8Bits //This Divisor is > $00FFFFFF:FFFFFFFF
mov ecx, 8 //Load byte shift counter
mov esi, [eax + 12] //Do fast 56 bit (7 bytes) shift...
shr esi, cl //esi = $00XXXXXX
mov edi, [eax + 9] //Load for one byte right shifted 32 bit value
@Shift8Bits:
mov bl, [eax + ecx] //Load 8 bits of Dividend
//Here we can unrole partial loop 8 bit division to increase execution speed...
mov ch, 8 //Set partial byte counter value
@Do65BitsShift:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
setc bh //Save 65th bit
sub edi, ebp //Compare dividend and divisor
sbb esi, edx //Subtract the divisor
sbb bh, 0 //Use 65th bit in bh
jnc @NoCarryAtCmp //Test...
add edi, ebp //Return privius dividend state
adc esi, edx
@NoCarryAtCmp:
dec ch //Decrement counter
jnz @Do65BitsShift
//End of 8 bit (byte) partial division loop
dec cl //Decrement byte loop shift counter
jns @Shift8Bits //Last jump at cl = 0!!!
//End of 64 bit division loop
mov eax, edi //Load result to eax:edx
mov edx, esi
@RestoreRegisters:
pop ebp //Restore Registers
pop edi
pop esi
pop ebx
ret
@DivByZero:
xor eax, eax //Here you can raise Div by 0 exception, now function only return 0.
xor edx, edx
jmp @RestoreRegisters
end;
At least one more speed optimisation is possible! After 'Huge Divisor Numbers Shift Optimisation' we can test divisors high bit, if it is 0 we do not need to use extra bh register as 65th bit to store in it. So unrolled part of loop can look like:
shl bl,1 //Shift dividend left for one bit
rcl edi,1
rcl esi,1
sub edi, ebp //Compare dividend and divisor
sbb esi, edx //Subtract the divisor
jnc @NoCarryAtCmpX
add edi, ebp //Return privius dividend state
adc esi, edx
@NoCarryAtCmpX: