问题
If I have a 64 bit number in the EDX-EAX, and I divide it with a relatively small number, the quotient may become a number bigger than 32 bits.
So at that point the div operator only sets the carry flag?
My problem is, that I would like to process a number in the EDX-EAX and write it out digit per digit, so in this case I would have to divide the number in EDX-EAX by 10 to get the last digit.
回答1:
No. DIV in 64b/32b has maximum quotient 232-1.
Overflow is indicated with the #DE (divide error) exception rather than with the CF flag.
If there's some limit for the 64b number, not using full 64b (like 261 max), then you may first split it by div 109 (nearest to 232 from left) first and then do the two "halves" separately by div 10. But as Jester noted, the 64b div is so slow, that doing the sub with powers of 10 sounds like better idea and the code will be simpler too.
because if it's so simple, then why not to add the code, right? Will be ~5min ... ~60min later (and I'm not very happy about it, I think it may be done in a bit more elegant way with shorter code ... not bothering with performance in either case, this one can be optimized for sure, at minimal to align loops where it matters, but it at least works, so it can be your "reference version" to compare/verify with)...
NASM 32b linux executable, save into uint64toascii.asm
to build: nasm -f elf *.asm; ld -m elf_i386 -s -o uint64toascii *.o
section .text
global _start ;must be declared for using gcc
_start: ;tell linker entry point
; allocate 24B temporary buffer for ASCII number
sub esp,24
; output test numbers in loop
mov esi,testnumbers
testNumbersLoop:
mov eax,[esi]
mov edx,[esi+4]
mov edi,esp
; call the routine
call integer64btoascii
; add new line to output
mov [edi],byte 10
inc edi
; display number string
mov edx,edi
sub edx,esp ; output length
mov ecx,esp ; output buffer address
mov ebx,1 ; file descriptor (stdout)
mov eax,4 ; system call number (sys_write)
int 0x80 ; call kernel
; loop through test numbers
add esi,8
cmp esi,testnumbersEND
jb testNumbersLoop
; exit
add esp,24 ; release temporary buffer
mov eax, 1 ; system call number (sys_exit)
int 0x80 ; call kernel
integer64btoascii:
; edx:eax = number to convert, edi = buffer to output (at least 20B)
; returns edi pointing after last character
push eax
push edx
push esi
push ebx
push ebp
push ecx
; test for zero in edx:eax -> special handling
mov esi,edx
or esi,eax
jz .zeroNumber
; convert other numbers by subtracting 10^k powers
mov esi,pow10table-8
.skipLeadingZero:
add esi,8
cmp edx,[esi+4]
jc .skipLeadingZero
jne .next10powerInit
cmp eax,[esi]
jc .skipLeadingZero
jmp .next10powerInit
; since here every power of 10 is counted and set into output
.next10power:
mov [edi],cl ; write counter digit of previous 10th power
inc edi
.next10powerInit:
mov ebx,[esi]
mov ebp,[esi+4] ; ebp:ebx = 10^k
test ebx,ebx
jz .finish ; only zero terminator can have lower 32b == 0
mov cl,'0'
add esi,8
.compare10power:
cmp edx,ebp
jc .next10power
jnz .sub10power
cmp eax,ebx
jc .next10power
.sub10power:
sub eax,ebx
sbb edx,ebp
inc cl
jmp .compare10power
.zeroNumber:
mov [edi],byte '0'
inc edi
.finish:
pop ecx
pop ebp
pop ebx
pop esi
pop edx
pop eax
ret
section .rodata
pow10table:
dq 10000000000000000000
dq 1000000000000000000
dq 100000000000000000
dq 10000000000000000
dq 1000000000000000
dq 100000000000000
dq 10000000000000
dq 1000000000000
dq 100000000000
dq 10000000000
dq 1000000000
dq 100000000
dq 10000000
dq 1000000
dq 100000
dq 10000
dq 1000
dq 100
dq 10
dq 1
dq 0 ; terminator
testnumbers:
dq ~0 ; max 2^64-1 = 18446744073709551615
dq 0 ; looks like zero to me
dd 0, 1 ; 2^32 = 4294967296
dq 1234567890 ; < 2^32 (edx = 0)
dq 9999999999999999999
dq 101001000100101
testnumbersEND:
You can try it live on web at http://www.tutorialspoint.com/compile_assembly_online.php (copy the source there)
And because I didn't like that first version too much, I kept playing with it a bit, mostly trying to achieve a short (LoC) code, not particularly caring about performance or code size (too lazy to measure anything except number of lines in editor).
Build command line + live demo same as in previous case:
section .text
global _start ;must be declared for using gcc
_start: ;tell linker entry point
; allocate 24B temporary buffer for ASCII number
sub esp,24
; output test numbers in loop
mov esi,testnumbers
testNumbersLoop:
mov eax,[esi]
mov edx,[esi+4]
mov edi,esp
; call the routine
call integer64btoascii
; add new line to output
mov [edi],byte 10
inc edi
; display number string
mov edx,edi
sub edx,esp ; output length
mov ecx,esp ; output buffer address
mov ebx,1 ; file descriptor (stdout)
mov eax,4 ; system call number (sys_write)
int 0x80 ; call kernel
; loop through test numbers
add esi,8
cmp esi,testnumbersEND
jb testNumbersLoop
; exit
add esp,24 ; release temporary buffer
mov eax, 1 ; system call number (sys_exit)
int 0x80 ; call kernel
integer64btoascii:
; edx:eax = number to convert, edi = buffer to output (at least 20B)
; returns edi pointing after last character
push eax
push edx
push esi
push ecx
mov ch,'1' ; test value for skipping leading zeroes
mov esi,pow10table
.nextPow10: ; [esi+4]:[esi] = 10^k
mov cl,'0'-1
.countPow10: ; subtract 10^k from edx:eax + count it
sub eax,[esi]
sbb edx,[esi+4]
inc cl ; preserves CF
jnc .countPow10
; subtraction overflow, did "one too many" of them
add eax,[esi] ; restore edx:eax to previous value
adc edx,[esi+4]
cmp cl,ch
mov [edi],cl ; write the digit into output
sbb edi,-1 ; advance edi as needed (when cl>=ch)
cmp cl,ch
lea esi,[esi+8] ; next power of 10
adc ch,-1 ; disable zero skip when non-zero found
cmp esi,pow10tableEND
jb .nextPow10 ; until all powers of 10 were processed
cmp ch,'1' ; all zeroes output => edx:eax == 0, CF=0
sbb edi,-1 ; advance edi when CF=0 (all zeroes)
pop ecx
pop esi
pop edx
pop eax
ret
section .rodata
pow10table:
dq 10000000000000000000
dq 1000000000000000000
dq 100000000000000000
dq 10000000000000000
dq 1000000000000000
dq 100000000000000
dq 10000000000000
dq 1000000000000
dq 100000000000
dq 10000000000
dq 1000000000
dq 100000000
dq 10000000
dq 1000000
dq 100000
dq 10000
dq 1000
dq 100
dq 10
dq 1
pow10tableEND:
testnumbers:
dq ~0 ; max 2^64-1 = 18446744073709551615
dq 0 ; looks like zero to me
dd 0, 1 ; 2^32 = 4294967296 (eax = 0)
dq 1234567890 ; < 2^32 (edx = 0)
dq 10000000000000000000 ; largest 10^k to fit into 64b
dq 9999999999999999999 ; to verify "9"
dq 10200300040000500000 ; to verify "0" in-between/at-end
testnumbersEND:
BTW, if the CPU will not stall too much over partial-reg ch vs cl clashing (IMHO shouldn't, as the update of value is a bit apart and will collide only rarely), then I believe the second version will perform better than first one, as the branching is lot more simplified.
But "believe" is the keyword here, if you are after performance, profile! (and use "align 8 or 16 (or maybe just 4)" on key loops, verify listing+performance which one is better)
One more version, probably sort of simpler to understand the leading zeroes test logic, plus this can be extended to 128b integers easily (ebx and ebp are spared, so they may hold another 64b of input number), and to any other arbitrary number of bits, when the number + subtracting is in memory (as 256b will not fit into registers in 32b x86 mode). This can be also modified to work in 64b mode with more registers with only few changes (of course all of these require much larger table of 10k powers, up to the maximum one for desired number of bits).
I'm posting this one, as I got really happy about how I solved the "zero" puzzle finally - which was annoying me whole weekend.
In the end the elegant (and faster) solution exist: just exit subtraction already after 101 power, so eax is left with value 0-9. Then just write that value into output without any "skip" test, as in case of non-zero number it belongs to proper output, and in case of edx:eax == 0 input it will create the single '0' char. Duh!
Plus I managed to change "skip leading zeroes" logic to survive any number of digits produced, not only 48.
section .text
global _start ;must be declared for using gcc
_start: ;tell linker entry point
; allocate 24B temporary buffer for ASCII number
sub esp,24
; output test numbers in loop
mov esi,testnumbers
testNumbersLoop:
mov eax,[esi]
mov edx,[esi+4]
mov edi,esp
; call the routine
call integer64btoascii
; add new line to output
mov [edi],byte 10
inc edi
; display number string
mov edx,edi
sub edx,esp ; output length
mov ecx,esp ; output buffer address
mov ebx,1 ; file descriptor (stdout)
mov eax,4 ; system call number (sys_write)
int 0x80 ; call kernel
; loop through test numbers
add esi,8
cmp esi,testnumbersEND
jb testNumbersLoop
; exit
add esp,24 ; release temporary buffer
mov eax, 1 ; system call number (sys_exit)
int 0x80 ; call kernel
integer64btoascii:
; edx:eax = number to convert, edi = buffer to output (at least 20B)
; returns edi pointing after last character
push eax
push edx
push esi
push ecx
mov ch,'0' ; test value to detect leading zeroes
mov esi,pow10table
.nextPow10: ; [esi+4]:[esi] = 10^k
mov cl,'0'-1 ; count of 10^k power in ASCII digit
.countPow10: ; subtract 10^k from edx:eax + count it
sub eax,[esi]
sbb edx,[esi+4]
inc cl ; preserves CF
jnc .countPow10 ; loop till subtraction overflows
; subtraction overflow, did "one too many" of them
or ch,cl ; merge digit into test_leading_zeroes
add eax,[esi] ; restore edx:eax to previous value
adc edx,[esi+4]
cmp ch,'1' ; test is still '0'? => CF=1
mov [edi],cl ; write the digit into output
lea esi,[esi+8] ; next power of 10
sbb edi,-1 ; advance edi as needed (test value > '0')
cmp esi,pow10tableEND
jb .nextPow10 ; until all table powers of 10 were processed
or al,'0' ; remaining eax = 0..9, convert to ASCII
mov [edi],al ; store last digit
inc edi ; last digit will advance edi always
pop ecx
pop esi
pop edx
pop eax
ret
section .rodata
pow10table:
dq 10000000000000000000
dq 1000000000000000000
dq 100000000000000000
dq 10000000000000000
dq 1000000000000000
dq 100000000000000
dq 10000000000000
dq 1000000000000
dq 100000000000
dq 10000000000
dq 1000000000
dq 100000000
dq 10000000
dq 1000000
dq 100000
dq 10000
dq 1000
dq 100
dq 10
pow10tableEND:
testnumbers:
dq ~0 ; max 2^64-1 = 18446744073709551615
dq 0 ; looks like zero to me
dd 0, 1 ; 2^32 = 4294967296 (eax = 0)
dq 1234567890 ; < 2^32 (edx = 0)
dq 10000000000000000000 ; largest 10^k to fit into 64b
dq 9999999999999999999 ; to verify "9"
dq 10200300040000500000 ; to verify "0" in-between/at-end
testnumbersEND:
For better performance it should be possible to do the inversion-value (reciprocal) multiplications to get div 10 by imul, but that's over my head.
These three versions are probably simple enough for somebody learning Assembly to understand them, plus they illustrate the progress of mind over couple of days.
来源:https://stackoverflow.com/questions/41084951/edx-eax-register-pair-divison-resulting-in-big-quotient