EDX-EAX register pair divison resulting in big quotient

核能气质少年 提交于 2019-12-11 05:28:52

问题


If I have a 64 bit number in the EDX-EAX, and I divide it with a relatively small number, the quotient may become a number bigger than 32 bits. So at that point the div operator only sets the carry flag?

My problem is, that I would like to process a number in the EDX-EAX and write it out digit per digit, so in this case I would have to divide the number in EDX-EAX by 10 to get the last digit.


回答1:


No. DIV in 64b/32b has maximum quotient 232-1.

Overflow is indicated with the #DE (divide error) exception rather than with the CF flag.

If there's some limit for the 64b number, not using full 64b (like 261 max), then you may first split it by div 109 (nearest to 232 from left) first and then do the two "halves" separately by div 10. But as Jester noted, the 64b div is so slow, that doing the sub with powers of 10 sounds like better idea and the code will be simpler too.


because if it's so simple, then why not to add the code, right? Will be ~5min ... ~60min later (and I'm not very happy about it, I think it may be done in a bit more elegant way with shorter code ... not bothering with performance in either case, this one can be optimized for sure, at minimal to align loops where it matters, but it at least works, so it can be your "reference version" to compare/verify with)...

NASM 32b linux executable, save into uint64toascii.asm
to build: nasm -f elf *.asm; ld -m elf_i386 -s -o uint64toascii *.o

section .text
    global _start       ;must be declared for using gcc
_start:                 ;tell linker entry point

    ; allocate 24B temporary buffer for ASCII number
    sub     esp,24
    ; output test numbers in loop
    mov     esi,testnumbers
testNumbersLoop:
    mov     eax,[esi]
    mov     edx,[esi+4]
    mov     edi,esp
    ; call the routine
    call    integer64btoascii
    ; add new line to output
    mov     [edi],byte 10
    inc     edi
    ; display number string
    mov     edx,edi
    sub     edx,esp     ; output length
    mov     ecx,esp     ; output buffer address
    mov     ebx,1       ; file descriptor (stdout)
    mov     eax,4       ; system call number (sys_write)
    int     0x80        ; call kernel
    ; loop through test numbers
    add     esi,8
    cmp     esi,testnumbersEND
    jb      testNumbersLoop
    ; exit
    add     esp,24      ; release temporary buffer
    mov     eax, 1      ; system call number (sys_exit)
    int     0x80        ; call kernel

integer64btoascii:
    ; edx:eax = number to convert, edi = buffer to output (at least 20B)
    ; returns edi pointing after last character
    push    eax
    push    edx
    push    esi
    push    ebx
    push    ebp
    push    ecx
    ; test for zero in edx:eax -> special handling
    mov     esi,edx
    or      esi,eax
    jz      .zeroNumber
    ; convert other numbers by subtracting 10^k powers
    mov     esi,pow10table-8
.skipLeadingZero:
    add     esi,8
    cmp     edx,[esi+4]
    jc      .skipLeadingZero
    jne     .next10powerInit
    cmp     eax,[esi]
    jc      .skipLeadingZero
    jmp     .next10powerInit
    ; since here every power of 10 is counted and set into output
.next10power:
    mov     [edi],cl    ; write counter digit of previous 10th power
    inc     edi
.next10powerInit:
    mov     ebx,[esi]
    mov     ebp,[esi+4] ; ebp:ebx = 10^k
    test    ebx,ebx
    jz      .finish     ; only zero terminator can have lower 32b == 0
    mov     cl,'0'
    add     esi,8
.compare10power:
    cmp     edx,ebp
    jc      .next10power
    jnz     .sub10power
    cmp     eax,ebx
    jc      .next10power
.sub10power:
    sub     eax,ebx
    sbb     edx,ebp
    inc     cl
    jmp     .compare10power

.zeroNumber:
    mov     [edi],byte '0'
    inc     edi

.finish:
    pop     ecx
    pop     ebp
    pop     ebx
    pop     esi
    pop     edx
    pop     eax
    ret

section .rodata

pow10table:
    dq  10000000000000000000
    dq  1000000000000000000
    dq  100000000000000000
    dq  10000000000000000
    dq  1000000000000000
    dq  100000000000000
    dq  10000000000000
    dq  1000000000000
    dq  100000000000
    dq  10000000000
    dq  1000000000
    dq  100000000
    dq  10000000
    dq  1000000
    dq  100000
    dq  10000
    dq  1000
    dq  100
    dq  10
    dq  1
    dq  0       ; terminator

testnumbers:
    dq  ~0          ; max 2^64-1 = 18446744073709551615
    dq  0           ; looks like zero to me
    dd  0, 1        ; 2^32 = 4294967296
    dq  1234567890  ; < 2^32 (edx = 0)
    dq  9999999999999999999
    dq  101001000100101
testnumbersEND:

You can try it live on web at http://www.tutorialspoint.com/compile_assembly_online.php (copy the source there)


And because I didn't like that first version too much, I kept playing with it a bit, mostly trying to achieve a short (LoC) code, not particularly caring about performance or code size (too lazy to measure anything except number of lines in editor).

Build command line + live demo same as in previous case:

section .text
    global _start       ;must be declared for using gcc
_start:                 ;tell linker entry point

    ; allocate 24B temporary buffer for ASCII number
    sub     esp,24
    ; output test numbers in loop
    mov     esi,testnumbers
testNumbersLoop:
    mov     eax,[esi]
    mov     edx,[esi+4]
    mov     edi,esp
    ; call the routine
    call    integer64btoascii
    ; add new line to output
    mov     [edi],byte 10
    inc     edi
    ; display number string
    mov     edx,edi
    sub     edx,esp     ; output length
    mov     ecx,esp     ; output buffer address
    mov     ebx,1       ; file descriptor (stdout)
    mov     eax,4       ; system call number (sys_write)
    int     0x80        ; call kernel
    ; loop through test numbers
    add     esi,8
    cmp     esi,testnumbersEND
    jb      testNumbersLoop
    ; exit
    add     esp,24      ; release temporary buffer
    mov     eax, 1      ; system call number (sys_exit)
    int     0x80        ; call kernel

integer64btoascii:
    ; edx:eax = number to convert, edi = buffer to output (at least 20B)
    ; returns edi pointing after last character
    push    eax
    push    edx
    push    esi
    push    ecx
    mov     ch,'1'          ; test value for skipping leading zeroes
    mov     esi,pow10table
.nextPow10:                 ; [esi+4]:[esi] = 10^k
    mov     cl,'0'-1
.countPow10:                ; subtract 10^k from edx:eax + count it
    sub     eax,[esi]
    sbb     edx,[esi+4]
    inc     cl              ; preserves CF
    jnc     .countPow10
    ; subtraction overflow, did "one too many" of them
    add     eax,[esi]       ; restore edx:eax to previous value
    adc     edx,[esi+4]
    cmp     cl,ch
    mov     [edi],cl        ; write the digit into output
    sbb     edi,-1          ; advance edi as needed (when cl>=ch)
    cmp     cl,ch
    lea     esi,[esi+8]     ; next power of 10
    adc     ch,-1           ; disable zero skip when non-zero found
    cmp     esi,pow10tableEND
    jb      .nextPow10      ; until all powers of 10 were processed
    cmp     ch,'1'          ; all zeroes output => edx:eax == 0, CF=0
    sbb     edi,-1          ; advance edi when CF=0 (all zeroes)
    pop     ecx
    pop     esi
    pop     edx
    pop     eax
    ret

section .rodata

pow10table:
    dq  10000000000000000000
    dq  1000000000000000000
    dq  100000000000000000
    dq  10000000000000000
    dq  1000000000000000
    dq  100000000000000
    dq  10000000000000
    dq  1000000000000
    dq  100000000000
    dq  10000000000
    dq  1000000000
    dq  100000000
    dq  10000000
    dq  1000000
    dq  100000
    dq  10000
    dq  1000
    dq  100
    dq  10
    dq  1
pow10tableEND:

testnumbers:
    dq  ~0          ; max 2^64-1 = 18446744073709551615
    dq  0           ; looks like zero to me
    dd  0, 1        ; 2^32 = 4294967296 (eax = 0)
    dq  1234567890  ; < 2^32 (edx = 0)
    dq  10000000000000000000    ; largest 10^k to fit into 64b
    dq  9999999999999999999     ; to verify "9"
    dq  10200300040000500000    ; to verify "0" in-between/at-end
testnumbersEND:

BTW, if the CPU will not stall too much over partial-reg ch vs cl clashing (IMHO shouldn't, as the update of value is a bit apart and will collide only rarely), then I believe the second version will perform better than first one, as the branching is lot more simplified.

But "believe" is the keyword here, if you are after performance, profile! (and use "align 8 or 16 (or maybe just 4)" on key loops, verify listing+performance which one is better)


One more version, probably sort of simpler to understand the leading zeroes test logic, plus this can be extended to 128b integers easily (ebx and ebp are spared, so they may hold another 64b of input number), and to any other arbitrary number of bits, when the number + subtracting is in memory (as 256b will not fit into registers in 32b x86 mode). This can be also modified to work in 64b mode with more registers with only few changes (of course all of these require much larger table of 10k powers, up to the maximum one for desired number of bits).

I'm posting this one, as I got really happy about how I solved the "zero" puzzle finally - which was annoying me whole weekend.

In the end the elegant (and faster) solution exist: just exit subtraction already after 101 power, so eax is left with value 0-9. Then just write that value into output without any "skip" test, as in case of non-zero number it belongs to proper output, and in case of edx:eax == 0 input it will create the single '0' char. Duh!

Plus I managed to change "skip leading zeroes" logic to survive any number of digits produced, not only 48.

section .text
    global _start       ;must be declared for using gcc
_start:                 ;tell linker entry point
    ; allocate 24B temporary buffer for ASCII number
    sub     esp,24
    ; output test numbers in loop
    mov     esi,testnumbers
testNumbersLoop:
    mov     eax,[esi]
    mov     edx,[esi+4]
    mov     edi,esp
    ; call the routine
    call    integer64btoascii
    ; add new line to output
    mov     [edi],byte 10
    inc     edi
    ; display number string
    mov     edx,edi
    sub     edx,esp     ; output length
    mov     ecx,esp     ; output buffer address
    mov     ebx,1       ; file descriptor (stdout)
    mov     eax,4       ; system call number (sys_write)
    int     0x80        ; call kernel
    ; loop through test numbers
    add     esi,8
    cmp     esi,testnumbersEND
    jb      testNumbersLoop
    ; exit
    add     esp,24      ; release temporary buffer
    mov     eax, 1      ; system call number (sys_exit)
    int     0x80        ; call kernel

integer64btoascii:
    ; edx:eax = number to convert, edi = buffer to output (at least 20B)
    ; returns edi pointing after last character
    push    eax
    push    edx
    push    esi
    push    ecx
    mov     ch,'0'          ; test value to detect leading zeroes
    mov     esi,pow10table
.nextPow10:                 ; [esi+4]:[esi] = 10^k
    mov     cl,'0'-1        ; count of 10^k power in ASCII digit
.countPow10:                ; subtract 10^k from edx:eax + count it
    sub     eax,[esi]
    sbb     edx,[esi+4]
    inc     cl              ; preserves CF
    jnc     .countPow10     ; loop till subtraction overflows
    ; subtraction overflow, did "one too many" of them
    or      ch,cl           ; merge digit into test_leading_zeroes
    add     eax,[esi]       ; restore edx:eax to previous value
    adc     edx,[esi+4]
    cmp     ch,'1'          ; test is still '0'? => CF=1
    mov     [edi],cl        ; write the digit into output
    lea     esi,[esi+8]     ; next power of 10
    sbb     edi,-1          ; advance edi as needed (test value > '0')
    cmp     esi,pow10tableEND
    jb      .nextPow10      ; until all table powers of 10 were processed
    or      al,'0'          ; remaining eax = 0..9, convert to ASCII
    mov     [edi],al        ; store last digit
    inc     edi             ; last digit will advance edi always
    pop     ecx
    pop     esi
    pop     edx
    pop     eax
    ret

section .rodata

pow10table:
    dq  10000000000000000000
    dq  1000000000000000000
    dq  100000000000000000
    dq  10000000000000000
    dq  1000000000000000
    dq  100000000000000
    dq  10000000000000
    dq  1000000000000
    dq  100000000000
    dq  10000000000
    dq  1000000000
    dq  100000000
    dq  10000000
    dq  1000000
    dq  100000
    dq  10000
    dq  1000
    dq  100
    dq  10
pow10tableEND:

testnumbers:
    dq  ~0          ; max 2^64-1 = 18446744073709551615
    dq  0           ; looks like zero to me
    dd  0, 1        ; 2^32 = 4294967296 (eax = 0)
    dq  1234567890  ; < 2^32 (edx = 0)
    dq  10000000000000000000    ; largest 10^k to fit into 64b
    dq  9999999999999999999     ; to verify "9"
    dq  10200300040000500000    ; to verify "0" in-between/at-end
testnumbersEND:

For better performance it should be possible to do the inversion-value (reciprocal) multiplications to get div 10 by imul, but that's over my head.

These three versions are probably simple enough for somebody learning Assembly to understand them, plus they illustrate the progress of mind over couple of days.



来源:https://stackoverflow.com/questions/41084951/edx-eax-register-pair-divison-resulting-in-big-quotient

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!