Cube root on x87 FPU using Newton-Raphson method

问题

I am trying to write an assembly program using the 8086 processor that will find the cube root of a number. Obviously I am using floating points.

Algorithm based upon Newton-Raphson method:

root := 1.0; 
repeat
     oldRoot := root;
     root := (2.0*root + x/(root*root)) / 3.0 
until ( |root – oldRoot| < 0.001;

How do I divide (2*root + x) by (root*root)?

.586
.MODEL FLAT
.STACK 4096

.DATA
root    REAL4   1.0
oldRoot REAL4   2.0
Two     REAL4   2.0
inttwo  DWORD   2
itThree DWORD   3
three   REAL4   3.0
x       DOWRD   27


.CODE
main    PROC
        finit           ; initialize FPU
        fld     root    ; root in ST
        fmul    two     ; root*two
        fadd    x       ; root*two+27

        fld     root    ; root in ST
        fimul    root    ; root*root

        mov     eax, 0  ; exit
        ret
main    ENDP 
END

I guess I don't understand what is in the stack at what location. Does the product for line

fimul root ; root*root

go into ST(1)? EDIT No, it goes into st(0) what was in st(0) got pushed down the stack to st(1)

But I haven't figured out the answer to my question... How do I divide? Now I see I need to divide st(1) by st(0) but I don't know how. I tried this.

finit           ; initialize FPU
fld     root    ; root in ST
fmul    two     ; root*two
fadd    xx      ; root*two+27
; the answer to root*two+x is stored in ST(0) when we load root st(0) moves to ST1 and we will use ST0 for the next operation

fld     root    ; root in ST previous content is now in ST1
fimul   root    ; root*root
fidiv   st(1)

EDIT: I had the formula written wrong. This is what I am looking for.

(2.0*root) + x / (root*root)) / 3.0 That's what I need. 
STEP 1) (2 * root) 
STEP 2) x / (root * root) 
STEP 3) ADD step one and step 2 
STEP 4) divide step 3 by 3.0

root = (2.0*1.0) + 27/(1.0*1.0) / 3.0 ; (2) + 27/(1.0) / 3.0 = 11 ==> root = 11

EDIT2: NEW CODE!!

.586
.MODEL FLAT
.STACK 4096

.DATA
root    REAL4   1.0
oldRoot REAL4   2.0
Two     REAL4   2.0
three   REAL4   3.0
xx      REAL4   27.0


.CODE
main    PROC
        finit           ; initialize FPU
                fld     root    ; root in ST    ; Pass 1 ST(0) has 1.0  
repreatAgain:
        ;fld    st(2)

        fmul    two     ; root*two      ; Pass 1 ST(0) has 2                                                                            Pass 2 ST(0) = 19.333333 st(1) = 3.0 st(2) = 29.0 st(3) = 1.0

        ; the answer to roor*two is stored in ST0 when we load rootSTO moves to ST1 and we will use ST0 for the next operation
        fld     root    ; root in ST(0) previous content is now in ST(1)      Pass 1 ST(0) has 1.0 ST(1) has 2.0                        Pass 2 st(
        fmul    st(0), st(0)    ; root*root                                 ; Pass 1 st(0) has 1.0 st(1) has 2.0
        fld     xx                                                          ; Pass 1 st(0) has 27.0 st(1) has 1.0 st(2) has 2.0
        fdiv    st(0), st(1) ; x / (root*root)  ; Pass 1: 27 / 1              Pass 1 st(0) has 27.0 st(1) has 2.0 st(2) has 2.0
        fadd    st(0), st(2) ; (2.0*root) + x / (root*root))                  Pass 1 st(0) has 29.0 st(1) has 1.0 st(2) has 2.0

        fld     three                                                       ; Pass 1 st(0) has 3.0 st(1) has 29.0 st(2) has 1.0 st(3) has 2.0

        fld     st(1)                                                       ; Pass 1 st(0) has 3.0 st(1) has 29.0 st(2) = 1.0 st(3) = 2.0
        fdiv    st(0), st(1) ; (2.0*root) + x / (root*root)) / 3.0            Pass 1 st(1) has 9.6666666666667



        jmp     repreatAgain
        mov     eax, 0  ; exit
        ret
main    ENDP 
END

回答1:

Intel's insn reference manual documents all the instructions, including fdiv and fdivr (x/y instead of y/x). If you really need to learn mostly-obsolete x87 (fdiv) instead of SSE2 (divss), then this x87 tutorial is essential reading, esp. the early chapter that explains the register stack. Also see this x87 FP comparison Q&A. See more links in the x86 tag wiki.

re: EDIT2 code dump:

You have 4 fld instructions inside the loop, but no p-suffixed operations. Your loop will overflow the 8-register FP stack on the 3rd iteration, at which point you'll get a NaN. (specifically, the indefinite-value NaN, which printf prints as 1#IND.

I'd suggest designing your loop so an iteration starts with root in st(0), and ends with the next iteration's root value in st(0). Don't load or store to/from root inside the loop. Use fld1 to load 1.0 as your initial value outside the loop, and fstp [root] after the loop to pop st(0) into memory.

You picked the most inconvenient way to do tmp / 3.0

                          ; stack = tmp   (and should otherwise be empty once you fix the rest of your code)
    fld     three         ; stack = 3.0, tmp
    fld     st(1)         ; stack = tmp, 3.0, tmp   ; should have used fxchg to just swap instead of making the stack deeper
    fdiv    st(0), st(1)  ; stack = tmp/3.0, 3.0, tmp

fdiv, fsub, etc. have multiple register-register forms: one where st(0) is the destination, and one where it's the source. The form with st(0) as the source is also available with a pop, so you could

    fld     three         ; stack = 3.0, tmp
    fdivp                 ; stack = tmp / 3.0  popping the stack back to just one entry
    ; fdivp  st(1), st(0) ; this is what fdivp with no operands means

It's actually even simpler than that if you use a memory operand directly instead of loading it. Since you want st(0) /= 3.0, you can do fdiv [three]. In that case, FP ops are just like integer ops, where you can do div dword ptr [integer_from_memory] to use a memory source operand.

The non-commutative operations (subtract and divide) also have reverse versions (e.g. fdivr), which can save you an fxchg or let you use a memory operand even if you'd needed 3.0/tmp instead of tmp/3.0

Dividing by 3 is the same as multiplying by 1/3, and fmul is much faster than fdiv. From a code-simplicity point of view, multiply is commutative, so another way to implement st(0) /= 3 is:

fld    [one_third]
fmulp                  ; shorthand for  fmulp st(1), st(0)

; or
fmul   [one_third]

Note that 1/3.0 has no exact representation in binary floating point, but all integers between +/- about 2^23 do (size of mantissa of single-precision REAL4). You should only care about this if you were expecting to work with exact multiples of three.

Comments on the original code:

You can hoist a division out of the loop by doing 2.0 / 3.0 and x/3.0 ahead of time. This is worth it if you expect the loop to run more than one iteration on average.

You can duplicate the top of the stack with fld st(0), so you don't have to keep loading from memory.

fimul [root] (integer mul) is a bug: Your root is in REAL4 (32bit float) format, not integer. fidiv is similarly a bug, and of course doesn't work with an x87 register as a source operand.

Since you have root at the top of the stack, I think you can just fmul st(0) to use st(0) as both the explicit and implicit operand, resulting in st(0) = st(0) * st(0), with no change in the depth of the stack.

You could also use sqrt as a better initial approximation than 1.0, or maybe +/-1 * sqrtf(fabsf(x)). I don't see an x87 instruction for applying the sign of one float to another, just fchs to unconditionally flip, and fabs to unconditionally clear the sign bit. There is an fcmov, but it requires a P6 or later CPU. You mentioned 8086, but then used .586, so IDK what you're targeting.

Better loop body:

Not debugged or tested, but your code full of repeated loads from the same data was making me crazy. This optimized version is here because I was curious, not because I think it's going to help the OP directly.

Also, hopefully this is a good example of how to comment the data flow in code where it's tricky. (e.g. x87, or vectorized code with shuffles).

## x/3.0 in st(1)
## 2.0/3.0 in st(2)

# before each iteration: st(0) = root
#  after each iteration: st(0) = root * 2.0/3.0 + (x/3.0 / (root*root)), with constants undisturbed

loop_body:
    fld     st(0)         ; stack: root, root, 2/3, x/3
    fmul    st(0), st(0)  ; stack: root^2, root, 2/3, x/3
    fdivr   st(0), st(3)  ; stack: x/3 / root^2, root, 2/3, x/3
    fxchg   st(1)         ; stack: root, x/3/root^2, 2/3, x/3
    fmul    st(0), st(2)  ; stack: root*2/3, x/3/root^2, 2/3, x/3
    faddp                 ; stack: root*2/3 + x/3/root^2, 2/3, x/3

; TODO: compare and loop back to loop_body

    fstp    [root]         ; store and pop
    fstp    st(0)          ; pop the two constants off the FP stack to empty it before returning
    fstp    st(0)
    ; finit is very slow, ~80cycles, don't use it if you don't have to.

32bit function calling-conventions return FP results in st(0), so you could do that, but then the caller probably have to store somewhere.

回答2:

I'm going to answer this on a very basic level for those people new to x87 who may be faced with a calculation that needs to be done on the FPU.

There are two things to consider. If you are given a calculation (INFIX notation) like:

root := (2.0*root + x/(root*root)) / 3.0

Is there a way to translate this into basic instructions that can be used by the x87 FPU? Yes, at a very basic level the x87 FPU is a stack that acts like a sophisticated RPN calculator. The equation in your code is INFIX notation. If you convert this to POSTFIX(RPN) notation, it can easily be implemented as a stack with operations.

This document provides some information on converting to POSTFIX notation. Following the rules your POSTFIX equivalent would look like:

2.0 root * x root root * / + 3.0 /

You could literally put that into an old RPN calculator (HPs) like the HP 15C using these keys where root=1 and x=27:

2.0 [enter] root * x [enter] root [enter] root * / + 3.0 /

The online HP 15C should show the result of that calculation being 9.667. Translating this to basic x87:

A number is a push to top of stack (fld)
A variable is a push to top of stack (fld)
* is fmulp (Multiply ST(1) by ST(0), store result in ST(1), and pop the register stack)
/ is fdivp (Divide ST(1) by ST(0), store result in ST(1), and pop the register stack)
+ is faddp (Add ST(0) to ST(1), store result in ST(1), and pop the register stack)
- is fsubp (Subtract ST(0) from ST(1), store result in ST(1), and pop register stack)

You can literally convert 2.0 root * x root root * / + 3.0 / to x87 instructions:

fld Two      ; st(0)=2.0
fld root     ; st(0)=root, st(1)=2.0
fmulp        ; st(0)=(2.0 * root)
fld xx       ; st(0)=x, st(1)=(2.0 * root)
fld root     ; st(0)=root, st(1)=x, st(2)=(2.0 * root)
fld root     ; st(0)=root, st(1)=root, st(2)=x, st(3)=(2.0 * root)
fmulp        ; st(0)=(root * root), st(1)=x, st(2)=(2.0 * root)
fdivp        ; st(0)=(x / (root * root)), st(1)=(2.0 * root)
faddp        ; st(0)=(2.0 * root) + (x / (root * root))
fld Three    ; st(0)=3.0, st(1)=(2.0 * root) + (x / (root * root))
fdivp        ; st(0)=((2.0 * root) + (x / (root * root))) / 3.0

Once you have the basics, you can move on to improving efficiency.

Regarding Edit 2 / Followup question

One thing to keep in mind is that if you don't use instructions that pop values off the stack, each iteration of the loop will consume more FPU stack slots. Generally the FPU instructions ending with P pop values off the stack. You don't use any instructions to remove items off the stack, the FPU stack keeps growing.

Unlike the program stack in user space, the FPU stack is very limited as it only has 8 slots. If you put more than 8 active values on the stack you will get overflow errors in the form of 1#IND. If we analyze your code and view the stack after each instruction we'd find this:

    fld     root            ; st(0)=root  
repreatAgain:
    fmul    two             ; st(0)=(2.0*root)      
    fld     root            ; st(0)=root, st(1)=(2.0*root) 
    fmul    st(0), st(0)    ; st(0)=(root*root), st(1)=(2.0*root)
    fld     xx              ; st(0)=x, st(1)=(root*root), st(2)=(2.0*root)
    fdiv    st(0), st(1)    ; st(0)=(x/(root*root)), st(1)=(root*root), st(2)=(2.0*root)
    fadd    st(0), st(2)    ; st(0)=((2.0*root) + x/(root*root)), st(1)=(root*root), st(2)=(2.0*root)
    fld     three           ; st(0)=3.0, st(1)=((2.0*root) + x/(root*root)), st(2)=(root*root), st(3)=(2.0*root)                                            
    fld     st(1)           ; st(0)=((2.0*root) + x/(root*root)), st(1)=3.0, st(2)=((2.0*root) + x/(root*root)), st(3)=(root*root), st(4)=(2.0*root)
    fdiv    st(0), st(1)    ; st(0)=(((2.0*root) + x/(root*root))/3.0), st(1)=3.0, st(2)=((2.0*root) + x/(root*root)), st(3)=(root*root), st(4)=(2.0*root)
    jmp     repreatAgain

Observe that after the last FDIV instruction and before the JMP we have 5 items on the stack (st(0) through st(4)). When we entered the loop we only had 1 which was root in st(0). The best way to resolve this is to use instructions in such a way that values get popped (removed) from the stack as the calculation progresses.

One other less efficient way is to free up the values we no longer want on the stack before repeating the loop. The FFREE instruction can used for this purpose by manually marking the entries unused starting from the bottom of the stack. If you add these lines after the code above, and before the jmp repreatAgain the code should work:

ffree   st(4)           ; st(0)=(((2.0*root) + x/(root*root))/3.0), st(1)=3.0, st(2)=((2.0*root) + x/(root*root)), st(3)=(root*root)
ffree   st(3)           ; st(0)=(((2.0*root) + x/(root*root))/3.0), st(1)=3.0, st(2)=((2.0*root) + x/(root*root))
ffree   st(2)           ; st(0)=(((2.0*root) + x/(root*root))/3.0), st(1)=3.0
ffree   st(1)           ; st(0)=(((2.0*root) + x/(root*root))/3.0)
fst     root            ; Update root variable
jmp     repreatAgain

With the use of the FFREE instruction we end the loop with only the new root in st(0).

I've also added fst root because of the way you did your calculation. Your calculation includes fld root which relies on the value in root being updated when each loop is finished. There is a more efficient way of doing this but I'm providing a fix that works in your current code without much reworking.

If you were to use the inefficient/simple code snippet I provided earlier to do the calculations you'd end up with code likes this:

    finit        ; initialize FPU
repreatAgain:
    fld Two      ; st(0)=2.0
    fld root     ; st(0)=root, st(1)=2.0
    fmulp        ; st(0)=(2.0 * root)
    fld xx       ; st(0)=x, st(1)=(2.0 * root)
    fld root     ; st(0)=root, st(1)=x, st(2)=(2.0 * root)
    fld root     ; st(0)=root, st(1)=root, st(2)=x, st(3)=(2.0 * root)
    fmulp        ; st(0)=(root * root), st(1)=x, st(2)=(2.0 * root)
    fdivp        ; st(0)=(x / (root * root)), st(1)=(2.0 * root)
    faddp        ; st(0)=(2.0 * root) + (x / (root * root))
    fld Three    ; st(0)=3.0, st(1)=(2.0 * root) + (x / (root * root))
    fdivp        ; newroot = st(0)=((2.0 * root) + (x / (root * root))) / 3.0
    fstp root    ; Store result at top of stack into root and pop value
                 ;     at this point the stack is clear again since
                 ;     all items pushed have been popped.

    jmp repreatAgain

This code doesn't require FFREE because elements are popped off the stack as the calculation progresses. The instruction FADDP, FSUBP, FDIVP, FADDP will additionally pop the value off the top of the stack. This has the side effect of keeping the stack clear of the partial intermediate calculations.

Integrate a Loop

To integrate the loop into the simple/inefficient code I created earlier, you can use a variant of the FCOM (Floating point compare) for comparison. The results of the floating point compare is then transferred/converted to the regular CPU flags (EFLAGS). One can then use the regular comparison operators to perform the conditional checks. The code could look like this:

epsilon REAL4   0.001

.CODE
main PROC
    finit              ; initialize FPU

repeatAgain:
    fld Two            ; st(0)=2.0
    fld root           ; st(0)=root, st(1)=2.0
    fmulp              ; st(0)=(2.0 * root)
    fld xx             ; st(0)=x, st(1)=(2.0 * root)
    fld root           ; st(0)=root, st(1)=x, st(2)=(2.0 * root)
    fld root           ; st(0)=root, st(1)=root, st(2)=x, st(3)=(2.0 * root)
    fmulp              ; st(0)=(root * root), st(1)=x, st(2)=(2.0 * root)
    fdivp              ; st(0)=(x / (root * root)), st(1)=(2.0 * root)
    faddp              ; st(0)=(2.0 * root) + (x / (root * root))
    fld Three          ; st(0)=3.0, st(1)=(2.0 * root) + (x / (root * root))
    fdivp              ; newroot=st(0)=((2.0 * root) + (x / (root * root))) / 3.0
    fld root           ; st(0)=oldroot, st(1)=newroot
    fsub st(0), st(1)  ; st(0)=(oldroot-newroot), st(1)=newroot
    fabs               ; st(0)=(|oldroot-newroot|), st(1)=newroot
    fld epsilon        ; st(0)=0.001, st(1)=(|oldroot-newroot|), st(2)=newroot
    fcompp             ; Do compare&set x87 flags pop top two values off stack
                       ;     st(0)=newroot    
    fstsw ax           ; Copy x87 Status Word containing the result to AX
    fwait              ; Insure previous instruction completed
    sahf               ; Transfer condition codes to the CPU's flags register

    fstp root          ; Store result (newroot) at top of stack into root 
                       ;     and pop value. At this point the stack is clear
                       ;     again since all items pushed have been popped.
    jbe repeatAgain    ; If 0.001 <= (|oldroot-newroot|) repeat
    mov eax, 0         ; exit
    ret
main    ENDP 
END

Note: The usage of FCOMPP and manual transfer of x87 flags to CPU flags is driven by the fact that you have .586 directive at the top of your code. I'm making the assumption that because you didn't specify .686 or later that instructions like FCOMI are not available. If you were using .686 or later, then the bottom part of the code could have looked like:

fld root           ; st(0)=oldroot, st(1)=newroot
fsub st(0), st(1)  ; st(0)=(oldroot-newroot), st(1)=newroot
fabs               ; st(0)=(|oldroot-newroot|), st(1)=newroot
fld epsilon        ; st(0)=0.001, st(1)=(|oldroot-newroot|), st(2)=newroot
fcomip st(0),st(1) ; Do compare & set CPU flags, pop one value off stack
                   ;     st(0)=(|oldroot-newroot|), st(1)=newroot
fstp st(0)         ; Pop temporary value off top of stack
                   ;     st(0)=newroot

fstp root          ; Store result (newroot) at top of stack into root 
                   ;     and pop value. At this point the stack is clear
                   ;     again since all items pushed have been popped.
jbe repeatAgain    ; If 0.001 <= (|oldroot-newroot|) repeat

Quick method to creating RPN/Postfix from Infix notation

If learning to convert Infix notation to RPN/Postfix seems a bit daunting from the document I linked earlier in my question there is some relief. There are a number of websites that will do this work for you. One such site is MathBlog. Just enter your equation, click convert and it should show you the RPN/Postfix equivalent. It is limited to +-/*, parentheses and exponents with ^.

Optimizations

A big key to optimizing the code is to optimize the formula by separating the parts that remain constant between each loop from the parts that are variable. The constant parts can be computed before the loop begins.

Your original equation is this:

Separating the constants part we can arrive at:

If we replace the constants with identifiers where twothirds = 2.0/3.0, and xover3 = x/3 then we end up with a simplified equation that looks like this:

If we convert that to POSTFIX/RPN then we get:

twothirds root * xover3 root root * / +

A similar optimization is what Peter is taking advantage of in his answer under the section Better loop body. He places the constants Twothirds and Xover3 onto the x87 FPU stack outside the loop, and references them as needed inside the loop. This avoids having to reread them unnecessarily from memory each time through the loop.

A more complete example based upon the optimization above:

.586
.MODEL FLAT
.STACK 4096

.DATA
xx        REAL4   27.0
root      REAL4   1.0
Three     REAL4   3.0
epsilon   REAL4   0.001
Twothirds REAL4 0.6666666666666666

.CODE
main PROC
    finit               ; Initialize FPU
    fld epsilon         ; st(0)=epsilon
    fld root            ; st(0)=prevroot (Copy of root), st(1)=epsilon
    fld TwoThirds       ; st(0)=(2/3), st(1)=prevroot, st(2)=epsilon 
    fld xx              ; st(0)=x, st(1)=(2/3), st(2)=prevroot, st(3)=epsilon
    fdiv Three          ; st(0)=(x/3), st(1)=(2/3), st(2)=prevroot, st(3)=epsilon
    fld st(2)           ; st(0)=root, st(1)=(x/3), st(2)=(2/3), st(3)=prevroot, st(4)=epsilon

repeatAgain:

    ; twothirds root * xover3 root root * / +
    fld st(0)           ; st(0)=root, st(1)=root, st(2)=(x/3), st(3)=(2/3), st(4)=prevroot, st(5)=epsilon
    fmul st(0), st(3)   ; st(0)=(2/3*root), st(1)=root, st(2)=(x/3), st(3)=(2/3), st(4)=prevroot, st(5)=epsilon           
    fxch                ; st(0)=root, st(1)=(2/3*root), st(2)=(x/3), st(3)=(2/3), st(4)=prevroot, st(5)=epsilon
    fmul st(0), st(0)   ; st(0)=(root*root), st(1)=(2/3*root), st(2)=(x/3), st(3)=(2/3), st(4)=prevroot, st(5)=epsilon
    fdivr st(0), st(2)  ; st(0)=((x/3)/(root*root)), st(1)=(2/3*root), st(2)=(x/3), st(3)=(2/3), st(4)=prevroot, st(5)=epsilon
    faddp               ; st(0)=((2/3*root)+(x/3)/(root*root)), st(1)=(x/3), st(2)=(2/3), st(3)=prevroot, st(4)=epsilon
    fxch st(3)          ; st(0)=prevroot, st(1)=(x/3), st(2)=(2/3), newroot=st(3)=((2/3*root)+(x/3)/(root*root)), st(4)=epsilon 
    fsub st(0), st(3)   ; st(0)=(prevroot-newroot), st(1)=(x/3), st(2)=(2/3), st(3)=newroot, st(4)=epsilon
    fabs                ; st(0)=(|prevroot-newroot|), st(1)=(x/3), st(2)=(2/3), st(3)=newroot, st(4)=epsilon
    fld st(4)           ; st(0)=0.001, st(1)=(|prevroot-newroot|), st(2)=(x/3), st(3)=(2/3), st(4)=newroot, st(5)=epsilon

    fcompp              ; Do compare&set x87 flags pop top two values off stack
                        ;     st(0)=(x/3), st(1)=(2/3), st(2)=newroot, st(3)=epsilon    
    fstsw ax            ; Copy x87 Status Word containing the result to AX
    fwait               ; Insure previous instruction completed
    sahf                ; Transfer condition codes to the CPU's flags register

    fld st(2)           ; st(0)=newroot, st(1)=(x/3), st(2)=(2/3), st(3)=newroot, st(4)=epsilon
    jbe repeatAgain     ; If 0.001 <= (|oldroot-newroot|) repeat

    ; Remove temporary values on stack, cubed root in st(0)
    ffree st(4)         ; st(0)=newroot, st(1)=(x/3), st(2)=(2/3), st(3)=epsilon
    ffree st(3)         ; st(0)=newroot, st(1)=(x/3), st(2)=(2/3)
    ffree st(2)         ; st(0)=newroot, st(1)=(x/3)
    ffree st(1)         ; st(0)=newroot

    mov     eax, 0  ; exit
    ret
main ENDP 

END

This code places these values on the stack prior to entering the loop: