x86 assembly (MASM) - Square root of a 64-bit integer?

后端未结

关注

 6  2159

陌清茗 2021-01-07 10:12

I\'m coding a simple primality tester program for Windows in x86 assembly language (MASM32), which involves calculating a square root of a (64-bit) integer. My question is:

6条回答

旧巷少年郎 (楼主)

2021-01-07 10:47

Finally I wrote, in addition to last function for 80386+ microprocessor's, that processes a 64 Bit unsigned-integer number's square-root, a new optimized function that works for 8086+ microprocessor, in assembly.

I've tested all these routines successfully.

They works as the first routine written by me (see at bottom of this article), but are more optimized. Because is easy, I shows an example of the 16 Bit's algorithm in Pascal as follows (half-section algorithm):

Function NewSqrt(X:Word):Word;

Var L,H,M:LongInt;

Begin
 L:=1;
 H:=X;
 M:=(X+1) ShR 1;
 Repeat
  If Sqr(M)>X Then
   H:=M
  Else
   L:=M;
  M:=(L+H) ShR 1;
 Until H-L<2;
 NewSqrt:=M;
End;

This my routine works through half-section algorithm and supports the square-root of a 64 Bit unsigned-integer number. Follows the 8086+ CPU new real-mode code:

Function NewSqrt16(XLow,XHigh:LongInt):LongInt; Assembler;

Var X0,X1,X2,X3,
    H0,H1,H2,H3,
    L0,L1,L2,L3,
    M0,M1,M2,M3:Word;

Asm

     LEA   SI,XLow
     Mov   AX,[SS:SI]
     Mov   BX,[SS:SI+2]

     LEA   SI,XHigh
     Mov   CX,[SS:SI]
     Mov   DX,[SS:SI+2]
    {------------------------
     INPUT: DX:CX:BX:AX = X
     OUPUT: DX:AX= Sqrt(X).
     ------------------------
         X0    :    X1    :    X2    :    X3    -> X
         H0    :    H1    :    H2    :    H3    -> H
         L0    :    L1    :    L2    :    L3    -> L
         M0    :    M1    :    M2    :    M3    -> M
         AX    ,    BX    ,    CX    ,    DX    ,
         ES    ,    DI    ,    SI               -> Temp. reg.
     ------------------------}
     Mov   [X0],AX           {  ...}
     Mov   [X1],BX           {  ...}
     Mov   [X2],CX           {  ...}
     Mov   [X3],DX           {  Stack <- X}

     Mov   [H0],AX           {  ...}
     Mov   [H1],BX           {  ...}
     Mov   [H2],CX           {  ...}
     Mov   [H3],DX           {  Stack <- H= X}

     Mov   [L0],Word(1)      {  ...}
     Mov   [L1],Word(0)      {  ...}
     Mov   [L2],Word(0)      {  ...}
     Mov   [L3],Word(0)      {  Stack <- L= 1}

     Add   AX,1              {  ...}
     AdC   Bx,0              {  ...}
     AdC   Cx,0              {  ...}
     AdC   Dx,0              {  X= X+1}

     RCR   Dx,1              {  ...}
     RCR   Cx,1              {  ...}
     RCR   Bx,1              {  ...}
     RCR   Ax,1              {  X= (X+1)/2}

     Mov   [M0],AX           {  ...}
     Mov   [M1],BX           {  ...}
     Mov   [M2],CX           {  ...}
     Mov   [M3],DX           {  Stack <- M= (X+1)/2}
    {------------------------}
@@LoopBegin:                 {Loop restart label}

     Mov   AX,[M3]           {If M is more ...}
     Or    AX,[M2]           {... then 32 bit ...}
     JNE   @@LoadMid         {... then Square(M)>X, jump}
    {DX:AX:CX:SI= 64 Bit square(Low(M))}
     Mov   AX,[M0]           {AX <- A=Low(M)}
     Mov   CX,AX             {CX <- A=Low(M)}

     Mul   AX                {DX:AX <- A*A}

     Mov   SI,AX             {SI <- Low 16 bit of last mul.}
     Mov   BX,DX             {BX:SI <- A*A}

     Mov   AX,[M1]           {AX <- D=High(M)}
     XChg  CX,AX             {AX <- A=Low(M); CX <- D=High(M)}

     Mul   CX                {DX:AX <- A*D=Low(M)*High(M)}

     XOr   DI,DI             {...}
     ShL   AX,1              {...}
     RCL   DX,1              {...}
     RCL   DI,1              {DI:DX:AX <- A*D+D*A= 2*A*D (33 Bit}

     Add   AX,BX             {...}
     AdC   DX,0              {...}
     AdC   DI,0              {DI:DX:AX:SI <- A*(D:A)+(D*A) ShL 16 (49 Bit)}

     XChg  CX,AX             {AX <- D=High(M); CX <- Low 16 bit of last mul.}
     Mov   BX,DX             {DI:BX:CX:SI <- A*(D:A)+(D*A) ShL 16 (49 Bit)}

     Mul   AX                {DX:AX <- D*D}

     Add   AX,BX             {...}
     AdC   DX,DI             {DX:AX:CX:SI <- (D:A)*(D:A)}
    {------------------------}
     Cmp   DX,[X3]           {Compare High(Square(M)):High(X)}

@@LoadMid:                   {Load M in DX:BP:BX:DI}

     Mov   DI,[M0]           {...}
     Mov   BX,[M1]           {...}
     Mov   ES,[M2]           {...}
     Mov   DX,[M3]           {DX:ES:BX:DI <- M}

     JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))Low(X) then Square(M)>X, jump}
    {------------------------}
@@SqrMIsLessThenX:           {Square(M)<=X}

     Mov   [L0],DI           {...}
     Mov   [L1],BX           {...}
     Mov   [L2],ES           {...}
     Mov   [L3],DX           {L= M}

     Jmp   @@ProcessMid      {Go to process the mid value}
    {------------------------}
@@SqrMIsMoreThenX:           {Square(M)>X}

     Mov   [H0],DI           {...}
     Mov   [H1],BX           {...}
     Mov   [H2],ES           {...}
     Mov   [H3],DX           {H= M}
    {------------------------}
@@ProcessMid:                {Process the mid value}

     Mov   SI,[H0]           {...}
     Mov   CX,[H1]           {...}
     Mov   AX,[H2]           {...}
     Mov   DX,[H3]           {DX:AX:CX:SI <- H}

     Mov   DI,SI             {DI <- H0}
     Mov   BX,CX             {BX <- H1}

     Add   SI,[L0]           {...}
     AdC   CX,[L1]           {...}
     AdC   AX,[L2]           {...}
     AdC   DX,[L3]           {DX:AX:CX:SI <- H+L}

     RCR   DX,1              {...}
     RCR   AX,1              {...}
     RCR   CX,1              {...}
     RCR   SI,1              {DX:AX:CX:SI <- (H+L)/2}

     Mov   [M0],SI           {...}
     Mov   [M1],CX           {...}
     Mov   [M2],AX           {...}
     Mov   [M3],DX           {M <- DX:AX:CX:SI}
    {------------------------}
     Mov   AX,[H2]           {...}
     Mov   DX,[H3]           {DX:AX:BX:DI <- H}

     Sub   DI,[L0]           {...}
     SbB   BX,[L1]           {...}
     SbB   AX,[L2]           {...}
     SbB   DX,[L3]           {DX:AX:BX:DI <- H-L}

     Or    BX,AX             {If (H-L) >= 65536 ...}
     Or    BX,DX             {...}
     JNE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}

     Cmp   DI,2              {If (H-L) >= 2 ...}
     JAE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
    {------------------------}    
     Mov   AX,[M0]           {...}
     Mov   DX,[M1]           {@Result <- Sqrt}

End;

This function in input receives a 64 Bit number (XHigh:XLow) and return the 32 Bit square-root of it. Uses four local variables:

X, the copy of input number, subdivided in four 16 Bit packages (X3:X2:X1:X0).
H, upper limit, subdivided in four 16 Bit packages (H3:H2:H1:H0).
L, lower limit, subdivided in four 16 Bit packages (L3:L2:L1:L0).
M, mid value, subdivided in four 16 Bit packages (M3:M2:M1:M0).

Initializes the lower limit L to 1; initializes the upper limit H to input number X; initializes the mid value M to (H+1)>>1. Test if the square of M is long more then 64 Bit by verifying if (M3 | M2)!=0; if it is true then square(M)>X, sets the upper limit H to M. If it isn't true then processes the square of the lower 32 Bits of M (M1:M0) as follows:

(M1:M0)*(M1:M0)=

M0*M0+((M0*M1)<<16)+((M1*M0)<<16)+((M1*M1)<<32)=
M0*M0+((M0*M1)<<17)+((M1*M1)<<32)

If the square of lower 32 Bits of M is greater then X, sets the upper limit H to M; if the square of lower 32 Bits of M is lower or equal then X value, sets the lower limit L to M. Processes the mid value M setting it to (L+H)>>1. If (H-L)<2 then M is the square root of X, else go to test if the square of M is long more then 64 Bit and runs the follows instructions.

This my routine works through half-section algorithm and supports the square-root of a 64 Bit unsigned-integer number. Follows the 80386+ CPU new protected-mode code:

Procedure NewSqrt32(X:Int64;Var Y:Int64); Assembler;

{INPUT: EDX:EAX = X
  TEMP: ECX
 OUPUT: Y = Sqrt(X)}

Asm

     Push  EBX               {Save EBX register into the stack}
     Push  ESI               {Save ESI register into the stack}
     Push  EDI               {Save EDI register into the stack}
    {------------------------}
     Mov   EAX,Y             {Load address of output var. into EAX reg.}

     Push  EAX               {Save address of output var. into the stack}
    {------------------------}
     LEA   EDX,X             {Load address of input var. into EDX reg.}
     Mov   EAX,[EDX]         {    EAX <- Low 32 Bit of input value (X)}
     Mov   EDX,[EDX+4]       {EDX:EAX <- Input value (X)}
    {------------------------
     [ESP+ 4]:[ESP   ]        -> X
      EBX    : ECX            -> M
      ESI    : EDI            -> L
     [ESP+12]:[ESP+ 8]        -> H
      EAX    , EDX            -> Temporary registers
     ------------------------}
     Mov   EDI,1             {    EDI <-  Low(L)= 1}
     XOr   ESI,ESI           {    ESI <- High(L)= 0}

     Mov   ECX,EAX           {    ECX <- Low 32 Bit of input value (X)}
     Mov   EBX,EDX           {EBX:ECX <-       M= Input value (X)}

     Add   ECX,EDI           {    ECX <- Low 32 Bit of (X+1)}
     AdC   EBX,ESI           {EBX:ECX <-       M= X+1}

     RCR   EBX,1             {    EBX <- High 32 Bit of (X+1)/2}
     RCR   ECX,1             {EBX:ECX <-       M= (X+1)/2}
    {------------------------}
     Push  EDX               {  Stack <- High(H)= High(X)}
     Push  EAX               {  Stack <-  Low(H)=  Low(X)}

     Push  EDX               {  Stack <- High(X)}
     Push  EAX               {  Stack <-  Low(X)}
    {------------------------}    
@@LoopBegin:                 {Loop restart label}

     Cmp   EBX,0             {If M is more then 32 bit ...}
     JNE   @@SqrMIsMoreThenX {... then Square(M)>X, jump}

     Mov   EAX,ECX           {EAX     <- A= Low(M)}

     Mul   ECX               {EDX:EAX <- 64 Bit square(Low(M))}

     Cmp   EDX,[ESP+4]       {Compare High(Square(M)):High(X)}
     JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
     JB    @@SqrMIsLessThenX {If High(Square(M))Low(X) then Square(M)>X, jump}
    {------------------------}
@@SqrMIsLessThenX:           {Square(M)<=X}

     Mov   EDI,ECX           {Set Low 32 Bit of L with Low 32 Bit of M}
     Mov   ESI,EBX           {ESI:EDI <- L= M}

     Jmp   @@ProcessMid      {Go to process the mid value}
    {------------------------}
@@SqrMIsMoreThenX:           {Square(M)>X}

     Mov   [ESP+8],ECX       {Set Low 32 Bit of H with Low 32 Bit of M}
     Mov   [ESP+12],EBX      {H= M}
    {------------------------}
@@ProcessMid:                {Process the mid value}

     Mov   EAX,[ESP+8]       {EAX     <- Low 32 Bit of H}
     Mov   EDX,[ESP+12]      {EDX:EAX <- H}

     Mov   ECX,EDI           {Set Low 32 Bit of M with Low 32 Bit of L}
     Mov   EBX,ESI           {EBX:ECX <- M= L}

     Add   ECX,EAX           {Set Low 32 Bit of M with Low 32 Bit of (L+H)}
     AdC   EBX,EDX           {EBX:ECX <- M= L+H}

     RCR   EBX,1             {Set High 32 Bit of M with High 32 Bit of (L+H)/2}
     RCR   ECX,1             {EBX:ECX <- M= (L+H)/2}
    {------------------------}
     Sub   EAX,EDI           {EAX     <- Low 32 Bit of (H-L)}
     SbB   EDX,ESI           {EDX:EAX <- H-L}

     Cmp   EDX,0             {If High 32 Bit of (H-L) aren't zero: (H-L) >= 2 ...}
     JNE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}

     Cmp   EAX,2             {If (H-L) >= 2 ...}
     JAE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
    {------------------------}
     Add   ESP,16            {Remove 4 local 32 bit variables from stack}
    {------------------------}
     Pop   EDX               {Load from stack the output var. address into EDX reg.}

     Mov   [EDX],ECX         {Set Low 32 Bit of output var. with Low 32 Bit of Sqrt(X)}
     Mov   [EDX+4],EBX       {Y= Sqrt(X)}
    {------------------------}
     Pop   EDI               {Retrieve EDI register from the stack}
     Pop   ESI               {Retrieve ESI register from the stack}
     Pop   EBX               {Retrieve EBX register from the stack}

End;

This my 8086+ CPU real-mode routine works through half-section algorithm and supports the square-root of a 32 Bit unsigned-integer number; is not too fast, but may be optimized:

Procedure _PosLongIMul2; Assembler;

{INPUT:

 DX:AX-> First factor (destroyed).
 BX:CX-> Second factor (destroyed).

 OUTPUT:

 BX:CX:DX:AX-> Multiplication result.

 TEMP:

 BP, Di, Si}

Asm

     Jmp   @Go

 @VR:DD    0      {COPY of RESULT     (LOW)}
     DD    0      {COPY of RESULT    (HIGH)}

 @Go:Push  BP

     Mov   BP,20H {32 Bit Op.}

     XOr   DI,DI  {COPY of first op.  (LOW)}
     XOr   SI,SI  {COPY of first op. (HIGH)}

     Mov   [CS:OffSet @VR  ],Word(0)
     Mov   [CS:OffSet @VR+2],Word(0)
     Mov   [CS:OffSet @VR+4],Word(0)
     Mov   [CS:OffSet @VR+6],Word(0)

 @01:ShR   BX,1
     RCR   CX,1

     JAE   @00

     Add   [CS:OffSet @VR  ],AX
     AdC   [CS:OffSet @VR+2],DX
     AdC   [CS:OffSet @VR+4],DI
     AdC   [CS:OffSet @VR+6],SI

 @00:ShL   AX,1
     RCL   DX,1
     RCL   DI,1
     RCL   SI,1

     Dec   BP
     JNE   @01

     Mov   AX,[CS:OffSet @VR]
     Mov   DX,[CS:OffSet @VR+2]
     Mov   CX,[CS:OffSet @VR+4]
     Mov   BX,[CS:OffSet @VR+6]

     Pop   BP

End;

Function _Sqrt:LongInt; Assembler;

{INPUT:

 Di:Si-> Unsigned integer input number X.

 OUTPUT:

 DX:AX-> Square root Y=Sqrt(X).

 TEMP:

 BX, CX}

Asm

     Jmp   @Go

 @Vr:DD    0 {+0  LOW}
     DD    0 {+4  HIGH}
     DD    0 {+8  Mid}
     DB    0 {+12 COUNT}

 @Go:Mov   [CS:OffSet @Vr],Word(0)
     Mov   [CS:OffSet @Vr+2],Word(0)

     Mov   [CS:OffSet @Vr+4],SI
     Mov   [CS:OffSet @Vr+6],DI

     Mov   [CS:OffSet @Vr+8],Word(0)
     Mov   [CS:OffSet @Vr+10],Word(0)

     Mov   [CS:OffSet @Vr+12],Byte(32)

 @02:Mov   AX,[CS:OffSet @Vr]
     Mov   DX,[CS:OffSet @Vr+2]

     Mov   CX,[CS:OffSet @Vr+4]
     Mov   BX,[CS:OffSet @Vr+6]

     Add   AX,CX
     AdC   DX,BX

     ShR   DX,1
     RCR   AX,1

     Mov   [CS:OffSet @Vr+8],AX
     Mov   [CS:OffSet @Vr+10],DX

     Mov   CX,AX
     Mov   BX,DX

     Push  DI
     Push  SI

     Call  _PosLongIMul2

     Pop   SI
     Pop   DI

     Or    BX,CX
     JNE   @00

     Cmp   DX,DI
     JA    @00
     JB    @01

     Cmp   AX,SI
     JA    @00

 @01:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+2],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02
     JE    @03

 @00:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr+4],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+6],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02

 @03:Mov   AX,[CS:OffSet @Vr+8]
     Mov   DX,[CS:OffSet @Vr+10]

End;

This my 8086+ CPU real-mode routine returns a 32 Bit fixed-point number that is the square-root of a 16 Bit unsigned-integer input number; is not too fast, but may be optimized:

Function _Sqrt2:LongInt; Assembler;

{INPUT:

 Si-> Unsigned integer number X (unaltered).

 OUTPUT:

 AX-> Integer part of Sqrt(X).
 DX-> Decimal part of Sqrt(X).

 TEMP:

 BX, CX, DX, Di}

Asm

     Jmp   @Go

 @Vr:DD    0 {+0  LOW}
     DD    0 {+4  HIGH}
     DD    0 {+8  Mid}
     DB    0 {+12 COUNT}

 @Go:Mov   [CS:OffSet @Vr],Word(0)
     Mov   [CS:OffSet @Vr+2],Word(0)

     Mov   [CS:OffSet @Vr+4],Word(0)
     Mov   [CS:OffSet @Vr+6],SI

     Mov   [CS:OffSet @Vr+8],Word(0)
     Mov   [CS:OffSet @Vr+10],Word(0)

     Mov   [CS:OffSet @Vr+12],Byte(32)

 @02:Mov   AX,[CS:OffSet @Vr]
     Mov   DX,[CS:OffSet @Vr+2]

     Mov   CX,[CS:OffSet @Vr+4]
     Mov   BX,[CS:OffSet @Vr+6]

     Add   AX,CX
     AdC   DX,BX

     ShR   DX,1
     RCR   AX,1

     Mov   [CS:OffSet @Vr+8],AX
     Mov   [CS:OffSet @Vr+10],DX

     Mov   CX,AX
     Mov   BX,DX

     Push  SI

     Call  _PosLongIMul2

     Pop   SI

     Or    BX,BX
     JNE   @00

     Cmp   CX,SI
     JB    @01
     JA    @00

     Or    DX,AX
     JNE   @00

 @01:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+2],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02
     JE    @03

 @00:Mov   AX,[CS:OffSet @Vr+8]
     Mov   [CS:OffSet @Vr+4],AX

     Mov   DX,[CS:OffSet @Vr+10]
     Mov   [CS:OffSet @Vr+6],DX

     Dec   Byte Ptr [CS:OffSet @Vr+12]
     JNE   @02

 @03:Mov   DX,[CS:OffSet @Vr+8]
     Mov   AX,[CS:OffSet @Vr+10]

End;

Hi!

0 讨论(0)

查看其它6个回答