x86 assembly (MASM) - Square root of a 64-bit integer?

后端 未结 6 2159
陌清茗
陌清茗 2021-01-07 10:12

I\'m coding a simple primality tester program for Windows in x86 assembly language (MASM32), which involves calculating a square root of a (64-bit) integer. My question is:

6条回答
  •  旧巷少年郎
    2021-01-07 10:47

    Finally I wrote, in addition to last function for 80386+ microprocessor's, that processes a 64 Bit unsigned-integer number's square-root, a new optimized function that works for 8086+ microprocessor, in assembly.

    I've tested all these routines successfully.

    They works as the first routine written by me (see at bottom of this article), but are more optimized. Because is easy, I shows an example of the 16 Bit's algorithm in Pascal as follows (half-section algorithm):

    Function NewSqrt(X:Word):Word;
    
    Var L,H,M:LongInt;
    
    Begin
     L:=1;
     H:=X;
     M:=(X+1) ShR 1;
     Repeat
      If Sqr(M)>X Then
       H:=M
      Else
       L:=M;
      M:=(L+H) ShR 1;
     Until H-L<2;
     NewSqrt:=M;
    End;
    

    This my routine works through half-section algorithm and supports the square-root of a 64 Bit unsigned-integer number. Follows the 8086+ CPU new real-mode code:

    Function NewSqrt16(XLow,XHigh:LongInt):LongInt; Assembler;
    
    Var X0,X1,X2,X3,
        H0,H1,H2,H3,
        L0,L1,L2,L3,
        M0,M1,M2,M3:Word;
    
    Asm
    
         LEA   SI,XLow
         Mov   AX,[SS:SI]
         Mov   BX,[SS:SI+2]
    
         LEA   SI,XHigh
         Mov   CX,[SS:SI]
         Mov   DX,[SS:SI+2]
        {------------------------
         INPUT: DX:CX:BX:AX = X
         OUPUT: DX:AX= Sqrt(X).
         ------------------------
             X0    :    X1    :    X2    :    X3    -> X
             H0    :    H1    :    H2    :    H3    -> H
             L0    :    L1    :    L2    :    L3    -> L
             M0    :    M1    :    M2    :    M3    -> M
             AX    ,    BX    ,    CX    ,    DX    ,
             ES    ,    DI    ,    SI               -> Temp. reg.
         ------------------------}
         Mov   [X0],AX           {  ...}
         Mov   [X1],BX           {  ...}
         Mov   [X2],CX           {  ...}
         Mov   [X3],DX           {  Stack <- X}
    
         Mov   [H0],AX           {  ...}
         Mov   [H1],BX           {  ...}
         Mov   [H2],CX           {  ...}
         Mov   [H3],DX           {  Stack <- H= X}
    
         Mov   [L0],Word(1)      {  ...}
         Mov   [L1],Word(0)      {  ...}
         Mov   [L2],Word(0)      {  ...}
         Mov   [L3],Word(0)      {  Stack <- L= 1}
    
         Add   AX,1              {  ...}
         AdC   Bx,0              {  ...}
         AdC   Cx,0              {  ...}
         AdC   Dx,0              {  X= X+1}
    
         RCR   Dx,1              {  ...}
         RCR   Cx,1              {  ...}
         RCR   Bx,1              {  ...}
         RCR   Ax,1              {  X= (X+1)/2}
    
         Mov   [M0],AX           {  ...}
         Mov   [M1],BX           {  ...}
         Mov   [M2],CX           {  ...}
         Mov   [M3],DX           {  Stack <- M= (X+1)/2}
        {------------------------}
    @@LoopBegin:                 {Loop restart label}
    
         Mov   AX,[M3]           {If M is more ...}
         Or    AX,[M2]           {... then 32 bit ...}
         JNE   @@LoadMid         {... then Square(M)>X, jump}
        {DX:AX:CX:SI= 64 Bit square(Low(M))}
         Mov   AX,[M0]           {AX <- A=Low(M)}
         Mov   CX,AX             {CX <- A=Low(M)}
    
         Mul   AX                {DX:AX <- A*A}
    
         Mov   SI,AX             {SI <- Low 16 bit of last mul.}
         Mov   BX,DX             {BX:SI <- A*A}
    
         Mov   AX,[M1]           {AX <- D=High(M)}
         XChg  CX,AX             {AX <- A=Low(M); CX <- D=High(M)}
    
         Mul   CX                {DX:AX <- A*D=Low(M)*High(M)}
    
         XOr   DI,DI             {...}
         ShL   AX,1              {...}
         RCL   DX,1              {...}
         RCL   DI,1              {DI:DX:AX <- A*D+D*A= 2*A*D (33 Bit}
    
         Add   AX,BX             {...}
         AdC   DX,0              {...}
         AdC   DI,0              {DI:DX:AX:SI <- A*(D:A)+(D*A) ShL 16 (49 Bit)}
    
         XChg  CX,AX             {AX <- D=High(M); CX <- Low 16 bit of last mul.}
         Mov   BX,DX             {DI:BX:CX:SI <- A*(D:A)+(D*A) ShL 16 (49 Bit)}
    
         Mul   AX                {DX:AX <- D*D}
    
         Add   AX,BX             {...}
         AdC   DX,DI             {DX:AX:CX:SI <- (D:A)*(D:A)}
        {------------------------}
         Cmp   DX,[X3]           {Compare High(Square(M)):High(X)}
    
    @@LoadMid:                   {Load M in DX:BP:BX:DI}
    
         Mov   DI,[M0]           {...}
         Mov   BX,[M1]           {...}
         Mov   ES,[M2]           {...}
         Mov   DX,[M3]           {DX:ES:BX:DI <- M}
    
         JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
         JB    @@SqrMIsLessThenX {If High(Square(M))High(X) then Square(M)>X, jump}
         JB    @@SqrMIsLessThenX {If High(Square(M))High(X) then Square(M)>X, jump}
         JB    @@SqrMIsLessThenX {If High(Square(M))Low(X) then Square(M)>X, jump}
        {------------------------}
    @@SqrMIsLessThenX:           {Square(M)<=X}
    
         Mov   [L0],DI           {...}
         Mov   [L1],BX           {...}
         Mov   [L2],ES           {...}
         Mov   [L3],DX           {L= M}
    
         Jmp   @@ProcessMid      {Go to process the mid value}
        {------------------------}
    @@SqrMIsMoreThenX:           {Square(M)>X}
    
         Mov   [H0],DI           {...}
         Mov   [H1],BX           {...}
         Mov   [H2],ES           {...}
         Mov   [H3],DX           {H= M}
        {------------------------}
    @@ProcessMid:                {Process the mid value}
    
         Mov   SI,[H0]           {...}
         Mov   CX,[H1]           {...}
         Mov   AX,[H2]           {...}
         Mov   DX,[H3]           {DX:AX:CX:SI <- H}
    
         Mov   DI,SI             {DI <- H0}
         Mov   BX,CX             {BX <- H1}
    
         Add   SI,[L0]           {...}
         AdC   CX,[L1]           {...}
         AdC   AX,[L2]           {...}
         AdC   DX,[L3]           {DX:AX:CX:SI <- H+L}
    
         RCR   DX,1              {...}
         RCR   AX,1              {...}
         RCR   CX,1              {...}
         RCR   SI,1              {DX:AX:CX:SI <- (H+L)/2}
    
         Mov   [M0],SI           {...}
         Mov   [M1],CX           {...}
         Mov   [M2],AX           {...}
         Mov   [M3],DX           {M <- DX:AX:CX:SI}
        {------------------------}
         Mov   AX,[H2]           {...}
         Mov   DX,[H3]           {DX:AX:BX:DI <- H}
    
         Sub   DI,[L0]           {...}
         SbB   BX,[L1]           {...}
         SbB   AX,[L2]           {...}
         SbB   DX,[L3]           {DX:AX:BX:DI <- H-L}
    
         Or    BX,AX             {If (H-L) >= 65536 ...}
         Or    BX,DX             {...}
         JNE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
    
         Cmp   DI,2              {If (H-L) >= 2 ...}
         JAE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
        {------------------------}    
         Mov   AX,[M0]           {...}
         Mov   DX,[M1]           {@Result <- Sqrt}
    
    End;
    

    This function in input receives a 64 Bit number (XHigh:XLow) and return the 32 Bit square-root of it. Uses four local variables:

    X, the copy of input number, subdivided in four 16 Bit packages (X3:X2:X1:X0).
    H, upper limit, subdivided in four 16 Bit packages (H3:H2:H1:H0).
    L, lower limit, subdivided in four 16 Bit packages (L3:L2:L1:L0).
    M, mid value, subdivided in four 16 Bit packages (M3:M2:M1:M0).
    

    Initializes the lower limit L to 1; initializes the upper limit H to input number X; initializes the mid value M to (H+1)>>1. Test if the square of M is long more then 64 Bit by verifying if (M3 | M2)!=0; if it is true then square(M)>X, sets the upper limit H to M. If it isn't true then processes the square of the lower 32 Bits of M (M1:M0) as follows:

    (M1:M0)*(M1:M0)=
    
    M0*M0+((M0*M1)<<16)+((M1*M0)<<16)+((M1*M1)<<32)=
    M0*M0+((M0*M1)<<17)+((M1*M1)<<32)
    

    If the square of lower 32 Bits of M is greater then X, sets the upper limit H to M; if the square of lower 32 Bits of M is lower or equal then X value, sets the lower limit L to M. Processes the mid value M setting it to (L+H)>>1. If (H-L)<2 then M is the square root of X, else go to test if the square of M is long more then 64 Bit and runs the follows instructions.

    This my routine works through half-section algorithm and supports the square-root of a 64 Bit unsigned-integer number. Follows the 80386+ CPU new protected-mode code:

    Procedure NewSqrt32(X:Int64;Var Y:Int64); Assembler;
    
    {INPUT: EDX:EAX = X
      TEMP: ECX
     OUPUT: Y = Sqrt(X)}
    
    Asm
    
         Push  EBX               {Save EBX register into the stack}
         Push  ESI               {Save ESI register into the stack}
         Push  EDI               {Save EDI register into the stack}
        {------------------------}
         Mov   EAX,Y             {Load address of output var. into EAX reg.}
    
         Push  EAX               {Save address of output var. into the stack}
        {------------------------}
         LEA   EDX,X             {Load address of input var. into EDX reg.}
         Mov   EAX,[EDX]         {    EAX <- Low 32 Bit of input value (X)}
         Mov   EDX,[EDX+4]       {EDX:EAX <- Input value (X)}
        {------------------------
         [ESP+ 4]:[ESP   ]        -> X
          EBX    : ECX            -> M
          ESI    : EDI            -> L
         [ESP+12]:[ESP+ 8]        -> H
          EAX    , EDX            -> Temporary registers
         ------------------------}
         Mov   EDI,1             {    EDI <-  Low(L)= 1}
         XOr   ESI,ESI           {    ESI <- High(L)= 0}
    
         Mov   ECX,EAX           {    ECX <- Low 32 Bit of input value (X)}
         Mov   EBX,EDX           {EBX:ECX <-       M= Input value (X)}
    
         Add   ECX,EDI           {    ECX <- Low 32 Bit of (X+1)}
         AdC   EBX,ESI           {EBX:ECX <-       M= X+1}
    
         RCR   EBX,1             {    EBX <- High 32 Bit of (X+1)/2}
         RCR   ECX,1             {EBX:ECX <-       M= (X+1)/2}
        {------------------------}
         Push  EDX               {  Stack <- High(H)= High(X)}
         Push  EAX               {  Stack <-  Low(H)=  Low(X)}
    
         Push  EDX               {  Stack <- High(X)}
         Push  EAX               {  Stack <-  Low(X)}
        {------------------------}    
    @@LoopBegin:                 {Loop restart label}
    
         Cmp   EBX,0             {If M is more then 32 bit ...}
         JNE   @@SqrMIsMoreThenX {... then Square(M)>X, jump}
    
         Mov   EAX,ECX           {EAX     <- A= Low(M)}
    
         Mul   ECX               {EDX:EAX <- 64 Bit square(Low(M))}
    
         Cmp   EDX,[ESP+4]       {Compare High(Square(M)):High(X)}
         JA    @@SqrMIsMoreThenX {If High(Square(M))>High(X) then Square(M)>X, jump}
         JB    @@SqrMIsLessThenX {If High(Square(M))Low(X) then Square(M)>X, jump}
        {------------------------}
    @@SqrMIsLessThenX:           {Square(M)<=X}
    
         Mov   EDI,ECX           {Set Low 32 Bit of L with Low 32 Bit of M}
         Mov   ESI,EBX           {ESI:EDI <- L= M}
    
         Jmp   @@ProcessMid      {Go to process the mid value}
        {------------------------}
    @@SqrMIsMoreThenX:           {Square(M)>X}
    
         Mov   [ESP+8],ECX       {Set Low 32 Bit of H with Low 32 Bit of M}
         Mov   [ESP+12],EBX      {H= M}
        {------------------------}
    @@ProcessMid:                {Process the mid value}
    
         Mov   EAX,[ESP+8]       {EAX     <- Low 32 Bit of H}
         Mov   EDX,[ESP+12]      {EDX:EAX <- H}
    
         Mov   ECX,EDI           {Set Low 32 Bit of M with Low 32 Bit of L}
         Mov   EBX,ESI           {EBX:ECX <- M= L}
    
         Add   ECX,EAX           {Set Low 32 Bit of M with Low 32 Bit of (L+H)}
         AdC   EBX,EDX           {EBX:ECX <- M= L+H}
    
         RCR   EBX,1             {Set High 32 Bit of M with High 32 Bit of (L+H)/2}
         RCR   ECX,1             {EBX:ECX <- M= (L+H)/2}
        {------------------------}
         Sub   EAX,EDI           {EAX     <- Low 32 Bit of (H-L)}
         SbB   EDX,ESI           {EDX:EAX <- H-L}
    
         Cmp   EDX,0             {If High 32 Bit of (H-L) aren't zero: (H-L) >= 2 ...}
         JNE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
    
         Cmp   EAX,2             {If (H-L) >= 2 ...}
         JAE   @@LoopBegin       {... Repeat @LoopBegin else goes forward}
        {------------------------}
         Add   ESP,16            {Remove 4 local 32 bit variables from stack}
        {------------------------}
         Pop   EDX               {Load from stack the output var. address into EDX reg.}
    
         Mov   [EDX],ECX         {Set Low 32 Bit of output var. with Low 32 Bit of Sqrt(X)}
         Mov   [EDX+4],EBX       {Y= Sqrt(X)}
        {------------------------}
         Pop   EDI               {Retrieve EDI register from the stack}
         Pop   ESI               {Retrieve ESI register from the stack}
         Pop   EBX               {Retrieve EBX register from the stack}
    
    End;
    

    This my 8086+ CPU real-mode routine works through half-section algorithm and supports the square-root of a 32 Bit unsigned-integer number; is not too fast, but may be optimized:

    Procedure _PosLongIMul2; Assembler;
    
    {INPUT:
    
     DX:AX-> First factor (destroyed).
     BX:CX-> Second factor (destroyed).
    
     OUTPUT:
    
     BX:CX:DX:AX-> Multiplication result.
    
     TEMP:
    
     BP, Di, Si}
    
    Asm
    
         Jmp   @Go
    
     @VR:DD    0      {COPY of RESULT     (LOW)}
         DD    0      {COPY of RESULT    (HIGH)}
    
     @Go:Push  BP
    
         Mov   BP,20H {32 Bit Op.}
    
         XOr   DI,DI  {COPY of first op.  (LOW)}
         XOr   SI,SI  {COPY of first op. (HIGH)}
    
         Mov   [CS:OffSet @VR  ],Word(0)
         Mov   [CS:OffSet @VR+2],Word(0)
         Mov   [CS:OffSet @VR+4],Word(0)
         Mov   [CS:OffSet @VR+6],Word(0)
    
     @01:ShR   BX,1
         RCR   CX,1
    
         JAE   @00
    
         Add   [CS:OffSet @VR  ],AX
         AdC   [CS:OffSet @VR+2],DX
         AdC   [CS:OffSet @VR+4],DI
         AdC   [CS:OffSet @VR+6],SI
    
     @00:ShL   AX,1
         RCL   DX,1
         RCL   DI,1
         RCL   SI,1
    
         Dec   BP
         JNE   @01
    
         Mov   AX,[CS:OffSet @VR]
         Mov   DX,[CS:OffSet @VR+2]
         Mov   CX,[CS:OffSet @VR+4]
         Mov   BX,[CS:OffSet @VR+6]
    
         Pop   BP
    
    End;
    
    Function _Sqrt:LongInt; Assembler;
    
    {INPUT:
    
     Di:Si-> Unsigned integer input number X.
    
     OUTPUT:
    
     DX:AX-> Square root Y=Sqrt(X).
    
     TEMP:
    
     BX, CX}
    
    Asm
    
         Jmp   @Go
    
     @Vr:DD    0 {+0  LOW}
         DD    0 {+4  HIGH}
         DD    0 {+8  Mid}
         DB    0 {+12 COUNT}
    
     @Go:Mov   [CS:OffSet @Vr],Word(0)
         Mov   [CS:OffSet @Vr+2],Word(0)
    
         Mov   [CS:OffSet @Vr+4],SI
         Mov   [CS:OffSet @Vr+6],DI
    
         Mov   [CS:OffSet @Vr+8],Word(0)
         Mov   [CS:OffSet @Vr+10],Word(0)
    
         Mov   [CS:OffSet @Vr+12],Byte(32)
    
     @02:Mov   AX,[CS:OffSet @Vr]
         Mov   DX,[CS:OffSet @Vr+2]
    
         Mov   CX,[CS:OffSet @Vr+4]
         Mov   BX,[CS:OffSet @Vr+6]
    
         Add   AX,CX
         AdC   DX,BX
    
         ShR   DX,1
         RCR   AX,1
    
         Mov   [CS:OffSet @Vr+8],AX
         Mov   [CS:OffSet @Vr+10],DX
    
         Mov   CX,AX
         Mov   BX,DX
    
         Push  DI
         Push  SI
    
         Call  _PosLongIMul2
    
         Pop   SI
         Pop   DI
    
         Or    BX,CX
         JNE   @00
    
         Cmp   DX,DI
         JA    @00
         JB    @01
    
         Cmp   AX,SI
         JA    @00
    
     @01:Mov   AX,[CS:OffSet @Vr+8]
         Mov   [CS:OffSet @Vr],AX
    
         Mov   DX,[CS:OffSet @Vr+10]
         Mov   [CS:OffSet @Vr+2],DX
    
         Dec   Byte Ptr [CS:OffSet @Vr+12]
         JNE   @02
         JE    @03
    
     @00:Mov   AX,[CS:OffSet @Vr+8]
         Mov   [CS:OffSet @Vr+4],AX
    
         Mov   DX,[CS:OffSet @Vr+10]
         Mov   [CS:OffSet @Vr+6],DX
    
         Dec   Byte Ptr [CS:OffSet @Vr+12]
         JNE   @02
    
     @03:Mov   AX,[CS:OffSet @Vr+8]
         Mov   DX,[CS:OffSet @Vr+10]
    
    End;
    

    This my 8086+ CPU real-mode routine returns a 32 Bit fixed-point number that is the square-root of a 16 Bit unsigned-integer input number; is not too fast, but may be optimized:

    Function _Sqrt2:LongInt; Assembler;
    
    {INPUT:
    
     Si-> Unsigned integer number X (unaltered).
    
     OUTPUT:
    
     AX-> Integer part of Sqrt(X).
     DX-> Decimal part of Sqrt(X).
    
     TEMP:
    
     BX, CX, DX, Di}
    
    Asm
    
         Jmp   @Go
    
     @Vr:DD    0 {+0  LOW}
         DD    0 {+4  HIGH}
         DD    0 {+8  Mid}
         DB    0 {+12 COUNT}
    
     @Go:Mov   [CS:OffSet @Vr],Word(0)
         Mov   [CS:OffSet @Vr+2],Word(0)
    
         Mov   [CS:OffSet @Vr+4],Word(0)
         Mov   [CS:OffSet @Vr+6],SI
    
         Mov   [CS:OffSet @Vr+8],Word(0)
         Mov   [CS:OffSet @Vr+10],Word(0)
    
         Mov   [CS:OffSet @Vr+12],Byte(32)
    
     @02:Mov   AX,[CS:OffSet @Vr]
         Mov   DX,[CS:OffSet @Vr+2]
    
         Mov   CX,[CS:OffSet @Vr+4]
         Mov   BX,[CS:OffSet @Vr+6]
    
         Add   AX,CX
         AdC   DX,BX
    
         ShR   DX,1
         RCR   AX,1
    
         Mov   [CS:OffSet @Vr+8],AX
         Mov   [CS:OffSet @Vr+10],DX
    
         Mov   CX,AX
         Mov   BX,DX
    
         Push  SI
    
         Call  _PosLongIMul2
    
         Pop   SI
    
         Or    BX,BX
         JNE   @00
    
         Cmp   CX,SI
         JB    @01
         JA    @00
    
         Or    DX,AX
         JNE   @00
    
     @01:Mov   AX,[CS:OffSet @Vr+8]
         Mov   [CS:OffSet @Vr],AX
    
         Mov   DX,[CS:OffSet @Vr+10]
         Mov   [CS:OffSet @Vr+2],DX
    
         Dec   Byte Ptr [CS:OffSet @Vr+12]
         JNE   @02
         JE    @03
    
     @00:Mov   AX,[CS:OffSet @Vr+8]
         Mov   [CS:OffSet @Vr+4],AX
    
         Mov   DX,[CS:OffSet @Vr+10]
         Mov   [CS:OffSet @Vr+6],DX
    
         Dec   Byte Ptr [CS:OffSet @Vr+12]
         JNE   @02
    
     @03:Mov   DX,[CS:OffSet @Vr+8]
         Mov   AX,[CS:OffSet @Vr+10]
    
    End;
    

    Hi!

提交回复
热议问题