Visual C++ x64 add with carry

问题

Since there doesn't seem to be an intrinsic for ADC and I can't use inline assembler for x64 architecture with Visual C++, what should I do if I want to write a function using add with carry but include it in a C++ namespace?

(Emulating with comparison operators is not an option. This 256 megabit add is performance critical.)

回答1:

There is now an instrinsic for ADC in MSVC: _addcarry_u64. The following code

#include <inttypes.h>
#include <intrin.h>
#include <stdio.h>

typedef struct {
    uint64_t x1;
    uint64_t x2;
    uint64_t x3;
    uint64_t x4;
} uint256;

void add256(uint256 *x, uint256 *y) {
    unsigned char c = 0;
    c = _addcarry_u64(c, x->x1, y->x1, &x->x1);
    c = _addcarry_u64(c, x->x2, y->x2, &x->x2);
    c = _addcarry_u64(c, x->x3, y->x3, &x->x3);
    _addcarry_u64(c, x->x4, y->x4, &x->x4);
}

int main() {
    //uint64_t x1, x2, x3, x4;
    //uint64_t y1, y2, y3, y4;
    uint256 x, y;
    x.x1 = x.x2 = x.x3 = -1; x.x4 = 0;
    y.x1 = 2; y.x2 = y.x3 = y.x4 = 0;

    printf(" %016" PRIx64 "%016" PRIx64 "%016" PRIx64 "%016" PRIx64 "\n", x.x4, x.x3, x.x2, x.x1);
    printf("+");
    printf("%016" PRIx64 "%016" PRIx64 "%016" PRIx64 "%016" PRIx64 "\n", y.x4, y.x3, y.x2, y.x1);
    add256(&x, &y);
    printf("=");
    printf("%016" PRIx64 "%016" PRIx64 "%016" PRIx64 "%016" PRIx64 "\n", x.x4, x.x3, x.x2, x.x1);
}

produces the following assembly output from Visual Studio Express 2013

mov rdx, QWORD PTR x$[rsp]
mov r8, QWORD PTR x$[rsp+8] 
mov r9, QWORD PTR x$[rsp+16]
mov rax, QWORD PTR x$[rsp+24]
add rdx, QWORD PTR y$[rsp]
adc r8, QWORD PTR y$[rsp+8]
adc r9, QWORD PTR y$[rsp+16]
adc rax, QWORD PTR y$[rsp+24]

which has one add and three adc as expected.

Edit:

There seems to be some confusion as to what _addcarry_u64 does. If you look at Microsoft's documentation for this which I linked to at the start of this answer it shows that it does not require any special hardware. This produces adc and it will work on all x86-64 processors (and _addcarry_u32 would work on even older processors). It works fine on the Ivy Bridge system I tested it on.

However, _addcarryx_u64 does require adx (as shown in MSFT's documentation) and indeed it fails to run on my Ivy Bridge System.

回答2:

VS2010 has built-in support for compiling and linking code written in assembly and translated by MASM (ml64.exe). You just have to jump through a few hoops to enable it:

Right-click the project in the Solution Explorer window, Build Customizations, tick "masm".
Project + Add New Item, pick the C++ File template but name it something.asm
Ensure you've got the x64 platform target for the project. Build + Configuration Manager, select "x64" in the "Active solution platform" combo. If missing, select <New> and pick x64 from the first combo. If missing you'll have to re-run setup and add support for 64-bit compilers.

Write assembly code using MASM syntax, reference is here. Quick start tutorial is here.

The skeleton for the assembly code looks like this:

.CODE
PUBLIC Foo
Foo PROC
  ret                    ; TODO: make useful
Foo ENDP
END

And called from C++ code like this:

extern "C" void Foo();

int main(int argc, char* argv[])
{
    Foo();
    return 0;
}

Full debugging support is available, you'll typically want to at least use the Debug + Windows + Registers window.

回答3:

I've implemented a 256 bit integer using an array of unsigned long long and used x64 assembly to implement the add with carry. Here's the C++ caller:

#include "stdafx.h"

extern "C" void add256(unsigned long long *a, unsigned long long * b, unsigned long long *c);

int _tmain(int argc, _TCHAR* argv[])
{
    unsigned long long a[4] = {0x8000000000000001, 2, 3, 4};
    unsigned long long b[4] = {0x8000000000000005, 6, 7, 8};
    unsigned long long c[4] = {0, 0, 0, 0};
    add256(a, b, c); // c[] == {6, 9, 10, 12};
    return 0;
}

The add256 is implemented in assembly:

    ; void add256(unsigned long long *a, unsigned long long * b, unsigned long long *c)

.CODE
PUBLIC add256
add256 PROC

    mov                 qword ptr [rsp+18h],r8    
    mov                 qword ptr [rsp+10h],rdx    
    mov                 qword ptr [rsp+8],rcx    
    push                rdi    

    ; c[0] = a[0] + b[0];

    mov                 rax,qword ptr 16[rsp]
    mov                 rax,qword ptr [rax]    
    mov                 rcx,qword ptr 24[rsp]
    add                 rax,qword ptr [rcx]    
    mov                 rcx,qword ptr 32[rsp]
    mov                 qword ptr [rcx],rax    

    ; c[1] = a[1] + b[1] + CARRY;

    mov                 rax,qword ptr 16[rsp]
    mov                 rax,qword ptr [rax+8]    
    mov                 rcx,qword ptr 24[rsp]
    adc                 rax,qword ptr [rcx+8]    
    mov                 rcx,qword ptr 32[rsp]
    mov                 qword ptr [rcx+8],rax    

    ; c[2] = a[2] + b[2] + CARRY;

    mov                 rax,qword ptr 16[rsp]
    mov                 rax,qword ptr [rax+10h]    
    mov                 rcx,qword ptr 24[rsp]
    adc                 rax,qword ptr [rcx+10h]    
    mov                 rcx,qword ptr 32[rsp]
    mov                 qword ptr [rcx+10h],rax    

    ; c[3] = a[3] + b[3] + CARRY;

    mov                 rax,qword ptr 16[rsp]
    mov                 rax,qword ptr [rax+18h]    
    mov                 rcx,qword ptr 24[rsp]
    adc                 rax,qword ptr [rcx+18h]    
    mov                 rcx,qword ptr 32[rsp]
    mov                 qword ptr [rcx+18h],rax    

    ; }

    pop                 rdi    
    ret    

    add256              endp

    end

I know you indicating you didn't want an emulated add with carry solution, and wanted a high performing solution, but, still, you may consider the following C++ only solution which has a nice way of simulating 256 bit numbers:

#include "stdafx.h"

int _tmain(int argc, _TCHAR* argv[])
{
    unsigned long long a[4] = {0x8000000000000001, 2, 3, 4};
    unsigned long long b[4] = {0x8000000000000005, 6, 7, 8};
    unsigned long long c[4] = {0, 0, 0, 0};
    c[0] = a[0] + b[0]; // 6
    c[1] = a[1] + b[1] + (c[0] < a[0]); // 9
    c[2] = a[2] + b[2] + (c[1] < a[1]); // 10
    c[3] = a[3] + b[3] + (c[2] < a[2]); // 12
    return 0;
}

来源：https://stackoverflow.com/questions/9145644/visual-c-x64-add-with-carry

标签

c++

visual-c++

64-bit

inline-assembly

intrinsics