What's the efficient way to swap two register variables in CUDA?

限于喜欢 提交于 2020-01-24 07:20:46

问题


I'm starting to write some CUDA code, and I want to do the equivalent of std::swap() for two variables within a kernel; they're in the register file (no spillage, not in some buffer, etc.). Suppose I have the following device code:

__device__ foo(/* some args here */) {

    /* etc. */

    int x = /* value v1 */;
    int y = /* value v2 */;

    /* etc. */

    swap(x,y);

    /* etc. */
}        

Now, I could just write

template <typename T> void swap ( T& a, T& b )
{
  T c(a); a=b; b=c;
}

but I wonder - isn't there some CUDA built-in for this functionality?

Notes:

  • Yes, I want this to run for all threads.
  • Never mind about whether I have enough registers or not. Assume that I have them.

回答1:


To the best of my knowledge, this is all completely irrelevant.

x and y are not "real" objects: they only exist in the abstract machine described by the C++ standard. In particular, they do not correspond to registers.

You might imagine that the compiler when creating your program would assign them to registers, but that's really not how things work. The things being stored in registers can get shuffled around, duplicated, changed into something else, or even eliminated entirely.

In particular, unconditionally swapping two variables that are stored in registers usually doesn't generate any code at all — its only effect is for the compiler to adjust its internal tables of what objects are being stored in what registers at that point in time.

(even for a conditional swap, you're still usually better off letting the compiler do its thing)




回答2:


I have considered the following test program

template <typename T> __device__ void inline swap_test_device1(T& a, T& b)
{
    T c(a); a=b; b=c;
}

template <typename T> __device__ void inline swap_test_device2(T a, T b)
{
    T c(a); a=b; b=c;
}

__global__ void swap_test_global(const int* __restrict__ input1, const int* __restrict__ input2, int* output1, int* output2) {

    int tx = threadIdx.x + blockIdx.x * blockDim.x;

    int x = input1[tx]*input1[tx];
    int y = input2[tx]*input2[tx];

    //swap_test_device2(x,y);
    swap_test_device1(x,y);

    output1[tx] = x;
    output2[tx] = y;

} 

and I have disassembled it. The result when using swap_test_device1 and swap_test_device2 is the same. The common disassembled code is the following

MOV R1, c[0x1][0x100];
S2R R0, SR_CTAID.X;
S2R R2, SR_TID.X;
MOV32I R9, 0x4;  
IMAD R3, R0, c[0x0][0x8], R2;
IMAD R6.CC, R3, R9, c[0x0][0x28];
IMAD.HI.X R7, R3, R9, c[0x0][0x2c];
IMAD R10.CC, R3, R9, c[0x0][0x20];
LD.E R2, [R6];                         loads input1[tx] and stores it in R2
IMAD.HI.X R11, R3, R9, c[0x0][0x24];
IMAD R4.CC, R3, R9, c[0x0][0x30];
LD.E R0, [R10];                        loads input2[tx] and stores it in R0
IMAD.HI.X R5, R3, R9, c[0x0][0x34];
IMAD R8.CC, R3, R9, c[0x0][0x38];
IMAD.HI.X R9, R3, R9, c[0x0][0x3c];
IMUL R2, R2, R2;                       R2 = R2 * R2
ST.E [R4], R2;                         stores input1[tx]*input1[tx] in global memory
IMUL R0, R0, R0;                       R0 = R0 * R0
ST.E [R8], R0;                         stores input2[tx]*input2[tx] in global memory
EXIT ;

It seems that the there is not an explicit swap in the disassembled code. In other words, the compiler, for this simple example, is capable to optimize the code directly writing x and y in the proper global memory locations.

EDIT

I have now considered the following more involved test case

__global__ void swap_test_global(const char* __restrict__ input1, const char* __restrict__ input2, char* output1, char* output2) {

    int tx = threadIdx.x + blockIdx.x * blockDim.x;

    char x = input1[tx];
    char y = input2[tx];

    //swap_test_device2(x,y);
    swap_test_device1(x,y);

    output1[tx] = (x >> 3) & y;
    output2[tx] = (y >> 5) & x;

 }

with the same above __device__ functions. The disassembled code is

MOV R1, c[0x1][0x100];              
S2R R0, SR_CTAID.X;                 
S2R R2, SR_TID.X;           
IMAD R0, R0, c[0x0][0x8], R2;       R0 = threadIdx.x + blockIdx.x * blockDim.x
BFE R7, R0, 0x11f;
IADD R8.CC, R0, c[0x0][0x28];
IADD.X R9, R7, c[0x0][0x2c];
IADD R10.CC, R0, c[0x0][0x20];
LD.E.S8 R4, [R8];                   R4 = x = input1[tx]
IADD.X R11, R7, c[0x0][0x24];
IADD R2.CC, R0, c[0x0][0x30];
LD.E.S8 R5, [R10];                  R5 = y = input2[tx]
IADD.X R3, R7, c[0x0][0x34];
IADD R12.CC, R0, c[0x0][0x38];
IADD.X R13, R7, c[0x0][0x3c];
SHR.U32 R0, R4, 0x3;                R0 = x >> 3
SHR.U32 R6, R5, 0x5;                R6 = y >> 5
LOP.AND R5, R0, R5;                 R5 = (x >> 3) & y
LOP.AND R0, R6, R4;                 R0 = (y >> 5) & x
ST.E.U8 [R2], R5;                   global memory store
ST.E.U8 [R12], R0;                  global memory store
EXIT ;

As it can be seen, there is still no apparent register swap.



来源:https://stackoverflow.com/questions/19559745/whats-the-efficient-way-to-swap-two-register-variables-in-cuda

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!