Is there an equivalent to memcpy() that works inside a CUDA kernel?

前端 未结 3 1987
一个人的身影
一个人的身影 2020-12-23 22:05

I\'m trying to break apart and reshape the structure of an array asynchronously using the CUDA kernel. memcpy() doesn\'t work inside the kernel, and neither doe

3条回答
  •  离开以前
    2020-12-23 22:38

    Yes, there is an equivalent to memcpy that works inside cuda kernels. It is called memcpy. As an example:

    __global__ void kernel(int **in, int **out, int len, int N)
    {
        int idx = threadIdx.x + blockIdx.x*blockDim.x;
    
        for(; idx

    which compiles without error like this:

    $ nvcc -Xptxas="-v" -arch=sm_20 -c memcpy.cu 
    ptxas info    : Compiling entry function '_Z6kernelPPiS0_ii' for 'sm_20'
    ptxas info    : Function properties for _Z6kernelPPiS0_ii
        0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
    ptxas info    : Used 11 registers, 48 bytes cmem[0]
    

    and emits PTX:

    .version 3.0
    .target sm_20
    .address_size 32
    
        .file   1 "/tmp/tmpxft_00000407_00000000-9_memcpy.cpp3.i"
        .file   2 "memcpy.cu"
        .file   3 "/usr/local/cuda/nvvm/ci_include.h"
    
    .entry _Z6kernelPPiS0_ii(
        .param .u32 _Z6kernelPPiS0_ii_param_0,
        .param .u32 _Z6kernelPPiS0_ii_param_1,
        .param .u32 _Z6kernelPPiS0_ii_param_2,
        .param .u32 _Z6kernelPPiS0_ii_param_3
    )
    {
        .reg .pred  %p<4>;
        .reg .s32   %r<32>;
        .reg .s16   %rc<2>;
    
    
        ld.param.u32    %r15, [_Z6kernelPPiS0_ii_param_0];
        ld.param.u32    %r16, [_Z6kernelPPiS0_ii_param_1];
        ld.param.u32    %r2, [_Z6kernelPPiS0_ii_param_3];
        cvta.to.global.u32  %r3, %r15;
        cvta.to.global.u32  %r4, %r16;
        .loc 2 4 1
        mov.u32     %r5, %ntid.x;
        mov.u32     %r17, %ctaid.x;
        mov.u32     %r18, %tid.x;
        mad.lo.s32  %r30, %r5, %r17, %r18;
        .loc 2 6 1
        setp.ge.s32     %p1, %r30, %r2;
        @%p1 bra    BB0_5;
    
        ld.param.u32    %r26, [_Z6kernelPPiS0_ii_param_2];
        shl.b32     %r7, %r26, 2;
        .loc 2 6 54
        mov.u32     %r19, %nctaid.x;
        .loc 2 4 1
        mov.u32     %r29, %ntid.x;
        .loc 2 6 54
        mul.lo.s32  %r8, %r29, %r19;
    
    BB0_2:
        .loc 2 7 1
        shl.b32     %r21, %r30, 2;
        add.s32     %r22, %r4, %r21;
        ld.global.u32   %r11, [%r22];
        add.s32     %r23, %r3, %r21;
        ld.global.u32   %r10, [%r23];
        mov.u32     %r31, 0;
    
    BB0_3:
        add.s32     %r24, %r10, %r31;
        ld.u8   %rc1, [%r24];
        add.s32     %r25, %r11, %r31;
        st.u8   [%r25], %rc1;
        add.s32     %r31, %r31, 1;
        setp.lt.u32     %p2, %r31, %r7;
        @%p2 bra    BB0_3;
    
        .loc 2 6 54
        add.s32     %r30, %r8, %r30;
        ld.param.u32    %r27, [_Z6kernelPPiS0_ii_param_3];
        .loc 2 6 1
        setp.lt.s32     %p3, %r30, %r27;
        @%p3 bra    BB0_2;
    
    BB0_5:
        .loc 2 9 2
        ret;
    }
    

    The code block at BB0_3 is a byte sized memcpy loop emitted automagically by the compiler. It might not be a great idea from a performance point-of-view to use it, but it is fully supported (and has been for a long time on all architectures).


    Edited four years later to add that since the device side runtime API was released as part of the CUDA 6 release cycle, it is also possible to directly call something like

    cudaMemcpyAsync(void *to, void *from, size, cudaMemcpyDeviceToDevice)
    

    in device code for all architectures which support it (Compute Capability 3.5 and newer hardware using separate compilation and device linking).

提交回复
热议问题