CUDA cudaMemcpy Struct of Arrays

后端 未结 1 1587
走了就别回头了
走了就别回头了 2020-12-16 05:59

I\'d like to clean up the parameters of CUDA kernels in my project.


Now, a kernel needs 3 uint32_t arrays, which leads to pretty ugly code: (id me

相关标签:
1条回答
  • 2020-12-16 06:34

    You have at least two options. One excellent choice was already given by talonmies, but I'll introduce you to the "learn the hard way" approach.

    First, your struct definition:

    typedef struct S {
        uint32_t *arr1;
        uint32_t *arr2;
        uint32_t *arr3; 
        uint32_t size;
    } S;
    

    ...and kernel definition (with some global variable, but you don't need to follow with that pattern):

    const int size = 10000;
    
    __global__ void some_kernel(S *s)
    {
        int id = blockIdx.x * blockDim.x + threadIdx.x;
        if (id < size)
        {
            s->arr1[id] = 1; // val1
            s->arr2[id] = 2; // val2
            s->arr3[id] = 3; // val3
        }
    }
    

    Notice that if protects you from running out-of-bounds.

    Next, we come with some function that prepares data, executes kernel and prints some result. Part one is data allocation:

    uint32_t *host_arr1, *host_arr2, *host_arr3;
    uint32_t *dev_arr1, *dev_arr2, *dev_arr3;
    
    // Allocate and fill host data
    host_arr1 = new uint32_t[size]();
    host_arr2 = new uint32_t[size]();
    host_arr3 = new uint32_t[size]();
    
    // Allocate device data   
    cudaMalloc((void **) &dev_arr1, size * sizeof(*dev_arr1));
    cudaMalloc((void **) &dev_arr2, size * sizeof(*dev_arr2));
    cudaMalloc((void **) &dev_arr3, size * sizeof(*dev_arr3));
    
    // Allocate helper struct on the device
    S *dev_s;
    cudaMalloc((void **) &dev_s, sizeof(*dev_s));
    

    It's nothing special, you just allocate three arrays and struct. What looks more interesting is how to handle copying of such data into device:

    // Copy data from host to device
    cudaMemcpy(dev_arr1, host_arr1, size * sizeof(*dev_arr1), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_arr2, host_arr2, size * sizeof(*dev_arr2), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_arr3, host_arr3, size * sizeof(*dev_arr3), cudaMemcpyHostToDevice);
    
    // NOTE: Binding pointers with dev_s
    cudaMemcpy(&(dev_s->arr1), &dev_arr1, sizeof(dev_s->arr1), cudaMemcpyHostToDevice);
    cudaMemcpy(&(dev_s->arr2), &dev_arr2, sizeof(dev_s->arr2), cudaMemcpyHostToDevice);
    cudaMemcpy(&(dev_s->arr3), &dev_arr3, sizeof(dev_s->arr3), cudaMemcpyHostToDevice);
    

    Beside ordinary copy of array you noticed, that it's also neccessary to "bind" them with the struct. For that you need to pass an address of pointer. As result, only these pointers are copied.

    Next kernel call, copy data back again to host and printing results:

    // Call kernel
    some_kernel<<<10000/256 + 1, 256>>>(dev_s); // block size need to be a multiply of 256
    
    // Copy result to host:
    cudaMemcpy(host_arr1, dev_arr1, size * sizeof(*host_arr1), cudaMemcpyDeviceToHost);
    cudaMemcpy(host_arr2, dev_arr2, size * sizeof(*host_arr2), cudaMemcpyDeviceToHost);
    cudaMemcpy(host_arr3, dev_arr3, size * sizeof(*host_arr3), cudaMemcpyDeviceToHost);
    
    // Print some result
    std::cout << host_arr1[size-1] << std::endl;
    std::cout << host_arr2[size-1] << std::endl;
    std::cout << host_arr3[size-1] << std::endl;
    

    Keep in mind that in any serious code you should always check for errors from CUDA API calls.

    0 讨论(0)
提交回复
热议问题