Replicate a vector multiple times using CUDA Thrust

前端 未结 3 1401
广开言路
广开言路 2020-12-17 05:36

I am trying to solve a problem using CUDA Thrust.

I have a host array with 3 elements. Is it possible, using Thrust, to create a device array of 3

3条回答
  •  没有蜡笔的小新
    2020-12-17 05:56

    As an apparently simpler alternative to using CUDA Thrust, I'm posting below a worked example implementing in CUDA the classical Matlab's meshgrid function.

    In Matlab

    x = [1 2 3];
    y = [4 5 6 7];
    [X, Y] = meshgrid(x, y);
    

    produces

    X =
    
         1     2     3
         1     2     3
         1     2     3
         1     2     3
    

    and

    Y =
    
         4     4     4
         5     5     5
         6     6     6
         7     7     7
    

    X is exactly the four-fold replication of the x array, which is the OP's question and first guess of Robert Crovella's answer, while Y is the three-fold consecutive replication of each element of the y array, which is the second guess of Robert Crovella's answer.

    Here is the code:

    #include 
    
    #include 
    
    #include "Utilities.cuh"
    
    #define BLOCKSIZE_MESHGRID_X    16
    #define BLOCKSIZE_MESHGRID_Y    16
    
    #define DEBUG
    
    /*******************/
    /* MESHGRID KERNEL */
    /*******************/
    template 
    __global__ void meshgrid_kernel(const T * __restrict__ x, size_t Nx, const float * __restrict__ y, size_t Ny, T * __restrict__ X, T * __restrict__ Y) 
    {
        unsigned int tidx = blockIdx.x * blockDim.x + threadIdx.x;
        unsigned int tidy = blockIdx.y * blockDim.y + threadIdx.y;
    
        if ((tidx < Nx) && (tidy < Ny)) {   
            X[tidy * Nx + tidx] = x[tidx];
            Y[tidy * Nx + tidx] = y[tidy];
        }
    }
    
    /************/
    /* MESHGRID */
    /************/
    template 
    thrust::pair meshgrid(const T *x, const unsigned int Nx, const T *y, const unsigned int Ny) {
    
        T *X; gpuErrchk(cudaMalloc((void**)&X, Nx * Ny * sizeof(T)));
        T *Y; gpuErrchk(cudaMalloc((void**)&Y, Nx * Ny * sizeof(T)));
    
        dim3 BlockSize(BLOCKSIZE_MESHGRID_X, BLOCKSIZE_MESHGRID_Y);
        dim3 GridSize (iDivUp(Nx, BLOCKSIZE_MESHGRID_X), iDivUp(BLOCKSIZE_MESHGRID_Y, BLOCKSIZE_MESHGRID_Y));
    
        meshgrid_kernel<<>>(x, Nx, y, Ny, X, Y);
    #ifdef DEBUG
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    #endif
    
        return thrust::make_pair(X, Y);
    }
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int Nx = 3;
        const int Ny = 4;
    
        float *h_x = (float *)malloc(Nx * sizeof(float));
        float *h_y = (float *)malloc(Ny * sizeof(float));
    
        float *h_X = (float *)malloc(Nx * Ny * sizeof(float));
        float *h_Y = (float *)malloc(Nx * Ny * sizeof(float));
    
        for (int i = 0; i < Nx; i++) h_x[i] = i;
        for (int i = 0; i < Ny; i++) h_y[i] = i + 4.f;
    
        float *d_x; gpuErrchk(cudaMalloc(&d_x, Nx * sizeof(float)));
        float *d_y; gpuErrchk(cudaMalloc(&d_y, Ny * sizeof(float)));
    
        gpuErrchk(cudaMemcpy(d_x, h_x, Nx * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_y, h_y, Ny * sizeof(float), cudaMemcpyHostToDevice));
    
        thrust::pair meshgrid_pointers = meshgrid(d_x, Nx, d_y, Ny);
        float *d_X = (float *)meshgrid_pointers.first;
        float *d_Y = (float *)meshgrid_pointers.second;
    
        gpuErrchk(cudaMemcpy(h_X, d_X, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
        gpuErrchk(cudaMemcpy(h_Y, d_Y, Nx * Ny * sizeof(float), cudaMemcpyDeviceToHost));
    
        for (int j = 0; j < Ny; j++) {
            for (int i = 0; i < Nx; i++) {
                printf("i = %i; j = %i; x = %f; y = %f\n", i, j, h_X[j * Nx + i], h_Y[j * Nx + i]);
            }
        }
    
        return 0;
    
    }
    

提交回复
热议问题