Copying array of pointers into device memory and back (CUDA)

前端 未结 1 1774
一向
一向 2020-12-17 03:06

I am trying to use cublas function cublasSgemmBatched in my toy example. In this example I first allocate 2D arrays: h_AA, h_BB of the

相关标签:
1条回答
  • 2020-12-17 03:59

    So, I figured out the answer (thanks to @Robert Crovella): in order to create device array of pointers to device arrays (for batched functions), one should first create host array of pointers to device arrays, and after that copy it into device array of pointers to device arrays. The same is true about transfering back to host: one should use intermediate host array of pointers to device arrays.

    cublasHandle_t handle;
    cudaError_t cudaerr;
    cudaEvent_t start, stop;
    cublasStatus_t stat;
    const float alpha = 1.0f;
    const float beta = 0.0f;
    
    float *h_A = new float[5];
    float *h_B = new float[5];
    float *h_C = new float[6];
    for (int i = 0; i < 5; i++)
    {
        h_A[i] = i;
        h_B[i] = i;
    }
    
    
    
    float **h_AA, **h_BB, **h_CC;
    h_AA = (float**)malloc(6* sizeof(float*));
    h_BB = (float**)malloc(6 * sizeof(float*));
    h_CC = (float**)malloc(6 * sizeof(float*));
    for (int i = 0; i < 6; i++){
        cudaMalloc((void **)&h_AA[i], 5 * sizeof(float));
        cudaMalloc((void **)&h_BB[i], 5 * sizeof(float));
        cudaMalloc((void **)&h_CC[i], sizeof(float));
        cudaMemcpy(h_AA[i], h_A, 5 * sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(h_BB[i], h_B, 5 * sizeof(float), cudaMemcpyHostToDevice);
    }
    float **d_AA, **d_BB, **d_CC;
    cudaMalloc(&d_AA, 6 * sizeof(float*));
    cudaMalloc(&d_BB, 6 * sizeof(float*));
    cudaMalloc(&d_CC, 6 * sizeof(float*));
    cudaerr = cudaMemcpy(d_AA, h_AA, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    cudaerr = cudaMemcpy(d_BB, h_BB, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    cudaerr = cudaMemcpy(d_CC, h_CC, 6 * sizeof(float*), cudaMemcpyHostToDevice);
    stat = cublasCreate(&handle);
        stat = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, 1, 1, 5, &alpha, 
                 (const float**)d_AA, 1, (const float**)d_BB, 5, &beta, d_CC, 1, 6);
        cudaerr = cudaMemcpy(h_CC, d_CC, sizeof(float), cudaMemcpyDeviceToHost);
        for (int i = 0; i < 6;i++)
            cudaMemcpy(h_C+i, h_CC[i], sizeof(float), cudaMemcpyDeviceToHost);
    cublasDestroy(handle);
    
    0 讨论(0)
提交回复
热议问题