Cuda object copy

一世执手 提交于 2019-12-02 11:46:28

Use of the __device__ variable is causing difficulty. It is intended to be used for static allocations, known at compile time.

Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.

Some problems with your approach:

  1. You are using an incorrect API for modifying a __device__ variable. We don't use cudaMemcpy. We use cudaMemcpyToSymbol, etc.

  2. You are not allowed to take the address of a device entity in host code:

    e = cudaMalloc((void**)&d_a, sizeof(A));
                           ^
    

    cudaMalloc expects to store the allocated pointer value in host memory, not in device memory. It will point to a location in device memory, but it should be stored in a host variable.

If you want to stay with your method, the following modifications should make it correct:

$ cat t89.cu
#include <iostream>
#include <stdio.h>

class A {
public:
    int s;
};

__device__ A *d_a;
__global__ void MethodA() {
    printf("%d\n", d_a->s);
}

int main() {
    A *a = new A();
    a->s = 10;
    A *temp_d_a;
    cudaMalloc((void**)&temp_d_a, sizeof(A));
    cudaMemcpy(temp_d_a, a, sizeof(A), cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(d_a, &temp_d_a, sizeof(A *));
    MethodA << <1, 1 >> > ();
    cudaMemcpy(a, temp_d_a, sizeof(A), cudaMemcpyDeviceToHost);
    std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
    cudaFree(temp_d_a);
    delete(a);
    return 0;
}
$ nvcc t89.cu -o t89
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$

EDIT: Regarding my previous statement:

Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.

and asked about in the comments below, here is a worked example showing that approach:

$ cat t89.cu
#include <iostream>
#include <stdio.h>

class A {
public:
    int s;
};

__global__ void MethodA(A *a) {
    printf("%d\n", a->s);
}

int main() {
    A *a = new A();
    a->s = 10;
    A *d_a;  //  an ordinary host-based pointer
    cudaMalloc((void**)&d_a, sizeof(A)); //dynamic allocation created at runtime
    cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
    MethodA << <1, 1 >> > (d_a);  // passed to kernel via parameter
    cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
    std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
    cudaFree(d_a);
    delete(a);
    return 0;
}
$ nvcc -o t89 t89.cu
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$
标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!