Cuda object copy | 易学教程

问题

I'm trying to use CUDA with objects, this is a little test code i put together to try out things, but i ran into a problem. When i'm doing anything to the device version of the variable, the copy back to the host fails with "cuda Error Ilegal Address", but if i just copy the code to the device and back it works. If i comment out the printf... line, it the works.

class A {
public:
    int s;
};

__device__ A *d_a;
__global__ void MethodA() {
    printf("%d\n", d_a->s);
}

int main() {
    A *a = new A();
    a->s = 10;

    cudaError e;
    e = cudaMalloc((void**)&d_a, sizeof(A));
    e = cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
    MethodA << <1, 1 >> > ();
    e = cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
    std::cout << cudaGetErrorName(e) << std::endl;

    delete(a);
    std::getchar();
    return 0;
}

回答1:

Use of the __device__ variable is causing difficulty. It is intended to be used for static allocations, known at compile time.

Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.

Some problems with your approach:

You are using an incorrect API for modifying a __device__ variable. We don't use cudaMemcpy. We use cudaMemcpyToSymbol, etc.
You are not allowed to take the address of a device entity in host code:
```
e = cudaMalloc((void**)&d_a, sizeof(A));
                       ^
```
cudaMalloc expects to store the allocated pointer value in host memory, not in device memory. It will point to a location in device memory, but it should be stored in a host variable.

If you want to stay with your method, the following modifications should make it correct:

$ cat t89.cu
#include <iostream>
#include <stdio.h>

class A {
public:
    int s;
};

__device__ A *d_a;
__global__ void MethodA() {
    printf("%d\n", d_a->s);
}

int main() {
    A *a = new A();
    a->s = 10;
    A *temp_d_a;
    cudaMalloc((void**)&temp_d_a, sizeof(A));
    cudaMemcpy(temp_d_a, a, sizeof(A), cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(d_a, &temp_d_a, sizeof(A *));
    MethodA << <1, 1 >> > ();
    cudaMemcpy(a, temp_d_a, sizeof(A), cudaMemcpyDeviceToHost);
    std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
    cudaFree(temp_d_a);
    delete(a);
    return 0;
}
$ nvcc t89.cu -o t89
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$

EDIT: Regarding my previous statement:

Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.

and asked about in the comments below, here is a worked example showing that approach:

$ cat t89.cu
#include <iostream>
#include <stdio.h>

class A {
public:
    int s;
};

__global__ void MethodA(A *a) {
    printf("%d\n", a->s);
}

int main() {
    A *a = new A();
    a->s = 10;
    A *d_a;  //  an ordinary host-based pointer
    cudaMalloc((void**)&d_a, sizeof(A)); //dynamic allocation created at runtime
    cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
    MethodA << <1, 1 >> > (d_a);  // passed to kernel via parameter
    cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
    std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
    cudaFree(d_a);
    delete(a);
    return 0;
}
$ nvcc -o t89 t89.cu
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$

来源：https://stackoverflow.com/questions/49878410/cuda-object-copy

标签

object

cuda

memcpy