问题
I'm trying to use CUDA with objects, this is a little test code i put together to try out things, but i ran into a problem. When i'm doing anything to the device version of the variable, the copy back to the host fails with "cuda Error Ilegal Address", but if i just copy the code to the device and back it works. If i comment out the printf... line, it the works.
class A {
public:
int s;
};
__device__ A *d_a;
__global__ void MethodA() {
printf("%d\n", d_a->s);
}
int main() {
A *a = new A();
a->s = 10;
cudaError e;
e = cudaMalloc((void**)&d_a, sizeof(A));
e = cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
MethodA << <1, 1 >> > ();
e = cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorName(e) << std::endl;
delete(a);
std::getchar();
return 0;
}
回答1:
Use of the __device__
variable is causing difficulty. It is intended to be used for static allocations, known at compile time.
Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.
Some problems with your approach:
You are using an incorrect API for modifying a
__device__
variable. We don't usecudaMemcpy
. We usecudaMemcpyToSymbol
, etc.You are not allowed to take the address of a device entity in host code:
e = cudaMalloc((void**)&d_a, sizeof(A)); ^
cudaMalloc
expects to store the allocated pointer value in host memory, not in device memory. It will point to a location in device memory, but it should be stored in a host variable.
If you want to stay with your method, the following modifications should make it correct:
$ cat t89.cu
#include <iostream>
#include <stdio.h>
class A {
public:
int s;
};
__device__ A *d_a;
__global__ void MethodA() {
printf("%d\n", d_a->s);
}
int main() {
A *a = new A();
a->s = 10;
A *temp_d_a;
cudaMalloc((void**)&temp_d_a, sizeof(A));
cudaMemcpy(temp_d_a, a, sizeof(A), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_a, &temp_d_a, sizeof(A *));
MethodA << <1, 1 >> > ();
cudaMemcpy(a, temp_d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaFree(temp_d_a);
delete(a);
return 0;
}
$ nvcc t89.cu -o t89
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$
EDIT: Regarding my previous statement:
Your methodology would be simplified if you used an ordinary host-based pointer, pointing to a dynamic allocation created at runtime (which you are doing anyway), and then pass that host-based pointer to the device, via a kernel parameter.
and asked about in the comments below, here is a worked example showing that approach:
$ cat t89.cu
#include <iostream>
#include <stdio.h>
class A {
public:
int s;
};
__global__ void MethodA(A *a) {
printf("%d\n", a->s);
}
int main() {
A *a = new A();
a->s = 10;
A *d_a; // an ordinary host-based pointer
cudaMalloc((void**)&d_a, sizeof(A)); //dynamic allocation created at runtime
cudaMemcpy(d_a, a, sizeof(A), cudaMemcpyHostToDevice);
MethodA << <1, 1 >> > (d_a); // passed to kernel via parameter
cudaMemcpy(a, d_a, sizeof(A), cudaMemcpyDeviceToHost);
std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
cudaFree(d_a);
delete(a);
return 0;
}
$ nvcc -o t89 t89.cu
$ cuda-memcheck ./t89
========= CUDA-MEMCHECK
10
no error
========= ERROR SUMMARY: 0 errors
$
来源:https://stackoverflow.com/questions/49878410/cuda-object-copy