I\'ve been trying to transform some cuda/C code into a more OO code, but my goal doesn\'t seem to be easy to achieve for my current understanding of the cuda functionin
Your approach should be workable. When you pass an object by value as a kernel parameter (as you have indicated) there really isn't much setup that needs to be done associated with the transfer from host to device.
You need to properly allocate data on the host and the device, and use cudaMemcpy
type operations at appropriate points to move the data, just as you would in an ordinary CUDA program.
One thing to be aware of when declaring an object at global scope as you have done, is that it is recommended not to use CUDA API calls in the object's constructor or destructor. The reasons are covered here, I won't repeat them here. Although that treatment mostly focuses on kernels launched before main, the CUDA lazy initialization can also impact any CUDA API call that is executed outside of main
scope, which applies to constructors and destructors of objects instantiated at global scope.
What follows is a fleshed out example from what you have shown. I mostly didn't change the code you had already written, just added some method definitions for the ones you hadn't. There's obviously a lot of different possible approaches here. For more examples you might want to look at the CUDA C++ integration sample code.
Here's a worked example around what you have shown:
$ cat t1236.cu
#include
class myClass
{
public:
bool bool_var; // Set from host and readable from device
int data_size; // Set from host
__host__ myClass();
__host__ ~myClass();
__host__ void setValues(bool iftrue, int size);
__device__ void dosomething(int device_parameter);
__host__ void export_data();
// completely unknown methods
__host__ void prepareDeviceObj();
__host__ void retrieveDataToHost();
private:
int *data; // Filled in device, shared between threads, at the end copied back to host for data output
int *h_data;
};
__host__ myClass::myClass()
{
}
__host__ myClass::~myClass()
{
}
__host__ void myClass::prepareDeviceObj(){
cudaMemcpy(data, h_data, data_size*sizeof(h_data[0]), cudaMemcpyHostToDevice);
}
__host__ void myClass::retrieveDataToHost(){
cudaMemcpy(h_data, data, data_size*sizeof(h_data[0]), cudaMemcpyDeviceToHost);
}
__host__ void myClass::setValues(bool iftrue, int size)
{
bool_var = iftrue;
data_size = size;
cudaMalloc(&data, data_size*sizeof(data[0]));
h_data = (int *)malloc(data_size*sizeof(h_data[0]));
memset(h_data, 0, data_size*sizeof(h_data[0]));
}
__device__ void myClass::dosomething(int idx)
{
int toadd = idx+data_size;
atomicAdd(&(data[idx]), toadd); // data should be unique among threads
}
__host__ void myClass::export_data(){
for (int i = 0; i < data_size; i++) printf("%d ", h_data[i]);
printf("\n");
cudaFree(data);
free(h_data);
}
__global__ void myKernel(myClass obj)
{
const int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx < obj.data_size)
{
if(!obj.bool_var)
printf("Object is not up to any task here!");
else
{
//printf("Object is ready!");
obj.dosomething(idx);
}
}
}
myClass globalInstance;
int main(int argc, char** argv)
{
int some_number = 40;
globalInstance.setValues(true, some_number);
globalInstance.prepareDeviceObj();
myKernel<<<1,some_number>>>(globalInstance);
globalInstance.retrieveDataToHost();
globalInstance.export_data();
exit(EXIT_SUCCESS);
}
$ nvcc -o t1236 t1236.cu
$ cuda-memcheck ./t1236
========= CUDA-MEMCHECK
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
========= ERROR SUMMARY: 0 errors
$