There are several invalid memory access in the provided code.
- Accessing device memory (allocated using
cudaMalloc
) from host like d_data->a
will cause undefined behavior (segmentation fault etc.).
cudaMemcpy
takes pointers as arguments, not address of pointer. So cudaMemcpy(&d_data, &h_data...
should be replaced with cudaMemcpy(d_data, h_data...
.
Allocating a device object with a device pointer as a member is a bit tricky. It can be achieved as follows:
- Allocate a temporary host object (
MyStruct temp
).
- Allocate device memory to the member we want on device (
cudaMalloc(&temp.a, bytes)
).
- Allocate device object (
cudaMalloc(&d_data, sizeof(MyStruct)
).
- Copy temporary host object to the device object (
cudaMemcpy(d_data, &temp, sizeof(MyStruct), cudaMemcpyHostToDevice)
).
Keep in mind that when you modify the contents of d_data->a
on the device, temp.a
will also be modified because they are actually pointing to same memory location on device.
Your final main function will look something like this:
int main(){
MyStruct *h_data, *d_data, *out_data;
size_t structSize = sizeof(MyStruct);
size_t intSize = sizeof(int);
h_data = (MyStruct *) malloc(structSize * 1);
h_data->b = 32;
h_data->a = (int *)malloc(intSize * h_data->b);
out_data = (MyStruct *) malloc(structSize * 1);
out_data->b = 32;
out_data->a = (int *)malloc(intSize * out_data->b);
for(int i = 0; i<32; i++){
h_data->a[i] = i;
}
//Create temporary MyStruct object on host and allocate memory to its member "a" on device
MyStruct temp;
temp.b = h_data->b;
checkCuda(cudaMalloc(&temp.a, 32 * sizeof(int)));
//Copy host data to temp.a
checkCuda(cudaMemcpy(temp.a, h_data->a, 32 * sizeof(int), cudaMemcpyHostToDevice));
//Memory allocation for the device MyStruct
checkCuda(cudaMalloc(&d_data, sizeof(MyStruct) * 1));
//Copy actual object to device
checkCuda(cudaMemcpy(d_data, &temp, sizeof(MyStruct) * 1, cudaMemcpyHostToDevice));
structOperation<<<1,32>>>(d_data);
//temp.a will be updated after kernel launch
checkCuda(cudaMemcpy(out_data->a, temp.a, 32 * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nDataElements : ");
for(int i = 0; i<32; i++)
{
printf(" %d",out_data->a[i]);
}
printf("\n");
checkCuda(cudaFree(temp.a));
checkCuda(cudaFree(d_data));
free(h_data->a);
free(out_data->a);
free(h_data);
free(out_data);
}