问题
I am new to CUDA/GPU and I am having problems copying data from my device back to the host. I am developing for Jetson TK1 with CUDA Toolkit 6.5. It builds successfully, but gives an error during runtime. My code is below:
//main.cu
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size);
int main () {
int data_length = 1024000;
const int length=512;
const size_t size= length;
double signalA[length], signalB[length], signalC[length];
for (int i=0; i<data_length; i++)
{
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
if(i==0)
{
for(int k=0; k<length; k++)
{
signalA[k]=v_ia[k];
signalB[k]=v_ib[k];
signalC[k]=v_ic[k];
}
i=length-1;
}
else
{
//allocate memory in GPU and kernel call for phase A
allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalA[length-1]=v_ia[i];
//allocate memory in GPU and kernel call for phase B
allocate(d_inputCurrentIb, signalB, d_outputCurrentIb, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalB, d_outputCurrentIb, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalB[length-1]=v_ib[i];
//allocate memory in GPU and kernel call for phase C;
allocate(d_inputCurrentIc, signalC, d_outputCurrentIc, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalC, d_outputCurrentIc, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalC[length-1]=v_ic[i];
//memory cleaning
checkCudaErrors(cudaFree(d_inputCurrentIa));
checkCudaErrors(cudaFree(d_inputCurrentIb));
checkCudaErrors(cudaFree(d_inputCurrentIc));
checkCudaErrors(cudaFree(d_outputCurrentIa));
checkCudaErrors(cudaFree(d_outputCurrentIb));
checkCudaErrors(cudaFree(d_outputCurrentIc));
}
And my kernel and function are simple, they are just moving the array elements to the left each time:
__global__ void allocate_kernel(double* const d_in, double* const d_out, const size_t size) {
__shared__ double shared[512];
int tid = threadIdx.x;
if(tid < size)
shared[tid] = d_in[tid];
__syncthreads();
if(tid < size-1)
d_out[tid]=shared[tid+1];
__syncthreads();
}
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size) {
const dim3 blockSize(512);
const dim3 gridSize(1);
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc((void **)&d_inputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMalloc((void **)&d_outputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMemset(d_outputCurrent, 0, sizeof(double) * size));
checkCudaErrors(cudaMemcpy(d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
allocate_kernel<<<gridSize, blockSize>>>(d_inputCurrent, d_outputCurrent, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
This is a small part of my doctoral thesis, I was practicing CUDA with this code, I know it is not so meaningful for now, but I couldn't move any further because I am so stuck with this problem. Any help would be appreciated, thanks in advance.
回答1:
In C, you cannot pass a pointer to a function by value, have that function modify the pointer, and then expect the modification of that pointer to show up in the calling environment:
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
...
//allocate memory in GPU and kernel call for phase A
// at this point, d_inputCurrentIa and d_outputCurrentIa are pointing to nothing
allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
// allocate modified those pointers internally, but the modified values don't show up here
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
// therefore you will get an error here, because d_outputCurrentIa still points to nothing
There are a variety of ways to make this work. One approach is to pass the address of the pointers you wish to modify and use:
void allocate(double** d_inputCurrent, double* signal, double **d_outputCurrent, const size_t size);
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
...
//allocate memory in GPU and kernel call for phase A
allocate(&d_inputCurrentIa, signalA, &d_outputCurrentIa, size);
...
void allocate(double** d_inputCurrent, double* signal, double** d_outputCurrent, const size_t size) {
const dim3 blockSize(512);
const dim3 gridSize(1);
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc((void **)d_inputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMalloc((void **)d_outputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMemset(*d_outputCurrent, 0, sizeof(double) * size));
checkCudaErrors(cudaMemcpy(*d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
allocate_kernel<<<gridSize, blockSize>>>(*d_inputCurrent, *d_outputCurrent, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
Notes:
Not sure why you would mark those pointers
const
. They are not in any wayconst
(the function will modify the pointer value as well as the data it is pointing to.)Coded in browser. You may have to fix some other things up. Since you haven't provided a complete code to work with, I haven't provided a complete code either. But this should be a roadmap.
Allocating in functions can be a memory leak waiting to happen. You might want to give this some thought. Be sure to have a plan to free up those pointers if you will be reusing them or creating a lot of them.
来源:https://stackoverflow.com/questions/34007319/invalid-argument-error-in-cudamemcpy-from-device-to-host