问题
I found the following program from http://llpanorama.wordpress.com/2008/05/21/my-first-cuda-program/
Unfortunately I can't copy paste it here because the code becomes messy
It takes as input a vector of numbers and then gives as an output the vector multiplied by itself, I run it on the emulator that I have installed on my computer and it gives the following output:
0 0.000000
1 1.000000
2 4.000000
3 9.000000
4 16.000000
5 25.000000
6 36.000000
7 49.000000
8 64.000000
9 81.000000
however if I decide to run it on a remote computer which runs debian and has cuda compatible gpu by entering
nvcc test.cu -lcudart -o test
./test
it gives me the following output
0 0.000000
1 1.000000
2 2.000000
3 3.000000
4 4.000000
5 5.000000
6 6.000000
7 7.000000
8 8.000000
9 9.000000
why does this happen? Thank you in advance!
回答1:
The problem is that code has no error checking, and there is something wrong with the remote computer. Add error checking to that code (it's not hard to do), re-run it, and then see what happens. If you still have trouble, report back.
Here is the code suitably modified with error checking:
// example1.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
cudaCheckErrors("cudaMalloc fail");
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2 fail");
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
}
来源:https://stackoverflow.com/questions/15177019/cuda-program-does-not-give-the-correct-output-when-using-a-cuda-compatible-gpu