问题
I'm implementing a function to find the sum of an array by using reduction, my array have 32*32 elements and its values is 0 ... 1023. The my expected sum value is 523776, but my reult is 15872, it wrong. Here is my code:
#include <stdio.h>
#include <cuda.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(w+blocksize.x-1)/blocksize.x;
gridsize.y=(h+blocksize.y-1)/blocksize.y;
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
回答1:
There are at least 2 problems in your code
You are doing
atomicAddto the first element in yourdev_barray, but you are not initializing that element to a known value (i.e. 0). Sure, before you run the kernel, you are copyingbtodev_b, but since you haven't initializedbto any known values, that won't help. The arraybis not automatically initialized to zero in C or C++, if that is what you were thinking. We can fix this by settingb[0]to zero, before copyingbtodev_b.Your reduction kernel is written to handle a 1D case (i.e. the only thread index used is a 1D thread index based on the
.xvalues), but you are launching a kernel with 2D threadblocks and grids. This mismatch won't work properly and we either need to launch a 1D threadblock and grid, or else re-write the kernel to work with 2D indices (i.e..xand.y). I've chosen the former (1D).
Here is a worked example with those changes to your code, it seems to produce the correct result:
$ cat t1218.cu
#include <stdio.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
b[0] = 0; //initialize the first value of b to zero
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N/blocksize.x); //create 1D grid
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
printf("Value should be: %d \n", ((N-1)*(N/2)));
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
// note use of 1D thread indices (only) in this kernel
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
$ nvcc -o t1218 t1218.cu
$ cuda-memcheck ./t1218
========= CUDA-MEMCHECK
Reduced sum of Array elements = 523776
Value should be: 523776
========= ERROR SUMMARY: 0 errors
$
Notes:
The kernel and your code as written depend on
Nbeing an exact multiple of the threadblock size (256). That is satisfied for this case, but things will break if it is not.I don't see any evidence of proper cuda error checking. It wouldn't have turned up anything here, but its good practice. As a quick test, run your code with
cuda-memcheckas I have done here.
来源:https://stackoverflow.com/questions/38654754/how-to-find-the-sum-of-array-in-cuda-by-reduction