问题
I'm implementing a function to find the sum of an array by using reduction, my array have 32*32 elements and its values is 0 ... 1023. The my expected sum value is 523776, but my reult is 15872, it wrong. Here is my code:
#include <stdio.h>
#include <cuda.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(16,16);
dim3 gridsize;
gridsize.x=(w+blocksize.x-1)/blocksize.x;
gridsize.y=(h+blocksize.y-1)/blocksize.y;
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
回答1:
There are at least 2 problems in your code
You are doing
atomicAdd
to the first element in yourdev_b
array, but you are not initializing that element to a known value (i.e. 0). Sure, before you run the kernel, you are copyingb
todev_b
, but since you haven't initializedb
to any known values, that won't help. The arrayb
is not automatically initialized to zero in C or C++, if that is what you were thinking. We can fix this by settingb[0]
to zero, before copyingb
todev_b
.Your reduction kernel is written to handle a 1D case (i.e. the only thread index used is a 1D thread index based on the
.x
values), but you are launching a kernel with 2D threadblocks and grids. This mismatch won't work properly and we either need to launch a 1D threadblock and grid, or else re-write the kernel to work with 2D indices (i.e..x
and.y
). I've chosen the former (1D).
Here is a worked example with those changes to your code, it seems to produce the correct result:
$ cat t1218.cu
#include <stdio.h>
#define w 32
#define h 32
#define N w*h
__global__ void reduce(int *g_idata, int *g_odata);
void fill_array (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
fill_array( a, N );
b[0] = 0; //initialize the first value of b to zero
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
dim3 blocksize(256); // create 1D threadblock
dim3 gridsize(N/blocksize.x); //create 1D grid
reduce<<<gridsize, blocksize>>>(dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d \n", b[0]);
printf("Value should be: %d \n", ((N-1)*(N/2)));
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
__shared__ int sdata[256];
// each thread loads one element from global to shared mem
// note use of 1D thread indices (only) in this kernel
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0)
atomicAdd(g_odata,sdata[0]);
}
// CPU function to generate a vector of random integers
void fill_array (int *a, int n)
{
for (int i = 0; i < n; i++)
a[i] = i;
}
$ nvcc -o t1218 t1218.cu
$ cuda-memcheck ./t1218
========= CUDA-MEMCHECK
Reduced sum of Array elements = 523776
Value should be: 523776
========= ERROR SUMMARY: 0 errors
$
Notes:
The kernel and your code as written depend on
N
being an exact multiple of the threadblock size (256). That is satisfied for this case, but things will break if it is not.I don't see any evidence of proper cuda error checking. It wouldn't have turned up anything here, but its good practice. As a quick test, run your code with
cuda-memcheck
as I have done here.
来源:https://stackoverflow.com/questions/38654754/how-to-find-the-sum-of-array-in-cuda-by-reduction