cuda shared memory - inconsistent results

问题

I'm trying to do a parallel reduction to sum an array in CUDA. Currently i pass an array in which to store the sum of the elements in each block. This is my code:

#include <cstdlib>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <helper_cuda.h>
#include <host_config.h>
#define THREADS_PER_BLOCK 256
#define CUDA_ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

using namespace std;

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

struct double3c {
    double x; 
    double y;
    double z;
    __host__ __device__ double3c() : x(0), y(0), z(0) {}
    __host__ __device__ double3c(int x_, int y_, int z_) : x(x_), y(y_), z(z_) {}
    __host__ __device__ double3c& operator+=(const double3c& rhs) { x += rhs.x; y += rhs.y; z += rhs.z;}
    __host__ __device__ double3c& operator/=(const double& rhs) { x /= rhs; y /= rhs; z /= rhs;}

};

class VectorField {
public:
    double3c *data;
    int size_x, size_y, size_z;
    bool is_copy;  

    __host__ VectorField () {}

    __host__ VectorField (int x, int y, int z) {
        size_x = x; size_y = y; size_z = z;
        is_copy = false;
        CUDA_ERROR_CHECK (cudaMalloc(&data, x * y * z * sizeof(double3c))); 
    }

    __host__ VectorField (const VectorField& other) {
        size_x = other.size_x; size_y = other.size_y; size_z = other.size_z;
        this->data = other.data;
        is_copy = true;
    }

    __host__ ~VectorField() {     
        if (!is_copy) CUDA_ERROR_CHECK (cudaFree(data));
    }
};

__global__ void KernelCalculateMeanFieldBlock (VectorField m, double3c* result) {
    __shared__ double3c blockmean[THREADS_PER_BLOCK];    
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
    else blockmean[threadIdx.x] = double3c(0,0,0);
    __syncthreads();
    for(int s = THREADS_PER_BLOCK / 2; s > 0; s /= 2) {
        if (threadIdx.x < s) blockmean[threadIdx.x] += blockmean[threadIdx.x + s];
        __syncthreads();
    }


    if(threadIdx.x == 0) result[blockIdx.x] = blockmean[0];   
}

double3c CalculateMeanField (VectorField& m) { 
    int blocknum = (m.size_x * m.size_y * m.size_z - 1) / THREADS_PER_BLOCK + 1;
    double3c *mean = new double3c[blocknum]();
    double3c *cu_mean;
    CUDA_ERROR_CHECK (cudaMalloc(&cu_mean, sizeof(double3c) * blocknum));
    CUDA_ERROR_CHECK (cudaMemset (cu_mean, 0, sizeof(double3c) * blocknum));

        KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK>>> (m, cu_mean);
        CUDA_ERROR_CHECK (cudaPeekAtLastError());
        CUDA_ERROR_CHECK (cudaDeviceSynchronize());
        CUDA_ERROR_CHECK (cudaMemcpy(mean, cu_mean, sizeof(double3c) * blocknum, cudaMemcpyDeviceToHost));

    CUDA_ERROR_CHECK (cudaFree(cu_mean));
    for (int i = 1; i < blocknum; i++) {mean[0] += mean[i];}
    mean[0] /= m.size_x * m.size_y * m.size_z;
    double3c aux = mean[0];
    delete[] mean;
    return aux;
}



int main() {
    VectorField m(100,100,100);
    double3c sum = CalculateMeanField (m);
    cout <<  sum.x << '\t' << sum.y << '\t' <<sum.z;  


    return 0;
}

EDIT

Posted a functional code. Constructing a VectorField with 10x10x10 elements works fine and gives mean 1, but constructing it with 100x100x100 elements gives mean ~0.97 (it varies from run to run). Is this a right way to do a parallel reduction, or should I stick to one kernel launch per block?

回答1:

When I compile the code you have now on linux, I get the following warning:

t614.cu(55): warning: __shared__ memory variable with non-empty constructor or destructor (potential race between threads)

This type of warning should not be ignored. It is associated with this line of code:

__shared__ double3c blockmean[THREADS_PER_BLOCK];

Since the initialization of these objects stored in shared memory (by the constructor) will happen in some arbitrary order, and you have no barrier between that and the subsequent code that will also set these values, unpredictable things (*) can happen.

If I insert a __syncthreads() in the code to isolate the constructor activity from the subsequent code, I get expected results:

__shared__ double3c blockmean[THREADS_PER_BLOCK];    
int index = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();  // add this line
if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);
else blockmean[threadIdx.x] = double3c(0,0,0);
__syncthreads();

This still leaves us with the warning, however. A modification to fix this and make the warning go away would be to allocate the necessary __shared__ size dynamically. Change your shared memory declaration to this:

extern __shared__ double3c blockmean[];

and modify your kernel call:

KernelCalculateMeanFieldBlock <<<blocknum, THREADS_PER_BLOCK, THREADS_PER_BLOCK*sizeof(double3c)>>> (m, cu_mean);

This will eliminate the warning, produce the correct result, and avoid the unnecessary constructor traffic on the shared memory variable. (And the additional __syncthreads() described above is no longer necessary.)

*regarding "unpredictable things", if you look under the hood by inspecting either the generated SASS (cuobjdump -sass ...) or the PTX (**) (nvcc -ptx ...), you will see that each thread initializes the entire __shared__ array of objects to zero (the behavior of the default constructor). As a result of this, some of the threads (i.e. warps) can race ahead and begin populating the shared memory area according to this line:

if (index < m.size_x * m.size_y * m.size_z) blockmean[threadIdx.x] = m.data[index] = double3c(0, 1, 0);

Then, when other warps begin executing, those threads will clear out the entire shared memory array again. This racing behavior leads to unpredictable results.

** I don't normally suggest judging code behavior by inspecting the PTX, but in this case it is equally instructive. The final compile stages will not optimize away the constructor behavior.

来源：https://stackoverflow.com/questions/27230621/cuda-shared-memory-inconsistent-results

标签

c++

memory

cuda

shared