CUDA performance of atomic operation on different address in warp

问题

To my knowledge, if atomic operations are performed on same memory address location in a warp, the performance of the warp could be 32 times slower.

But what if atomic operations of threads in a warp are on 32 different memory locations? Is there any performance penalty at all? Or it will be as fast as normal operation?

My use case is that I have 32 different positions, each thread in a warp needs one of these position but which position is data dependent. So each thread could use atomicCAS to scan if the location desired is empty or not. If it is not empty, scan the next position.

If I am lucky, 32 threads could atomicCAS to 32 different memory locations, is there any performance penalty is this case?

I assume Kepler architecture is used

回答1:

In the code below, I'm adding a constant value to the elements of an array (dev_input). I'm comparing two kernels, one using atomicAdd and one using regular addition. This is an example taken to the extreme in which atomicAdd operates on completely different addresses, so there will be no need for serialization of the operations.

#include <stdio.h>

#define BLOCK_SIZE 1024

int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)  
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void regular_addition(float *dev_input, float val, int N) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;  

    if (i < N) dev_input[i] = dev_input[i] + val;
}

__global__ void atomic_operations(float *dev_input, float val, int N) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;  

    if (i < N) atomicAdd(&dev_input[i],val);
}

int main(){

    int N = 8192*32;

    float* output = (float*)malloc(N*sizeof(float));
    float* dev_input; gpuErrchk(cudaMalloc((void**)&dev_input, N*sizeof(float)));

    gpuErrchk(cudaMemset(dev_input, 0, N*sizeof(float)));

    int NumBlocks = iDivUp(N,BLOCK_SIZE);

    float time, timing1 = 0.f, timing2 = 0.f;
    cudaEvent_t start, stop;

    int niter = 32;

    for (int i=0; i<niter; i++) {

        gpuErrchk(cudaEventCreate(&start));
        gpuErrchk(cudaEventCreate(&stop));
        gpuErrchk(cudaEventRecord(start,0));

        atomic_operations<<<NumBlocks,BLOCK_SIZE>>>(dev_input,3,N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        gpuErrchk(cudaEventRecord(stop,0));
        gpuErrchk(cudaEventSynchronize(stop));
        gpuErrchk(cudaEventElapsedTime(&time, start, stop));

        timing1 = timing1 + time;

    }

    printf("Time for atomic operations:  %3.5f ms \n", timing1/(float)niter);

    for (int i=0; i<niter; i++) {

        gpuErrchk(cudaEventCreate(&start));
        gpuErrchk(cudaEventCreate(&stop));
        gpuErrchk(cudaEventRecord(start,0));

        regular_addition<<<NumBlocks,BLOCK_SIZE>>>(dev_input,3,N);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());

        gpuErrchk(cudaEventRecord(stop,0));
        gpuErrchk(cudaEventSynchronize(stop));
        gpuErrchk(cudaEventElapsedTime(&time, start, stop));

        timing2 = timing2 + time;

    }

    printf("Time for regular addition:  %3.5f ms \n", timing2/(float)niter);

}

Testing this code on my NVIDIA GeForce GT540M, CUDA 5.5, Windows 7, I obtain approximately the same results for the two kernels, i.e., about 0.7ms.

Now change the instruction

if (i < N) atomicAdd(&dev_input[i],val);

if (i < N) atomicAdd(&dev_input[i%32],val);

which is closer to the case of your interest, namely, each atomicAdd operates on different addresses within a warp. The result I obtain is that no performance penalty is observed.

Finally, change the above instruction to

if (i < N) atomicAdd(&dev_input[0],val);

This is the other extreme in which atomicAdd always operates on the same address. In this case, the execution time raises to 5.1ms.

The above tests have been performed on a Fermi architecture. You can try to run the above code on your Kepler card.

来源：https://stackoverflow.com/questions/22342685/cuda-performance-of-atomic-operation-on-different-address-in-warp

标签

performance

cuda

gpu

atomic