why CUDA doesn't result in speedup in C++ code?

问题

I'm using VS2019 and have an NVIDIA GeForce GPU. I tried the code from this link: https://towardsdatascience.com/writing-lightning-fast-code-with-cuda-c18677dcdd5f

The author of that post claims to get a speedup when using CUDA. However, for me, the serial version takes around 7 milliseconds while the CUDA version takes around 28 milliseconds. Why is CUDA slower for this code? The code I used is below:

__global__
void add(int n, float* x, float* y)
{

    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
}

void addSerial(int n, float* x, float* y)
{
    for (int i = 0; i < n; i++)
        y[i] = x[i] + y[i];
}

int main()
{
    int NSerial = 1 << 20;   
    float* xSerial = new float[NSerial];
    float* ySerial = new float[NSerial];
    for (int i = 0; i < NSerial; i++) {
        xSerial[i] = 1.0f;
        ySerial[i] = 2.0f;
    }
    auto t1Serial = std::chrono::high_resolution_clock::now();
    addSerial(NSerial, xSerial, ySerial);
    auto t2Serial = std::chrono::high_resolution_clock::now(); 
    auto durationSerial = std::chrono::duration_cast<std::chrono::milliseconds>(t2Serial - t1Serial).count(); 
    float maxErrorSerial = 0.0f;
    for (int i = 0; i < NSerial; i++)
        maxErrorSerial = fmax(maxErrorSerial, fabs(ySerial[i] - 3.0f));
    std::cout << "Max error Serial: " << maxErrorSerial << std::endl;
    std::cout << "durationSerial: "<<durationSerial << std::endl;
    delete[] xSerial;
    delete[] ySerial;


    int N = 1 << 20;   

    float* x, * y;
    cudaMallocManaged(&x, N * sizeof(float));
    cudaMallocManaged(&y, N * sizeof(float));

    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }


    int device = -1;
    cudaGetDevice(&device);
    cudaMemPrefetchAsync(x, N * sizeof(float), device, NULL);
    cudaMemPrefetchAsync(y, N * sizeof(float), device, NULL);


    int blockSize = 1024;
    int numBlocks = (N + blockSize - 1) / blockSize;
    auto t1 = std::chrono::high_resolution_clock::now();
    add << <numBlocks, blockSize >> > (N, x, y);

    cudaDeviceSynchronize();
    auto t2 = std::chrono::high_resolution_clock::now(); 
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count(); 

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
    std::cout << "Max error: " << maxError << std::endl;
    std::cout << "duration CUDA: "<<duration; 

    cudaFree(x);
    cudaFree(y);



    return 0;
}

回答1:

There are several observations to make here:

The first call of a CUDA kernel can accumulate a lot of one time latency associated with setup on the GPU, so the normal approach is to include a "warm-up" call
The kernel design in your question is a "resident" design, so optimal execution should occur when you launch only as many blocks as required to fully occupy your GPU. There is an API you can use to get this information for your GPU.
Perform timing in microseconds, not milliseconds
Build your code in release mode.

Doing all of this to your CUDA code gets me this:

    int N = 1 << 20;   
    int device = -1;
    cudaGetDevice(&device);

    float* x, * y;
    cudaMallocManaged(&x, N * sizeof(float));
    cudaMallocManaged(&y, N * sizeof(float));

    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
    cudaMemPrefetchAsync(x, N * sizeof(float), device, NULL);
    cudaMemPrefetchAsync(y, N * sizeof(float), device, NULL);

    int blockSize, numBlocks;
    cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, add);

    for(int rep=0; rep<10; rep++) {
        auto t1 = std::chrono::high_resolution_clock::now();
        add << <numBlocks, blockSize >> > (N, x, y);
        cudaDeviceSynchronize();
        auto t2 = std::chrono::high_resolution_clock::now(); 
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count(); 
        std::cout << rep << " duration CUDA: " << duration <<std::endl; 
    }

    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(y[i] - 12.0f));
    std::cout << "Max error: " << maxError << std::endl;

    cudaFree(x);
    cudaFree(y);

And building it and running it:

$ nvcc -arch=sm_52 -std=c++11 -o not_so_fast not_so_fast.cu 
$ ./not_so_fast 
Max error Serial: 0
durationSerial: 2762
0 duration CUDA: 1074
1 duration CUDA: 150
2 duration CUDA: 151
3 duration CUDA: 158
4 duration CUDA: 152
5 duration CUDA: 152
6 duration CUDA: 147
7 duration CUDA: 124
8 duration CUDA: 112
9 duration CUDA: 113
Max error: 0

On my system, the first GPU run close to three times as fast as the serial loop. The second and subsequent runs are almost 10 times faster again. Your results can (and probably will) vary.

来源：https://stackoverflow.com/questions/60085669/why-cuda-doesnt-result-in-speedup-in-c-code

标签

c++

parallel-processing

cuda

gpu