False dependency issue for the Fermi architecture

|▌冷眼眸甩不掉的悲伤 提交于 2019-12-19 10:21:43

问题


I am trying to achieve "3-way overlapping" using 3 streams as in the examples in CUDA streams and concurrency webinar. But I couldn't achieve it.

I have Geforce GT 550M (Fermi Architecture with one copy engine) and I am using Windows 7 (64 bit).

Here is the code that I have written.

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions 

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x) 
    {
        out[i] = in[i] * in[i];
    }
}

int main()
{
    const int dataSize = DATA_SIZE;
    int *h_in = new int[dataSize];
    int *h_out = new int[dataSize];
    int *h_groundTruth = new int[dataSize];

    // Input population
    for(int i = 0; i < dataSize; i++)
        h_in[i] = 5;

    for(int i = 0; i < dataSize; i++)
        h_out[i] = 0;

    // CPU calculation for ground truth
    for(int i = 0; i < dataSize; i++)
        h_groundTruth[i] = h_in[i] * h_in[i];

    // Choose which GPU to run on, change this on a multi-GPU system.
    checkCudaErrors( cudaSetDevice(0) );

    int *d_in = 0;
    int *d_out = 0;
    int streamSize = dataSize / NUM_STREAMS;
    size_t memSize = dataSize * sizeof(int);
    size_t streamMemSize = memSize / NUM_STREAMS;

    checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
    checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

    // set kernel launch config
    dim3 nThreads = dim3(NUM_THREADS,1,1);
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamCreate(&streams[i]));

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, 0);

    // overlapped execution using version 2

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);
    }

    //cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        //kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);
    }

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
    }



    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamSynchronize(streams[i]));

    cudaEventRecord(stop, 0);

    checkCudaErrors(cudaStreamSynchronize(0));

    checkCudaErrors(cudaDeviceSynchronize());

    float gpu_time = 0;
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaHostUnregister(h_in));
    checkCudaErrors(cudaHostUnregister(h_out));
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamDestroy(streams[i]));

    cudaDeviceReset();  

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


    // GPU output check
    int sum = 0;
    for(int i = 0; i < dataSize; i++)       
        sum += h_groundTruth[i] - h_out[i];

    cout << "Error between CPU and GPU: " << sum << endl;

    delete[] h_in;
    delete[] h_out;
    delete[] h_groundTruth;

    return 0;
}

Using Nsight for profiling, I have this result:

It may seem correct, but why does the D2H transfer in stream #1 only start when the last kernel launch of stream #2 and not before? I tried also to use 8 streams (just by changing NUM_STREAM to 8) to achieve such a "3-way overlap" and here is the result:

The interesting thing is that when I use 8 streams, the overlappings between computation and memory transfers seem to be much better.

What is the reason for this problem? Is it due to WDDM driver or is there something wrong with my program?


回答1:


From the comments above, it seems that the OP's problem is a false dependency issue, suffered by the Fermi architecture and solved by the Hyper-Q feature of the Kepler architecture.

To summarize, the OP is highlighting the fact that the first D2H transfer (stream #1) does not start immediately after the last H2D (stream #3) finishes, while in principle it could. The time gap is highlighted by the red circle in the following figure (henceforth, but for the differently specified, all the tests refer to a GeForce GT540M belonging to the Fermi family):

The OP's approach is a breadth-first approach, which operates according to the following scheme:

for(int i = 0; i < NUM_STREAMS; i++)
    cudaMemcpyAsync(..., cudaMemcpyHostToDevice,   streams[i]);

for(int i = 0; i < NUM_STREAMS; i++)
{
    kernel_launch_1<<<..., 0, streams[i]>>>(...);
    kernel_launch_2<<<..., 0, streams[i]>>>(...);
}

for(int i = 0; i < NUM_STREAMS; i++)
    cudaMemcpyAsync(..., cudaMemcpyDeviceToHost,   streams[i]);

Using a depth-first approach, operating according to the following scheme

for(int i = 0; i < NUM_STREAMS; i++)
{
    cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]);

    kernel_launch_1<<<...., 0, streams[i]>>>(....);
    kernel_launch_2<<<...., 0, streams[i]>>>(....);

    cudaMemcpyAsync(...., cudaMemcpyDeviceToHost,   streams[i]);
}

does not seem to improve the situation, according to the following timeline (the depth-first code is reported at the bottom of the answer), but it seems to show a worse overlapping:

Under the breadth-first approach, and commenting the second kernel launch, the first D2H copy starts immediately as it can, as reported by the following timeline:

Finally, running the code on a Kepler K20c, the problem does not show up, as illustrated by the following figure:

Here is the code for the depth-first approach:

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x)
    {
        out[i] = in[i] * in[i];
    }
}

int main()
{
    const int dataSize = DATA_SIZE;
    int *h_in = new int[dataSize];
    int *h_out = new int[dataSize];
    int *h_groundTruth = new int[dataSize];

    // Input population
    for(int i = 0; i < dataSize; i++)
        h_in[i] = 5;

    for(int i = 0; i < dataSize; i++)
        h_out[i] = 0;

    // CPU calculation for ground truth
    for(int i = 0; i < dataSize; i++)
        h_groundTruth[i] = h_in[i] * h_in[i];

    // Choose which GPU to run on, change this on a multi-GPU system.
    checkCudaErrors( cudaSetDevice(0) );

    int *d_in = 0;
    int *d_out = 0;
    int streamSize = dataSize / NUM_STREAMS;
    size_t memSize = dataSize * sizeof(int);
    size_t streamMemSize = memSize / NUM_STREAMS;

    checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
    checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

    // set kernel launch config
    dim3 nThreads = dim3(NUM_THREADS,1,1);
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamCreate(&streams[i]));

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, 0);

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;

        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);

        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);

        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
    }



    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamSynchronize(streams[i]));

    cudaEventRecord(stop, 0);

    checkCudaErrors(cudaStreamSynchronize(0));

    checkCudaErrors(cudaDeviceSynchronize());

    float gpu_time = 0;
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaHostUnregister(h_in));
    checkCudaErrors(cudaHostUnregister(h_out));
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamDestroy(streams[i]));

    cudaDeviceReset();  

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


    // GPU output check
    int sum = 0;
    for(int i = 0; i < dataSize; i++)      
        sum += h_groundTruth[i] - h_out[i];

    cout << "Error between CPU and GPU: " << sum << endl;

    delete[] h_in;
    delete[] h_out;
    delete[] h_groundTruth;

    return 0;
}


来源:https://stackoverflow.com/questions/24754878/false-dependency-issue-for-the-fermi-architecture

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!