can't enter into __global__ function using cuda

末鹿安然 提交于 2019-12-20 03:58:21

问题


I have written a code on Nsight that compiles and can be executed but the first launch can't be completed.

The strange thing is that when I run it in debug mode, it works perfectly but it is too slow.

Here is the part of the code before entering the function that access the GPU (where i think there is an error I can't find) :

void parallelAction (int * dataReturned, char * data, unsigned char * descBase, int range, int cardBase, int streamIdx)
{
    size_t inputBytes = range*128*sizeof(unsigned char);
    size_t baseBytes = cardBase*128*sizeof(unsigned char);
    size_t outputBytes = range*sizeof(int);

    unsigned char * data_d;
    unsigned char * descBase_d;
    int * cardBase_d;
    int * dataReturned_d;

    cudaMalloc((void **) &data_d, inputBytes);  
    cudaMalloc((void **) &descBase_d, baseBytes);
    cudaMalloc((void **) &cardBase_d, sizeof(int));
    cudaMalloc((void **) &dataReturned_d, outputBytes);

    int blockSize = 196;
    int nBlocks = range/blockSize + (range%blockSize == 0?0:1);

    cudaMemcpy(data_d, data, inputBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(descBase_d, descBase, baseBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(cardBase_d, &cardBase, sizeof(int), cudaMemcpyHostToDevice);

    FindClosestDescriptor<<< nBlocks, blockSize >>>(dataReturned_d, data_d, descBase_d, cardBase_d);

    cudaMemcpy(dataReturned, dataReturned_d, outputBytes, cudaMemcpyDeviceToHost);

    cudaFree(data_d);
    cudaFree(descBase_d);
    cudaFree(cardBase_d);
    cudaFree(dataReturned_d);
}

And the function entering the GPU (I don't think the error is here) :

__global__ void FindClosestDescriptor(int * dataReturned, unsigned char * data, unsigned char * base, int *cardBase)
{
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned char descriptor1[128], descriptor2[128];
    int part = 0;
    int result = 0;
    int winner = 0;
    int minDistance = 0;
    int itelimit = *cardBase;
    for (int k = 0; k < 128; k++)
    {
        descriptor1[k] = data[idx*128+k];

    }
    // initialize minDistance
    for (int k = 0; k < 128; k++)
    {
        descriptor2[k] = base[k];
    }

    for (int k = 0; k < 128; k++)
    {
        part = (descriptor1[k]-descriptor2[k]);
        part *= part;
        minDistance += part;
    }

    // test all descriptors in the base :
    for (int i = 1; i < itelimit; i++)
    {
        result = 0;
        for (int k = 0; k < 128; k++)
        {
            descriptor2[k] = base[i*128+k];
            // Calculate squared l2 distance :
            part = (descriptor1[k]-descriptor2[k]);
            part *= part;
            result += part;
        }

        // Compare to minDistance
        if (result < minDistance)
        {
            minDistance = result;
            winner = i;
        }
    }

    // Write the result in dataReturned
    dataReturned[idx] = winner;
}

Thank you in advance if you can help me.

EDIT : the last cudaMemcpy returns the error "the launch timed out and was terminated".


回答1:


linux has a watchdog mechanism. If your kernel runs for a long time (you say it is slow in debug mode) you can hit the linux watchdog, and receive the "launch timed out and was terminated" error.

In this case you have several things you might try. The options are covered here.



来源:https://stackoverflow.com/questions/18357265/cant-enter-into-global-function-using-cuda

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!