Simple CUDA Test always fails with “an illegal memory access was encountered” error

前端 未结 1 1716
清酒与你
清酒与你 2020-12-17 19:15

If I run this program I get "an illegal memory access was encountered in matrixMulti.cu at line 48" error. I searched and tried a lot. So I hope somebody can help

相关标签:
1条回答
  • 2020-12-17 19:40

    In general, your method of allocating and copying a doubly-subscripted C array won't work. cudaMemcpy expects flat, contiguously allocated, single-pointer, single-subscript arrays.

    As a result of this confusion, the pointers being passed to your kernel (int** a, int** b) cannot be properly (safely) dereferenced twice:

    b[0][0]=4;
    

    When you try to do the above in kernel code, you get an illegal memory access, because you have not properly allocated a pointer-to-pointer style allocation on the device.

    If you ran your code with cuda-memcheck, you would get another indication of the illegal memory access in the kernel code.

    The usual suggestion in these cases is to "flatten" your 2D arrays to single dimension, and use appropriate pointer or index arithmetic to simulate 2D access. It is possible to allocate 2D arrays (i.e. double-subscript, double-pointer), but it is fairly involved (due in part to the need for a "deep copy"). If you'd like to learn more about that just search on the upper right hand corner for CUDA 2D array.

    Here's a version of your code that has the array flattening for the device-side array:

    $ cat t60.cu
    #include <iostream>
    #include <cuda.h>
    #include <stdio.h>
    
    using namespace std;
    
    #define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )
    void printVec(int** a, int n);
    
    static void HandleError( cudaError_t err, const char *file, int line )
    {
        if (err != cudaSuccess)
        {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
        }
    }
    
    void checkCUDAError(const char *msg)
    {
        cudaError_t err = cudaGetLastError();
        if( cudaSuccess != err)
        {
            fprintf(stderr, "Cuda error: %s: %s.\n", msg,
                                  cudaGetErrorString( err) );
            exit(EXIT_FAILURE);
        }
    }
    
    __global__ void MatrixMulti(int* b, unsigned n) {
        for (int row = 0; row < n; row++)
          for (int col=0; col < n; col++)
        b[(row*n)+col]=col;  //simulate 2D access in kernel code
    }
    
    int main() {
        int N =10;
        int** array, *devarray;  // flatten device-side array
        array = new int*[N];
        array[0] = new int[N*N]; // host allocation needs to be contiguous
        for (int i = 1; i < N; i++) array[i] = array[i-1]+N; //2D on top of contiguous allocation
    
        HANDLE_ERROR ( cudaMalloc((void**)&devarray, N*N*sizeof(int) ) );
        HANDLE_ERROR ( cudaMemcpy(devarray, array[0], N*N*sizeof(int), cudaMemcpyHostToDevice) );
        MatrixMulti<<<1,1>>>(devarray, N);
        HANDLE_ERROR ( cudaMemcpy(array[0], devarray, N*N*sizeof(int), cudaMemcpyDeviceToHost) );
        HANDLE_ERROR ( cudaFree(devarray) );
        printVec(array,N);
    
        return 0;
    }
    
    void printVec(int** a , int n) {
        for(int i =0 ; i < n; i++) {
            for ( int j = 0; j <n; j++) {
            cout<< a[i][j] <<" ";
            }
            cout<<" "<<endl;
        }
    }
    $ nvcc -arch=sm_20 -o t60 t60.cu
    $ ./t60
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    0 1 2 3 4 5 6 7 8 9
    $
    
    0 讨论(0)
提交回复
热议问题