2D CUDA median filter optimization

后端 未结 4 873
清酒与你
清酒与你 2021-01-01 04:12

I have implemented a 2D median filter in CUDA and the whole program is shown below.

#include \"cuda_runtime.h\"
#include \"cuda_runtime_api.h\"
#include \"de         


        
4条回答
  •  盖世英雄少女心
    2021-01-01 04:28

    I'm answering your last question on the use of shared memory.

    As already noticed by Eric, your use of shared memory does not really lead to thread collaboration.

    I'm comparing your solution, for the 3x3 case, with a variant of your kernel not using shared memory at all as well as with the Accelereyes solution discussed in 2D median filtering in CUDA: how to efficiently copy global memory to shared memory.

    Here is the complete code:

    #include   
    #include    
    
    using namespace std;
    
    #define BLOCK_WIDTH 16 
    #define BLOCK_HEIGHT 16
    
    /*******************/
    /* iDivUp FUNCTION */
    /*******************/
    int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
    
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);
        }
    }
    
    /**********************************************/
    /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
    /**********************************************/
    __global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height)
    {
        const int tx_l = threadIdx.x;                           // --- Local thread x index
        const int ty_l = threadIdx.y;                           // --- Local thread y index
    
        const int tx_g = blockIdx.x * blockDim.x + tx_l;        // --- Global thread x index
        const int ty_g = blockIdx.y * blockDim.y + ty_l;        // --- Global thread y index
    
        __shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2];
    
        // --- Fill the shared memory border with zeros
        if (tx_l == 0)                      smem[tx_l]  [ty_l+1]    = 0;    // --- left border
        else if (tx_l == BLOCK_WIDTH-1)     smem[tx_l+2][ty_l+1]    = 0;    // --- right border
        if (ty_l == 0) {                    smem[tx_l+1][ty_l]      = 0;    // --- upper border
            if (tx_l == 0)                  smem[tx_l]  [ty_l]      = 0;    // --- top-left corner
            else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l]      = 0;    // --- top-right corner
            }   else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2]  = 0;    // --- bottom border
            if (tx_l == 0)                  smem[tx_l]  [ty_l+2]    = 0;    // --- bottom-left corder
            else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2]    = 0;    // --- bottom-right corner
        }
    
        // --- Fill shared memory
                                                                        smem[tx_l+1][ty_l+1] =                           Input_Image[ty_g*Image_Width + tx_g];      // --- center
        if ((tx_l == 0)&&((tx_g > 0)))                                      smem[tx_l]  [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1];      // --- left border
        else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))         smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1];      // --- right border
        if ((ty_l == 0)&&(ty_g > 0)) {                                      smem[tx_l+1][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g];    // --- upper border
                if ((tx_l == 0)&&((tx_g > 0)))                                  smem[tx_l]  [ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- top-left corner
                else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l]   = Input_Image[(ty_g-1)*Image_Width + tx_g+1];  // --- top-right corner
             } else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) {  smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g];    // --- bottom border
             if ((tx_l == 0)&&((tx_g > 0)))                                 smem[tx_l]  [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1];  // --- bottom-left corder
            else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))     smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1];  // --- bottom-right corner
        }
        __syncthreads();
    
        // --- Pull the 3x3 window in a local array
        unsigned short v[9] = { smem[tx_l][ty_l],   smem[tx_l+1][ty_l],     smem[tx_l+2][ty_l],
                                smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1],   smem[tx_l+2][ty_l+1],
                                smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2],   smem[tx_l+2][ty_l+2] };    
    
        // --- Bubble-sort
        for (int i = 0; i < 5; i++) {
            for (int j = i + 1; j < 9; j++) {
                if (v[i] > v[j]) { // swap?
                    unsigned short tmp = v[i];
                    v[i] = v[j];
                    v[j] = tmp;
                }
             }
        }
    
        // --- Pick the middle one
        Output_Image[ty_g*Image_Width + tx_g] = v[4];
    }
    
    /****************************/
    /* ORIGINAL KERNEL FUNCTION */
    /****************************/
    __global__ void Original_Kernel_Function(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
    
        __shared__ unsigned short surround[BLOCK_WIDTH*BLOCK_HEIGHT][9];
    
        int iterator;
    
        const int x     = blockDim.x * blockIdx.x + threadIdx.x;
        const int y     = blockDim.y * blockIdx.y + threadIdx.y;
        const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   
    
        if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
    
        // --- Fill shared memory
        iterator = 0;
        for (int r = x - 1; r <= x + 1; r++) {
            for (int c = y - 1; c <= y + 1; c++) {
                surround[tid][iterator] = Input_Image[c*Image_Width+r];
                iterator++;
            }
        }
    
        // --- Sort shared memory to find the median using Bubble Short
        for (int i=0; i<5; ++i) {
    
            // --- Find the position of the minimum element
            int minval=i;
            for (int l=i+1; l<9; ++l) if (surround[tid][l] < surround[tid][minval]) minval=l;
    
            // --- Put found minimum element in its place
            unsigned short temp = surround[tid][i];
            surround[tid][i]=surround[tid][minval];
            surround[tid][minval]=temp;
        }
    
        // --- Pick the middle one
        Output_Image[(y*Image_Width)+x]=surround[tid][4]; 
    
        __syncthreads();
    
    }
    
    /***********************************************/
    /* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
    /***********************************************/
    __global__ void Original_Kernel_Function_no_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) {
    
        unsigned short surround[9];
    
        int iterator;
    
        const int x     = blockDim.x * blockIdx.x + threadIdx.x;
        const int y     = blockDim.y * blockIdx.y + threadIdx.y;
        const int tid   = threadIdx.y * blockDim.x + threadIdx.x;   
    
        if( (x >= (Image_Width - 1)) || (y >= Image_Height - 1) || (x == 0) || (y == 0)) return;
    
        // --- Fill array private to the threads
        iterator = 0;
        for (int r = x - 1; r <= x + 1; r++) {
            for (int c = y - 1; c <= y + 1; c++) {
                surround[iterator] = Input_Image[c*Image_Width+r];
                iterator++;
            }
        }
    
        // --- Sort private array to find the median using Bubble Short
        for (int i=0; i<5; ++i) {
    
            // --- Find the position of the minimum element
            int minval=i;
            for (int l=i+1; l<9; ++l) if (surround[l] < surround[minval]) minval=l;
    
            // --- Put found minimum element in its place
            unsigned short temp = surround[i];
            surround[i]=surround[minval];
            surround[minval]=temp;
        }
    
        // --- Pick the middle one
        Output_Image[(y*Image_Width)+x]=surround[4]; 
    
    }
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int Image_Width = 1580;
        const int Image_Height = 1050;
    
        // --- Open data file
        ifstream is; is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary );
    
        // --- Get file length
        is.seekg(0, ios::end);
        int dataLength = is.tellg();
        is.seekg(0, ios::beg);
    
        // --- Read data from file and close file
        unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char) / sizeof(unsigned short)];
        is.read((char*)Input_Image_Host,dataLength);
        is.close();
    
        // --- CUDA warm up
        unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short)));
        gpuErrchk(cudaFree(forFirstCudaMalloc));
    
        // --- Allocate host and device memory spaces 
        unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength);
        unsigned short *Input_Image; gpuErrchk(cudaMalloc( (void**)&Input_Image, dataLength * sizeof(unsigned short))); 
        unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short))); 
    
        // --- Copy data from host to device
        gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering
    
        // --- Grid and block sizes
        const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);      
        const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 
    
        /****************************/
        /* ORIGINAL KERNEL FUNCTION */
        /****************************/
        float time;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
    
        cudaFuncSetCacheConfig(Original_Kernel_Function, cudaFuncCachePreferShared);
        Original_Kernel_Function<<>>(Input_Image, Output_Image, Image_Width, Image_Height);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Original kernel function - elapsed time:  %3.3f ms \n", time);
    
        /***********************************************/
        /* ORIGINAL KERNEL FUNCTION - NO SHARED MEMORY */
        /***********************************************/
        cudaEventRecord(start, 0);
    
        cudaFuncSetCacheConfig(Original_Kernel_Function_no_shared, cudaFuncCachePreferL1);
        Original_Kernel_Function_no_shared<<>>(Input_Image, Output_Image, Image_Width, Image_Height);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Original kernel function - no shared - elapsed time:  %3.3f ms \n", time);
    
        /**********************************************/
        /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */
        /**********************************************/
        cudaEventRecord(start, 0);
    
        cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared);
        Optimized_Kernel_Function_shared<<>>(Input_Image, Output_Image, Image_Width, Image_Height);
        gpuErrchk(cudaPeekAtLastError());
        gpuErrchk(cudaDeviceSynchronize());
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&time, start, stop);
        printf("Optimized kernel function - shared - elapsed time:  %3.3f ms \n", time);
    
        // --- Copy results back to the host
        gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost));
    
        // --- Open results file, write results and close the file
        ofstream of2;     of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw",  ios::binary);
        of2.write((char*)Output_Image_Host, dataLength);
        of2.close();
    
        cout << "\n Press Any Key To Exit..!!";
        gpuErrchk(cudaFree(Input_Image));
    
        delete Input_Image_Host;
        delete Output_Image_Host;
    
        return 0;
    }
    

    Here are the timing results on a Kepler K20c:

    1580 x 1050
    Original_Kernel_Function             = 1.588ms
    Original_Kernel_Function_no_shared   = 1.278ms
    Optimized_Kernel_Function_shared     = 1.455ms
    
    2048 x 2048
    Original_Kernel_Function             = 3.94ms
    Original_Kernel_Function_no_shared   = 3.118ms
    Optimized_Kernel_Function_shared     = 3.709ms
    
    4096 x 4096
    Original_Kernel_Function             = 16.003ms
    Original_Kernel_Function_no_shared   = 13.735ms
    Optimized_Kernel_Function_shared     = 14.526ms
    
    8192 x 8192
    Original_Kernel_Function             = 62.278ms
    Original_Kernel_Function_no_shared   = 47.484ms
    Optimized_Kernel_Function_shared     = 57.474ms
    

    Here are the timing results on a GT540M, which is more similar to your card:

    1580 x 1050
    Original_Kernel_Function             = 10.332 ms
    Original_Kernel_Function_no_shared   =  9.294 ms
    Optimized_Kernel_Function_shared     = 10.301 ms
    
    2048 x 2048
    Original_Kernel_Function             = 25.256 ms
    Original_Kernel_Function_no_shared   = 23.567 ms
    Optimized_Kernel_Function_shared     = 23.876 ms
    
    4096 x 4096
    Original_Kernel_Function             = 99.791 ms
    Original_Kernel_Function_no_shared   = 93.919 ms
    Optimized_Kernel_Function_shared     = 95.464 ms
    
    8192 x 8192
    Original_Kernel_Function             = 399.259 ms
    Original_Kernel_Function_no_shared   = 375.634 ms
    Optimized_Kernel_Function_shared     = 383.121 ms
    

    As it can be seen, the version not using shared memory seems to be (slightly) convenient in all the cases.

提交回复
热议问题