How can I make IDCT run faster on my GPU?

问题

I am trying to optimize IDCT from this code for the GPU. The GPU I have on my system in NVIDIA Tesla k20c.

The IDCT function as written in the original code looks like this:

void IDCT(int32_t *input, uint8_t *output) {
    int32_t Y[64];
    int32_t k, l;

    for (k = 0; k < 8; k++) {
        for (l = 0; l < 8; l++) Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
        idct_1d(&Y(k, 0));
    }

    for (l = 0; l < 8; l++) {
        int32_t Yc[8];

        for (k = 0; k < 8; k++) Yc[k] = Y(k, l);

        idct_1d(Yc);

        for (k = 0; k < 8; k++) {
            int32_t r = 128 + DESCALE(Yc[k], S_BITS + 3);
            r = r > 0 ? (r < 255 ? r : 255) : 0;
            X(k, l) = r;
        }
    }
}

More details about the .c file can be found in this link.

This function is called from main.c in the following way:

 for (index_X = 0; index_X < nb_MCU_X; index_X++) {

for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

for (index = 0; index < SOS_section.n; index++) {

 uint32_t component_index = component_order[index];

int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {

 //unpack block function
 //iqzz function

IDCT(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));

 }
//other function                    
 }
//Code continues...

Details of main.c can be found in this link.

I made a kernel out of the IDCT function in the following way:

__kernel void IDCT(__global int* input, __global uchar* output) 
{
 unsigned int kid= get_global_id(0);

 int Y[64]; 
 int k,l;
 int Yc[8];

 for (k = 0; k < 8; k++)
 {
  for (l = 0; l < 8; l++)
  {
   Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);     
  }         
 idct_1D(&Y(k,0));
 }

for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{Yc[k] = Y(k, l);}

idct_1D(Yc);

for (k = 0; k < 8; k++)
{

int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}

}

}

I am calling this kernel from main.c in this way:

    for (index_X = 0; index_X < nb_MCU_X; index_X++) {

    for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {

    for (index = 0; index < SOS_section.n; index++) {

     uint32_t component_index = component_order[index];

     int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);

     for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {

    cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);

    //Output buffer
    cl_mem  DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
    chk(ret, "clCreateBuffer");

    ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
    ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);

//Timing-Start..

  start_time = wtime();

  size_t globalForInverseDCT= 1024;

  size_t localForInverseDCT= 256;

 ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 1, NULL, &globalForInverseDCT, &localForInverseDCT, 0, NULL, NULL);

//Timing-End..

 run_time = wtime() - start_time;

 ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);

 }

 //other function

 }

 //code continues...

I am using this function for timing IDCT:

double wtime()
{

   /* Use a generic timer */
   static int sec = -1;
   struct timeval tv;
   gettimeofday(&tv, NULL);
   if (sec < 0) sec = tv.tv_sec;
   return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;

}

The execution time for IDCT on the CPU with my timing method is 5 microseconds. The execution time on the GPU is 20 microseconds. (I used OpenCL's built-in profiling command as well and it gave me an execution time of 31 microseconds.)

How can I modify my code so that it runs faster on the GPU?

MY WORK:

I tried to break my kernel down into smaller kernels in this way but I am getting static as my screen output.

I profiled the code again and I got the following output:

The execution time for my IDCT kernel according to Nvidia's Visual Profiler is 17.6 microseconds (approx.).

EDIT:

I profiled my code again with the command line profiler and the execution time for idct is now 4.9 microseconds.

PLEASE SUGGEST HOW I CAN FURTHER OPTIMIZE MY KERNEL.

来源：https://stackoverflow.com/questions/45927692/how-can-i-make-idct-run-faster-on-my-gpu

标签

optimization

opencl

gpu