问题
I am trying to optimize IDCT from this code for the GPU. The GPU I have on my system in NVIDIA Tesla k20c.
The IDCT
function as written in the original code looks like this:
void IDCT(int32_t *input, uint8_t *output) {
int32_t Y[64];
int32_t k, l;
for (k = 0; k < 8; k++) {
for (l = 0; l < 8; l++) Y(k, l) = SCALE(input[(k << 3) + l], S_BITS);
idct_1d(&Y(k, 0));
}
for (l = 0; l < 8; l++) {
int32_t Yc[8];
for (k = 0; k < 8; k++) Yc[k] = Y(k, l);
idct_1d(Yc);
for (k = 0; k < 8; k++) {
int32_t r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
}
More details about the .c
file can be found in this link.
This function is called from main.c
in the following way:
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
for (index = 0; index < SOS_section.n; index++) {
uint32_t component_index = component_order[index];
int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {
//unpack block function
//iqzz function
IDCT(unZZ_MCU, YCbCr_MCU_ds[component_index] + (64 * chroma_ss));
}
//other function
}
//Code continues...
Details of main.c
can be found in this link.
I made a kernel out of the IDCT function in the following way:
__kernel void IDCT(__global int* input, __global uchar* output)
{
unsigned int kid= get_global_id(0);
int Y[64];
int k,l;
int Yc[8];
for (k = 0; k < 8; k++)
{
for (l = 0; l < 8; l++)
{
Y(k,l) = SCALE(input[(k << 3) + l], S_BITS);
}
idct_1D(&Y(k,0));
}
for (l = 0; l < 8; l++)
{
for (k = 0; k < 8; k++)
{Yc[k] = Y(k, l);}
idct_1D(Yc);
for (k = 0; k < 8; k++)
{
int r = 128 + DESCALE(Yc[k], S_BITS + 3);
r = r > 0 ? (r < 255 ? r : 255) : 0;
X(k, l) = r;
}
}
}
I am calling this kernel from main.c
in this way:
for (index_X = 0; index_X < nb_MCU_X; index_X++) {
for (index_Y = 0; index_Y < nb_MCU_Y; index_Y++) {
for (index = 0; index < SOS_section.n; index++) {
uint32_t component_index = component_order[index];
int nb_MCU = ((SOF_component[component_index].HV>> 4) & 0xf) * (SOF_component[component_index].HV & 0x0f);
for (chroma_ss = 0; chroma_ss < nb_MCU; chroma_ss++) {
cl_mem DCT_Input = clCreateBuffer(context, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, 64 * sizeof(cl_int), unZZ_MCU, &ret);
//Output buffer
cl_mem DCT_Output = clCreateBuffer(context, CL_MEM_READ_WRITE| CL_MEM_COPY_HOST_PTR, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), &ret);
chk(ret, "clCreateBuffer");
ret = clSetKernelArg(cos_kernel, 0, sizeof(cl_mem), (void *)&DCT_Input);
ret |= clSetKernelArg(cos_kernel, 1, sizeof(cl_mem), (void *)&DCT_Output);
//Timing-Start..
start_time = wtime();
size_t globalForInverseDCT= 1024;
size_t localForInverseDCT= 256;
ret = clEnqueueNDRangeKernel(command_queue, cos_kernel, 1, NULL, &globalForInverseDCT, &localForInverseDCT, 0, NULL, NULL);
//Timing-End..
run_time = wtime() - start_time;
ret = clEnqueueReadBuffer(command_queue, DCT_Output, CL_TRUE, 0, (MCU_sx * MCU_sy * max_ss_h * max_ss_v) + 4, YCbCr_MCU_ds[component_index] + (64 * chroma_ss), 0, NULL, NULL);
}
//other function
}
//code continues...
I am using this function for timing IDCT
:
double wtime()
{
/* Use a generic timer */
static int sec = -1;
struct timeval tv;
gettimeofday(&tv, NULL);
if (sec < 0) sec = tv.tv_sec;
return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
}
The execution time for IDCT on the CPU with my timing method is 5 microseconds. The execution time on the GPU is 20 microseconds. (I used OpenCL's built-in profiling command as well and it gave me an execution time of 31 microseconds.)
How can I modify my code so that it runs faster on the GPU?
MY WORK:
I tried to break my kernel down into smaller kernels in this way but I am getting static as my screen output.
I profiled the code again and I got the following output:
The execution time for my IDCT kernel according to Nvidia's Visual Profiler is 17.6 microseconds (approx.).
EDIT:
I profiled my code again with the command line profiler and the execution time for idct is now 4.9 microseconds.
PLEASE SUGGEST HOW I CAN FURTHER OPTIMIZE MY KERNEL.
来源:https://stackoverflow.com/questions/45927692/how-can-i-make-idct-run-faster-on-my-gpu