Strategy for doing final reduction

岁酱吖の 提交于 2019-12-01 16:18:17

If that float's order of magnitude is smaller than exa scale, then:

Instead of

if (local_id == 0)
  partialSums[get_group_id(0)] = localSums[0];

You could use

if (local_id == 0)
{
    if(strategy==ATOMIC)
    {
        long integer_part=getIntegerPart(localSums[0]);
        atom_add (&totalSumIntegerPart[0] ,integer_part);
        long float_part=1000000*getFloatPart(localSums[0]);
         // 1000000 for saving meaningful 7 digits as integer
        atom_add (&totalSumFloatPart[0] ,float_part);
    }
}

this will overflow float part so when you divide it by 1000000 in another kernel, it may have more than 1000000 value so you get its integer part and add it to the real integer part:

   float value=0;
   if(strategy==ATOMIC)
   {
       float float_part=getFloatPart_(totalSumFloatPart[0]);
       float integer_part=getIntegerPart_(totalSumFloatPart[0])
       + totalSumIntegerPart[0];
       value=integer_part+float_part;
   }

just a few atomic operations shouldn't be effective on whole kernel time.

Some of these get___part can be written easily already using floor and similar functions. Some need a divide by 1M.

Sorry for previous code. also It has problem.

CLK_GLOBAL_MEM_FENCE effects only current workgroup. I confused. =[

If you want to reduction sum by GPU, you should enqueue reduction kernel by NDRangeKernel function after clFinish(commandQueue).

Plaese just take concept.

__kernel void sumGPU ( __global const double *input,
                       __global double *partialSums,
               __local double *localSums)
  {
 uint local_id = get_local_id(0);
 uint group_size = get_local_size(0);

  // Copy from global memory to local memory
  localSums[local_id] = input[get_global_id(0)];

  // Loop for computing localSums
  for (uint stride = group_size/2; stride>0; stride /=2)
     {
      // Waiting for each 2x2 addition into given workgroup
      barrier(CLK_LOCAL_MEM_FENCE);

      // Divide WorkGroup into 2 parts and add elements 2 by 2
      // between local_id and local_id + stride
      if (local_id < stride)
        localSums[local_id] += localSums[local_id + stride];
     }

  // Write result into partialSums[nWorkGroups]
  if (local_id == 0)
    partialSums[get_group_id(0)] = localSums[0];

    barrier(CLK_GLOBAL_MEM_FENCE);

      if(get_group_id(0)==0){
          if(local_id < get_num_groups(0)){  // 16384
            for(int n=0 ; n<get_num_groups(0) ; n+= group_size )
               localSums[local_id] += partialSums[local_id+n];
            barrier(CLK_LOCAL_MEM_FENCE);

            for(int s=group_size/2;s>0;s/=2){
               if(local_id < s)
                  localSums[local_id] += localSums[local_id+s];
               barrier(CLK_LOCAL_MEM_FENCE);
            }
            if(local_id == 0)
               partialSums[0] = localSums[0];
          }
       }
 }

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!