Cuda - Multiple sums in each vector element

问题

The product of two series of Chebyshev polynomials with coefficient a and b can be represented by the formula

The problem is to parallelize this as much as possible.

I have managed to use cuda to parallelize the formula above by simply applying one thread per vector element. Thus one thread performs the sums/multiplications.

#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <time.h>

__global__ void chebyprod(int n, float *a, float *b, float *c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   float sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[j-i];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
   /*
   if (i < n)
      c[i] = a[i] + b[i];
   */  
}

int main(void){
  clock_t tStart = clock();
  int N = 10000;
  float *a, *b, *c, *d_a, *d_b, *d_c;
  a = (float*)malloc(N*sizeof(float));
  b = (float*)malloc(N*sizeof(float));
  c = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_a, N*sizeof(float)); 
  cudaMalloc(&d_b, N*sizeof(float));
  cudaMalloc(&d_c, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(float), cudaMemcpyHostToDevice);

  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  printf("Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);

  cudaMemcpy(c, d_c, N*sizeof(float), cudaMemcpyDeviceToHost);

  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
}

In another code I have managed to use sum reduction to sum all the elements in a vector (someone else's code I copied from an nvidia presentation). The problem is now, how do I combine the two approaches? I want a bunch of threads to compute all of the sums/multiplications in every element of c. Any tips? Or perhaps a problem similar that I can learn from?

Row reduction in a matrix can possibly be similar to the problem. However I have multiple sums with different lengths and multiplications.

This is the code provided by nvidia employee (I think)

template <unsigned int blockSize>
__global__ void parreduc(float *array_in, float *reduct, size_t array_len)
    {
    extern volatile __shared__ float sdata[];
    size_t  tid        = threadIdx.x,
            gridSize   = blockSize * gridDim.x,
            i          = blockIdx.x * blockSize + tid;
    sdata[tid] = 0;
    while (i < array_len)
        { sdata[tid] += array_in[i];
        i += gridSize; }
    __syncthreads();
    if (blockSize >= 512)
        { if (tid < 256) sdata[tid] += sdata[tid + 256]; __syncthreads();         }
    if (blockSize >= 256)
        { if (tid < 128) sdata[tid] += sdata[tid + 128]; __syncthreads(); }
    if (blockSize >= 128)
        { if (tid <  64) sdata[tid] += sdata[tid + 64]; __syncthreads(); }
    if (tid < 32)
        { if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
          if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
          if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
          if (blockSize >= 8)  sdata[tid] += sdata[tid + 4];
          if (blockSize >= 4)  sdata[tid] += sdata[tid + 2];
          if (blockSize >= 2)  sdata[tid] += sdata[tid + 1]; }
    if (tid == 0) reduct[blockIdx.x] = sdata[0];
    }

回答1:

The code provided in the question is a sensible first step in realization. The thread strategy is the most common/typical: to assign one thread per output point (N output points here). Each thread must perform all calculations necessary to compute a particular output point. Motivations to improve performance of CUDA code should always address at least 2 CUDA optimization priorities:

Expose enough parallelism (roughly: create enough threads)
Make efficient use of memory (roughly: for global memory access, strive for coalescing)

With respect to item 1, the effectiveness of the code provided in the question will depend on the GPU. As a rough rule of thumb, we seek to launch at least 2048 threads (1024 on Turing) per SM in the GPU we are running on, to have a chance to "saturate" the GPU. For N = 10000, we can saturate a GPU with 5 SMs. For a Tesla V100, with 80 SMs, we have no hope of saturating that GPU with 10,000 threads.

With respect to item 2, the code provided also falls short to some degree; it has issues when it comes to coalescing: adjacent threads in many cases are not reading adjacent values in memory. To pick just one example, the first global load I see there is a[j]. This is loading the same value/location per thread, rather than adjacent values in adjacent threads.

Can we come up with an alternate realization that will possibly improve on both of these? We will consider the following change in thread strategy: assign one threadblock per output point, rather than one thread per output point. The calculations needed for each output point will be visualized as one "row" of a matrix. A threadblock will "stride" along the row, performing the calculations needed, and eventually doing a threadblock-level reduction to produce a single result for that row. This will allow us to address both items: adjacent threads in a warp will be able to read adjacent values from a and b, and we will also immediately be able to increase our total number of threads by a factor of up to 1024 (so, instead of 10 thousand threads, we could spin up ~10 million threads. 10 million is enough to saturate any current CUDA GPU). This thread strategy also has another nice feature: the "rows" of calculations mentioned above have varying length. The first and last rows will be the longest, with approximately N calculation elements, whereas the rows in the middle will have closer to N/2 calculation elements. By choosing a block-stride loop (conceptually similar to a grid-stride loop) we can efficiently handle varying row lengths. Each threadblock will "stride" along the row, only as far as needed, accumulating results.

Here is a worked example of that realization:

$ cat t1497.cu
#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
}
// assume one threadblock per c_k coefficient
// assume a power-of-2 threadblock size
const int tpb_p2 = 8;
const int nTPB = 1<<tpb_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>tpb_p2)<<tpb_p2);

__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
#ifndef NO_WS
  __shared__ mt sd[32];
  if (threadIdx.x < 32) sd[threadIdx.x] = 0;
  __syncthreads();
#else
  __shared__ mt sd[nTPB];
#endif
  int k = blockIdx.x;
  mt sum = 0.0f;
  int row_width = (((k)>(n-k))?(k):(n-k))+1;
  int strides = (row_width>>tpb_p2)+ ((row_width&row_mask)?1:0);
  int j = threadIdx.x;
  mt tmp_a;
  for (int s=0; s < strides; s++){ // block-stride loop
    if (j < n) tmp_a = a[j];
    if (j <= k) sum += tmp_a*b[k-j];
    if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
    j += nTPB;
    }
#ifndef NO_WS
  // 1st warp-shuffle reduction
  int lane = threadIdx.x & (warpSize-1);
  int warpID = threadIdx.x >> 5; // assumes warpSize == 32
  unsigned mask = 0xFFFFFFFFU;
  for (int offset = warpSize>>1; offset > 0; offset >>= 1)
    sum += __shfl_down_sync(mask, sum, offset);
  if (lane == 0) sd[warpID] = sum;
  __syncthreads(); // put warp results in shared mem
  // hereafter, just warp 0
  if (warpID == 0){
  // reload val from shared mem if warp existed
    sum = sd[lane];
  // final warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(mask, sum, offset);
  }
#else
  sd[threadIdx.x] = sum;
  for (int s = nTPB>>1; s > 0; s>>=1){ // sweep reduction
    __syncthreads();
    if (threadIdx.x < s) sd[threadIdx.x] += sd[threadIdx.x+s];}
  if (!threadIdx.x) sum = sd[0];
#endif
  if (!threadIdx.x) c[k] = sum*0.5f;
}

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< N, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
  free(ic);
}
$ nvcc -arch=sm_52 -o t1497 t1497.cu
$ ./t1497
blockSize: 1024
gridSize: 10
Time taken: 0.001687s
no error
Vector c: [ 199.996 199.986 199.976 199.966 199.956 199.946 199.936 199.926 199.916 199.906 ]
Time taken: 0.000350s
no error
Vector c: [ 199.99 199.98 199.97 199.96 199.95 199.94 199.93 199.92 199.91 199.9 ]
Max error = 0.0137787
$

(change the -arch switch to match your GPU)

The above example shows that the modified algorithm runs about 5x faster (on a Tesla V100). Although there are numerical differences, these are due to floating point issues. To prove that the algorithm gives the correct result, switch the typedef from float to double. You will see that there is then essentially no longer any numerical difference in the results (suggesting that the algorithms are logically the same) and also that the improved algorithm version in float resolution gives answers for the first 10 elements that are numerically closer to the "more accurate" result produced with double arithmetic.

As discussed in the comments, this algorithm transformation may not be beneficial in every case. The principal benefit will come from exploiting GPUs with a larger thread capacity (larger than N threads). Relatively smaller GPUs (e.g. 8 SMs or less, perhaps, for N = 10000) may not benefit from this and in fact the code may run slower than the original algorithm.

Although I mention coalescing, for N = 10000 the input data here is quite small (~80K bytes) which will fit in the L2 cache of most GPUs. Once the data is in the L2 cache, inefficient access patterns are much less of an issue. So the primary benefit of this algorithm in this case is probably due to item 1. If item 1 cannot be exploited, the algorithm shows little or no benefit.

For test purposes, I created another version using a warp-stride loop. However it doesn't seem to be significantly faster on small GPUs and is actually slower on V100:

#include <stdio.h>
#include <iostream>
#include <cuda.h>
typedef float mt;
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
const bool sync = true;
const bool nosync = false;
unsigned long long dtime_usec(unsigned long long start, bool use_sync = nosync){
  if (use_sync == sync) cudaDeviceSynchronize();
  timeval tv;
  gettimeofday(&tv, 0);
  return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
__global__ void chebyprod(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
   int i = blockIdx.x *blockDim.x + threadIdx.x;
   mt sum;
   if (i < n) {
      sum = 0.f;
      for (int j = 0; j<=i; j++){
         sum += a[j]*b[i-j];
      }
      for (int j = 1; j < n-i; j++){
         sum += a[j]*b[j+i]+a[j+i]*b[j];
      }
      c[i] = 0.5f*sum;
   }
}
// assume one warp per c_k coefficient
// assume a multiple-of-32 threadblock size
const int nTPB = 32*8;
const int warpSize_p2 = 5; // assumes warpSize == 32
const int nWarps = nTPB>>warpSize_p2;
const unsigned row_mask = ~((0xFFFFFFFFU>>warpSize_p2)<<warpSize_p2);
__global__ void chebyprod_imp(int n, const mt * __restrict__ a, const mt * __restrict__ b, mt * __restrict__ c){
  int warpID = threadIdx.x >> warpSize_p2;
  int k = blockIdx.x*(nWarps)+warpID;
  if (k < n){
    mt sum = 0.0f;
    int lane = threadIdx.x & (warpSize-1);
    int row_width = (((k)>(n-k))?(k):(n-k))+1;
    int strides = (row_width>>warpSize_p2)+ ((row_width&row_mask)?1:0);
    int j = lane;
    mt tmp_a;
    for (int s=0; s < strides; s++){ // warp-stride loop
      if (j < n) tmp_a = a[j];
      if (j <= k) sum += tmp_a*b[k-j];
      if ((j > 0) && (j < (n-k))) sum += tmp_a*b[j+k] + a[j+k]*b[j];
      j += warpSize;
      }
  // warp-shuffle reduction
    for (int offset = warpSize>>1; offset > 0; offset >>= 1)
      sum += __shfl_down_sync(0xFFFFFFFFU, sum, offset);
    if (lane==0) c[k] = sum*0.5f;}
}

int main(int argc, char *argv[]){
  int N = 10000;
  if (argc>1) N = atoi(argv[1]);
  std::cout << "N = " << N << std::endl;
  mt *a, *b, *c, *ic, *d_a, *d_b, *d_c;
  a  = (mt*)malloc(N*sizeof(mt));
  b  = (mt*)malloc(N*sizeof(mt));
  c  = (mt*)malloc(N*sizeof(mt));
  ic = (mt*)malloc(N*sizeof(mt));

  cudaMalloc(&d_a, N*sizeof(mt));
  cudaMalloc(&d_b, N*sizeof(mt));
  cudaMalloc(&d_c, N*sizeof(mt));

  for (int i = 0; i < N; i++) {
    a[i] = 0.1f;
    b[i] = 0.2f;
  }

  cudaMemcpy(d_a, a, N*sizeof(mt), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, N*sizeof(mt), cudaMemcpyHostToDevice);
  int blockSize, gridSize;
  // Number of threads in each thread block
  blockSize = 1024;

  // Number of thread blocks in grid
  gridSize = (int)ceil((float)N/blockSize);

  std::cout << "blockSize: " << blockSize << "\ngridSize: " << gridSize << "\n";

  // Perform chebyprod on N elements
  unsigned long long  dt = dtime_usec(0);
  chebyprod<<< gridSize, blockSize >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);

  cudaMemcpy(c, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << c[k] << " ";
  std::cout <<"]\n";
  dt = dtime_usec(0);
  chebyprod_imp<<< (N/nWarps)+1, nTPB >>>(N, d_a, d_b, d_c);
  dt = dtime_usec(dt,sync);
  cudaMemcpy(ic, d_c, N*sizeof(mt), cudaMemcpyDeviceToHost);
  printf("Time taken: %fs\n", dt/(float)USECPSEC);
  std::cout << cudaGetErrorString(cudaGetLastError()) << std::endl;
  std::cout << "Vector c: [ ";
  for (int k = 0; k < 10; ++k)
    std::cout << ic[k] << " ";
  std::cout <<"]\n";
  mt max_error = 0;
  for (int k = 0; k < N; k++)
    max_error = fmax(max_error, fabs(c[k] - ic[k]));
  std::cout << "Max error = " << max_error << std::endl;
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  free(a);
  free(b);
  free(c);
  free(ic);
}

来源：https://stackoverflow.com/questions/57644701/cuda-multiple-sums-in-each-vector-element

标签

vector

cuda

sum

reduction