Sharing roots and weights for many Gauss-Legendre Quadrature in GPUs

问题

I am intending to compute in parallel fashion a lot of numerical quadratures that at the end of the day use a common set of data for all the computations ( a quite big arrays of roots and weights ocupying about 25 Kb of memory). The Gauss-Legendre quadrature method is simple enought to start with. I want to make available to all the threads in the device, the roots and weights, through the declaration device double *d_droot, *d_dweight. But I am missing something because I have to pass explictly the pointers to the arrays to make my kernel to work well. How can I do it properly? Even more, aiming to have available more free memory on the device, is it possible to burn the roots and weights to some constant portion of the memory of the device?

The code is attached

#include <math.h>
#include <stdlib.h>
#include <stdio.h>


__device__  double *d_droot, *d_dweight;


__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}

__global__
void lege_inte2(int n, double alpha, double a, double b, double *lroots, double *weight, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += weight[dummy] * f(alpha,c1 * lroots[dummy] + c2)*c1;
    }
}

__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  extern __shared__ double *d_droot;
  extern __shared__ double *d_dweight;
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f \n", d_dweight[dummy]);
    }
    }
}


int main(void)
{
  int N = 1<<23;
  int N_nodes = 5;


  double *droot, *dweight, *dresult, *d_dresult;


  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/


  /*double version in device*/
  cudaMalloc(&d_droot, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/


  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;


  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;



  /*double copy host-> device*/
  cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);


  // Perform SAXPY on 1M element

  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_droot, d_dweight, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */





  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost); 

  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures \n", maxError, N);
  printf("integral: %f  \n" ,dresult[0]);



  cudaFree(dresult);
  cudaFree(d_droot);
  cudaFree(d_dweight);

}

and a makefile to compile it:

objects = main.o 

all: $(objects)
        nvcc   -Xcompiler -std=c99 -arch=sm_20 $(objects) -o gauss

%.o: %.cpp
        nvcc -x cu -arch=sm_20  -I. -dc $< -o $@

clean:
        rm -f *.o gauss

Thanks in advance for any suggestion

回答1:

Your handling of d_droot and d_dweight has a variety of errors. When I compile your code, I get various warnings like this:

t640.cu(86): warning: address of a __shared__ variable "d_droot" cannot be directly taken in a host function

t640.cu(87): warning: address of a __shared__ variable "d_dweight" cannot be directly taken in a host function

t640.cu(108): warning: a __shared__ variable "d_droot" cannot be directly read in a host function

t640.cu(109): warning: a __shared__ variable "d_dweight" cannot be directly read in a host function

which should not be ignored.

These declarations:
```
__device__  double *d_droot, *d_dweight;
```
do not not define __shared__ variables, so these lines:
```
extern __shared__ double *d_droot;
extern __shared__ double *d_dweight;
```
make no sense. Furthermore, if you did want these to be dynamically allocated shared variables (what extern __shared__ is used for), you would need to pass the allocation size as the 3rd kernel launch parameter, which you are not doing.
These statements are incorrect:
```
cudaMalloc(&d_droot, N_nodes*sizeof(double));
cudaMalloc(&d_dweight, N_nodes*sizeof(double));
```
You cannot take the address of a __device__ variable in host code, and we don't use cudaMalloc to allocate a __device__ variable anyway; it is a static allocation by definition.
I recommend doing proper cuda error checking. As a quick test, you can also run your code with cuda-memcheck. Either method would indicate the presence of a runtime error in your code (albeit not the crux of any issue).

These statements are also incorrect:

cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);

cudaMemcpy is not the correct API to use with a __device__ variable. Use cudaMemcpyToSymbol instead.

The following code has these various usage errors fixed, will compile cleanly, and seems to run correctly. It demonstrates that it is not necessary to pass a __device__ variable as a kernel parameter:

#include <math.h>
#include <stdlib.h>
#include <stdio.h>


__device__  double *d_droot, *d_dweight;


__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}

__global__
void lege_inte2(int n, double alpha, double a, double b, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
    }
}

__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f \n", d_dweight[dummy]);
    }
    }
}


int main(void)
{
  int N = 1<<23;
  int N_nodes = 5;


  double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;


  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/


  /*double version in device*/
  cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/


  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;


  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;



  /*double copy host-> device*/
  cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
  cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));

  // Perform SAXPY on 1M element

  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */





  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);

  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures \n", maxError, N);
  printf("integral: %f  \n" ,dresult[0]);



  cudaFree(d_dresult);
  cudaFree(d_droot_temp);
  cudaFree(d_dweight_temp);

}

(I can't vouch for the results.)

Now, regarding this question:

Even more, aiming to have available more free memory on the device, is it possible to burn the roots and weights to some constant portion of the memory of the device?

Since your accesses of d_dweight and d_droot appear to be uniform:

result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;

Then it may be useful to define these as __constant__ memory space variables. Constant memory access is optimum when every thread in a warp is requesting the same value (same location) in constant memory. However, __constant__ memory cannot be allocated dynamically, and it makes no sense to store a pointer (only) in constant memory; this doesn't provide any of the benefits of the constant cache mechanism.

Therefore, the following further modification to your code demonstrates how to store these values in __constant__ memory, but it requires a static allocation. Furthermore, this doesn't really "save" any device memory. Whether you allocate dynamically using cudaMalloc, statically with a __device__ variable, or via a __constant__ variable definition (also a static allocation), all of these methods require global memory backing store in device memory (on-board DRAM).

Code demonstrating possible constant memory usage:

#include <math.h>
#include <stdlib.h>
#include <stdio.h>

#define N_nodes 5

__constant__   double d_droot[N_nodes], d_dweight[N_nodes];


__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}

__global__
void lege_inte2(int n, double alpha, double a, double b, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
    }
}

__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f \n", d_dweight[dummy]);
    }
    }
}


int main(void)
{
  int N = 1<<23;
 // int N_nodes = 5;


  double *droot, *dweight, *dresult, *d_dresult;


  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/


  /*double version in device*/
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/


  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;


  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;



  /*double copy host-> device*/
  cudaMemcpyToSymbol(d_droot, droot, N_nodes*sizeof(double));
  cudaMemcpyToSymbol(d_dweight, dweight, N_nodes*sizeof(double));

  // Perform SAXPY on 1M element

  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */





  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);

  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures \n", maxError, N);
  printf("integral: %f  \n" ,dresult[0]);



  cudaFree(d_dresult);

}

来源：https://stackoverflow.com/questions/28821743/sharing-roots-and-weights-for-many-gauss-legendre-quadrature-in-gpus

标签

c++

cuda

gpu-programming