问题
I am intending to compute in parallel fashion a lot of numerical quadratures that at the end of the day use a common set of data for all the computations ( a quite big arrays of roots and weights ocupying about 25 Kb of memory). The Gauss-Legendre quadrature method is simple enought to start with. I want to make available to all the threads in the device, the roots and weights, through the declaration device double *d_droot, *d_dweight. But I am missing something because I have to pass explictly the pointers to the arrays to make my kernel to work well. How can I do it properly? Even more, aiming to have available more free memory on the device, is it possible to burn the roots and weights to some constant portion of the memory of the device?
The code is attached
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
__device__ double *d_droot, *d_dweight;
__device__ __host__
double f(double alpha,double x)
{
/*function to be integrated via gauss-legendre quadrature. */
return exp(alpha*x);
}
__global__
void lege_inte2(int n, double alpha, double a, double b, double *lroots, double *weight, double *result)
{
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
lroots[]: roots for the quadrature
weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
result[i] += weight[dummy] * f(alpha,c1 * lroots[dummy] + c2)*c1;
}
}
__global__
void lege_inte2_shared(int n,double alpha, double a, double b, double *result)
{
extern __shared__ double *d_droot;
extern __shared__ double *d_dweight;
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
d_root[]: roots for the quadrature
d_weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
{
result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
printf(" Vale: %f \n", d_dweight[dummy]);
}
}
}
int main(void)
{
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot, N_nodes*sizeof(double));
cudaMalloc(&d_dweight, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M element
lege_inte2<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_droot, d_dweight, d_dresult); /*This kerlnel works OK*/
//lege_inte2_shared<<<(N+255)/256, 256>>>(N, -3.0, 3.0, d_dresult); /*why this one does not work? */
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
double maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(dresult[i]-20.03574985));
printf("Max error: %f in %i quadratures \n", maxError, N);
printf("integral: %f \n" ,dresult[0]);
cudaFree(dresult);
cudaFree(d_droot);
cudaFree(d_dweight);
}
and a makefile to compile it:
objects = main.o
all: $(objects)
nvcc -Xcompiler -std=c99 -arch=sm_20 $(objects) -o gauss
%.o: %.cpp
nvcc -x cu -arch=sm_20 -I. -dc $< -o $@
clean:
rm -f *.o gauss
Thanks in advance for any suggestion
回答1:
Your handling of d_droot
and d_dweight
has a variety of errors. When I compile your code, I get various warnings like this:
t640.cu(86): warning: address of a __shared__ variable "d_droot" cannot be directly taken in a host function
t640.cu(87): warning: address of a __shared__ variable "d_dweight" cannot be directly taken in a host function
t640.cu(108): warning: a __shared__ variable "d_droot" cannot be directly read in a host function
t640.cu(109): warning: a __shared__ variable "d_dweight" cannot be directly read in a host function
which should not be ignored.
These declarations:
__device__ double *d_droot, *d_dweight;
do not not define
__shared__
variables, so these lines:extern __shared__ double *d_droot; extern __shared__ double *d_dweight;
make no sense. Furthermore, if you did want these to be dynamically allocated shared variables (what
extern __shared__
is used for), you would need to pass the allocation size as the 3rd kernel launch parameter, which you are not doing.These statements are incorrect:
cudaMalloc(&d_droot, N_nodes*sizeof(double)); cudaMalloc(&d_dweight, N_nodes*sizeof(double));
You cannot take the address of a
__device__
variable in host code, and we don't usecudaMalloc
to allocate a__device__
variable anyway; it is a static allocation by definition.I recommend doing proper cuda error checking. As a quick test, you can also run your code with
cuda-memcheck
. Either method would indicate the presence of a runtime error in your code (albeit not the crux of any issue).These statements are also incorrect:
cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy
is not the correct API to use with a__device__
variable. UsecudaMemcpyToSymbol
instead.
The following code has these various usage errors fixed, will compile cleanly, and seems to run correctly. It demonstrates that it is not necessary to pass a __device__
variable as a kernel parameter:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
__device__ double *d_droot, *d_dweight;
__device__ __host__
double f(double alpha,double x)
{
/*function to be integrated via gauss-legendre quadrature. */
return exp(alpha*x);
}
__global__
void lege_inte2(int n, double alpha, double a, double b, double *result)
{
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
lroots[]: roots for the quadrature
weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
}
}
__global__
void lege_inte2_shared(int n,double alpha, double a, double b, double *result)
{
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
d_root[]: roots for the quadrature
d_weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
{
result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
printf(" Vale: %f \n", d_dweight[dummy]);
}
}
}
int main(void)
{
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
// Perform SAXPY on 1M element
lege_inte2<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
//lege_inte2_shared<<<(N+255)/256, 256>>>(N, -3.0, 3.0, d_dresult); /*why this one does not work? */
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
double maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(dresult[i]-20.03574985));
printf("Max error: %f in %i quadratures \n", maxError, N);
printf("integral: %f \n" ,dresult[0]);
cudaFree(d_dresult);
cudaFree(d_droot_temp);
cudaFree(d_dweight_temp);
}
(I can't vouch for the results.)
Now, regarding this question:
Even more, aiming to have available more free memory on the device, is it possible to burn the roots and weights to some constant portion of the memory of the device?
Since your accesses of d_dweight
and d_droot
appear to be uniform:
result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
Then it may be useful to define these as __constant__
memory space variables. Constant memory access is optimum when every thread in a warp is requesting the same value (same location) in constant memory. However, __constant__
memory cannot be allocated dynamically, and it makes no sense to store a pointer (only) in constant memory; this doesn't provide any of the benefits of the constant cache mechanism.
Therefore, the following further modification to your code demonstrates how to store these values in __constant__
memory, but it requires a static allocation. Furthermore, this doesn't really "save" any device memory. Whether you allocate dynamically using cudaMalloc
, statically with a __device__
variable, or via a __constant__
variable definition (also a static allocation), all of these methods require global memory backing store in device memory (on-board DRAM).
Code demonstrating possible constant memory usage:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#define N_nodes 5
__constant__ double d_droot[N_nodes], d_dweight[N_nodes];
__device__ __host__
double f(double alpha,double x)
{
/*function to be integrated via gauss-legendre quadrature. */
return exp(alpha*x);
}
__global__
void lege_inte2(int n, double alpha, double a, double b, double *result)
{
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
lroots[]: roots for the quadrature
weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
}
}
__global__
void lege_inte2_shared(int n,double alpha, double a, double b, double *result)
{
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
d_root[]: roots for the quadrature
d_weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
{
result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
printf(" Vale: %f \n", d_dweight[dummy]);
}
}
}
int main(void)
{
int N = 1<<23;
// int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpyToSymbol(d_droot, droot, N_nodes*sizeof(double));
cudaMemcpyToSymbol(d_dweight, dweight, N_nodes*sizeof(double));
// Perform SAXPY on 1M element
lege_inte2<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
//lege_inte2_shared<<<(N+255)/256, 256>>>(N, -3.0, 3.0, d_dresult); /*why this one does not work? */
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
double maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(dresult[i]-20.03574985));
printf("Max error: %f in %i quadratures \n", maxError, N);
printf("integral: %f \n" ,dresult[0]);
cudaFree(d_dresult);
}
来源:https://stackoverflow.com/questions/28821743/sharing-roots-and-weights-for-many-gauss-legendre-quadrature-in-gpus