CUDA: Copy dynamically created array of function pointers on the CPU to GPU memory

跟風遠走 提交于 2019-11-29 18:14:55

If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:

template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
  constexpr auto num_f = sizeof...(Functions);

  constexpr fptr_t table[] = { Functions... };

  if (threadIdx.x < num_f)
  {
    fptr_t f = table[threadIdx.x];
    f(a,b);
  }
}

You would then call this kernel using

kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);

Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!

Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.

A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.

common.h:

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <iostream>

#define num_functions 3

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);

// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}

// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];

#endif

main.cu:

#include "common.h"

// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;

// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
  fptr_t dev_f = f; // Create device function pointer
  dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
  dev_fList[idx](a,b); // Make sure that the array was populated correctly
}

// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
  if (count_f >= num_functions) {
    std::cout << "Error: not enough memory statically allocated on device!\n";
    exit(EXIT_FAILURE);
  }
  kernel<f><<<1,1>>>(a,b,count_f);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());
  count_f++;
}

int main() {
  int a = 12, b = 15;
  addFunc<Add>(a,b);
  addFunc<Subtract>(a,b);
  addFunc<Multiply>(a,b);

  return 0;
}

Edit: Added copy of the array of function pointers to constant memory

For what it's worth, here is how to copy our dev_fList array to constant memory:

In common.h:

__constant__ fptr_t cst_fList[num_functions];

__global__ void cst_test(int a, int b, int idx) {
   if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}

In main.cu main() function, after all desired functions have been added:

  fptr_t *temp;
  gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );

  cst_test<<<1,count_f>>>(a,b, count_f);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.

It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.

Example showing structure with function pointers:

This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.

This is dynamic assignment:

typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu)    (float, float, float);

// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
  /*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
  /*__host__ __device__*/ pDecayFu rad_decay;
  /*__host__ __device__*/ pDecayFu lrate_decay;
};

// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..

This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(

pDistanceFu hDistance; 
pDecayFu hRadDay; 
pDecayFu hLRateDecay; 

void DeviceAssign(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
  cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
  cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );

  dist.distance = hDistance;
  dist.rad_decay = hRadDay;
  dist.lrate_decay = hLRateDecay;
} 

Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..

// .. and this would work
#ifdef __CUDACC__
  __host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
  return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}

__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible 

void DeviceAssign2(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
  // the same:
  // cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
  // ..

  dist.lrate_decay = hLRateDecay;
  // ..
} 
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!