CUDA: Copy dynamically created array of function pointers on the CPU to GPU memory

I would like to create a list of function pointers dynamically on the CPU (with some sort of push_back() method called from main()) and copy it to a GPU __constant__ or __device__ array, without needing to resort to static __device__ function pointers. I believe this question is related to my problem; however, my goal is to create the __host__ function pointer array iteratively and then copy it to the __constant__ function pointer array instead of initialising the latter on declaration.

A working code example with static function pointers (as seen here or here) would be:

common.h:

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <iostream>

#define num_functions 3

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),     file, line);
      if (abort) exit(code);
   }
}

// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);

// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}

// List of function pointers in device memory
__constant__ fptr_t constant_fList[num_functions];

// Kernel called from main(): choose the function to apply whose index is equal to thread ID
__global__ void kernel(int a, int b) {
  fptr_t f;
  if (threadIdx.x < num_functions) {
    f = constant_fList[threadIdx.x];
    f(a,b);
  }
}

#endif

main.cu:

#include "common.h"

// Static device function pointers
__device__ fptr_t p_Add = Add;
__device__ fptr_t p_Sub = Subtract;
__device__ fptr_t p_Mul = Multiply;

// Load function list to constant memory
void loadList_staticpointers() {
  fptr_t h_fList[num_functions];
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) );
}

int main() {

  loadList_staticpointers();
  int a = 12, b = 15;
  kernel<<<1,3>>>(a, b);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

  return 0;
}

Specs: GeForce GTX 670, compiled for -arch=sm_30, CUDA 6.5, Ubuntu 14.04

I wish to avoid the use of static device function pointers, as appending each function would require code maintenance on the user side - declaration of a new static pointer like p_Add or p_Mul, manipulation of void loadList_functionpointers(), etc. To make it clear, I am trying something like the following (crashing) code:

main_wrong.cu:

#include "common.h"
#include <vector>

// Global variable: list of function pointers in host memory
std::vector<fptr_t> vec_fList;

// Add function to functions list
void addFunc(fptr_t f) {vec_fList.push_back(f);}

// Upload the functions in the std::vector<fptr_t> to GPU memory
// Copies CPU-side pointers to constant_fList, therefore crashes on kernel call 
void UploadVector() {
  fptr_t* h_vpointer = vec_fList.data();
  gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_vpointer, vec_fList.size() * sizeof(fptr_t)) );
}

int main() {

  addFunc(Add);
  addFunc(Subtract);
  addFunc(Multiply);
  int a = 12, b = 15;

  UploadVector();

  kernel<<<1,3>>>(a, b); // Wrong to call a host-side function pointer from a kernel
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

  return 0;
}

My understanding is that function pointers pointing to host addresses are copied to the GPU and are unusable by the kernel, which needs pointers pointing to GPU addresses when the function f(a,b) is called. Populating a host-side array with device-side pointers would work for me with raw data (see this question) but not with function pointers. Trivial attempts with Unified Memory have failed as well... so far, I have only found static device-side pointers to work. Is there no other way to copy a dynamically created CPU array of function pointers onto the GPU?

If you can use C++11 (supported since CUDA 7), you could use the following to auto-generate the function table:

template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
  constexpr auto num_f = sizeof...(Functions);

  constexpr fptr_t table[] = { Functions... };

  if (threadIdx.x < num_f)
  {
    fptr_t f = table[threadIdx.x];
    f(a,b);
  }
}

You would then call this kernel using

kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);

Inspired by m.s.'s answer, I chose to pass the function pointer as a template parameter -this was in fact the key to solve my problem- and discovered that filling a __device__ array of function pointers dev_fList from the main() function iteratively without the help of static function pointers is indeed possible, plus C++11 compatibility is not even needed!

Here is a working example on a __device__ array in global memory. I have not tried its constant memory counterpart yet, but once a global memory array has been satisfactorily created, my guess is that a cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice) should do the trick.

A kernel kernel() creates a GPU address for function pointer dev_f and copies the function f that was passed as a template argument. Since this is an iterative process from the CPU, only one thread (thread 0) is involved in this kernel, which is launched with configuration <<<1,1>>>. The static variable count_f takes care of indexing in dev_fList.

common.h:

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>
#include <iostream>

#define num_functions 3

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

// fptr_t: Pointer to void function that takes two integer lvalues
typedef void (*fptr_t)(int&, int&);

// some examples of void(int&, int&) functions...
__device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);}
__device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);}
__device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);}

// List of function pointers in device memory
// Note that, in my example, it resides in global memory space, not constant memory
__device__ fptr_t dev_fList[num_functions];

#endif

main.cu:

#include "common.h"

// Index in dev_fList[] == number of times addFunc<>() was launched
static int count_f = 0;

// Kernel that copies function f to the GPU
template<fptr_t f>
__global__ void kernel(int a, int b, int idx) {
  fptr_t dev_f = f; // Create device function pointer
  dev_fList[idx] = dev_f; // Populate the GPU array of function pointers
  dev_fList[idx](a,b); // Make sure that the array was populated correctly
}

// Add function to functions list
template<fptr_t f>
void addFunc(const int &a, const int &b) {
  if (count_f >= num_functions) {
    std::cout << "Error: not enough memory statically allocated on device!\n";
    exit(EXIT_FAILURE);
  }
  kernel<f><<<1,1>>>(a,b,count_f);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());
  count_f++;
}

int main() {
  int a = 12, b = 15;
  addFunc<Add>(a,b);
  addFunc<Subtract>(a,b);
  addFunc<Multiply>(a,b);

  return 0;
}

Edit: Added copy of the array of function pointers to constant memory

For what it's worth, here is how to copy our dev_fList array to constant memory:

In common.h:

__constant__ fptr_t cst_fList[num_functions];

__global__ void cst_test(int a, int b, int idx) {
   if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b);
}

In main.cu main() function, after all desired functions have been added:

  fptr_t *temp;
  gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) );
  gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) );

  cst_test<<<1,count_f>>>(a,b, count_f);
  gpuErrchk(cudaGetLastError());
  gpuErrchk(cudaDeviceSynchronize());

It may look ugly as I understand that memory is transferred to the host via temp and then back to the device; more elegant suggestions are welcome.

It is impossible to use dynamically created CUDA device function pointers (at least not without crash or UB). The template based solutions work at compile time (not dynamic). The CUDA device function pointer approaches you see everywhere need device symbols in global space. This means that for every function a device function pointer must be already declared. This also means you cannot use normal C function pointers as reference, which are e.g. set at runtime. In comprehension, using CUDA device function pointers is questionable. Template based approaches look user-friendly, but are per definition not dynamic.

Example showing structure with function pointers:

This example shows a structure having some function pointers. In normal C++ code, you can set and change the device function pointers while the program is running (dynamically). With CUDA this example below is impossible, because the function pointers in the struct are no valid device symbols. This means they cannot be used with "cudaMemcpyFromSymbol". To circumvent this, either the original function (target of the function pointers) or global cuda device function pointers must be created. Both is not dynamic.

This is dynamic assignment:

typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu)    (float, float, float);

// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
  /*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
  /*__host__ __device__*/ pDecayFu rad_decay;
  /*__host__ __device__*/ pDecayFu lrate_decay;
};

// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..

This is how it should be with CUDA, but it will fail, because there is no valid device symbol :(

pDistanceFu hDistance; 
pDecayFu hRadDay; 
pDecayFu hLRateDecay; 

void DeviceAssign(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
  cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
  cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );

  dist.distance = hDistance;
  dist.rad_decay = hRadDay;
  dist.lrate_decay = hLRateDecay;
}

Here is the classical way, but you notice, it is not dynamic anymore because the device symbol must refer to the function reference not a pointer which may chnage during run-time..

// .. and this would work
#ifdef __CUDACC__
  __host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
  return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}

__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible 

void DeviceAssign2(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
  // the same:
  // cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
  // ..

  dist.lrate_decay = hLRateDecay;
  // ..
}

来源：https://stackoverflow.com/questions/31694730/cuda-copy-dynamically-created-array-of-function-pointers-on-the-cpu-to-gpu-memo

标签

cuda

function-pointers