Polymorphism and derived classes in CUDA / CUDA Thrust

问题

This is my first question on Stack Overflow, and it's quite a long question. The tl;dr version is: How do I work with a thrust::device_vector<BaseClass> if I want it to store objects of different types DerivedClass1, DerivedClass2, etc, simultaneously?

I want to take advantage of polymorphism with CUDA Thrust. I'm compiling for an -arch=sm_30 GPU (GeForce GTX 670).

Let us take a look at the following problem: Suppose there are 80 families in town. 60 of them are married couples, 20 of them are single-parent households. Each family has, therefore, a different number of members. It's census time and households have to state the parents' ages and the number of children they have. Therefore, an array of Family objects is constructed by the government, namely thrust::device_vector<Family> familiesInTown(80), such that information of families familiesInTown[0] to familiesInTown[59] corresponds to married couples, the rest (familiesInTown[60] to familiesInTown[79]) being single-parent households.

Family is the base class - the number of parents in the household (1 for single parents and 2 for couples) and the number of children they have are stored here as members.
SingleParent, derived from Family, includes a new member - the single parent's age, unsigned int ageOfParent.

MarriedCouple, also derived from Family, however, introduces two new members - both parents' ages, unsigned int ageOfParent1 and unsigned int ageOfParent2.

#include <iostream>
#include <stdio.h>
#include <thrust/device_vector.h>

class Family
{
protected:
  unsigned int numParents;
  unsigned int numChildren;
public:
  __host__ __device__ Family() {};
  __host__ __device__ Family(const unsigned int& nPars, const unsigned int& nChil) : numParents(nPars), numChildren(nChil) {};
  __host__ __device__ virtual ~Family() {};

  __host__ __device__ unsigned int showNumOfParents() {return numParents;}
  __host__ __device__ unsigned int showNumOfChildren() {return numChildren;}
};

class SingleParent : public Family
{
protected:
  unsigned int ageOfParent;
public:
  __host__ __device__ SingleParent() {};
  __host__ __device__ SingleParent(const unsigned int& nChil, const unsigned int& age) : Family(1, nChil), ageOfParent(age) {};

  __host__ __device__ unsigned int showAgeOfParent() {return ageOfParent;}
};

class MarriedCouple : public Family
{
protected:
  unsigned int ageOfParent1;
  unsigned int ageOfParent2;
public:
  __host__ __device__ MarriedCouple() {};
  __host__ __device__ MarriedCouple(const unsigned int& nChil, const unsigned int& age1, const unsigned int& age2) : Family(2, nChil), ageOfParent1(age1), ageOfParent2(age2) {};

  __host__ __device__ unsigned int showAgeOfParent1() {return ageOfParent1;}
  __host__ __device__ unsigned int showAgeOfParent2() {return ageOfParent2;}
};

If I were to naïvely initiate the objects in my thrust::device_vector<Family> with the following functors:

struct initSlicedCouples : public thrust::unary_function<unsigned int, MarriedCouple>
{
  __device__ MarriedCouple operator()(const unsigned int& idx) const
  // I use a thrust::counting_iterator to get idx
  {
    return MarriedCouple(idx % 3, 20 + idx, 19 + idx); 
    // Couple 0: Ages 20 and 19, no children
    // Couple 1: Ages 21 and 20, 1 child
    // Couple 2: Ages 22 and 21, 2 children
    // Couple 3: Ages 23 and 22, no children
    // etc
  }
};

struct initSlicedSingles : public thrust::unary_function<unsigned int, SingleParent>
{
  __device__ SingleParent operator()(const unsigned int& idx) const
  {
    return SingleParent(idx % 3, 25 + idx);
  }
};

int main()
{
  unsigned int Num_couples = 60;
  unsigned int Num_single_parents = 20;

  thrust::device_vector<Family> familiesInTown(Num_couples + Num_single_parents);
  // Families [0] to [59] are couples. Families [60] to [79] are single-parent households.
  thrust::transform(thrust::counting_iterator<unsigned int>(0),
                    thrust::counting_iterator<unsigned int>(Num_couples),
                    familiesInTown.begin(),
                    initSlicedCouples());
  thrust::transform(thrust::counting_iterator<unsigned int>(Num_couples),
                    thrust::counting_iterator<unsigned int>(Num_couples + Num_single_parents),
                    familiesInTown.begin() + Num_couples,
                    initSlicedSingles());
  return 0;
}

I would definitely be guilty of some classic object slicing...

So, I asked myself, what about a vector of pointers that may give me some sweet polymorphism? Smart pointers in C++ are a thing, and thrust iterators can do some really impressive things, so let's give it a shot, I figured. The following code compiles.

struct initCouples : public thrust::unary_function<unsigned int, MarriedCouple*>
{
  __device__ MarriedCouple* operator()(const unsigned int& idx) const
  {
    return new MarriedCouple(idx % 3, 20 + idx, 19 + idx); // Memory issues?
  }
};
struct initSingles : public thrust::unary_function<unsigned int, SingleParent*>
{
  __device__ SingleParent* operator()(const unsigned int& idx) const
  {
    return new SingleParent(idx % 3, 25 + idx);
  }
};

int main()
{
  unsigned int Num_couples = 60;
  unsigned int Num_single_parents = 20;

  thrust::device_vector<Family*> familiesInTown(Num_couples + Num_single_parents);
  // Families [0] to [59] are couples. Families [60] to [79] are single-parent households.
  thrust::transform(thrust::counting_iterator<unsigned int>(0),
                    thrust::counting_iterator<unsigned int>(Num_couples),
                    familiesInTown.begin(),
                    initCouples()); 
  thrust::transform(thrust::counting_iterator<unsigned int>(Num_couples),
                    thrust::counting_iterator<unsigned int>(Num_couples + Num_single_parents),
                    familiesInTown.begin() + Num_couples,
                    initSingles());

  Family A = *(familiesInTown[2]); // Compiles, but object slicing takes place (in theory)
  std::cout << A.showNumOfParents() << "\n"; // Segmentation fault
 return 0;
}

Seems like I've hit a wall here. Am I understanding memory management correctly? (VTables, etc). Are my objects being instantiated and populated on the device? Am I leaking memory like there is no tomorrow?

For what it's worth, in order to avoid object slicing, I tried with a dynamic_cast<DerivedPointer*>(basePointer). That's why I made my Family destructor virtual.

Family *pA = familiesInTown[2];
MarriedCouple *pB = dynamic_cast<MarriedCouple*>(pA);

The following lines compile, but, unfortunately, a segfault is thrown again. CUDA-Memcheck won't tell me why.

  std::cout << "Ages " << (pB -> showAgeOfParent1()) << ", " << (pB -> showAgeOfParent2()) << "\n";

and

  MarriedCouple B = *pB;
  std::cout << "Ages " << B.showAgeOfParent1() << ", " << B.showAgeOfParent2() << "\n";

In short, what I need is a class interface for objects that will have different properties, with different numbers of members among each other, but that I can store in one common vector (that's why I want a base class) that I can manipulate on the GPU. My intention is to work with them both in thrust transformations and in CUDA kernels via thrust::raw_pointer_casting, which has worked flawlessly for me until I've needed to branch out my classes into a base one and several derived ones. What is the standard procedure for that?

Thanks in advance!

回答1:

I am not going to attempt to answer everything in this question, it is just too large. Having said that here are some observations about the code you posted which might help:

The GPU side new operator allocates memory from a private runtime heap. As of CUDA 6, that memory cannot be accessed by the host side CUDA APIs. You can access the memory from within kernels and device functions, but that memory cannot be accessed by the host. So using new inside a thrust device functor is a broken design that can never work. That is why your "vector of pointers" model fails.
Thrust is fundamentally intended to allow data parallel versions of typical STL algorithms to be applied to POD types. Building a codebase using complex polymorphic objects and trying to cram those through Thrust containers and algorithms might be made to work, but it isn't what Thrust was designed for, and I wouldn't recommend it. Don't be surprised if you break thrust in unexpected ways if you do.
CUDA supports a lot of C++ features, but the compilation and object models are much simpler than even the C++98 standard upon which they are based. CUDA lacks several key features (RTTI for example) which make complex polymorphic object designs workable in C++. My suggestion is use C++ features sparingly. Just because you can do something in CUDA doesn't mean you should. The GPU is a simple architecture and simple data structures and code are almost always more performant than functionally similar complex objects.

Having skim read the code you posted, my overall recommendation is to go back to the drawing board. If you want to look at some very elegant CUDA/C++ designs, spend some time reading the code bases of CUB and CUSP. They are both very different, but there is a lot to learn from both (and CUSP is built on top of Thrust, which makes it even more relevant to your usage case, I suspect).

回答2:

I completely agree with @talonmies answer. (e.g. I don't know that thrust has been extensively tested with polymorphism.) Furthermore, I have not fully parsed your code. I post this answer to add additional info, in particular that I believe some level of polymorphism can be made to work with thrust.

A key observation I would make is that it is not allowed to pass as an argument to a __global__ function an object of a class with virtual functions. This means that polymorphic objects created on the host cannot be passed to the device (via thrust, or in ordinary CUDA C++). (One basis for this limitation is the requirement for virtual function tables in the objects, which will necessarily be different between host and device, coupled with the fact that it is illegal to directly take the address of a device function in host code).

However, polymorphism can work in device code, including thrust device functions.

The following example demonstrates this idea, restricting ourselves to objects created on the device although we can certainly initialize them with host data. I have created two classes, Triangle and Rectangle, derived from a base class Polygon which includes a virtual function area. Triangle and Rectangle inherit the function set_values from the base class but replace the virtual area function.

We can then manipulate objects of those classes polymorphically as demonstrated here:

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/copy.h>
#define N 4


class Polygon {
  protected:
    int width, height;
  public:
  __host__ __device__  void set_values (int a, int b)
      { width=a; height=b; }
  __host__ __device__  virtual int area ()
      { return 0; }
};

class Rectangle: public Polygon {
  public:
  __host__ __device__  int area ()
      { return width * height; }
};

class Triangle: public Polygon {
  public:
  __host__ __device__   int area ()
      { return (width * height / 2); }
};


struct init_f {
  template <typename Tuple>
  __host__ __device__ void operator()(const Tuple &arg) {
    (thrust::get<0>(arg)).set_values(thrust::get<1>(arg), thrust::get<2>(arg));}
};

struct setup_f {
  template <typename Tuple>
  __host__ __device__ void operator()(const Tuple &arg) {
    if (thrust::get<0>(arg) == 0)
      thrust::get<1>(arg) = &(thrust::get<2>(arg));
    else
      thrust::get<1>(arg) = &(thrust::get<3>(arg));}
};

struct area_f {
  template <typename Tuple>
  __host__ __device__ void operator()(const Tuple &arg) {
    thrust::get<1>(arg) = (thrust::get<0>(arg))->area();}
};


int main () {

  thrust::device_vector<int>  widths(N);
  thrust::device_vector<int> heights(N);
  thrust::sequence( widths.begin(),  widths.end(), 2);
  thrust::sequence(heights.begin(), heights.end(), 3);
  thrust::device_vector<Rectangle> rects(N);
  thrust::device_vector<Triangle>  trgls(N);
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(rects.begin(), widths.begin(), heights.begin())), thrust::make_zip_iterator(thrust::make_tuple(rects.end(), widths.end(), heights.end())), init_f());
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(trgls.begin(), widths.begin(), heights.begin())), thrust::make_zip_iterator(thrust::make_tuple(trgls.end(), widths.end(), heights.end())), init_f());
  thrust::device_vector<Polygon *> polys(N);
  thrust::device_vector<int> selector(N);
  for (int i = 0; i<N; i++) selector[i] = i%2;
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(selector.begin(), polys.begin(), rects.begin(), trgls.begin())), thrust::make_zip_iterator(thrust::make_tuple(selector.end(), polys.end(), rects.end(), trgls.end())), setup_f());
  thrust::device_vector<int> areas(N);
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(polys.begin(), areas.begin())), thrust::make_zip_iterator(thrust::make_tuple(polys.end(), areas.end())), area_f());
  thrust::copy(areas.begin(), areas.end(), std::ostream_iterator<int>(std::cout, "\n"));
  return 0;
}

I suggest compiling the above code for a cc2.0 or newer architecture. I tested with CUDA 6 on RHEL 5.5.

(The polymorphic example idea, and some of the code, was taken from here.)

来源：https://stackoverflow.com/questions/22988244/polymorphism-and-derived-classes-in-cuda-cuda-thrust

标签

cuda

polymorphism

thrust