How to implement nested loops in cuda thrust

吃可爱长大的小学妹 提交于 2019-12-03 21:51:07
m.s.

EDIT: Now that you have posted an example, here is how you could solve it:

You have n 2D points stored in a linear array like this (here n=4)

points = [p0 p1 p2 p3]

Based on your code I assume you want to calculate:

result = f(p0, p1) + f(p0, p2) + f(p0, p3) +
         f(p1, p2) + f(p1, p3) +
         f(p2, p3)

Where f() is your distance function which needs to be executed m times in total:

m = (n-1)*n/2

in this example: m=6

You can look at this problem as a triangular matrix:

[ p0 p1 p2 p3 ] 
[    p1 p2 p3 ]
[       p2 p3 ]
[          p3 ]

Transforming this matrix into a linear vector with m elements while leaving out the diagonal elements results in:

[p1 p2 p3 p2 p3 p3]

The index of an element in the vector is k = [0,m-1]. Index k can be remapped to columns and rows of the triangular matrix to k -> (i,j):

i = n - 2 - floor(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5)
j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2

i is the row and j is the column.

In our example:

0 -> (0, 1)
1 -> (0, 2)
2 -> (0, 3)
3 -> (1, 2)
4 -> (1, 3)
5 -> (2, 3)

Now you can put all this together and execute a modified distance functor m times which applies the aforementioned mapping to get the corresponding pairs based on the index and then sum up everything.

I modified your code accordingly:

#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/random.h>
#include <math.h>
#include <iostream>
#include <stdio.h>
#include <stdint.h>

#define PRINT_DEBUG

typedef float Float;

// define a 2d point pair
typedef thrust::tuple<Float, Float> Point;

// return a random Point in [0,1)^2
Point make_point(void)
{
    static thrust::default_random_engine rng(12345);
    static thrust::uniform_real_distribution<Float> dist(0.0, 1.0);
    Float x = dist(rng);
    Float y = dist(rng);
    return Point(x,y);
}


struct sqrt_dis_new
{
    typedef thrust::device_ptr<Point> DevPtr;

    DevPtr points;
    const uint64_t n;

    __host__
    sqrt_dis_new(uint64_t n, DevPtr p) : n(n), points(p)
    {
    }

    __device__ 
    Float operator()(uint64_t k) const
    {
        // calculate indices in triangular matrix
        const uint64_t i = n - 2 - floor(sqrt((double)(-8*k + 4*n*(n-1)-7))/2.0 - 0.5);
        const uint64_t j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2;

#ifdef PRINT_DEBUG
        printf("%llu -> (%llu, %llu)\n", k,i,j);
#endif

        const Point& p1 = *(points.get()+j);
        const Point& p2 = *(points.get()+i);

        const Float xm = thrust::get<0>(p1)-thrust::get<0>(p2);
        const Float ym = thrust::get<1>(p1)-thrust::get<1>(p2);

        return 1.0/(-1.0 * sqrt(xm*xm + ym*ym));
    }
};


int main()
{
    const uint64_t N = 4;

    // allocate some random points in the unit square on the host
    thrust::host_vector<Point> h_points(N);
    thrust::generate(h_points.begin(), h_points.end(), make_point);

    // transfer to device
    thrust::device_vector<Point> d_points = h_points;

    const uint64_t count = (N-1)*N/2;

    std::cout << count << std::endl;


    thrust::plus<Float> binary_op;
    const Float init = 0.0;

    Float result = thrust::transform_reduce(thrust::make_counting_iterator((uint64_t)0),
                                            thrust::make_counting_iterator(count),
                                            sqrt_dis_new(N, d_points.data()),
                                            init,
                                            binary_op);

    std::cout.precision(10);  
    std::cout<<"result: " << result << std::endl;

    return 0;
}    

It depends on your compute function which you do not specify. Usually you unroll the loops and launch the kernel in a 2D manner for every combination of i and j if the computations are independent. Have a look at the Thrust examples and identify similar use cases to your problem.

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!