Determining the least element and its position in each matrix column with CUDA Thrust

前端 未结 3 1081
清歌不尽
清歌不尽 2020-12-17 02:13

I have a fairly simple problem but I cannot figure out an elegant solution to it.

I have a Thrust code which produces c vectors of same size containing

相关标签:
3条回答
  • 2020-12-17 03:02

    I had the curiosity to test which one of the previous approaches was faster. So, I implemented Robert Crovella's idea in the code below which reports, for the sake of completeness, also Eric's approach.

    #include <iterator>
    #include <algorithm>
    
    #include <thrust/random.h>
    #include <thrust/device_vector.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/transform_iterator.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/iterator/discard_iterator.h>
    #include <thrust/reduce.h>
    #include <thrust/functional.h>
    #include <thrust/sort.h>
    
    #include "TimingGPU.cuh"
    
    using namespace thrust::placeholders;
    
    template <typename Iterator>
    class strided_range
    {
        public:
    
        typedef typename thrust::iterator_difference<Iterator>::type difference_type;
    
        struct stride_functor : public thrust::unary_function<difference_type,difference_type>
        {
            difference_type stride;
    
            stride_functor(difference_type stride)
                : stride(stride) {}
    
            __host__ __device__
            difference_type operator()(const difference_type& i) const
            { 
                return stride * i;
            }
        };
    
        typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
        typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
        typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
    
        // type of the strided_range iterator
        typedef PermutationIterator iterator;
    
        // construct strided_range for the range [first,last)
        strided_range(Iterator first, Iterator last, difference_type stride)
            : first(first), last(last), stride(stride) {}
    
        iterator begin(void) const
        {
            return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
        }
    
        iterator end(void) const
        {
            return begin() + ((last - first) + (stride - 1)) / stride;
        }
    
        protected:
        Iterator first;
        Iterator last;
        difference_type stride;
    };
    
    
    /**************************************************************/
    /* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
    /**************************************************************/
    template< typename T >
    struct mod_functor {
        __host__ __device__ T operator()(T a, T b) { return a % b; }
    };
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        /***********************/
        /* SETTING THE PROBLEM */
        /***********************/
        const int Nrows = 200;
        const int Ncols = 200;
    
        // --- Random uniform integer distribution between 10 and 99
        thrust::default_random_engine rng;
        thrust::uniform_int_distribution<int> dist(10, 99);
    
        // --- Matrix allocation and initialization
        thrust::device_vector<float> d_matrix(Nrows * Ncols);
        for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);
    
        TimingGPU timerGPU;
    
        /******************/
        /* APPROACH NR. 1 */
        /******************/
        timerGPU.StartCounter();
    
        thrust::device_vector<float>    d_min_values(Ncols);
        thrust::device_vector<int>      d_min_indices_1(Ncols);
    
        thrust::reduce_by_key(
                thrust::make_transform_iterator(
                        thrust::make_counting_iterator((int) 0),
                        _1 / Nrows),
                thrust::make_transform_iterator(
                        thrust::make_counting_iterator((int) 0),
                        _1 / Nrows) + Nrows * Ncols,
                thrust::make_zip_iterator(
                        thrust::make_tuple(
                                thrust::make_permutation_iterator(
                                        d_matrix.begin(),
                                        thrust::make_transform_iterator(
                                                thrust::make_counting_iterator((int) 0), (_1 % Nrows) * Ncols + _1 / Nrows)),
                                thrust::make_transform_iterator(
                                        thrust::make_counting_iterator((int) 0), _1 % Nrows))),
                thrust::make_discard_iterator(),
                thrust::make_zip_iterator(
                        thrust::make_tuple(
                                d_min_values.begin(),
                                d_min_indices_1.begin())),
                thrust::equal_to<int>(),
                thrust::minimum<thrust::tuple<float, int> >()
        );
    
        printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
    
        /******************/
        /* APPROACH NR. 2 */
        /******************/
        timerGPU.StartCounter();
    
        // --- Computing row indices vector
        thrust::device_vector<int> d_row_indices(Nrows * Ncols);
        thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(Nrows * Ncols), thrust::make_constant_iterator(Ncols), d_row_indices.begin(), thrust::divides<int>() );
    
        // --- Computing column indices vector
        thrust::device_vector<int> d_column_indices(Nrows * Ncols);
        thrust::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(Nrows * Ncols), thrust::make_constant_iterator(Ncols), d_column_indices.begin(), mod_functor<int>());
    
        // --- int and float iterators
        typedef thrust::device_vector<int>::iterator        IntIterator;
        typedef thrust::device_vector<float>::iterator      FloatIterator;
    
        // --- Relevant tuples of int and float iterators
        typedef thrust::tuple<IntIterator, IntIterator>     IteratorTuple1;
        typedef thrust::tuple<FloatIterator, IntIterator>   IteratorTuple2;
    
        // --- zip_iterator of the relevant tuples
        typedef thrust::zip_iterator<IteratorTuple1>        ZipIterator1;
        typedef thrust::zip_iterator<IteratorTuple2>        ZipIterator2;
    
        // --- zip_iterator creation
        ZipIterator1 iter1(thrust::make_tuple(d_column_indices.begin(), d_row_indices.begin()));
    
        thrust::stable_sort_by_key(d_matrix.begin(), d_matrix.end(), iter1);
    
        ZipIterator2 iter2(thrust::make_tuple(d_matrix.begin(), d_row_indices.begin()));
    
        thrust::stable_sort_by_key(d_column_indices.begin(), d_column_indices.end(), iter2);
    
        typedef thrust::device_vector<int>::iterator Iterator;
    
        // --- Strided access to the sorted array
        strided_range<Iterator> d_min_indices_2(d_row_indices.begin(), d_row_indices.end(), Nrows);
    
        printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
    
        printf("\n\n");
        std::copy(d_min_indices_2.begin(), d_min_indices_2.end(), std::ostream_iterator<int>(std::cout, " "));
        std::cout << std::endl;
    
        return 0;
    }
    

    Testing the two approaches for the case of 2000x2000 sized matrices, this has been the result on a Kepler K20c card:

    Eric's             :  8.4s
    Robert Crovella's  : 33.4s
    
    0 讨论(0)
  • 2020-12-17 03:04

    Since the length of your vectors has to be the same. It's better to concatenate them together and treat them as a matrix C.

    Then your problem becomes finding the indices of the min element of each column in a row-major matrix. It can be solved as follows.

    1. change the row-major to col-major;
    2. find indices for each column.

    In step 1, you proposed to use stable_sort_by_key to rearrange the element order, which is not a effective method. Since the rearrangement can be directly calculated given the #row and #col of the matrix. In thrust, it can be done with permutation iterators as:

    thrust::make_permutation_iterator(
        c.begin(),
        thrust::make_transform_iterator(
            thrust::make_counting_iterator((int) 0),
            (_1 % row) * col + _1 / row)
    )
    

    In step 2, reduce_by_key can do exactly what you want. In your case the reduction binary-op functor is easy, since comparison on tuple (element of your zipped vector) has already been defined to compare the 1st element of the tuple, and it's supported by thrust as

    thrust::minimum< thrust::tuple<float, int> >()
    

    The whole program is shown as follows. Thrust 1.6.0+ is required since I use placeholders in fancy iterators.

    #include <iterator>
    #include <algorithm>
    
    #include <thrust/device_vector.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/transform_iterator.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/iterator/discard_iterator.h>
    #include <thrust/reduce.h>
    #include <thrust/functional.h>
    
    using namespace thrust::placeholders;
    
    int main()
    {
    
        const int row = 2;
        const int col = 5;
        float initc[] =
                { 0, 10, 20, 3, 40, 1, 2, 3, 5, 10 };
        thrust::device_vector<float> c(initc, initc + row * col);
    
        thrust::device_vector<float> minval(col);
        thrust::device_vector<int> minidx(col);
    
        thrust::reduce_by_key(
                thrust::make_transform_iterator(
                        thrust::make_counting_iterator((int) 0),
                        _1 / row),
                thrust::make_transform_iterator(
                        thrust::make_counting_iterator((int) 0),
                        _1 / row) + row * col,
                thrust::make_zip_iterator(
                        thrust::make_tuple(
                                thrust::make_permutation_iterator(
                                        c.begin(),
                                        thrust::make_transform_iterator(
                                                thrust::make_counting_iterator((int) 0), (_1 % row) * col + _1 / row)),
                                thrust::make_transform_iterator(
                                        thrust::make_counting_iterator((int) 0), _1 % row))),
                thrust::make_discard_iterator(),
                thrust::make_zip_iterator(
                        thrust::make_tuple(
                                minval.begin(),
                                minidx.begin())),
                thrust::equal_to<int>(),
                thrust::minimum<thrust::tuple<float, int> >()
        );
    
        std::copy(minidx.begin(), minidx.end(), std::ostream_iterator<int>(std::cout, " "));
        std::cout << std::endl;
        return 0;
    }
    

    Two remaining issues may affect the performance.

    1. min values have to be outputted, which is not required;
    2. reduce_by_key is designed for segments with variant lengths, it may not be the fastest algorithm for reduction on segments with same length.

    Writing your own kernel could be the best solution for highest performance.

    0 讨论(0)
  • 2020-12-17 03:05

    One possible idea, building on the vectorized sort idea here

    1. Suppose I have vectors like this:

      values:    C =      ( 0,10,20, 3,40, 1, 2, 3, 5,10)
      keys:      K =      ( 0, 1, 2, 3, 4, 0, 1, 2, 3, 4)
      segments:  S =      ( 0, 0, 0, 0, 0, 1, 1, 1, 1, 1)
      
    2. zip together K and S to create KS

    3. stable_sort_by_key using C as the keys, and KS as the values:

      stable_sort_by_key(C.begin(), C.end(), KS_begin);
      
    4. zip together the reordered C and K vectors, to create CK

    5. stable_sort_by_key using the reordered S as the keys, and CK as the values:

      stable_sort_by_key(S.begin(), S.end(), CK_begin);
      
    6. use a permutation iterator or a strided range iterator to access every Nth element (0, N, 2N, ...) of the newly re-ordered K vector, to retrieve a vector of the indices of the min element in each segment, where N is the length of the segments.

    I haven't actually implemented this, right now it's just an idea. Maybe it won't work for some reason I haven't observed yet.

    segments (S) and keys (K) are effectively row and column indices.

    And your question seems wierd to me, because your title mentions "find index of max value" but most of your question seems to be referring to "lowest value". Regardless, with a change to step 6 of my algorithm, you can find either value.

    0 讨论(0)
提交回复
热议问题