CUDA Thrust: reduce_by_key on only some values in an array, based off values in a “key” array

后端 未结 2 995
無奈伤痛
無奈伤痛 2020-12-09 14:22

Let\'s say I have two device_vector arrays, d_keys and d_data.

If d_data is, for example, a flattened 2D 3x5 array

相关标签:
2条回答
  • 2020-12-09 14:36

    Here is some sample code that does something like what you are after, using the approach I outlined in my comment below your question. In fact we want to use 4-tuples, to pick up your key value. Reproducing the suitably modified comment here:

    You could make a zip iterator that zips your 3 rows together plus the key "row" and passes a 4-tuple to a special functor. Your special functor would then do a reduction on the array of 3-tuples (using the key also) and return a result that is a 4-tuple. The thrust dot product example may give you some ideas.

    This is one possible approach:

    #include <thrust/host_vector.h>
    #include <thrust/iterator/zip_iterator.h>
    #include <thrust/sequence.h>
    #include <thrust/fill.h>
    #include <thrust/tuple.h>
    
    #define N 30  // make this evenly divisible by 3 for this example
    
    typedef thrust::tuple<int, int, int, int>  tpl4int;
    typedef thrust::host_vector<int>::iterator intiter;
    typedef thrust::tuple<intiter, intiter, intiter, intiter>  tpl4intiter;
    typedef thrust::zip_iterator<tpl4intiter>  int4zip;
    
    
    
    struct r3key_unary_op : public thrust::unary_function<tpl4int, tpl4int>
    {
      __host__ __device__
      tpl4int operator()(const tpl4int& x) const
      {
        tpl4int result;
        thrust::get<0>(result) = x.get<0>()*x.get<3>();
        thrust::get<1>(result) = x.get<1>()*x.get<3>();
        thrust::get<2>(result) = x.get<2>()*x.get<3>();
        thrust::get<3>(result) = 1;
        return result;
       }
    };
    
    struct r3key_binary_op : public thrust::binary_function<tpl4int, tpl4int, tpl4int>
    {
      __host__ __device__
      tpl4int operator()(const tpl4int& x, const tpl4int& y) const
      {
        tpl4int result;
        thrust::get<0>(result) = x.get<0>()*x.get<3>() + y.get<0>()*y.get<3>();
        thrust::get<1>(result) = x.get<1>()*x.get<3>() + y.get<1>()*y.get<3>();
        thrust::get<2>(result) = x.get<2>()*x.get<3>() + y.get<2>()*y.get<3>();
        thrust::get<3>(result) = 1;
        return result;
      }
    };
    
    
    int main() {
    
      thrust::host_vector<int> A(N);  // values, in 3 "rows" flattened
      thrust::sequence(A.begin(), A.end());
      thrust::host_vector<int> K(N/3);   // keys in one row
      thrust::fill(K.begin(), K.end(), 1);  // set some keys to 1
      K[9] = 0;  // set some keys to zero
    
      int4zip first = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), A.begin() + N/3, A.begin() + 2*N/3, K.begin()));
      int4zip  last = thrust::make_zip_iterator(thrust::make_tuple(A.begin() + N/3, A.begin() + 2*N/3, A.end(), K.end()));
      r3key_unary_op my_unary_op;
      r3key_binary_op my_binary_op;
      tpl4int init = my_unary_op(*first);
      // init = thrust::make_tuple((int) 0, (int) 0, (int) 0, (int) 0);
      tpl4int result = thrust::transform_reduce(first, last, my_unary_op, init, my_binary_op);
      std::cout << "row 0 = " << result.get<0>() << std::endl;
      std::cout << "row 1 = " << result.get<1>() << std::endl;
      std::cout << "row 2 = " << result.get<2>() << std::endl;
      return 0;
    
    }
    

    Notes:

    1. This is just using host_vector. Extending it to work with device_vector, or templatizing it to work with something other than int should be straightforward.
    2. For completeness, I am using the unary functor to provide an init value other than zero for the sum reduction of each row. You might want to change the init value to zero (a 4-tuple of zeros).
    0 讨论(0)
  • 2020-12-09 14:40

    Based on the additional comment that instead of 3 rows there are thousands of rows, we can write a transform functor that sums an entire row. Based on the fact that there are thousands of rows, this should keep the machine pretty busy:

    #include <iostream>
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/transform.h>
    #include <thrust/sequence.h>
    #include <thrust/fill.h>
    
    #define ROW   20
    #define COL   10
    
    __device__ int *vals;
    __device__ int *keys;
    
    struct test_functor
    {
      const int a;
    
      test_functor(int _a) : a(_a) {}
    
      __device__
      int operator()(int& x, int& y ) {
        int temp = 0;
        for (int i = 0; i<a; i++)
          temp += vals[i + (y*a)] * keys[i];
        return temp;
        }
    };
    
    int main(){
      int *s_vals, *s_keys;
      thrust::host_vector<int> h_vals(ROW*COL);
      thrust::host_vector<int> h_keys(COL);
      thrust::sequence(h_vals.begin(), h_vals.end());
      thrust::fill(h_keys.begin(), h_keys.end(), 1);
      h_keys[0] = 0;
      thrust::device_vector<int> d_vals = h_vals;
      thrust::device_vector<int> d_keys = h_keys;
      thrust::device_vector<int> d_sums(ROW);
      thrust::fill(d_sums.begin(), d_sums.end(), 0);
      s_vals = thrust::raw_pointer_cast(&d_vals[0]);
      s_keys = thrust::raw_pointer_cast(&d_keys[0]);
      cudaMemcpyToSymbol(vals, &s_vals, sizeof(int *));
      cudaMemcpyToSymbol(keys, &s_keys, sizeof(int *));
      thrust::device_vector<int> d_idx(ROW);
      thrust::sequence(d_idx.begin(), d_idx.end());
      thrust::transform(d_sums.begin(), d_sums.end(), d_idx.begin(),  d_sums.begin(), test_functor(COL));
      thrust::host_vector<int> h_sums = d_sums;
      std::cout << "Results :" << std::endl;
      for (unsigned i = 0; i<ROW; i++)
        std::cout<<"h_sums["<<i<<"] = " << h_sums[i] << std::endl;
      return 0;
    }
    

    This approach has the drawback that in general accesses to the vals array will not be coalesced. However for a few thousand rows the cache may offer significant relief. We can fix this problem by re-ordering the data to be stored in column-major form in the flattened array, and change our indexing method in the loop in the functor to be like this:

    for (int i=0; i<a; i++)
      temp += vals[(i*ROW)+y]*keys[i];
    

    If preferred, you can pass ROW as an additional parameter to the functor.

    0 讨论(0)
提交回复
热议问题