Given two key-value lists, I am trying to combine the two sides by matching the keys and applying a function to the two values when the keys match. In my case I want to mult
You can actually do all you want using one thrust::set_intersection_by_key call.
However, some prerequisites need to be met:
First, the easy one:
You need to zip Lvalsv and Rvalsv into a single thrust::zip_iterator and pass this as the values to thrust::set_intersection_by_key.
You could already run this:
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector result_keys(min_size);
thrust::device_vector result_values_left(min_size);
thrust::device_vector result_values_right(min_size);
auto zipped_input_values = thrust::make_zip_iterator(thrust::make_tuple(Lvalsv.begin(), Rvalsv.begin()));
auto zipped_output_values = thrust::make_zip_iterator(thrust::make_tuple(result_values_left.begin(), result_values_right.begin()));
auto result_pair = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), zipped_input_values, result_keys.begin(), zipped_output_values);
This would yield two result vectors, which you would need to multiply element-wise to get your final result.
But wait, wouldn't it be great if you could avoid having to store these two vectors as the result, then read each element again for multiplying them and then store the final result in a third vector?
Let's do that. The concept I adapted is from here.
The transform_output_iterator is a iterator, which is a wrapper around another OutputIterator. When writing to the transform_output_iterator, a UnaryFunction is applied to the value to be written, then that result is written to the wrapped OutputIterator.
This allows us to pass the result from thrust::set_intersection_by_key through the Multiplier functor and then store it in the results in a single result_values vector.
The following code implements this idea:
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define PRINTER(name) print(#name, (name))
template class V, typename T, typename ...Args>
void print(const char* name, const V & v)
{
std::cout << name << ":\t";
thrust::copy(v.begin(), v.end(), std::ostream_iterator(std::cout, "\t"));
std::cout << std::endl;
}
template
class Proxy
{
UnaryFunction& fun;
OutputIterator& out;
public:
__host__ __device__
Proxy(UnaryFunction& fun, OutputIterator& out) : fun(fun), out(out) {}
template
__host__ __device__
Proxy operator=(const T& x) const
{
*out = fun(x);
return *this;
}
};
// This iterator is a wrapper around another OutputIterator which
// applies a UnaryFunction before writing to the OutputIterator.
template
class transform_output_iterator : public thrust::iterator_adaptor<
transform_output_iterator
, OutputIterator
, thrust::use_default
, thrust::use_default
, thrust::use_default
, Proxy >
{
UnaryFunction fun;
public:
friend class thrust::iterator_core_access;
// shorthand for the name of the iterator_adaptor we're deriving from
typedef thrust::iterator_adaptor<
transform_output_iterator,
OutputIterator, thrust::use_default, thrust::use_default, thrust::use_default, Proxy
> super_t;
__host__ __device__
transform_output_iterator(OutputIterator out, UnaryFunction fun) : super_t(out), fun(fun)
{
}
private:
__host__ __device__
typename super_t::reference dereference() const
{
return Proxy(fun, this->base_reference());
}
};
struct Multiplier
{
template
__host__ __device__
auto operator()(Tuple t) const -> decltype(thrust::get<0>(t) * thrust::get<1>(t))
{
return thrust::get<0>(t) * thrust::get<1>(t);
}
};
template
transform_output_iterator
__host__ __device__
make_transform_output_iterator(OutputIterator out, UnaryFunction fun)
{
return transform_output_iterator(out, fun);
}
int main()
{
int Lkeys[] = { 1, 2, 4, 5, 6 };
int Lvals[] = { 3, 4, 1, 2, 1 };
int Rkeys[] = { 1, 3, 4, 5, 6, 7 };
int Rvals[] = { 2, 1, 1, 4, 1, 2 };
size_t Lsize = sizeof(Lkeys)/sizeof(int);
size_t Rsize = sizeof(Rkeys)/sizeof(int);
thrust::device_vector Lkeysv(Lkeys, Lkeys+Lsize);
thrust::device_vector Lvalsv(Lvals, Lvals+Lsize);
thrust::device_vector Rkeysv(Rkeys, Rkeys+Rsize);
thrust::device_vector Rvalsv(Rvals, Rvals+Rsize);
std::size_t min_size = std::min(Lsize, Rsize);
thrust::device_vector result_keys(min_size);
thrust::device_vector result_values(min_size);
auto zipped_values = thrust::make_zip_iterator(thrust::make_tuple(Lvalsv.begin(), Rvalsv.begin()));
auto output_it = make_transform_output_iterator(result_values.begin(), Multiplier());
auto result_pair = thrust::set_intersection_by_key(Lkeysv.begin(), Lkeysv.end(), Rkeysv.begin(), Rkeysv.end(), zipped_values, result_keys.begin(), output_it);
std::size_t new_size = result_pair.first - result_keys.begin();
result_keys.resize(new_size);
result_values.resize(new_size);
PRINTER(result_keys);
PRINTER(result_values);
}
output
$ nvcc -std=c++11 main.cu && ./a.out
result_keys: 1 4 5 6
result_values: 6 1 8 1