Reduce matrix columns with CUDA

后端 未结 2 513
北恋
北恋 2020-12-16 08:20

I have a matrix and I would like to use CUDA and in the fastest possible way compute the column-wise mean (boils down to be simply the sum), i.e., return a row vector contai

2条回答
  •  伪装坚强ぢ
    2020-12-16 09:13

    As alternatives to the answer already provided by talonmies, I'm here reporting 4 other approaches for column reduction, 3 of them based on using CUDA Thrust and 1 based on using cublasgemv() with a column of 1's, as suggested in my comment above.

    The CUDA Thrust approaches are the analogous of Reduce matrix rows with CUDA with an implicit transposition obtained by

    thrust::make_permutation_iterator(d_matrix.begin(),                 
            thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                    (_1 % Nrows) * Ncols + _1 / Nrows))
    

    Here is the full code:

    #include 
    
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    #include 
    
    #include 
    #include 
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    using namespace thrust::placeholders;
    
    // --- Required for approach #2
    __device__ float *vals;
    
    /**************************************************************/
    /* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
    /**************************************************************/
    template 
    struct linear_index_to_row_index : public thrust::unary_function {
    
        T Ncols; // --- Number of columns
    
        __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
    
        __host__ __device__ T operator()(T i) { return i / Ncols; }
    };
    
    /******************************************/
    /* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
    /******************************************/
    struct col_reduction {
    
        const int Nrows;    // --- Number of rows
        const int Ncols;    // --- Number of cols
    
        col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {}
    
        __device__ float operator()(float& x, int& y ) {
            float temp = 0.f;
            for (int i = 0; i
    struct MulC: public thrust::unary_function
    {
        T C;
        __host__ __device__ MulC(T c) : C(c) { }
        __host__ __device__ T operator()(T x) { return x * C; }
    };
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int Nrows = 5;     // --- Number of rows
        const int Ncols = 8;     // --- Number of columns
    
        // --- Random uniform integer distribution between 10 and 99
        thrust::default_random_engine rng;
        thrust::uniform_int_distribution dist(10, 99);
    
        // --- Matrix allocation and initialization
        thrust::device_vector d_matrix(Nrows * Ncols);
        for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);
    
        TimingGPU timerGPU;
    
        /***************/
        /* APPROACH #1 */
        /***************/
        timerGPU.StartCounter();
        // --- Allocate space for row sums and indices
        thrust::device_vector d_col_sums(Ncols);
        thrust::device_vector d_col_indices(Ncols);
    
        // --- Compute row sums by summing values with equal row indices
        thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator(0), linear_index_to_row_index(Nrows)),
                              thrust::make_transform_iterator(thrust::counting_iterator(0), linear_index_to_row_index(Nrows)) + (Nrows*Ncols),
                              thrust::make_permutation_iterator(
                                    d_matrix.begin(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                              d_col_indices.begin(),
                              d_col_sums.begin(),
                              thrust::equal_to(),
                              thrust::plus());
    
        //thrust::reduce_by_key(
     //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index(Nrows)),
     //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index(Nrows)) + (Nrows*Ncols),
     //               thrust::make_permutation_iterator(
        //              d_matrix.begin(),
        //              thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
     //               thrust::make_discard_iterator(),
     //               d_col_sums.begin());
    
        printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
    
        // --- Print result
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums[j] << "\n";
        }
    
        /***************/
        /* APPROACH #2 */
        /***************/
        timerGPU.StartCounter();
        thrust::device_vector d_col_sums_2(Ncols, 0);
        float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]);
        gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
        thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols));
    
        printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_2[j] << "\n";
        }
    
        /***************/
        /* APPROACH #3 */
        /***************/
    
        timerGPU.StartCounter();
        thrust::device_vector d_col_sums_3(Ncols, 0);
        thrust::device_vector d_temp(Nrows * Ncols);
        thrust::inclusive_scan_by_key(
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index(Nrows)),
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index(Nrows)) + (Nrows*Ncols),
                    thrust::make_permutation_iterator(
                            d_matrix.begin(),
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                    d_temp.begin());
        thrust::copy(
                    thrust::make_permutation_iterator(
                            d_temp.begin() + Nrows - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC(Nrows))),
                    thrust::make_permutation_iterator(
                            d_temp.begin() + Nrows - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC(Nrows))) + Ncols,
                    d_col_sums_3.begin());
    
        printf("Timing for approach #3 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_3[j] << "\n";
        }
    
        /***************/
        /* APPROACH #4 */
        /***************/
        cublasHandle_t handle;
    
        timerGPU.StartCounter();
        cublasSafeCall(cublasCreate(&handle));
    
        thrust::device_vector d_col_sums_4(Ncols);
        thrust::device_vector d_ones(Nrows, 1.f);
    
        float alpha = 1.f;
        float beta  = 0.f;
        cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, 
                                   thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1));
    
        printf("Timing for approach #4 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_4[j] << "\n";
        }
    
        return 0;
    }
    

提交回复
热议问题