Reduce matrix columns with CUDA

后端 未结 2 512
北恋
北恋 2020-12-16 08:20

I have a matrix and I would like to use CUDA and in the fastest possible way compute the column-wise mean (boils down to be simply the sum), i.e., return a row vector contai

相关标签:
2条回答
  • 2020-12-16 09:07

    What is stopping you doing something like this:

    template<typename T>
    __global__ void kernelSum(const T* __restrict__ input, 
                              T* __restrict__ per_block_results, 
                              const size_t lda, const size_t n)
    {
        extern __shared__ T sdata[];
    
        // Accumulate per thread partial sum
        T x = 0.0;
        T * p = &input[blockIdx.x * lda];
        for(int i=threadIdx.x; i < n; i += blockDim.x) {
            x += p[i];
        }
    
        // load partial sum into __shared__ memory
        sdata[threadIdx.x] = x;
        __syncthreads();
    
        // contiguous range pattern
        for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
            if(threadIdx.x < offset) {
                // add a partial sum upstream to our own
                sdata[threadIdx.x] += sdata[threadIdx.x + offset];
            }
            // wait until all threads in the block have
            // updated their partial sums
            __syncthreads();
        }
    
        // thread 0 writes the final result
        if(threadIdx.x == 0) {
            per_block_results[blockIdx.x] = sdata[0];
        }
    }
    

    [standard disclaimer: written in browser, never compiled or tested, use at own risk]

    ie. you only need one entry in sdata for each thread in the block for the shared memory reduction. Each thread sums as many values as required to cover the full column input. Then there is no shared memory restriction, you can sum any size column with the same block size.


    EDIT: Apparently the idea of using per thread partial sums is new to you, so here is a complete example to study:

    #include <iostream>
    
    template<typename T>
    __global__ 
    void kernelSum(const T* __restrict__ input, 
            const size_t lda, // pitch of input in words of sizeof(T)
            T* __restrict__ per_block_results, 
                    const size_t n)
    {
        extern __shared__ T sdata[];
    
        T x = 0.0;
        const T * p = &input[blockIdx.x * lda];
        // Accumulate per thread partial sum
        for(int i=threadIdx.x; i < n; i += blockDim.x) {
            x += p[i];
        }
    
        // load thread partial sum into shared memory
        sdata[threadIdx.x] = x;
        __syncthreads();
    
        for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
            if(threadIdx.x < offset) {
                sdata[threadIdx.x] += sdata[threadIdx.x + offset];
            }
            __syncthreads();
        }
    
        // thread 0 writes the final result
        if(threadIdx.x == 0) {
            per_block_results[blockIdx.x] = sdata[0];
        }
    }
    
    
    int main(void)
    {
        const int m = 10000, n = 16;
        float * a = new float[m*n];
    
        for(int i=0; i<(m*n); i++) { a[i] = (float)(i%10); }
    
        float *a_;
        size_t size_a = m * n * sizeof(float);
        cudaMalloc((void **)&a_, size_a);
        cudaMemcpy(a_, a, size_a, cudaMemcpyHostToDevice);
    
        float *b_;
        size_t size_b = n * sizeof(float);
        cudaMalloc((void **)&b_, size_b);
    
        // select number of warps per block according to size of the
        // colum and launch one block per column. Probably makes sense
        // to have at least 4:1 column size to block size
        dim3 blocksize(256); 
        dim3 gridsize(n);
        size_t shmsize = sizeof(float) * (size_t)blocksize.x;
        kernelSum<float><<<gridsize, blocksize, shmsize>>>(a_, b_, m, m);
    
        float * b = new float[n];
        cudaMemcpy(b, b_, size_b, cudaMemcpyDeviceToHost);
    
        for(int i=0; i<n; i++) {
           std::cout << i << " " << b[i] << std::endl;
        }
    
        cudaDeviceReset();
    
        return 0;
    } 
    

    You should experiment with the block size relative to your matrix size for optimal performance, but in general the more work per thread the kernel does, the better the overall performance will be (because the shared memory reduction is quite expensive). You can see one approach to block and grid size heuristics for similarly memory bandwidth bound problem in this answer.

    0 讨论(0)
  • 2020-12-16 09:13

    As alternatives to the answer already provided by talonmies, I'm here reporting 4 other approaches for column reduction, 3 of them based on using CUDA Thrust and 1 based on using cublas<t>gemv() with a column of 1's, as suggested in my comment above.

    The CUDA Thrust approaches are the analogous of Reduce matrix rows with CUDA with an implicit transposition obtained by

    thrust::make_permutation_iterator(d_matrix.begin(),                 
            thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                    (_1 % Nrows) * Ncols + _1 / Nrows))
    

    Here is the full code:

    #include <cublas_v2.h>
    
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/generate.h>
    #include <thrust/reduce.h>
    #include <thrust/functional.h>
    #include <thrust/random.h>
    #include <thrust/sequence.h>
    
    #include <stdio.h>
    #include <iostream>
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    using namespace thrust::placeholders;
    
    // --- Required for approach #2
    __device__ float *vals;
    
    /**************************************************************/
    /* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
    /**************************************************************/
    template <typename T>
    struct linear_index_to_row_index : public thrust::unary_function<T,T> {
    
        T Ncols; // --- Number of columns
    
        __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
    
        __host__ __device__ T operator()(T i) { return i / Ncols; }
    };
    
    /******************************************/
    /* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
    /******************************************/
    struct col_reduction {
    
        const int Nrows;    // --- Number of rows
        const int Ncols;    // --- Number of cols
    
        col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {}
    
        __device__ float operator()(float& x, int& y ) {
            float temp = 0.f;
            for (int i = 0; i<Nrows; i++) {
                temp += vals[y + (i*Ncols)];
            }
            return temp;
        }
    };
    
    /**************************/
    /* NEEDED FOR APPROACH #3 */
    /**************************/
    template<typename T>
    struct MulC: public thrust::unary_function<T, T>
    {
        T C;
        __host__ __device__ MulC(T c) : C(c) { }
        __host__ __device__ T operator()(T x) { return x * C; }
    };
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int Nrows = 5;     // --- Number of rows
        const int Ncols = 8;     // --- Number of columns
    
        // --- Random uniform integer distribution between 10 and 99
        thrust::default_random_engine rng;
        thrust::uniform_int_distribution<int> dist(10, 99);
    
        // --- Matrix allocation and initialization
        thrust::device_vector<float> d_matrix(Nrows * Ncols);
        for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);
    
        TimingGPU timerGPU;
    
        /***************/
        /* APPROACH #1 */
        /***************/
        timerGPU.StartCounter();
        // --- Allocate space for row sums and indices
        thrust::device_vector<float> d_col_sums(Ncols);
        thrust::device_vector<int> d_col_indices(Ncols);
    
        // --- Compute row sums by summing values with equal row indices
        thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)),
                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
                              thrust::make_permutation_iterator(
                                    d_matrix.begin(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                              d_col_indices.begin(),
                              d_col_sums.begin(),
                              thrust::equal_to<int>(),
                              thrust::plus<float>());
    
        //thrust::reduce_by_key(
     //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
     //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
     //               thrust::make_permutation_iterator(
        //              d_matrix.begin(),
        //              thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
     //               thrust::make_discard_iterator(),
     //               d_col_sums.begin());
    
        printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
    
        // --- Print result
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums[j] << "\n";
        }
    
        /***************/
        /* APPROACH #2 */
        /***************/
        timerGPU.StartCounter();
        thrust::device_vector<float> d_col_sums_2(Ncols, 0);
        float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]);
        gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
        thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator<int>(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols));
    
        printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_2[j] << "\n";
        }
    
        /***************/
        /* APPROACH #3 */
        /***************/
    
        timerGPU.StartCounter();
        thrust::device_vector<float> d_col_sums_3(Ncols, 0);
        thrust::device_vector<float> d_temp(Nrows * Ncols);
        thrust::inclusive_scan_by_key(
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
                    thrust::make_permutation_iterator(
                            d_matrix.begin(),
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                    d_temp.begin());
        thrust::copy(
                    thrust::make_permutation_iterator(
                            d_temp.begin() + Nrows - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))),
                    thrust::make_permutation_iterator(
                            d_temp.begin() + Nrows - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))) + Ncols,
                    d_col_sums_3.begin());
    
        printf("Timing for approach #3 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_3[j] << "\n";
        }
    
        /***************/
        /* APPROACH #4 */
        /***************/
        cublasHandle_t handle;
    
        timerGPU.StartCounter();
        cublasSafeCall(cublasCreate(&handle));
    
        thrust::device_vector<float> d_col_sums_4(Ncols);
        thrust::device_vector<float> d_ones(Nrows, 1.f);
    
        float alpha = 1.f;
        float beta  = 0.f;
        cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, 
                                   thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1));
    
        printf("Timing for approach #4 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_4[j] << "\n";
        }
    
        return 0;
    }
    
    0 讨论(0)
提交回复
热议问题