What is the most efficient way to transpose a matrix in CUDA?

后端未结

关注

 3  1785

予麋鹿 2021-01-06 16:27

I have a M*N host memory matrix, and upon copying into a device memory, I need it to be transposed into a N*M matrix. Is there any cuda (cuBLAS...)

3条回答

遥遥无期 (楼主)

2021-01-06 17:10

To answer your question on efficiency, I have compared two ways to perform matrix transposition, one using the Thrust library and one using cublasgeam, as suggested by Robert Crovella. The result of the comparison is the following on a Kepler K20c card:

| Matrix size   | Thrust [ms]   | cuBLAS [ms]   |
|               |               |               |
| 32x32         | 0.015         | 0.016         |
| 64x64         | 0.015         | 0.017         |
| 128x128       | 0.019         | 0.017         |
| 256x256       | 0.028         | 0.017         |
| 512x512       | 0.088         | 0.042         |
| 1024x1024     | 0.34          | 0.13          |
| 2048x2048     | 1.24          | 0.48          |
| 4096x4096     | 11.02         | 1.98          |

As it can be seen, the cublasgeam outperforms the version using Thrust. Below is the code to perform the comparison.

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err)     __cublasSafeCall(err, __FILE__, __LINE__)
#endif

inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
    if( CUBLAS_STATUS_SUCCESS != err) {
        fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err); 
        getch(); cudaDeviceReset(); assert(0); 
    }
}

// convert a linear index to a linear index in the transpose 
struct transpose_index : public thrust::unary_function
{
    size_t m, n;

    __host__ __device__
    transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}

    __host__ __device__
    size_t operator()(size_t linear_index)
    {
        size_t i = linear_index / n;
        size_t j = linear_index % n;

        return m * j + i;
    }
};

// convert a linear index to a row index
struct row_index : public thrust::unary_function
{
    size_t n;

    __host__ __device__
    row_index(size_t _n) : n(_n) {}

    __host__ __device__

    size_t operator()(size_t i)
    {
        return i / n;
    }
};

// transpose an M-by-N array
template 
void transpose(size_t m, size_t n, thrust::device_vector& src, thrust::device_vector& dst)
{
    thrust::counting_iterator indices(0);

    thrust::gather
    (thrust::make_transform_iterator(indices, transpose_index(n, m)),
    thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(),
    src.begin(),dst.begin());
}

// print an M-by-N array
template 
void print(size_t m, size_t n, thrust::device_vector& d_data)
{
    thrust::host_vector h_data = d_data;

    for(size_t i = 0; i < m; i++)
    {
        for(size_t j = 0; j < n; j++)
            std::cout << std::setw(8) << h_data[i * n + j] << " ";
            std::cout << "\n";
    }
}

int main(void)
{
    size_t m = 5; // number of rows
    size_t n = 4; // number of columns

    // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
    thrust::device_vector data(m * n, 1.);
    data[1] = 2.;
    data[3] = 3.;

    std::cout << "Initial array" << std::endl;
    print(m, n, data);

    std::cout << "Transpose array - Thrust" << std::endl;
    thrust::device_vector transposed_thrust(m * n);
    transpose(m, n, data, transposed_thrust);
    print(n, m, transposed_thrust);

    std::cout << "Transpose array - cuBLAS" << std::endl;
    thrust::device_vector transposed_cuBLAS(m * n);
    double* dv_ptr_in  = thrust::raw_pointer_cast(data.data());
    double* dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
    double alpha = 1.;
    double beta  = 0.;
    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));
    cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m)); 
    print(n, m, transposed_cuBLAS);

    getch();

    return 0;
}

0 讨论(0)

查看其它3个回答