I have a M*N host memory matrix, and upon copying into a device memory, I need it to be transposed into a N*M matrix. Is there any cuda (cuBLAS...)
To answer your question on efficiency, I have compared two ways to perform matrix transposition, one using the Thrust library and one using cublas, as suggested by Robert Crovella. The result of the comparison is the following on a Kepler K20c card:
| Matrix size | Thrust [ms] | cuBLAS [ms] |
| | | |
| 32x32 | 0.015 | 0.016 |
| 64x64 | 0.015 | 0.017 |
| 128x128 | 0.019 | 0.017 |
| 256x256 | 0.028 | 0.017 |
| 512x512 | 0.088 | 0.042 |
| 1024x1024 | 0.34 | 0.13 |
| 2048x2048 | 1.24 | 0.48 |
| 4096x4096 | 11.02 | 1.98 |
As it can be seen, the cublas outperforms the version using Thrust. Below is the code to perform the comparison.
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/**********************/
/* cuBLAS ERROR CHECK */
/**********************/
#ifndef cublasSafeCall
#define cublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
#endif
inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
{
if( CUBLAS_STATUS_SUCCESS != err) {
fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err);
getch(); cudaDeviceReset(); assert(0);
}
}
// convert a linear index to a linear index in the transpose
struct transpose_index : public thrust::unary_function
{
size_t m, n;
__host__ __device__
transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {}
__host__ __device__
size_t operator()(size_t linear_index)
{
size_t i = linear_index / n;
size_t j = linear_index % n;
return m * j + i;
}
};
// convert a linear index to a row index
struct row_index : public thrust::unary_function
{
size_t n;
__host__ __device__
row_index(size_t _n) : n(_n) {}
__host__ __device__
size_t operator()(size_t i)
{
return i / n;
}
};
// transpose an M-by-N array
template
void transpose(size_t m, size_t n, thrust::device_vector& src, thrust::device_vector& dst)
{
thrust::counting_iterator indices(0);
thrust::gather
(thrust::make_transform_iterator(indices, transpose_index(n, m)),
thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(),
src.begin(),dst.begin());
}
// print an M-by-N array
template
void print(size_t m, size_t n, thrust::device_vector& d_data)
{
thrust::host_vector h_data = d_data;
for(size_t i = 0; i < m; i++)
{
for(size_t j = 0; j < n; j++)
std::cout << std::setw(8) << h_data[i * n + j] << " ";
std::cout << "\n";
}
}
int main(void)
{
size_t m = 5; // number of rows
size_t n = 4; // number of columns
// 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ]
thrust::device_vector data(m * n, 1.);
data[1] = 2.;
data[3] = 3.;
std::cout << "Initial array" << std::endl;
print(m, n, data);
std::cout << "Transpose array - Thrust" << std::endl;
thrust::device_vector transposed_thrust(m * n);
transpose(m, n, data, transposed_thrust);
print(n, m, transposed_thrust);
std::cout << "Transpose array - cuBLAS" << std::endl;
thrust::device_vector transposed_cuBLAS(m * n);
double* dv_ptr_in = thrust::raw_pointer_cast(data.data());
double* dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data());
double alpha = 1.;
double beta = 0.;
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m));
print(n, m, transposed_cuBLAS);
getch();
return 0;
}