I know it sound weird, but here is my scenario:
I need to do a matrix-matrix multiplication (A(n*k)*B(k*n)), but I only needs the diagonal elements to be evaluated f
Here has a code example for your problem, I think this code link could help you. Thanks the Github's author.
__global__ void invokeDeviceCublasSgemm(cublasStatus_t *returnValue,
int n,
const float *d_alpha,
const float *d_A,
const float *d_B,
const float *d_beta,
float *d_C)
{
cublasHandle_t cnpHandle;
cublasStatus_t status = cublasCreate(&cnpHandle);
if (status != CUBLAS_STATUS_SUCCESS){
*returnValue = status;
return;
}
/* Perform operation using cublas */
status = cublasSgemm(cnpHandle,
CUBLAS_OP_N, CUBLAS_OP_N,
n, n, n,
d_alpha,
d_A, n,
d_B, n,
d_beta,
d_C, n);
cublasDestroy(cnpHandle);
*returnValue = status;
}