Singular values calculation only with CUDA

I'm trying to use the new cusolverDnSgesvd routine of CUDA 7.0 for the calculation of the singular values. The full code is reported below:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <cusolverDn.h>
#include <cuda_runtime_api.h>

void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
   if (code != cudaSuccess)
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) { exit(code); }
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }

/* MAIN */
int main(){

    int M = 10;
    int N = 10;

    // --- Setting the host matrix
    float *h_A = (float *)malloc(M * N * sizeof(float));
    for(unsigned int i = 0; i < M; i++){
        for(unsigned int j = 0; j < N; j++){
            h_A[j*M + i] = (i + j) * (i + j);

    // --- Setting the device matrix and moving the host matrix to the device
    float *d_A;         gpuErrchk(cudaMalloc(&d_A,      M * N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    float *h_U = (float *)malloc(M * M * sizeof(float));
    float *h_V = (float *)malloc(N * N * sizeof(float));
    float *h_S = (float *)malloc(N *     sizeof(float));

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo;       gpuErrchk(cudaMalloc(&devInfo,          sizeof(int)));
    float *d_U;         gpuErrchk(cudaMalloc(&d_U,      M * M * sizeof(float)));
    float *d_V;         gpuErrchk(cudaMalloc(&d_V,      N * N * sizeof(float)));
    float *d_S;         gpuErrchk(cudaMalloc(&d_S,      N *     sizeof(float)));

    cusolverStatus_t stat;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;

    stat = cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size);
    if(stat != CUSOLVER_STATUS_SUCCESS ) std::cout << "Initialization of cuSolver failed. \N";

    float *work;    gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));
    //float *rwork; gpuErrchk(cudaMalloc(&rwork, work_size * sizeof(float)));

    // --- CUDA SVD execution
    //stat = cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);
    stat = cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo);

    int devInfo_h = 0;
    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    std::cout << "devInfo = " << devInfo_h << "\n";

        case CUSOLVER_STATUS_SUCCESS:           std::cout << "SVD computation success\n";                       break;
        case CUSOLVER_STATUS_NOT_INITIALIZED:   std::cout << "Library cuSolver not initialized correctly\n";    break;
        case CUSOLVER_STATUS_INVALID_VALUE:     std::cout << "Invalid parameters passed\n";                     break;
        case CUSOLVER_STATUS_INTERNAL_ERROR:    std::cout << "Internal operation failed\n";                     break;

    if (devInfo_h == 0 && stat == CUSOLVER_STATUS_SUCCESS) std::cout    << "SVD successful\n\n";

    // --- Moving the results from device to host
    gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));

    for(int i = 0; i < N; i++) std::cout << "d_S["<<i<<"] = " << h_S[i] << std::endl;


    return 0;


If I ask for the computation of the full SVD (commented line with jobu = 'A' and jobvt = 'A') everything works fine. If I ask for the computation of the singular values only (line with jobu = 'N' and jobvt = 'N'), cusolverDnSgesvd returns


Please note that, in this case devInfo = 0, so I cannot spot the invalid parameter.

Please also note that the documentation PDF lacks information about the rwork parameter so that I have dealt with it as a dummy parameter.


At this time the cuSolver gesvd function only supports jobu = 'A' and jobvt = 'A'

So the error when you specify other combinations is expected. From the documentation:

Remark 2: gesvd only supports jobu='A' and jobvt='A' and returns matrix U and VH


USE OF cusolver<T>nSgesvd

As remarked by lebedov, as of CUDA 8.0, it is now possible to calculate the singular values only by cusolverDnSgesvd. I report below a slightly modified version of your code with two calls to cusolverDnSgesvd, one performing the singular values calculation only

cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo)

and one performing the full SVD calculation

cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo)

As you already remarked, the two 'A' fields for the full SVD case are changed to 'N' in the singular values only case. Please, note that, in the singular values only case, there is no need to store space for the singular vector matrices U and V. Indeed, a NULL pointer is passed.

The singular values calculation only is faster than the full SVD calculation. On a GTX 960, for a 1000x1000 matrix, the timing has been the following:

Singular values only: 559 ms
Full SVD: 2239 ms

Here is the full code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>


#include <cusolverDn.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

/* MAIN */
int main(){

    int M = 1000;
    int N = 1000;

    TimingGPU timerGPU;
    float     elapsedTime;

    // --- Setting the host matrix
    float *h_A = (float *)malloc(M * N * sizeof(float));
    for (unsigned int i = 0; i < M; i++){
        for (unsigned int j = 0; j < N; j++){
            h_A[j*M + i] = (i + j) * (i + j);

    // --- Setting the device matrix and moving the host matrix to the device
    float *d_A;         gpuErrchk(cudaMalloc(&d_A, M * N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_A, h_A, M * N * sizeof(float), cudaMemcpyHostToDevice));

    // --- host side SVD results space
    float *h_U = (float *)malloc(M * M * sizeof(float));
    float *h_V = (float *)malloc(N * N * sizeof(float));
    float *h_S = (float *)malloc(N *     sizeof(float));

    // --- device side SVD workspace and matrices
    int work_size = 0;

    int *devInfo;       gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));
    float *d_U;         gpuErrchk(cudaMalloc(&d_U, M * M * sizeof(float)));
    float *d_V;         gpuErrchk(cudaMalloc(&d_V, N * N * sizeof(float)));
    float *d_S;         gpuErrchk(cudaMalloc(&d_S, N *     sizeof(float)));

    cusolverStatus_t stat;

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;

    cusolveSafeCall(cusolverDnSgesvd_bufferSize(solver_handle, M, N, &work_size));

    float *work;    gpuErrchk(cudaMalloc(&work, work_size * sizeof(float)));

    // --- CUDA SVD execution - Singular values only
    cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'N', 'N', M, N, d_A, M, d_S, NULL, M, NULL, N, work, work_size, NULL, devInfo));
    elapsedTime = timerGPU.GetCounter();

    int devInfo_h = 0;
    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h == 0)
        printf("SVD successfull for the singular values calculation only\n\n");
    else if (devInfo_h < 0)
        printf("SVD unsuccessfull for the singular values calculation only. Parameter %i is wrong\n", -devInfo_h);
        printf("SVD unsuccessfull for the singular values calculation only. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zero\n", devInfo_h);

    printf("Calculation of the singular values only: %f ms\n\n", elapsedTime);

    // --- Moving the results from device to host
    //gpuErrchk(cudaMemcpy(h_S, d_S, N * sizeof(float), cudaMemcpyDeviceToHost));
    //for (int i = 0; i < N; i++) std::cout << "d_S[" << i << "] = " << h_S[i] << std::endl;

    // --- CUDA SVD execution - Full SVD
    cusolveSafeCall(cusolverDnSgesvd(solver_handle, 'A', 'A', M, N, d_A, M, d_S, d_U, M, d_V, N, work, work_size, NULL, devInfo));
    elapsedTime = timerGPU.GetCounter();

    devInfo_h = 0;
    gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h == 0)
        printf("SVD successfull for the full SVD calculation\n\n");
    else if (devInfo_h < 0)
        printf("SVD unsuccessfull for the full SVD calculation. Parameter %i is wrong\n", -devInfo_h);
        printf("SVD unsuccessfull for the full SVD calculation. A number of %i superdiagonals of an intermediate bidiagonal form did not converge to zero\n", devInfo_h);

    printf("Calculation of the full SVD calculation: %f ms\n\n", elapsedTime);


    return 0;



I have compared the performance of the singular values only calculation and the the Full SVD computations for CUDA 8.0, CUDA 9.1 and CUDA 10.0, for a 5000x5000 matrix. Here are the results on a GTX 960.

Computation type               CUDA 8.0     CUDA 9.1     CUDA 10.0     

Singular values only           17s          15s          15s
Full SVD                       161s         159s         457s

