Varying results from cuBlas

六月ゝ 毕业季﹏ 提交于 2019-12-02 14:53:28

问题


I have implemented the following CUDA code but i am a little bit confused about the behavior.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <ctime>
#include <chrono>
#include <string>

#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1)) 

void PrintMatrix(float* a, int n)
{
    int j, i;
    for (j = 1; j <= n; j++)
    {
        for (i = 1; i <= n; i++)
        {
            printf("%7.0f", a[IDX2F(i, j, n)]);
        }
        printf("\n");
    }
}

float* CreateMatrix(int n)
{
    float* matrix = static_cast<float *>(malloc(n * n * sizeof(float)));
    if (!matrix)
    {
        printf("host memory allocation failed");
        return nullptr;
    }

    for (int j = 1; j <= n; j++)
    {
        for (int i = 1; i <= n; i++)
        {
            matrix[IDX2F(i, j, n)] = 2;
        }
    }

    return matrix;
}

long CudaMatrixMultiply(float* matrix, int n)
{
    cudaError_t cudaStat;
    cublasStatus_t status;
    cublasHandle_t handle;
    float* deviceMatrix;

    cudaStat = cudaMalloc(reinterpret_cast<void**>(&deviceMatrix), n * n * sizeof(float));
    if (cudaStat != cudaSuccess)
    {
        printf("device memory allocation failed");
        return EXIT_FAILURE;
    }

    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("CUBLAS initialization failed\n");
        return EXIT_FAILURE;
    }

    status = cublasSetMatrix(n, n, sizeof(float), matrix, n, deviceMatrix, n);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("data download failed");
        cudaFree(deviceMatrix);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    float alpha = 1;
    float beta = 0;
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, deviceMatrix, n, deviceMatrix, n, &beta, deviceMatrix, n);

    status = cublasGetMatrix(n, n, sizeof(float), deviceMatrix, n, matrix, n);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("data upload failed");
        cudaFree(deviceMatrix);
        cublasDestroy(handle);
        return EXIT_FAILURE;
    }

    cudaFree(deviceMatrix);
    cublasDestroy(handle);
    return EXIT_SUCCESS;
}

float* CpuMatrixMultiply(float* matrix, int size)
{
    float* result = new float[size * size]();

    // Copied from https://msdn.microsoft.com/en-us/library/hh873134.aspx
    for (int row = 1; row <= size; row++) 
    {
        for (int col = 1; col <= size; col++) 
        {
            // Multiply the row of A by the column of B to get the row, column of product.
            for (int inner = 1; inner <= size; inner++) 
            {
                // result[row][col] += matrix[row][inner] * matrix[inner][col];
                result[IDX2F(col, row, size)] += matrix[IDX2F(inner, row, size)] * matrix[IDX2F(col, inner, size)];
            }
        }
    }

    free(matrix);
    return result;
}

int main(void)
{
    // printf("Matrix * Matrix Test\n");
    int size = 1000;
    int runs = 10;

    for (int run = 0; run != runs; run++)
    {
        printf("=== Test %d (Matrix * Matrix, Size = %d) ===\n\n", run + 1, size);
        printf("RAM usage is: %f GB\n", size * size * sizeof(float) / 1000000000.0);

        float* cpuMatrix = CreateMatrix(size);
        cpuMatrix = CpuMatrixMultiply(cpuMatrix, size);

        PrintMatrix(cpuMatrix, 5);

        float* gpuMatrix = CreateMatrix(size);
        CudaMatrixMultiply(gpuMatrix, size);
        PrintMatrix(gpuMatrix, 5);

        free(cpuMatrix);
        free(gpuMatrix);
    }
    getchar();
    return EXIT_SUCCESS;
}

The ouput of the CPU version of the MatrixMultiplication is the following as expected:

4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000
4000 4000 4000 4000 4000

but the result of the GPU computed is sometimes the right one (see above) or a wrong random(?) one. When the loop is executed the first time then the result was always the right one.

I am not able to find a mistake in my code and it would be great if you could help me.


Additionally if i set size (int the main method) to e.g. 16000 then my driver is crashing and i get an error message. For this i have written a bug report to NVidea because my pc crashed twice. But maybe it is a programming fault by me?

Driver: 364.72 (newest one)
SDK: CUDA Toolkit 7.5
Graphics Card: NVidia GeForce GTX 960 (4GB)
Windows 10 64Bit

Driver Error

Display driver NVIDIA Windows kernel Mode Driver, Version 362.72 stopped responding and has successfully recovered.

Edit: With the help of the community i found out that this is a problem with the watchdog timer. See answer below.


回答1:


Regarding the second part of the question, following njuffa's remark, you may change the settings for driver behavior to avoid the error when increasing size. Open NSIGHT Monitor and in Options, General, Microsoft Display Driver, change to False the WDDM TDR enabled field.

From spec, the 32bits FPU flops should be around 2.4 TFLOPS in single precision, hence your operation for a 16000 sized matrix should take at the minimum 3.5 seconds. Hence the Driver Recovery after 2 seconds.



来源:https://stackoverflow.com/questions/36748979/varying-results-from-cublas

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!