Cuda Performance measuring - Elapsed time returns zero

风格不统一 提交于 2019-12-12 02:56:39

问题


I wrote a few kernel function and wonder how many miliseconds to process these functions.

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#define N 8000

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b) {
    int add = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        add = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b) {
    int subtract = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        subtract = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b) {
    int multiply = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        multiply = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b) {
    int divide = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        divide = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b) {
    int modulus = 0;

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        modulus = a[tid] % b[tid];
    }
}

__global__ void neg(int *data) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        data[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devA);
    neg<<<blocksPerGrid, threadsPerBlock>>>(devB);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int a[N], b[N];
    float dur = 0;



    int *devA, *devB;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devA, b, N * sizeof(int), cudaMemcpyHostToDevice);



    dur = duration(a, b, N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";

    return 0;
}

Milisecond always return zero. Why? What I'm missing here? If a i remove the neg functions from the duration duration function. It returns 0.15687 ms. I think it is a small number to process these functions. whats wrong with that program?

After edit, I did this:

using namespace std;
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

const int N = 8000;

void fillArray(int *data, int count) {
    for (int i = 0; i < count; i++)
        data[i] = rand() % 100;
}

__global__ void add(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] + b[tid];
    }
}

__global__ void subtract(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] - b[tid];
    }
}

__global__ void multiply(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] * b[tid];
    }
}

__global__ void divide(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] / b[tid];
    }
}

__global__ void modu(int* a, int *b, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = a[tid] % b[tid];
    }
}

__global__ void neg(int *data, int *c) {

    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    if (tid < N) {
        c[tid] = -data[tid];
    }
}

float duration(int *devA, int *devB, int *devC, int blocksPerGrid, int threadsPerBlock) {

    cudaEvent_t start, stop;
    float elapsedTime;

    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    double hArrayC[N];

    add<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    subtract<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    multiply<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    divide<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    modu<<<blocksPerGrid, threadsPerBlock>>>(devA, devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devA,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    neg<<<blocksPerGrid, threadsPerBlock>>>(devB,devC);
    cudaMemcpy(hArrayC,devC,N*sizeof(int),cudaMemcpyDeviceToHost);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedTime, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return elapsedTime;
}

int main(void) {

    int a[N], b[N],c[N];
    float dur = 0;

    int *devA, *devB,*devC;

    cudaMalloc((void**) &devA, N * sizeof(int));
    cudaMalloc((void**) &devB, N * sizeof(int));
    cudaMalloc((void**) &devC, N * sizeof(int));

    fillArray(a, N);
    fillArray(b, N);

    cudaMemcpy(devA, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devB, b, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devC, c, N * sizeof(int), cudaMemcpyHostToDevice);




    dur = duration(devA, devB, devC,N, 1);

    cout << "Global memory version:\n";
    cout << "Process completed in " << dur;
    cout << " for a data set of " << N << " integers.";



    cudaFree(devA);
    cudaFree(devB);
    return 0;
}

回答1:


Your kernels are not doing anything, since you only store results in registers. When compiling, you get some warnings:

kernel.cu(13): warning: variable "add" was set but never used

Also, if you want to see some better timings, use NVIDIA's profiler: either nvprof (CLI) or nvvp (GUI).

$ nvprof ./kernel

======== NVPROF is profiling kernel...
======== Command: kernel
Global memory version: Process completed in 0 for a data set of 8000 integers.
======== Profiling result:
  Time(%)     Time   Calls       Avg       Min       Max  Name
  100.00   18.46us       2    9.23us    6.02us   12.45us  [CUDA memcpy HtoD]
    0.00       0ns       1       0ns       0ns       0ns  multiply(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  add(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  modu(int*, int*)
    0.00       0ns       2       0ns       0ns       0ns  neg(int*)
    0.00       0ns       1       0ns       0ns       0ns  subtract(int*, int*)
    0.00       0ns       1       0ns       0ns       0ns  divide(int*, int*)

You are also using N blocks per grid, and 1 thread per block. You should consider reading the answer to this question.

UPDATE

Concerning the vector addition (and the other simple operations) in itself, you should either study the vectorAdd sample of the CUDA SDK, or use Thrust. The first option will teach you how to use CUDA, and the second option will show you the kind of high-level operations you can do with Thrust. If I were you, I would do both.




回答2:


Cuda tasks are running on device without blocking the CPU thread. So the cuda call will block only when you try to get computed data from device memory and it's not ready yet. Or when you explicitly synchronize you CPU thread with GPU using cudaDeviceSyncronize() call. If you want to measure the calculation time you need to synchronize before stopping the timer.

If you will be interested in measuring memory copy time you need to synchronize after calculation started and before the copy timer have started or the calculation time will be shown as copy time.

You can use the profiler that is included in cuda SDK to measure the time of all cuda calls.




回答3:


Try use float (or double) variables and arrays instead of int to store all arithmetic variables and operations. Sometimes the time interval is too small that integer value will always round to zero.



来源:https://stackoverflow.com/questions/16515894/cuda-performance-measuring-elapsed-time-returns-zero

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!