Timing CUDA operations

后端 未结 5 1932
走了就别回头了
走了就别回头了 2020-12-02 23:48

I need to time a CUDA kernel execution. The Best Practices Guide says that we can use either events or standard timing functions like clock() in Windows. My pro

相关标签:
5条回答
  • 2020-12-03 00:20

    A satisfactory answer has been already given to your question.

    I have constructed classes for timing C/C++ as well as CUDA operations and want to share with other hoping they could be helpful to next users. You will just need to add the 4 files reported below to your project and #include the two header files as

    // --- Timing includes
    #include "TimingCPU.h"
    #include "TimingGPU.cuh"
    

    The two classes can be used as follows.

    Timing CPU section

    TimingCPU timer_CPU;
    
    timer_CPU.StartCounter();
    CPU perations to be timed
    std::cout << "CPU Timing = " << timer_CPU.GetCounter() << " ms" << std::endl;
    

    Timing GPU section

    TimingGPU timer_GPU;
    timer_GPU.StartCounter();
    GPU perations to be timed
    std::cout << "GPU Timing = " << timer_GPU.GetCounter() << " ms" << std::endl;
    

    In both the cases, the timing is in milliseconds. Also, the two classes can be used under linux or windows.

    Here are the 4 files:

    TimingCPU.cpp

    /**************/
    /* TIMING CPU */
    /**************/
    
    #include "TimingCPU.h"
    
    #ifdef __linux__
    
        #include <sys/time.h>
        #include <stdio.h>
    
        TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }
    
        TimingCPU::~TimingCPU() { }
    
        void TimingCPU::StartCounter()
        {
            struct timeval time;
            if(gettimeofday( &time, 0 )) return;
            cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
        }
    
        double TimingCPU::GetCounter()
        {
            struct timeval time;
            if(gettimeofday( &time, 0 )) return -1;
    
            long cur_time = 1000000 * time.tv_sec + time.tv_usec;
            double sec = (cur_time - cur_time_) / 1000000.0;
            if(sec < 0) sec += 86400;
            cur_time_ = cur_time;
    
            return 1000.*sec;
        }
    
    #elif _WIN32 || _WIN64
        #include <windows.h>
        #include <iostream>
    
        struct PrivateTimingCPU {
            double  PCFreq;
            __int64 CounterStart;
        };
    
        // --- Default constructor
        TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }
    
        // --- Default destructor
        TimingCPU::~TimingCPU() { }
    
        // --- Starts the timing
        void TimingCPU::StartCounter()
        {
            LARGE_INTEGER li;
            if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!\n";
    
            (*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;
    
            QueryPerformanceCounter(&li);
            (*privateTimingCPU).CounterStart = li.QuadPart;
        }
    
        // --- Gets the timing counter in ms
        double TimingCPU::GetCounter()
        {
            LARGE_INTEGER li;
            QueryPerformanceCounter(&li);
            return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
        }
    #endif
    

    TimingCPU.h

    // 1 micro-second accuracy
    // Returns the time in seconds
    
    #ifndef __TIMINGCPU_H__
    #define __TIMINGCPU_H__
    
    #ifdef __linux__
    
        class TimingCPU {
    
            private:
                long cur_time_;
    
            public:
    
                TimingCPU();
    
                ~TimingCPU();
    
                void StartCounter();
    
                double GetCounter();
        };
    
    #elif _WIN32 || _WIN64
    
        struct PrivateTimingCPU;
    
        class TimingCPU
        {
            private:
                PrivateTimingCPU *privateTimingCPU;
    
            public:
    
                TimingCPU();
    
                ~TimingCPU();
    
                void StartCounter();
    
                double GetCounter();
    
        }; // TimingCPU class
    
    #endif
    
    #endif
    

    TimingGPU.cu

    /**************/
    /* TIMING GPU */
    /**************/
    
    #include "TimingGPU.cuh"
    
    #include <cuda.h>
    #include <cuda_runtime.h>
    
    struct PrivateTimingGPU {
        cudaEvent_t     start;
        cudaEvent_t     stop;
    };
    
    // default constructor
    TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }
    
    // default destructor
    TimingGPU::~TimingGPU() { }
    
    void TimingGPU::StartCounter()
    {
        cudaEventCreate(&((*privateTimingGPU).start));
        cudaEventCreate(&((*privateTimingGPU).stop));
        cudaEventRecord((*privateTimingGPU).start,0);
    }
    
    void TimingGPU::StartCounterFlags()
    {
        int eventflags = cudaEventBlockingSync;
    
        cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
        cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
        cudaEventRecord((*privateTimingGPU).start,0);
    }
    
    // Gets the counter in ms
    float TimingGPU::GetCounter()
    {
        float   time;
        cudaEventRecord((*privateTimingGPU).stop, 0);
        cudaEventSynchronize((*privateTimingGPU).stop);
        cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
        return time;
    }
    

    TimingGPU.cuh

    #ifndef __TIMING_CUH__
    #define __TIMING_CUH__
    
    /**************/
    /* TIMING GPU */
    /**************/
    
    // Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
    // microsecond precision.
    
    struct PrivateTimingGPU;
    
    class TimingGPU
    {
        private:
            PrivateTimingGPU *privateTimingGPU;
    
        public:
    
            TimingGPU();
    
            ~TimingGPU();
    
            void StartCounter();
            void StartCounterFlags();
    
            float GetCounter();
    
    }; // TimingCPU class
    
    #endif
    
    0 讨论(0)
  • 2020-12-03 00:23

    You could do something along the lines of :

    #include <sys/time.h>
    
    struct timeval t1, t2;
    
    gettimeofday(&t1, 0);
    
    kernel_call<<<dimGrid, dimBlock, 0>>>();
    
    HANDLE_ERROR(cudaThreadSynchronize();)
    
    gettimeofday(&t2, 0);
    
    double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;
    
    printf("Time to generate:  %3.1f ms \n", time);
    

    or:

    float time;
    cudaEvent_t start, stop;
    
    HANDLE_ERROR( cudaEventCreate(&start) );
    HANDLE_ERROR( cudaEventCreate(&stop) );
    HANDLE_ERROR( cudaEventRecord(start, 0) );
    
    kernel_call<<<dimGrid, dimBlock, 0>>>();
    
    HANDLE_ERROR( cudaEventRecord(stop, 0) );
    HANDLE_ERROR( cudaEventSynchronize(stop) );
    HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );
    
    printf("Time to generate:  %3.1f ms \n", time);
    
    0 讨论(0)
  • 2020-12-03 00:27

    There is an out-of-box GpuTimer struct for use:

    #ifndef __GPU_TIMER_H__
    #define __GPU_TIMER_H__
    
    struct GpuTimer
    {
          cudaEvent_t start;
          cudaEvent_t stop;
    
          GpuTimer()
          {
                cudaEventCreate(&start);
                cudaEventCreate(&stop);
          }
    
          ~GpuTimer()
          {
                cudaEventDestroy(start);
                cudaEventDestroy(stop);
          }
    
          void Start()
          {
                cudaEventRecord(start, 0);
          }
    
          void Stop()
          {
                cudaEventRecord(stop, 0);
          }
    
          float Elapsed()
          {
                float elapsed;
                cudaEventSynchronize(stop);
                cudaEventElapsedTime(&elapsed, start, stop);
                return elapsed;
          }
    };
    
    #endif  /* __GPU_TIMER_H__ */
    
    0 讨论(0)
  • 2020-12-03 00:39

    You can use the compute visula profiler which will be great for your purpose. it measures the time of every cuda function and tells you how many times you called it .

    0 讨论(0)
  • 2020-12-03 00:42

    If you want to measure GPU time you pretty much have to use events. Theres a great discussion thread on the do's and don'ts of timing your application over on the nvidia forums here.

    0 讨论(0)
提交回复
热议问题