I have searched and used many approaches for measuring the elapsed time. there are many questions for this purpose. For example, this question is very good but when you nee
I recommend using this method for x86 micro-architecture.
NOTE:
NUM_LOOP should be a number which helps to increase the accuracy
with repeating your code to record the best time ttbest_rdtsc must
be bigger than the worst time I recommend to maximize it.
I used (you might not want it) OVERAL_TIME as another checking rule because I used this for many kernels and in some cases NUM_LOOP was very big and I didn't want to change it. I planned OVERAL_TIME to limit the iterations and stop after specific time.
UPDATE: The whole program is this:
#include <stdio.h>
#include <x86intrin.h>
#define NUM_LOOP 100 //executes your code NUM_LOOP times to get the smalest time to avoid overheads such as cache misses, etc.
int main()
{
long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999;
int do_while = 0;
do{
t1_rdtsc = _rdtsc();
//put your code here
t2_rdtsc = _rdtsc();
ttotal_rdtsc = t2_rdtsc - t1_rdtsc;
//store the smalest time:
if (ttotal_rdtsc<ttbest_rdtsc)
ttbest_rdtsc = ttotal_rdtsc;
}while (do_while++ < NUM_LOOP);
printf("\nthe best is %lld in %d repetitions\n", ttbest_rdtsc, NUM_LOOP );
return 0;
}
that I have changed to this and added to a header for my self then I can use it simply in my program.
#include <x86intrin.h>
#define do_while NUM_LOOP
#define OVERAL_TIME 999999999
long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc, ttbest_rdtsc = 99999999999999999, elapsed, elapsed_rdtsc=do_while, overal_time = OVERAL_TIME, ttime=0;
#define begin_rdtsc\
do{\
t1_rdtsc=_rdtsc();
#define end_rdtsc\
t2_rdtsc=_rdtsc();\
ttotal_rdtsc=t2_rdtsc-t1_rdtsc;\
if (ttotal_rdtsc<ttbest_rdtsc){\
ttbest_rdtsc = ttotal_rdtsc;\
elapsed=(do_while-elapsed_rdtsc);}\
ttime+=ttotal_rdtsc;\
}while (elapsed_rdtsc-- && (ttime<overal_time));\
printf("\nthe best is %lld in %lldth iteration and %lld repetitions\n", ttbest_rdtsc, elapsed, (do_while-elapsed_rdtsc));
How to use this method? Well, it is very simple!
int main()
{
//before the section
begin_rdtsc
//put your code here to measure the clocks.
end_rdtsc
return 0
}
Be creative, You can change it to measure the speedup in your program, etc. An example of the output is:
the best is 9600 in 384751th iteration and 569179 repetitions
my tested code got 9600 clock that the best was recorded in 384751enditeration and my code was tested 569179 times
I have tested them on GCC and Clang.
If you have problem with autovectorizer and want to limit it just add a asm("#somthing"); after your begin_rdtsc it will separate the do-while loop. I just checked and it vectorized your posted code which auto vectorizer was unable to vectorize it.
I changed your macro you can use it....
long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc[do_while], ttbest_rdtsc = 99999999999999999, elapsed, elapsed_rdtsc=do_while, overal_time = OVERAL_TIME, ttime=0;
int ii=0;
#define begin_rdtsc\
do{\
asm("#mmmmmmmmmmm");\
t1_rdtsc=_rdtsc();
#define end_rdtsc\
t2_rdtsc=_rdtsc();\
asm("#mmmmmmmmmmm");\
ttotal_rdtsc[ii]=t2_rdtsc-t1_rdtsc;\
}while (ii++<do_while);\
for(ii=0; ii<do_while; ii++){\
if (ttotal_rdtsc[ii]<ttbest_rdtsc){\
ttbest_rdtsc = ttotal_rdtsc[ii];}}\
printf("\nthe best is %lld in %lld iteration\n", ttbest_rdtsc, elapsed_rdtsc);
I have developed my first answer and got this solution. But, I still want a solution. Because it is very important to measure the time accurately and with the least impacts. I put this part in a header file and include it in main program files.
//Header file header.h
#define count 1000 // number of repetition
long long t1_rdtsc, t2_rdtsc, ttotal_rdtsc[count], ttbest_rdtsc = 99999999999999999, elapsed, elapsed_rdtsc=count, overal_time = OVERAL_TIME, ttime=0;
int ii=0;
#define begin_rdtsc\
do{\
t1_rdtsc=_rdtsc();
#define end_rdtsc\
t2_rdtsc=_rdtsc();\
ttotal_rdtsc[ii]=t2_rdtsc-t1_rdtsc;\
}while (ii++<count);\
for(ii=0; ii<do_while; ii++){\
if (ttotal_rdtsc[ii]<ttbest_rdtsc){\
ttbest_rdtsc = ttotal_rdtsc[ii];}}\
printf("\nthe best is %lld in %lldth iteration \n", ttbest_rdtsc, elapsed_rdtsc);
//Main program
#include "header.h"
.
.
.
int main()
{
//before the section
begin_rdtsc
//put your code here to measure the clocks.
end_rdtsc
return 0
}