I\'m tried to improve performance of copy operation via SSE and AVX:
#include
const int sz = 1024;
float *mas = (float *)_mm_
I think this is because measuring not accurate for kinda short operations.
When measuring performance on Intel CPU
Disable "Turbo Boost" and "SpeedStep". You can to this on system BIOS.
Change Process/Thread priority to High or Realtime. This will keep your thread running.
Set Process CPU Mask to only one core. CPU Masking with Higher priority will minimize context switching.
use __rdtsc() intrinsic function. Intel Core series returns CPU internal clock counter with __rdtsc(). You will get 3400000000 counts/second from 3.4Ghz CPU. And __rdtsc() flushes all scheduled operations in CPU so it can measure timing more accurate.
This is my test-bed startup code for testing SSE/AVX codes.
int GetMSB(DWORD_PTR dwordPtr)
{
if(dwordPtr)
{
int result = 1;
#if defined(_WIN64)
if(dwordPtr & 0xFFFFFFFF00000000) { result += 32; dwordPtr &= 0xFFFFFFFF00000000; }
if(dwordPtr & 0xFFFF0000FFFF0000) { result += 16; dwordPtr &= 0xFFFF0000FFFF0000; }
if(dwordPtr & 0xFF00FF00FF00FF00) { result += 8; dwordPtr &= 0xFF00FF00FF00FF00; }
if(dwordPtr & 0xF0F0F0F0F0F0F0F0) { result += 4; dwordPtr &= 0xF0F0F0F0F0F0F0F0; }
if(dwordPtr & 0xCCCCCCCCCCCCCCCC) { result += 2; dwordPtr &= 0xCCCCCCCCCCCCCCCC; }
if(dwordPtr & 0xAAAAAAAAAAAAAAAA) { result += 1; }
#else
if(dwordPtr & 0xFFFF0000) { result += 16; dwordPtr &= 0xFFFF0000; }
if(dwordPtr & 0xFF00FF00) { result += 8; dwordPtr &= 0xFF00FF00; }
if(dwordPtr & 0xF0F0F0F0) { result += 4; dwordPtr &= 0xF0F0F0F0; }
if(dwordPtr & 0xCCCCCCCC) { result += 2; dwordPtr &= 0xCCCCCCCC; }
if(dwordPtr & 0xAAAAAAAA) { result += 1; }
#endif
return result;
}
else
{
return 0;
}
}
int _tmain(int argc, _TCHAR* argv[])
{
// Set Core Affinity
DWORD_PTR processMask, systemMask;
GetProcessAffinityMask(GetCurrentProcess(), &processMask, &systemMask);
SetProcessAffinityMask(GetCurrentProcess(), 1 << (GetMSB(processMask) - 1) );
// Set Process Priority. you can use REALTIME_PRIORITY_CLASS.
SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
DWORD64 start, end;
start = __rdtsc();
// your code here.
end = __rdtsc();
printf("%I64d\n", end - start);
return 0;
}