SSE-copy, AVX-copy and std::copy performance

前端 未结 5 1619
庸人自扰
庸人自扰 2020-12-08 03:28

I\'m tried to improve performance of copy operation via SSE and AVX:

    #include 

    const int sz = 1024;
    float *mas = (float *)_mm_         


        
5条回答
  •  甜味超标
    2020-12-08 04:18

    I think this is because measuring not accurate for kinda short operations.

    When measuring performance on Intel CPU

    1. Disable "Turbo Boost" and "SpeedStep". You can to this on system BIOS.

    2. Change Process/Thread priority to High or Realtime. This will keep your thread running.

    3. Set Process CPU Mask to only one core. CPU Masking with Higher priority will minimize context switching.

    4. use __rdtsc() intrinsic function. Intel Core series returns CPU internal clock counter with __rdtsc(). You will get 3400000000 counts/second from 3.4Ghz CPU. And __rdtsc() flushes all scheduled operations in CPU so it can measure timing more accurate.

    This is my test-bed startup code for testing SSE/AVX codes.

        int GetMSB(DWORD_PTR dwordPtr)
        {
            if(dwordPtr)
            {
                int result = 1;
        #if defined(_WIN64)
                if(dwordPtr & 0xFFFFFFFF00000000) { result += 32; dwordPtr &= 0xFFFFFFFF00000000; }
                if(dwordPtr & 0xFFFF0000FFFF0000) { result += 16; dwordPtr &= 0xFFFF0000FFFF0000; }
                if(dwordPtr & 0xFF00FF00FF00FF00) { result += 8;  dwordPtr &= 0xFF00FF00FF00FF00; }
                if(dwordPtr & 0xF0F0F0F0F0F0F0F0) { result += 4;  dwordPtr &= 0xF0F0F0F0F0F0F0F0; }
                if(dwordPtr & 0xCCCCCCCCCCCCCCCC) { result += 2;  dwordPtr &= 0xCCCCCCCCCCCCCCCC; }
                if(dwordPtr & 0xAAAAAAAAAAAAAAAA) { result += 1; }
        #else
                if(dwordPtr & 0xFFFF0000) { result += 16; dwordPtr &= 0xFFFF0000; }
                if(dwordPtr & 0xFF00FF00) { result += 8;  dwordPtr &= 0xFF00FF00; }
                if(dwordPtr & 0xF0F0F0F0) { result += 4;  dwordPtr &= 0xF0F0F0F0; }
                if(dwordPtr & 0xCCCCCCCC) { result += 2;  dwordPtr &= 0xCCCCCCCC; }
                if(dwordPtr & 0xAAAAAAAA) { result += 1; }
        #endif
                return result;
            }
            else
            {
                return 0;
            }
        }
    
        int _tmain(int argc, _TCHAR* argv[])
        {
            // Set Core Affinity
            DWORD_PTR processMask, systemMask;
            GetProcessAffinityMask(GetCurrentProcess(), &processMask, &systemMask);
            SetProcessAffinityMask(GetCurrentProcess(), 1 << (GetMSB(processMask) - 1) );
    
            // Set Process Priority. you can use REALTIME_PRIORITY_CLASS.
            SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
    
            DWORD64 start, end;
            start = __rdtsc();
        // your code here.
            end = __rdtsc();
            printf("%I64d\n", end - start);
            return 0;
        }
    

提交回复
热议问题