Faster way to zero memory than with memset?

后端 未结 9 1479
刺人心
刺人心 2020-12-07 12:02

I learned that memset(ptr, 0, nbytes) is really fast, but is there a faster way (at least on x86)?

I assume that memset uses mov, however w

9条回答
  •  小蘑菇
    小蘑菇 (楼主)
    2020-12-07 12:59

    That's an interesting question. I made this implementation that is just slightly faster (but hardly measurable) when 32-bit release compiling on VC++ 2012. It probably can be improved on a lot. Adding this in your own class in a multithreaded environment would probably give you even more performance gains since there are some reported bottleneck problems with memset() in multithreaded scenarios.

    // MemsetSpeedTest.cpp : Defines the entry point for the console application.
    //
    
    #include "stdafx.h"
    #include 
    #include "Windows.h"
    #include 
    
    #pragma comment(lib, "Winmm.lib") 
    using namespace std;
    
    /** a signed 64-bit integer value type */
    #define _INT64 __int64
    
    /** a signed 32-bit integer value type */
    #define _INT32 __int32
    
    /** a signed 16-bit integer value type */
    #define _INT16 __int16
    
    /** a signed 8-bit integer value type */
    #define _INT8 __int8
    
    /** an unsigned 64-bit integer value type */
    #define _UINT64 unsigned _INT64
    
    /** an unsigned 32-bit integer value type */
    #define _UINT32 unsigned _INT32
    
    /** an unsigned 16-bit integer value type */
    #define _UINT16 unsigned _INT16
    
    /** an unsigned 8-bit integer value type */
    #define _UINT8 unsigned _INT8
    
    /** maximum allo
    
    wed value in an unsigned 64-bit integer value type */
        #define _UINT64_MAX 18446744073709551615ULL
    
    #ifdef _WIN32
    
    /** Use to init the clock */
    #define TIMER_INIT LARGE_INTEGER frequency;LARGE_INTEGER t1, t2;double elapsedTime;QueryPerformanceFrequency(&frequency);
    
    /** Use to start the performance timer */
    #define TIMER_START QueryPerformanceCounter(&t1);
    
    /** Use to stop the performance timer and output the result to the standard stream. Less verbose than \c TIMER_STOP_VERBOSE */
    #define TIMER_STOP QueryPerformanceCounter(&t2);elapsedTime=(t2.QuadPart-t1.QuadPart)*1000.0/frequency.QuadPart;wcout<> 3;
        size_t bytesLeft = count - (blocks << 3);
        _UINT64 cUll = 
            c 
            | (((_UINT64)c) << 8 )
            | (((_UINT64)c) << 16 )
            | (((_UINT64)c) << 24 )
            | (((_UINT64)c) << 32 )
            | (((_UINT64)c) << 40 )
            | (((_UINT64)c) << 48 )
            | (((_UINT64)c) << 56 );
    
        _UINT64 *destPtr8 = (_UINT64*)dest;
        for (blockIdx = 0; blockIdx < blocks; blockIdx++) destPtr8[blockIdx] = cUll;
    
        if (!bytesLeft) return dest;
    
        blocks = bytesLeft >> 2;
        bytesLeft = bytesLeft - (blocks << 2);
    
        _UINT32 *destPtr4 = (_UINT32*)&destPtr8[blockIdx];
        for (blockIdx = 0; blockIdx < blocks; blockIdx++) destPtr4[blockIdx] = (_UINT32)cUll;
    
        if (!bytesLeft) return dest;
    
        blocks = bytesLeft >> 1;
        bytesLeft = bytesLeft - (blocks << 1);
    
        _UINT16 *destPtr2 = (_UINT16*)&destPtr4[blockIdx];
        for (blockIdx = 0; blockIdx < blocks; blockIdx++) destPtr2[blockIdx] = (_UINT16)cUll;
    
        if (!bytesLeft) return dest;
    
        _UINT8 *destPtr1 = (_UINT8*)&destPtr2[blockIdx];
        for (blockIdx = 0; blockIdx < bytesLeft; blockIdx++) destPtr1[blockIdx] = (_UINT8)cUll;
    
        return dest;
    }
    
    int _tmain(int argc, _TCHAR* argv[])
    {
        TIMER_INIT
    
        const size_t n = 10000000;
        const _UINT64 m = _UINT64_MAX;
        const _UINT64 o = 1;
        char test[n];
        {
            cout << "memset()" << endl;
            TIMER_START;
    
            for (int i = 0; i < m ; i++)
                for (int j = 0; j < o ; j++)
                    memset((void*)test, 0, n);  
    
            TIMER_STOP;
        }
        {
            cout << "MemSet() took:" << endl;
            TIMER_START;
    
            for (int i = 0; i < m ; i++)
                for (int j = 0; j < o ; j++)
                    MemSet((void*)test, 0, n);
    
            TIMER_STOP;
        }
    
        cout << "Done" << endl;
        int wait;
        cin >> wait;
        return 0;
    }
    

    Output is as follows when release compiling for 32-bit systems:

    memset() took:
    5.569000
    MemSet() took:
    5.544000
    Done
    

    Output is as follows when release compiling for 64-bit systems:

    memset() took:
    2.781000
    MemSet() took:
    2.765000
    Done
    

    Here you can find the source code Berkley's memset(), which I think is the most common implementation.

提交回复
热议问题