I learned that memset(ptr, 0, nbytes)
is really fast, but is there a faster way (at least on x86)?
I assume that memset uses mov
, however w
That's an interesting question. I made this implementation that is just slightly faster (but hardly measurable) when 32-bit release compiling on VC++ 2012. It probably can be improved on a lot. Adding this in your own class in a multithreaded environment would probably give you even more performance gains since there are some reported bottleneck problems with memset()
in multithreaded scenarios.
// MemsetSpeedTest.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include
#include "Windows.h"
#include
#pragma comment(lib, "Winmm.lib")
using namespace std;
/** a signed 64-bit integer value type */
#define _INT64 __int64
/** a signed 32-bit integer value type */
#define _INT32 __int32
/** a signed 16-bit integer value type */
#define _INT16 __int16
/** a signed 8-bit integer value type */
#define _INT8 __int8
/** an unsigned 64-bit integer value type */
#define _UINT64 unsigned _INT64
/** an unsigned 32-bit integer value type */
#define _UINT32 unsigned _INT32
/** an unsigned 16-bit integer value type */
#define _UINT16 unsigned _INT16
/** an unsigned 8-bit integer value type */
#define _UINT8 unsigned _INT8
/** maximum allo
wed value in an unsigned 64-bit integer value type */
#define _UINT64_MAX 18446744073709551615ULL
#ifdef _WIN32
/** Use to init the clock */
#define TIMER_INIT LARGE_INTEGER frequency;LARGE_INTEGER t1, t2;double elapsedTime;QueryPerformanceFrequency(&frequency);
/** Use to start the performance timer */
#define TIMER_START QueryPerformanceCounter(&t1);
/** Use to stop the performance timer and output the result to the standard stream. Less verbose than \c TIMER_STOP_VERBOSE */
#define TIMER_STOP QueryPerformanceCounter(&t2);elapsedTime=(t2.QuadPart-t1.QuadPart)*1000.0/frequency.QuadPart;wcout<> 3;
size_t bytesLeft = count - (blocks << 3);
_UINT64 cUll =
c
| (((_UINT64)c) << 8 )
| (((_UINT64)c) << 16 )
| (((_UINT64)c) << 24 )
| (((_UINT64)c) << 32 )
| (((_UINT64)c) << 40 )
| (((_UINT64)c) << 48 )
| (((_UINT64)c) << 56 );
_UINT64 *destPtr8 = (_UINT64*)dest;
for (blockIdx = 0; blockIdx < blocks; blockIdx++) destPtr8[blockIdx] = cUll;
if (!bytesLeft) return dest;
blocks = bytesLeft >> 2;
bytesLeft = bytesLeft - (blocks << 2);
_UINT32 *destPtr4 = (_UINT32*)&destPtr8[blockIdx];
for (blockIdx = 0; blockIdx < blocks; blockIdx++) destPtr4[blockIdx] = (_UINT32)cUll;
if (!bytesLeft) return dest;
blocks = bytesLeft >> 1;
bytesLeft = bytesLeft - (blocks << 1);
_UINT16 *destPtr2 = (_UINT16*)&destPtr4[blockIdx];
for (blockIdx = 0; blockIdx < blocks; blockIdx++) destPtr2[blockIdx] = (_UINT16)cUll;
if (!bytesLeft) return dest;
_UINT8 *destPtr1 = (_UINT8*)&destPtr2[blockIdx];
for (blockIdx = 0; blockIdx < bytesLeft; blockIdx++) destPtr1[blockIdx] = (_UINT8)cUll;
return dest;
}
int _tmain(int argc, _TCHAR* argv[])
{
TIMER_INIT
const size_t n = 10000000;
const _UINT64 m = _UINT64_MAX;
const _UINT64 o = 1;
char test[n];
{
cout << "memset()" << endl;
TIMER_START;
for (int i = 0; i < m ; i++)
for (int j = 0; j < o ; j++)
memset((void*)test, 0, n);
TIMER_STOP;
}
{
cout << "MemSet() took:" << endl;
TIMER_START;
for (int i = 0; i < m ; i++)
for (int j = 0; j < o ; j++)
MemSet((void*)test, 0, n);
TIMER_STOP;
}
cout << "Done" << endl;
int wait;
cin >> wait;
return 0;
}
Output is as follows when release compiling for 32-bit systems:
memset() took:
5.569000
MemSet() took:
5.544000
Done
Output is as follows when release compiling for 64-bit systems:
memset() took:
2.781000
MemSet() took:
2.765000
Done
Here you can find the source code Berkley's memset()
, which I think is the most common implementation.