I am thinking on how to implement the conversion of an integer (4byte, unsigned) to string with SSE instructions. The usual routine is to divide the number and store it in a loc
Interesting problem. If you're interested in a 10 radix only itoa() then I have made a 10 times as fast example and a 3 times as fast example as the typical itoa() implementation.
First example (3x performance)
The first, which is 3 times as fast as itoa(), uses a single-pass non-reversal software design pattern and is based on the open source itoa() implementation found in groff.
// itoaSpeedTest.cpp : Defines the entry point for the console application.
//
#pragma comment(lib, "Winmm.lib")
#include "stdafx.h"
#include "Windows.h"
#include
#include
using namespace std;
#ifdef _WIN32
/** a signed 32-bit integer value type */
#define _INT32 __int32
#else
/** a signed 32-bit integer value type */
#define _INT32 long int // Guess what a 32-bit integer is
#endif
/** minimum allowed value in a signed 32-bit integer value type */
#define _INT32_MIN -2147483647
/** maximum allowed value in a signed 32-bit integer value type */
#define _INT32_MAX 2147483647
/** maximum allowed number of characters in a signed 32-bit integer value type including a '-' */
#define _INT32_MAX_LENGTH 11
#ifdef _WIN32
/** Use to init the clock */
#define TIMER_INIT LARGE_INTEGER frequency;LARGE_INTEGER t1, t2;double elapsedTime;QueryPerformanceFrequency(&frequency);
/** Use to start the performance timer */
#define TIMER_START QueryPerformanceCounter(&t1);
/** Use to stop the performance timer and output the result to the standard stream */
#define TIMER_STOP QueryPerformanceCounter(&t2);elapsedTime=(t2.QuadPart-t1.QuadPart)*1000.0/frequency.QuadPart;wcout<, 1989-1992
\author Inge Eivind Henriksen, 2013
\note Function was originally a part of \a groff, and was refactored & optimized in 2013.
\relates itoa()
*/
const char *Int32ToStr(_INT32 i)
{
// Make room for a 32-bit signed integers digits and the '\0'
char buf[_INT32_MAX_LENGTH + 2];
char *p = buf + _INT32_MAX_LENGTH + 1;
*--p = '\0';
if (i >= 0)
{
do
{
*--p = numbersIn10Radix[i % 10];
i /= 10;
} while (i);
}
else
{
// Negative integer
do
{
*--p = reverseArrayEndPtr[i % 10];
i /= 10;
} while (i);
*--p = '-';
}
return p;
}
int _tmain(int argc, _TCHAR* argv[])
{
TIMER_INIT
// Make sure we are playing fair here
if (sizeof(int) != sizeof(_INT32))
{
cerr << "Error: integer size mismatch; test would be invalid." << endl;
return -1;
}
const int steps = 100;
{
char intBuffer[20];
cout << "itoa() took:" << endl;
TIMER_START;
for (int i = _INT32_MIN; i < i + steps ; i += steps)
itoa(i, intBuffer, 10);
TIMER_STOP;
}
{
cout << "Int32ToStr() took:" << endl;
TIMER_START;
for (int i = _INT32_MIN; i < i + steps ; i += steps)
Int32ToStr(i);
TIMER_STOP;
}
cout << "Done" << endl;
int wait;
cin >> wait;
return 0;
}
On 64-bit Windows the result from running this example is:
itoa() took:
2909.84 ms.
Int32ToStr() took:
991.726 ms.
Done
On 32-bit Windows the result from running this example is:
itoa() took:
3119.6 ms.
Int32ToStr() took:
1031.61 ms.
Done
Second example (10x performance)
If you don't mind spending some time initializing some buffers then it's possible to optimize the function above to be 10x faster than the typical itoa() implementation. What you need to do is to create string buffers rather than character buffers, like this:
// itoaSpeedTest.cpp : Defines the entry point for the console application.
//
#pragma comment(lib, "Winmm.lib")
#include "stdafx.h"
#include "Windows.h"
#include
#include
using namespace std;
#ifdef _WIN32
/** a signed 32-bit integer value type */
#define _INT32 __int32
/** a signed 8-bit integer value type */
#define _INT8 __int8
/** an unsigned 8-bit integer value type */
#define _UINT8 unsigned _INT8
#else
/** a signed 32-bit integer value type */
#define _INT32 long int // Guess what a 32-bit integer is
/** a signed 8-bit integer value type */
#define _INT8 char
/** an unsigned 8-bit integer value type */
#define _UINT8 unsigned _INT8
#endif
/** minimum allowed value in a signed 32-bit integer value type */
#define _INT32_MIN -2147483647
/** maximum allowed value in a signed 32-bit integer value type */
#define _INT32_MAX 2147483647
/** maximum allowed number of characters in a signed 32-bit integer value type including a '-' */
#define _INT32_MAX_LENGTH 11
#ifdef _WIN32
/** Use to init the clock */
#define TIMER_INIT LARGE_INTEGER frequency;LARGE_INTEGER t1, t2;double elapsedTime;QueryPerformanceFrequency(&frequency);
/** Use to start the performance timer */
#define TIMER_START QueryPerformanceCounter(&t1);
/** Use to stop the performance timer and output the result to the standard stream. Less verbose than \c TIMER_STOP_VERBOSE */
#define TIMER_STOP QueryPerformanceCounter(&t2);elapsedTime=(t2.QuadPart-t1.QuadPart)*1000.0/frequency.QuadPart;wcout<= 1, setting it smaller will
make the buffers smaller but the performance slower. If you want to set it larger than 100000 then you
must add some more cases to the switch blocks. Try to make it smaller to see the difference in
performance. It does however seem to become slower if larger than 100000 */
static const _INT32 numElem10Radix = 100000;
/** Array used for fast lookup number character lookup */
const char *numbersIn10Radix[numElem10Radix] = {};
_UINT8 numbersIn10RadixLen[numElem10Radix] = {};
/** Array used for fast lookup number character lookup */
const char *reverseNumbersIn10Radix[numElem10Radix] = {};
_UINT8 reverseNumbersIn10RadixLen[numElem10Radix] = {};
void InitBuffers()
{
char intBuffer[20];
for (int i = 0; i < numElem10Radix; i++)
{
itoa(i, intBuffer, 10);
size_t numLen = strlen(intBuffer);
char *intStr = new char[numLen + 1];
strcpy(intStr, intBuffer);
numbersIn10Radix[i] = intStr;
numbersIn10RadixLen[i] = numLen;
reverseNumbersIn10Radix[numElem10Radix - 1 - i] = intStr;
reverseNumbersIn10RadixLen[numElem10Radix - 1 - i] = numLen;
}
}
/*!
\brief Converts a 32-bit signed integer to a string
\param i [in] Integer
\par Software design pattern
Uses a single pass non-reversing algorithm with string buffers and is 10x as fast as \c itoa().
\returns Integer as a string
\copyright GNU General Public License
\copyright 1989-1992 Free Software Foundation, Inc.
\date 1989-1992, 2013
\author James Clark, 1989-1992
\author Inge Eivind Henriksen, 2013
\note This file was originally a part of \a groff, and was refactored & optimized in 2013.
\relates itoa()
*/
const char *Int32ToStr(_INT32 i)
{
/* Room for INT_DIGITS digits, - and '\0' */
char buf[_INT32_MAX_LENGTH + 2];
char *p = buf + _INT32_MAX_LENGTH + 1;
_INT32 modVal;
*--p = '\0';
if (i >= 0)
{
do
{
modVal = i % numElem10Radix;
switch(numbersIn10RadixLen[modVal])
{
case 5:
*--p = numbersIn10Radix[modVal][4];
case 4:
*--p = numbersIn10Radix[modVal][3];
case 3:
*--p = numbersIn10Radix[modVal][2];
case 2:
*--p = numbersIn10Radix[modVal][1];
default:
*--p = numbersIn10Radix[modVal][0];
}
i /= numElem10Radix;
} while (i);
}
else
{
// Negative integer
const char **reverseArray = &reverseNumbersIn10Radix[numElem10Radix - 1];
const _UINT8 *reverseArrayLen = &reverseNumbersIn10RadixLen[numElem10Radix - 1];
do
{
modVal = i % numElem10Radix;
switch(reverseArrayLen[modVal])
{
case 5:
*--p = reverseArray[modVal][4];
case 4:
*--p = reverseArray[modVal][3];
case 3:
*--p = reverseArray[modVal][2];
case 2:
*--p = reverseArray[modVal][1];
default:
*--p = reverseArray[modVal][0];
}
i /= numElem10Radix;
} while (i);
*--p = '-';
}
return p;
}
int _tmain(int argc, _TCHAR* argv[])
{
InitBuffers();
TIMER_INIT
// Make sure we are playing fair here
if (sizeof(int) != sizeof(_INT32))
{
cerr << "Error: integer size mismatch; test would be invalid." << endl;
return -1;
}
const int steps = 100;
{
char intBuffer[20];
cout << "itoa() took:" << endl;
TIMER_START;
for (int i = _INT32_MIN; i < i + steps ; i += steps)
itoa(i, intBuffer, 10);
TIMER_STOP;
}
{
cout << "Int32ToStr() took:" << endl;
TIMER_START;
for (int i = _INT32_MIN; i < i + steps ; i += steps)
Int32ToStr(i);
TIMER_STOP;
}
cout << "Done" << endl;
int wait;
cin >> wait;
return 0;
}
On 64-bit Windows the result from running this example is:
itoa() took:
2914.12 ms.
Int32ToStr() took:
306.637 ms.
Done
On 32-bit Windows the result from running this example is:
itoa() took:
3126.12 ms.
Int32ToStr() took:
299.387 ms.
Done
Why do you use reverse string lookup buffers?
It's possible to do this without the reverse string lookup buffers (thus saving 1/2 the internal memory), but this makes it significantly slower (timed at about 850 ms on 64-bit and 380 ms on 32-bit systems). It's not clear to me exactly why it's so much slower - especially on 64-bit systems, to test this further yourself you can change simply the following code:
#define _UINT32 unsigned _INT32
...
static const _UINT32 numElem10Radix = 100000;
...
void InitBuffers()
{
char intBuffer[20];
for (int i = 0; i < numElem10Radix; i++)
{
_itoa(i, intBuffer, 10);
size_t numLen = strlen(intBuffer);
char *intStr = new char[numLen + 1];
strcpy(intStr, intBuffer);
numbersIn10Radix[i] = intStr;
numbersIn10RadixLen[i] = numLen;
}
}
...
const char *Int32ToStr(_INT32 i)
{
char buf[_INT32_MAX_LENGTH + 2];
char *p = buf + _INT32_MAX_LENGTH + 1;
_UINT32 modVal;
*--p = '\0';
_UINT32 j = i;
do
{
modVal = j % numElem10Radix;
switch(numbersIn10RadixLen[modVal])
{
case 5:
*--p = numbersIn10Radix[modVal][4];
case 4:
*--p = numbersIn10Radix[modVal][3];
case 3:
*--p = numbersIn10Radix[modVal][2];
case 2:
*--p = numbersIn10Radix[modVal][1];
default:
*--p = numbersIn10Radix[modVal][0];
}
j /= numElem10Radix;
} while (j);
if (i < 0) *--p = '-';
return p;
}