I suppose I could offer this gem:
unsigned long isqrt(unsigned long value)
{
unsigned long tmp = 1, root = 0;
#define ISQRT_INNER(shift) \
{ \
if (value >= (tmp = ((root << 1) + (1 << (shift))) << (shift))) \
{ \
root += 1 << shift; \
value -= tmp; \
} \
}
// Find out how many bytes our value uses
// so we don't do any uneeded work.
if (value & 0xffff0000)
{
if ((value & 0xff000000) == 0)
tmp = 3;
else
tmp = 4;
}
else if (value & 0x0000ff00)
tmp = 2;
switch (tmp)
{
case 4:
ISQRT_INNER(15);
ISQRT_INNER(14);
ISQRT_INNER(13);
ISQRT_INNER(12);
case 3:
ISQRT_INNER(11);
ISQRT_INNER(10);
ISQRT_INNER( 9);
ISQRT_INNER( 8);
case 2:
ISQRT_INNER( 7);
ISQRT_INNER( 6);
ISQRT_INNER( 5);
ISQRT_INNER( 4);
case 1:
ISQRT_INNER( 3);
ISQRT_INNER( 2);
ISQRT_INNER( 1);
ISQRT_INNER( 0);
}
#undef ISQRT_INNER
return root;
}
Since the square-root was calculated at a very sensitive place, I got the task of looking into a way to make it faster. This small refactoring reduced the execution time by a third (for the combination of hardware and compiler used, YMMV):
unsigned long isqrt(unsigned long value)
{
unsigned long tmp = 1, root = 0;
#define ISQRT_INNER(shift) \
{ \
if (value >= (tmp = ((root << 1) + (1 << (shift))) << (shift))) \
{ \
root += 1 << shift; \
value -= tmp; \
} \
}
ISQRT_INNER (15);
ISQRT_INNER (14);
ISQRT_INNER (13);
ISQRT_INNER (12);
ISQRT_INNER (11);
ISQRT_INNER (10);
ISQRT_INNER ( 9);
ISQRT_INNER ( 8);
ISQRT_INNER ( 7);
ISQRT_INNER ( 6);
ISQRT_INNER ( 5);
ISQRT_INNER ( 4);
ISQRT_INNER ( 3);
ISQRT_INNER ( 2);
ISQRT_INNER ( 1);
ISQRT_INNER ( 0);
#undef ISQRT_INNER
return root;
}
Of course there are both faster AND better ways to do this, but I think it's a pretty neat example of a pessimization.
Edit: Come to think of it, the unrolled loop was actually also a neat pessimization. Digging though the version control, I can present the second stage of refactoring as well, which performed even better than the above:
unsigned long isqrt(unsigned long value)
{
unsigned long tmp = 1 << 30, root = 0;
while (tmp != 0)
{
if (value >= root + tmp) {
value -= root + tmp;
root += tmp << 1;
}
root >>= 1;
tmp >>= 2;
}
return root;
}
This is exactly the same algorithm, albeit a slightly different implementation, so I suppose it qualifies.