问题
I'm looking at code generated by GCC-4.8 for x86_64 and wondering if there is a better (faster) way to compute the minimum of three values.
Here's an excerpt from Python's collections module that computes the minimum of m
, rightindex+1
, and leftindex
:
ssize_t m = n;
if (m > rightindex + 1)
m = rightindex + 1;
if (m > leftindex)
m = leftindex;
GCC generates serially dependent code with CMOVs:
leaq 1(%rbp), %rdx
cmpq %rsi, %rdx
cmovg %rsi, %rdx
cmpq %rbx, %rdx
cmovg %rbx, %rdx
Is there faster code that can take advantage of processor out-of-order parallel execution by removing the data dependencies? I'm wondering if there are known tricks for computing the minimum of multiple values without using conditionals or predicated instructions. Am also wondering if there are some saturating arithmetic intrinsics that would help in this situation.
EDITS:
- As shown the code uses signed arithmetic, but an unsigned arithmetic answer would help as well.
- I asked about a minimum-of-three but also am interested in minimum-of-n where n is small.
- Linus's admonitions on CMOV: http://ondioline.org/mail/cmov-a-bad-idea-on-out-of-order-cpus
回答1:
Minimum of two unsigned numbers has classical solution:
; eax = min(eax, ebx), ecx - scratch register.
.min2:
sub ebx, eax
sbb ecx, ecx
and ecx, ebx
add eax, ecx
This approach is probably faster than the solution with cmov, but for higher speed the instructions have to be separated by other instructions for parallel execution.
Implementation of this method for three numbers is possible:
; eax = min(eax, ebx, edx), ecx - scratch register.
.min3:
sub ebx, eax
sbb ecx, ecx
and ecx, ebx
add eax, ecx
sub edx, eax
sbb ecx, ecx
and ecx, edx
add eax, ecx
Another try is to test the variant with conditional jumps. For the modern processors, it might be even faster, especially if the jumps are highly predictable:
.min3:
cmp eax, ebx
jle @f
mov eax, ebx
@@:
cmp eax, edx
jle @f
mov eax, edx
@@:
回答2:
Here are benchmark results for the methods suggested. Processor is Intel Core-i7 2600K running at 4GHz. Units are nanoseconds per loop (smaller is better). Measurements include pseudo-random test data generation and function call overhead, in addition to the actual min3 code under test.
gcc cmov conditional classical
sequence branches branchless
pseudo-random data 2.24 6.31 2.39
fixed data 2.24 2.99 2.39
Benchmark source code
functions.s: the 3 solutions evaluated:
//----------------------------------------------------------------------------
.text
.intel_syntax noprefix
//-----------------------------------------------------------------------------
// uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_a
min3_a:
mov rax, rcx
cmp rax, rdx
cmovg rax, rdx
cmp rax, r8
cmovg rax, r8
retq
//-----------------------------------------------------------------------------
// uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_b
min3_b:
mov rax, rcx
cmp rax, rdx
jle skip1
mov rax, rdx
skip1:
cmp rax, r8
jle skip2
mov rax, r8
skip2:
retq
//-----------------------------------------------------------------------------
// uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_c
min3_c:
sub rdx, rcx
sbb rax, rax
and rax, rdx
add rcx, rax
sub r8, rcx
sbb rax, rax
and rax, r8
add rax, rcx
retq
//-----------------------------------------------------------------------------
min3test.c, main program (mingw64/windows):
#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>
uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8);
uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8);
//-----------------------------------------------------------------------------
//
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
// count directly.
uint64_t queryPerformanceCounter (void)
{
LARGE_INTEGER int64;
QueryPerformanceCounter (&int64);
return int64.QuadPart;
}
//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly.
uint64_t queryPerformanceFrequency (void)
{
LARGE_INTEGER int64;
QueryPerformanceFrequency (&int64);
return int64.QuadPart;
}
//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
{
uint64_t carryOut = data >> 63;
uint64_t maskOrZ = -carryOut;
return (data << 1) ^ (maskOrZ & mask);
}
//---------------------------------------------------------------------------
uint64_t min3 (uint64_t a, uint64_t b, uint64_t c)
{
uint64_t smallest;
smallest = a;
if (smallest > b) smallest = b;
if (smallest > c) smallest = c;
return smallest;
}
//---------------------------------------------------------------------------
static void runtests (uint64_t pattern, uint64_t mask)
{
uint64_t startCount, elapsed, index, loops = 800000000;
double ns;
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
startCount = queryPerformanceCounter ();
for (index = 0; index < loops; index++)
{
pattern = lfsr64gpr (pattern, mask);
min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
}
elapsed = queryPerformanceCounter () - startCount;
ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
printf ("%7.2f ns\n", ns);
}
//---------------------------------------------------------------------------
int main (void)
{
uint64_t index, pattern, mask;
uint64_t count_a = 0, count_b = 0, count_c = 0;
mask = 0xBEFFFFFFFFFFFFFF;
pattern = 1;
// sanity check the asm functions
for (index = 0; index < 1000000; index++)
{
uint64_t expected, result_a, result_b, result_c;
uint64_t pattern1 = (pattern >> 0) & 0xFFFFF;
uint64_t pattern2 = (pattern >> 20) & 0xFFFFF;
uint64_t pattern3 = (pattern >> 40) & 0xFFFFF;
pattern = lfsr64gpr (pattern, mask);
expected = min3 (pattern1, pattern2, pattern3);
result_a = min3_a (pattern1, pattern2, pattern3);
result_b = min3_b (pattern1, pattern2, pattern3);
result_c = min3_c (pattern1, pattern2, pattern3);
if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu\n", expected, result_a, pattern1, pattern2, pattern3);
if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu\n", expected, result_b, pattern1, pattern2, pattern3);
if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu\n", expected, result_c, pattern1, pattern2, pattern3);
if (expected == pattern1) count_a++;
if (result_b == pattern2) count_b++;
if (result_c == pattern3) count_c++;
}
printf ("pseudo-random distribution: %llu, %llu, %llu\n", count_a, count_b, count_c);
// raise our priority to increase measurement accuracy
SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);
printf ("using pseudo-random data\n");
runtests (1, mask);
printf ("using fixed data\n");
runtests (0, mask);
return 0;
}
//---------------------------------------------------------------------------
build command line:
gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s
回答3:
The function min(x,y,z)
is continuous, but its derivative is not. That derivative has norm 1 everywhere it's defined. There's just no way to express that as an arithmetic function.
Saturating arithmetic has its own discontinuities, so the previous reasoning can't be used in that case. However, the saturation point is independent from the input. That in turn implies you'd need to scale the inputs, at which point I'm confident the resulting code won't be faster.
That's of course not a full proof of the non-existance of faster code, but you'd probably need an exhaustive search for that.
来源:https://stackoverflow.com/questions/18306615/best-assembly-or-compilation-for-minimum-of-three-values