Best assembly or compilation for minimum of three values

问题

I'm looking at code generated by GCC-4.8 for x86_64 and wondering if there is a better (faster) way to compute the minimum of three values.

Here's an excerpt from Python's collections module that computes the minimum of m, rightindex+1, and leftindex:

    ssize_t m = n;
    if (m > rightindex + 1)
        m = rightindex + 1;
    if (m > leftindex)
        m = leftindex;

GCC generates serially dependent code with CMOVs:

leaq    1(%rbp), %rdx
cmpq    %rsi, %rdx
cmovg   %rsi, %rdx
cmpq    %rbx, %rdx
cmovg   %rbx, %rdx

Is there faster code that can take advantage of processor out-of-order parallel execution by removing the data dependencies? I'm wondering if there are known tricks for computing the minimum of multiple values without using conditionals or predicated instructions. Am also wondering if there are some saturating arithmetic intrinsics that would help in this situation.

EDITS:

As shown the code uses signed arithmetic, but an unsigned arithmetic answer would help as well.
I asked about a minimum-of-three but also am interested in minimum-of-n where n is small.
Linus's admonitions on CMOV: http://ondioline.org/mail/cmov-a-bad-idea-on-out-of-order-cpus

回答1:

Minimum of two unsigned numbers has classical solution:

; eax = min(eax, ebx), ecx - scratch register.
.min2:
    sub     ebx, eax
    sbb     ecx, ecx
    and     ecx, ebx
    add     eax, ecx

This approach is probably faster than the solution with cmov, but for higher speed the instructions have to be separated by other instructions for parallel execution.

Implementation of this method for three numbers is possible:

; eax = min(eax, ebx, edx), ecx - scratch register.
.min3:
    sub     ebx, eax
    sbb     ecx, ecx
    and     ecx, ebx
    add     eax, ecx

    sub     edx, eax
    sbb     ecx, ecx
    and     ecx, edx
    add     eax, ecx

Another try is to test the variant with conditional jumps. For the modern processors, it might be even faster, especially if the jumps are highly predictable:

.min3:
    cmp     eax, ebx
    jle     @f
    mov     eax, ebx
@@:
    cmp     eax, edx
    jle     @f
    mov     eax, edx
@@:

回答2:

Here are benchmark results for the methods suggested. Processor is Intel Core-i7 2600K running at 4GHz. Units are nanoseconds per loop (smaller is better). Measurements include pseudo-random test data generation and function call overhead, in addition to the actual min3 code under test.

                          gcc cmov      conditional      classical
                          sequence      branches         branchless

pseudo-random data        2.24          6.31             2.39
fixed data                2.24          2.99             2.39

Benchmark source code

functions.s: the 3 solutions evaluated:

//----------------------------------------------------------------------------

.text
.intel_syntax noprefix

//-----------------------------------------------------------------------------
// uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_a
min3_a:
   mov   rax, rcx
   cmp   rax, rdx
   cmovg rax, rdx
   cmp   rax, r8
   cmovg rax, r8
   retq

//-----------------------------------------------------------------------------
// uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_b
min3_b:
   mov   rax, rcx
   cmp   rax, rdx
   jle   skip1
   mov   rax, rdx
skip1:
   cmp   rax, r8
   jle   skip2
   mov   rax, r8
skip2:
   retq

//-----------------------------------------------------------------------------
// uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8)
.globl min3_c
min3_c:
   sub   rdx, rcx
   sbb   rax, rax
   and   rax, rdx
   add   rcx, rax
   sub   r8,  rcx
   sbb   rax, rax
   and   rax, r8
   add   rax, rcx
   retq

//-----------------------------------------------------------------------------

min3test.c, main program (mingw64/windows):

#define __USE_MINGW_ANSI_STDIO 1
#include <windows.h>
#include <stdio.h>
#include <stdint.h>

 uint64_t min3_a (uint64_t rcx, uint64_t rdx, uint64_t r8);
 uint64_t min3_b (uint64_t rcx, uint64_t rdx, uint64_t r8);
 uint64_t min3_c (uint64_t rcx, uint64_t rdx, uint64_t r8);

//-----------------------------------------------------------------------------
//
//  queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
//                            count directly.

uint64_t queryPerformanceCounter (void)
    {
    LARGE_INTEGER int64;
    QueryPerformanceCounter (&int64);
    return int64.QuadPart;
    }

//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns  count direcly.

uint64_t queryPerformanceFrequency (void)
    {
    LARGE_INTEGER int64;

    QueryPerformanceFrequency (&int64);
    return int64.QuadPart;
    }

//----------------------------------------------------------------------------
//
// lfsr64gpr - left shift galois type lfsr for 64-bit data, general purpose register implementation
//
static uint64_t lfsr64gpr (uint64_t data, uint64_t mask)
   {
   uint64_t carryOut = data >> 63;
   uint64_t maskOrZ = -carryOut; 
   return (data << 1) ^ (maskOrZ & mask);
   }

//---------------------------------------------------------------------------

uint64_t min3 (uint64_t a, uint64_t b, uint64_t c)
    {
    uint64_t smallest;

    smallest = a;
    if (smallest > b) smallest = b;
    if (smallest > c) smallest = c;
    return smallest;
    }

//---------------------------------------------------------------------------

static void runtests (uint64_t pattern, uint64_t mask)
    {
    uint64_t startCount, elapsed, index, loops = 800000000;
    double ns;

    startCount = queryPerformanceCounter ();
    for (index = 0; index < loops; index++)
        {
        pattern = lfsr64gpr (pattern, mask);
        min3_a (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
        }
    elapsed = queryPerformanceCounter () - startCount;
    ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
    printf ("%7.2f ns\n", ns);

    startCount = queryPerformanceCounter ();
    for (index = 0; index < loops; index++)
        {
        pattern = lfsr64gpr (pattern, mask);
        min3_b (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
        }
    elapsed = queryPerformanceCounter () - startCount;
    ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
    printf ("%7.2f ns\n", ns);

    startCount = queryPerformanceCounter ();
    for (index = 0; index < loops; index++)
        {
        pattern = lfsr64gpr (pattern, mask);
        min3_c (pattern & 0xFFFFF, (pattern >> 20) & 0xFFFFF, (pattern >> 40) & 0xFFFFF);
        }
    elapsed = queryPerformanceCounter () - startCount;
    ns = (double) elapsed / queryPerformanceFrequency () * 1000000000 / loops;
    printf ("%7.2f ns\n", ns);
    }

//---------------------------------------------------------------------------

int main (void)
    {
    uint64_t index, pattern, mask;
    uint64_t count_a = 0, count_b = 0, count_c = 0;

    mask = 0xBEFFFFFFFFFFFFFF;
    pattern = 1;

    // sanity check the asm functions
    for (index = 0; index < 1000000; index++)
        {
        uint64_t expected, result_a, result_b, result_c;
        uint64_t pattern1 = (pattern >>  0) & 0xFFFFF;
        uint64_t pattern2 = (pattern >> 20) & 0xFFFFF;
        uint64_t pattern3 = (pattern >> 40) & 0xFFFFF;
        pattern = lfsr64gpr (pattern, mask);
        expected = min3 (pattern1, pattern2, pattern3);
        result_a = min3_a (pattern1, pattern2, pattern3);
        result_b = min3_b (pattern1, pattern2, pattern3);
        result_c = min3_c (pattern1, pattern2, pattern3);
        if (result_a != expected) printf ("min3_a fail, %llu %llu %llu %llu %llu\n", expected, result_a, pattern1, pattern2, pattern3);
        if (result_b != expected) printf ("min3_b fail, %llu %llu %llu %llu %llu\n", expected, result_b, pattern1, pattern2, pattern3);
        if (result_c != expected) printf ("min3_c fail, %llu %llu %llu %llu %llu\n", expected, result_c, pattern1, pattern2, pattern3);
        if (expected == pattern1) count_a++;
        if (result_b == pattern2) count_b++;
        if (result_c == pattern3) count_c++;
        }
    printf ("pseudo-random distribution: %llu, %llu, %llu\n", count_a, count_b, count_c);


    // raise our priority to increase measurement accuracy
    SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);

    printf ("using pseudo-random data\n");
    runtests (1, mask);
    printf ("using fixed data\n");
    runtests (0, mask);
    return 0;
    }

//---------------------------------------------------------------------------

build command line:

gcc -Wall -Wextra -O3 -omin3test.exe min3test.c functions.s

回答3:

The function min(x,y,z) is continuous, but its derivative is not. That derivative has norm 1 everywhere it's defined. There's just no way to express that as an arithmetic function.

Saturating arithmetic has its own discontinuities, so the previous reasoning can't be used in that case. However, the saturation point is independent from the input. That in turn implies you'd need to scale the inputs, at which point I'm confident the resulting code won't be faster.

That's of course not a full proof of the non-existance of faster code, but you'd probably need an exhaustive search for that.

来源：https://stackoverflow.com/questions/18306615/best-assembly-or-compilation-for-minimum-of-three-values

标签

optimization

assembly

x86-64

intrinsics