How fast can you make linear search?

前端未结

关注

 20  1853

I\'m looking to optimize this linear search:

static int
linear (const int *arr, int n, int key)
{
        int i = 0;
        while (i < n) {


                      
              相关标签:


      
      
        
          20条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  情书的邮戳        
                
              
                            
                2020-12-23 22:01
              
            
            
                                                                       
In reality, the answer to this question is 100% dependent on the platform you're writing the code for. For example:

CPU : Memory speed | Example CPU | Type of optimisation
========================================================================
    Equal          |    8086     | (1) Loop unrolling
------------------------------------------------------------------------
  CPU > RAM        |  Pentium    | (2) None



Avoiding the conditional branch required to loop though the data will give a slight performance improvement.
Once the CPU starts to get faster than the RAM, it doesn't matter how efficient the loop becomes (unless it's a really bad loop), it will be stalling due to having to wait for the data to be brought in from RAM. SIMD doesn't really help since the advantage of parallel testing is still outweighed by having to wait for more data to arrive. SIMD really comes into its own when you're CPU limited.

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  Happy的楠姐        
                
              
                            
                2020-12-23 22:02
              
            
            
                                                                       
You could avoid n checks similar to how loop unrolling does it

static int linear(const int *array, int arraySize, int key)
{
  //assuming the actual size of the array is always 1 less than arraySize
  array[arraySize] = key; 

  int i = 0;
  for (; ; ++i)
  {
     if (array[i] == key) return i;
  }
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  迷失自我        
                
              
                            
                2020-12-23 22:03
              
            
            
                                                                       
If a target-specific solution is acceptable then you can quite easily use SIMD (SSE, AltiVec, or whatever you have available) to get ~ 4x speed-up by testing 4 elements at a time rather than just 1.

Out of interest I put together a simple SIMD implementation as follows:

int linear_search_ref(const int32_t *A, int32_t key, int n)
{
    int result = -1;
    int i;

    for (i = 0; i < n; ++i)
    {
        if (A[i] >= key)
        {
            result = i;
            break;
        }
    }
    return result;
}

int linear_search(const int32_t *A, int32_t key, int n)
{
#define VEC_INT_ELEMS 4
#define BLOCK_SIZE (VEC_INT_ELEMS * 32)
    const __m128i vkey = _mm_set1_epi32(key);
    int vresult = -1;
    int result = -1;
    int i, j;

    for (i = 0; i <= n - BLOCK_SIZE; i += BLOCK_SIZE)
    {
        __m128i vmask0 = _mm_set1_epi32(-1);
        __m128i vmask1 = _mm_set1_epi32(-1);
        int mask0, mask1;

        for (j = 0; j < BLOCK_SIZE; j += VEC_INT_ELEMS * 2)
        {
            __m128i vA0 = _mm_load_si128(&A[i + j]);
            __m128i vA1 = _mm_load_si128(&A[i + j + VEC_INT_ELEMS]);
            __m128i vcmp0 = _mm_cmpgt_epi32(vkey, vA0);
            __m128i vcmp1 = _mm_cmpgt_epi32(vkey, vA1);
            vmask0 = _mm_and_si128(vmask0, vcmp0);
            vmask1 = _mm_and_si128(vmask1, vcmp1);
        }
        mask0 = _mm_movemask_epi8(vmask0);
        mask1 = _mm_movemask_epi8(vmask1);
        if ((mask0 & mask1) != 0xffff)
        {
            vresult = i;
            break;
        }
    }
    if (vresult > -1)
    {
        result = vresult + linear_search_ref(&A[vresult], key, BLOCK_SIZE);
    }
    else if (i < n)
    {
        result = i + linear_search_ref(&A[i], key, n - i);
    }
    return result;
#undef BLOCK_SIZE
#undef VEC_INT_ELEMS
}


On a 2.67 GHz Core i7, using OpenSUSE x86-64 and gcc 4.3.2, I get around 7x - 8x improvement around a fairly broad "sweet spot" where n = 100000 with the key being found at the midpoint of the array (i.e. result = n / 2). Performance drops off to around 3.5x when n gets large and the array therefore exceeds cache size (presumably becoming memory bandwidth-limited in this case). Performance also drops off when n is small, due to inefficiency of the SIMD implementation (it was optimised for large n of course).
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  既然无缘        
                
              
                            
                2020-12-23 22:03
              
            
            
                                                                       
If you're on an Intel platform:

int linear (const int *array, int n, int key)
{
  __asm
  {
    mov edi,array
    mov ecx,n
    mov eax,key
    repne scasd
    mov eax,-1
    jne end
    mov eax,n
    sub eax,ecx
    dec eax
end:
  }
}


but that only finds exact matches, not greater than or equal matches.

In C, you can also use Duff's Device:

int linear (const int *array, int n, int key)
{
  const int
    *end = &array [n];

  int
    result = 0;

  switch (n % 8)
  {
    do {
  case 0:
    if (*(array++) >= key) break;
    ++result;
  case 7:
    if (*(array++) >= key) break;
    ++result;
  case 6:
    if (*(array++) >= key) break;
    ++result;
  case 5:
    if (*(array++) >= key) break;
    ++result;
  case 4:
    if (*(array++) >= key) break;
    ++result;
  case 3:
    if (*(array++) >= key) break;
    ++result;
  case 2:
    if (*(array++) >= key) break;
    ++result;
  case 1:
    if (*(array++) >= key) break;
    ++result;
    } while(array < end);
  }

  return result;
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  梦如初夏        
                
              
                            
                2020-12-23 22:04
              
            
            
                                                                       
First of all, any fast solution must use vectorization to compare many elements at once.

However, all the vectorized implementations posted so far suffer from a common problem: they have branches. As a result, they have to introduce blockwise processing of the array (to reduce overhead of branching), which leads to low performance for small arrays. For large arrays linear search is worse than a well-optimized binary search, so there is no point in optimizing it.

However, linear search can be implemented without branches at all. The idea is very simple: the index you want is precisely the number of elements in the array that are less than the key you search for. So you can compare each element of the array to the key value and sum all the flags:

static int linear_stgatilov_scalar (const int *arr, int n, int key) {
    int cnt = 0;
    for (int i = 0; i < n; i++)
        cnt += (arr[i] < key);
    return cnt;
}


A fun thing about this solution is that it would return the same answer even if you shuffle the array =) Although this solution seems to be rather slow, it can be vectorized elegantly. The implementation provided below requires array to be 16-byte aligned. Also, the array must be padded with INT_MAX elements because it consumes 16 elements at once.

static int linear_stgatilov_vec (const int *arr, int n, int key) {
    assert(size_t(arr) % 16 == 0);
    __m128i vkey = _mm_set1_epi32(key);
    __m128i cnt = _mm_setzero_si128();
    for (int i = 0; i < n; i += 16) {
        __m128i mask0 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+0]), vkey);
        __m128i mask1 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+4]), vkey);
        __m128i mask2 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+8]), vkey);
        __m128i mask3 = _mm_cmplt_epi32(_mm_load_si128((__m128i *)&arr[i+12]), vkey);
        __m128i sum = _mm_add_epi32(_mm_add_epi32(mask0, mask1), _mm_add_epi32(mask2, mask3));
        cnt = _mm_sub_epi32(cnt, sum);
    }
    cnt = _mm_hadd_epi32(cnt, cnt);
    cnt = _mm_hadd_epi32(cnt, cnt);
//  int ans = _mm_extract_epi32(cnt, 0);    //SSE4.1
    int ans = _mm_extract_epi16(cnt, 0);    //correct only for n < 32K
    return ans;
}


The final reduction of a single SSE2 register can be implemented with SSE2 only if necessary, it should not really affect the overall performance.

I have tested it with Visual C++ 2013 x64 compiler on Intel Core2 Duo E4700 (quite old, yeah). The array of size 197 is generated with elements provided by rand(). The full code with all the testing is here. Here is the time to perform 32M searches:

[OP]
Time = 3.155 (-896368640) //the original OP's code
[Paul R]
Time = 2.933 (-896368640)
[stgatilov]
Time = 1.139 (-896368640) //the code suggested


The OP's original code processes 10.6 millions of array per second (2.1 billion elements per second). The suggested code processes 29.5 millions of arrays per second (5.8 billion elements per second).
Also, the suggested code works well for smaller arrays: even for arrays of 15 elements, it is still almost three times faster than OP's original code.

Here is the generated assembly:

$LL56@main:
    movdqa  xmm2, xmm4
    movdqa  xmm0, xmm4
    movdqa  xmm1, xmm4
    lea rcx, QWORD PTR [rcx+64]
    pcmpgtd xmm0, XMMWORD PTR [rcx-80]
    pcmpgtd xmm2, XMMWORD PTR [rcx-96]
    pcmpgtd xmm1, XMMWORD PTR [rcx-48]
    paffffd   xmm2, xmm0
    movdqa  xmm0, xmm4
    pcmpgtd xmm0, XMMWORD PTR [rcx-64]
    paffffd   xmm1, xmm0
    paffffd   xmm2, xmm1
    psubd   xmm3, xmm2
    dec r8
    jne SHORT $LL56@main
$LN54@main:
    phaffffd  xmm3, xmm3
    inc rdx
    phaffffd  xmm3, xmm3
    pextrw  eax, xmm3, 0


Finally, I'd like to note that a well-optimized binary search can be made faster by switching to the described vectorized linear search as soon as the interval becomes small.

UPDATE: More information can be found in my blog post on the matter.
                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  一向        
                
              
                            
                2020-12-23 22:04
              
            
            
                                                                       
unroll with fixed array indices.

int linear( const int *array, int n, int key ) {
  int i = 0;
  if ( array[n-1] >= key ) {
     do {
       if ( array[0] >= key ) return i+0;
       if ( array[1] >= key ) return i+1;
       if ( array[2] >= key ) return i+2;
       if ( array[3] >= key ) return i+3;
       array += 4;
       i += 4;
     } while ( true );
  }
  return -1;
}

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
   
          
     1
2
3
4
下一页
           
           
        
                                  
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复