I\'m looking to optimize this linear search:
static int
linear (const int *arr, int n, int key)
{
int i = 0;
while (i < n) {
uint32 LinearFindSse4( uint8* data, size_t data_len, uint8* finddata, size_t finddatalen )
{
/**
* the following is based on...
* #define haszero(v) (((v) - 0x01010101UL) & ~(v) & 0x80808080UL)
* we split it into 2 sections
* first section is:
* (v) - 0x01010101UL)
*
* second section is:
* ~(v) & 0x80808080UL)
*/
__m128i ones = _mm_set1_epi8( 0x01 );
__m128i eights = _mm_set1_epi8( 0x80 );
__m128i find_field = _mm_set1_epi8( finddata[0] );
uint32 found_at = 0;
for (int i = 0; i < data_len; i+=16)
{
#define CHECKTHIS( n ) if (!memcmp(&data[i+n], &finddata[0], sizeof(finddata))) { found_at = i + n; break; }
__m128i chunk = _mm_stream_load_si128( (__m128i *)&data[i] );
__m128i xor_result = _mm_xor_si128( chunk, find_field );
__m128i first_sec = _mm_sub_epi64( xor_result, ones );
__m128i second_sec = _mm_andnot_si128( xor_result, eights );
if(!_mm_testz_si128(first_sec, second_sec))
{
CHECKTHIS(0);
CHECKTHIS(1);
CHECKTHIS(2);
CHECKTHIS(3);
CHECKTHIS(4);
CHECKTHIS(5);
CHECKTHIS(6);
CHECKTHIS(7);
CHECKTHIS(8);
CHECKTHIS(9);
CHECKTHIS(10);
CHECKTHIS(11);
CHECKTHIS(12);
CHECKTHIS(13);
CHECKTHIS(14);
CHECKTHIS(15);
}
}
return found_at;
}