I am trying to find sum reduction of 32 elements (each 1 byte data) on an Intel i3 processor. I did this:
s=0;
for (i=0; i<32; i++)
{
s = s + a[i];
}
There is one more way to find the sum of all elements of an array using SSE instructions. The code uses the following SSE constructs.
The code works for any sized array of floats.
float sse_array_sum(float *a, int size)
{
/*
* sum += a[i] (for all i in domain)
*/
float *sse_sum, sum=0;
if(size >= 8)
{
// sse_sum[8]
posix_memalign((void **)&sse_sum, 32, 8*sizeof(float));
__m256 temp_sum;
__m256* ptr_a = (__m256*)a;
int itrs = size/8-1;
// sse_sum[0:7] = a[0:7]
temp_sum = *ptr_a;
a += 8;
ptr_a++;
for(int i=0; i<itrs; i++, ptr_a++, a+=8)
temp_sum = _mm256_add_ps(temp_sum, *ptr_a);
_mm256_store_ps(sse_sum, temp_sum);
for(int i=0; i<8; i++) sum += sse_sum[i];
}
// if size is not divisible by 8
int rmd_itrs = size%8;
// Note: a is pointing to remainder elements
for(int i=0; i<rmd_itrs; i++) sum += a[i];
return sum;
}
float seq_array_sum(float *a, int size)
{
/*
* sum += a[i] (for all i)
*/
float sum = 0;
for(int i=0; i<size; i++) sum += a[i];
return sum;
}
Benchmark:
size = 64000000
a[i] = 3141592.65358 for all i in domain
sequential version time: 194ms
SSE version time: 49ms
Machine specification:
Thread(s) per core: 2
Core(s) per socket: 2
Socket(s): 1
CPU MHz: 1700.072
OS: Ubuntu
This is a bit long-winded but it should still be at least 2x faster than the scalar code:
uint16_t sum_32(const uint8_t a[32])
{
const __m128i vk0 = _mm_set1_epi8(0); // constant vector of all 0s for use with _mm_unpacklo_epi8/_mm_unpackhi_epi8
__m128i v = _mm_load_si128(a); // load first vector of 8 bit values
__m128i vl = _mm_unpacklo_epi8(v, vk0); // unpack to two vectors of 16 bit values
__m128i vh = _mm_unpackhi_epi8(v, vk0);
__m128i vsum = _mm_add_epi16(vl, vh);
v = _mm_load_si128(&a[16]); // load second vector of 8 bit values
vl = _mm_unpacklo_epi8(v, vk0); // unpack to two vectors of 16 bit values
vh = _mm_unpackhi_epi8(v, vk0);
vsum = _mm_add_epi16(vsum, vl);
vsum = _mm_add_epi16(vsum, vh);
// horizontal sum
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
return _mm_extract_epi16(vsum, 0);
}
Note that a[]
needs to be 16 byte aligned.
You can probably improve on the above code using _mm_hadd_epi16
.
You can abuse PSADBW
to calculate small horizontal sums quickly.
Something like this: (not tested)
pxor xmm0, xmm0
psadbw xmm0, [a + 0]
pxor xmm1, xmm1
psadbw xmm1, [a + 16]
paddw xmm0, xmm1
pshufd xmm1, xmm0, 2
paddw xmm0, xmm1 ; low word in xmm0 is the total sum
Attempted intrinsics version:
I never use intrinsics so this code probably makes no sense whatsoever. The disassembly looked OK though.
uint16_t sum_32(const uint8_t a[32])
{
__m128i zero = _mm_xor_si128(zero, zero);
__m128i sum0 = _mm_sad_epu8(
zero,
_mm_load_si128(reinterpret_cast<const __m128i*>(a)));
__m128i sum1 = _mm_sad_epu8(
zero,
_mm_load_si128(reinterpret_cast<const __m128i*>(&a[16])));
__m128i sum2 = _mm_add_epi16(sum0, sum1);
__m128i totalsum = _mm_add_epi16(sum2, _mm_shuffle_epi32(sum2, 2));
return totalsum.m128i_u16[0];
}