I have a special question. I will try to describe this as accurate as possible.
I am doing a very important \"micro-optimization\". A loop that runs for days at a ti
This is the untested C#
version of @PeterCordes answer.
private static Vector128 HsumTranspose( ReadOnlySpan> counts )
{
var sum01 = Avx2.HorizontalAdd( counts[ 0 ], counts[ 1 ] );
var sum23 = Avx2.HorizontalAdd( counts[ 2 ], counts[ 3 ] );
var sum0123 = Avx2.HorizontalAdd( sum01, sum23 );
var sumHigh = Avx2.ExtractVector128( sum0123, 1 );
var sumLow = Avx2.ExtractVector128( sum0123, 0 );
return Sse2.Add( sumHigh, sumLow );
}
private unsafe static int[ ] CountElements( ReadOnlySpan input )
{
var outputCounts = new int[ 4 ];
// Four vectors of zeroed counters each vector holds
// counts for one bucket, to be hsummed at the end.
Span> counts = stackalloc Vector256[ 4 ]
{
Vector256.Zero,
Vector256.Zero,
Vector256.Zero,
Vector256.Zero
};
unsafe
{
fixed ( int* fixedInput = input )
{
var size = input.Length;
for ( var i = 0; i < size; i += 8 )
{
var v = Avx.LoadVector256( &fixedInput[ i ] );
for ( var val = 0; val < 3; val++ )
{
var match = Avx2.CompareEqual( v, Vector256.Create( val ) );
counts[ val ] = Avx2.Subtract( counts[ val ], match );
}
}
Vector128 summedCounts = HsumTranspose( counts );
fixed ( int* fixedOutputCounts = outputCounts )
Sse2.Store( fixedOutputCounts, summedCounts );
outputCounts[ 3 ] = size - outputCounts[ 0 ] -
outputCounts[ 1 ] - outputCounts[ 2 ];
// TODO: handle the last size%8 input elements; scalar would be easy
}
}
}
return outputCounts;
}