问题
class Program
{
static void Main(string[] args)
{
Console.WriteLine(Vector.IsHardwareAccelerated ? "SIMD supported" : "SIMD not supported.");
var rand = new Random();
var numNums = 10000000;
var arr1 = Enumerable.Repeat(0, numNums).Select(x => (int) (rand.NextDouble() * 100)).ToArray();
var arr2 = Enumerable.Repeat(0, numNums).Select(x => (int) (rand.NextDouble() * 100)).ToArray();
var simdResult = new int [numNums];
var conventionalResult = new int [numNums];
var watch = System.Diagnostics.Stopwatch.StartNew();
ConventionalArrayAddition(arr1, arr2, conventionalResult);
watch.Stop();
Console.WriteLine("Conventional time :" + watch.ElapsedMilliseconds);
var watch2 = System.Diagnostics.Stopwatch.StartNew();
SIMDArrayAddition(arr1, arr2, simdResult);
watch2.Stop();
Console.WriteLine("Simd time :" + watch2.ElapsedMilliseconds);
Console.ReadKey();
}
public static void SIMDArrayAddition(int[] lhs, int[] rhs, int [] result)
{
var simdLength = Vector<int>.Count;
var i = 0;
for (; i <= lhs.Length - simdLength; i += simdLength)
{
var va = new Vector<int>(lhs, i);
var vb = new Vector<int>(rhs, i);
(va + vb).CopyTo(result, i);
}
for (; i < lhs.Length; ++i)
{
result[i] = lhs[i] + rhs[i];
}
}
public static void ConventionalArrayAddition(int[] lhs, int[] rhs, int[] result)
{
for (int i = 0; i < lhs.Length; i ++)
{
result[i] = lhs[i] + rhs[i];
}
}
}
This code is adapted from one of the examples on https://instil.co/2016/03/21/parallelism-on-a-single-core-simd-with-c/.
I am compiling this as a .Net Framework console app (I've tried 4.6.1 and 4.7), with 'Optimize code' selected, as x64.
The results I get are along the lines of:
Conventional time :22
Simd time :23
If I do a similar test in .net core, I do get faster results using the vector method, but only because the naïve implementation is much slower under .net core (taking about 55ms). The vectorised implementation in core is generally slightly slower (say, 24ms) than the results I get in .net framework.
My processor is an i5-7500T, and I have had similar results on an i5-7200.
Is there likely to be some other simple setting I'm neglecting? Or could it be that the compiler is somehow optimising to use simd instructions in the naïve code anyway?
UPDATE: following the instructions in https://blogs.msdn.microsoft.com/clrcodegeneration/2007/10/19/how-to-see-the-assembly-code-generated-by-the-jit-using-visual-studio/, Here is the disassembly for ConventionalArrayAddition() :
for (int i = 0; i < lhs.Length; i++)
00000000 sub rsp,28h
00000004 xor eax,eax
00000006 mov r9d,dword ptr [rcx+8]
0000000a test r9d,r9d
0000000d jle 000000000000008A
0000000f test rdx,rdx
00000012 setne r10b
00000016 movzx r10d,r10b
0000001a and r10d,1
0000001e test r8,r8
00000021 setne r11b
00000025 movzx r11d,r11b
00000029 test r11d,r10d
0000002c je 0000000000000066
0000002e cmp dword ptr [rdx+8],r9d
00000032 setge r10b
00000036 movzx r10d,r10b
0000003a cmp dword ptr [r8+8],r9d
0000003e setge r11b
00000042 movzx r11d,r11b
00000046 test r11d,r10d
00000049 je 0000000000000066
{
result[i] = lhs[i] + rhs[i];
0000004b movsxd r10,eax
0000004e mov r11d,dword ptr [rcx+r10*4+10h]
00000053 add r11d,dword ptr [rdx+r10*4+10h]
00000058 mov dword ptr [r8+r10*4+10h],r11d
for (int i = 0; i < lhs.Length; i++)
0000005d inc eax
0000005f cmp r9d,eax
00000062 jg 000000000000004B
00000064 jmp 000000000000008A
00000066 movsxd r10,eax
00000069 mov r11d,dword ptr [rcx+r10*4+10h]
0000006e cmp eax,dword ptr [rdx+8]
00000071 jae 000000000000008F
00000073 add r11d,dword ptr [rdx+r10*4+10h]
00000078 cmp eax,dword ptr [r8+8]
0000007c jae 000000000000008F
0000007e mov dword ptr [r8+r10*4+10h],r11d
00000083 inc eax
00000085 cmp r9d,eax
00000088 jg 0000000000000066
0000008a add rsp,28h
}
}
0000008e ret
0000008f call 000000005FA91300
00000094 int 3
and for SIMDArrayAddition():
var simdLength = Vector<int>.Count;
00000000 push rdi
00000001 push rsi
00000002 sub rsp,28h
00000006 vzeroupper
00000009 xor eax,eax
for (; i <= lhs.Length - simdLength; i += simdLength)
0000000b mov r9d,dword ptr [rcx+8]
0000000f mov r10d,r9d
00000012 sub r10d,8
00000016 test r10d,r10d
00000019 jl 0000000000000064
0000001b mov r11d,dword ptr [rdx+8]
0000001f mov esi,dword ptr [r8+8]
00000023 cmp eax,r9d
00000026 jae 00000000000000A2
00000028 lea edi,[rax+7]
0000002b cmp edi,r9d
0000002e jae 00000000000000A2
00000030 vmovupd ymm0,ymmword ptr [rcx+rax*4+10h]
var vb = new Vector<int>(rhs, i);
00000037 cmp eax,r11d
0000003a jae 00000000000000A2
0000003c cmp edi,r11d
0000003f jae 00000000000000A2
00000041 vmovupd ymm1,ymmword ptr [rdx+rax*4+10h]
(va + vb).CopyTo(result, i);
00000048 vpaddd ymm0,ymm0,ymm1
0000004d cmp eax,esi
0000004f jae 00000000000000A7
00000051 cmp edi,esi
00000053 jae 00000000000000AC
00000055 vmovupd ymmword ptr [r8+rax*4+10h],ymm0
for (; i <= lhs.Length - simdLength; i += simdLength)
0000005c add eax,8
0000005f cmp r10d,eax
00000062 jge 0000000000000023
}
for (; i < lhs.Length; ++i)
00000064 cmp r9d,eax
00000067 jle 0000000000000098
00000069 mov r11d,dword ptr [rdx+8]
0000006d mov esi,dword ptr [r8+8]
{
result[i] = lhs[i] + rhs[i];
00000071 cmp eax,r9d
00000074 jae 00000000000000A2
00000076 movsxd r10,eax
00000079 mov edi,dword ptr [rcx+r10*4+10h]
0000007e cmp eax,r11d
00000081 jae 00000000000000A2
00000083 add edi,dword ptr [rdx+r10*4+10h]
00000088 cmp eax,esi
0000008a jae 00000000000000A2
0000008c mov dword ptr [r8+r10*4+10h],edi
for (; i < lhs.Length; ++i)
00000091 inc eax
00000093 cmp r9d,eax
00000096 jg 0000000000000071
00000098 vzeroupper
}
}
0000009b add rsp,28h
0000009f pop rsi
000000a0 pop rdi
000000a1 ret
000000a2 call 000000005FA91250
000000a7 call 000000005FA91B00
000000ac call 000000005FA91A50
000000b1 int 3
These were obtained from a different machine (i7-4790), which is producing similar timings.
回答1:
Changing the implementation to AddTo
to reduce the number of sources and destinations, improves performance by about 70%. This addition is useful in many cases, and how most CPU internal additions work, reducing memory bandwidth and cache requirements.
public static void SIMDArrayAddTo(int[] lhs, int[] rhs)
{
var simdLength = Vector<int>.Count;
var end = lhs.Length - simdLength;
var i = 0;
for (; i <= end; i += simdLength)
{
var va = new Vector<int>(lhs, i);
var vb = new Vector<int>(rhs, i);
(va + vb).CopyTo(lhs, i);
}
for (; i < lhs.Length; ++i)
{
lhs[i] += rhs[i];
}
}
I also tried unrolling the SSE loop, but it didn't seem to help. Added version similar to this one to the HPCsharp nuget package, including a multi-core version.
Also, added multi-core paralelism on top of the above function, which didn't improve performance. If anyone has access to a CPU with more than 2 memory channels, it would be interesting to see how this code scales when more system memory bandwidth is available.
来源:https://stackoverflow.com/questions/51553540/why-might-this-simd-array-adding-sample-not-be-demonstrating-any-performance-gai