I\'ve been trying to determine what the true cost of using the fixed statement within C# for managed unsafe structs that contain fixed arrays. Please note I am not referring
That was actually interesting question that I had myself.
The results I managed to obtain suggest slightly different reasons for performance loss than 'fixed' statement itself.
You can see the tests I run and the results below but there are following observations I draw from those:
Running the tests multiple times, gives slightly different but broadly consistent results. Probably I should have ran many series of tests and take the average times - but had no time for that :)
The test class first:
class Test {
public static void NormalAccess (float[] array, int index) {
array[index] = array[index] + 2;
}
public static void NormalRefAccess (ref float[] array, int index) {
array[index] = array[index] + 2;
}
public static void IntPtrAccess (IntPtr arrayPtr, int index) {
unsafe {
var array = (float*) IntPtr.Add (arrayPtr, index << 2);
(*array) = (*array) + 2;
}
}
public static void IntPtrMisalignedAccess (IntPtr arrayPtr, int index) {
unsafe {
var array = (float*) IntPtr.Add (arrayPtr, index); // getting bits of a float
(*array) = (*array) + 2;
}
}
public static void FixedAccess (float[] array, int index) {
unsafe {
fixed (float* ptr = &array[index])
(*ptr) = (*ptr) + 2;
}
}
public unsafe static void PtrAccess (float* ptr) {
(*ptr) = (*ptr) + 2;
}
}
And the tests themselves:
static int runs = 1000*1000*100;
public static void Print (string name, Stopwatch sw) {
Console.WriteLine ("{0}, items/sec = {1:N} \t {2}", sw.Elapsed, (runs / sw.ElapsedMilliseconds) * 1000, name);
}
static void Main (string[] args) {
var buffer = new float[1024*1024*100];
var len = buffer.Length;
var sw = new Stopwatch();
for (int i = 0; i < 1000; i++) {
Test.FixedAccess (buffer, 55);
Test.NormalAccess (buffer, 66);
}
Console.WriteLine ("Starting {0:N0} items", runs);
sw.Restart ();
for (int i = 0; i < runs; i++)
Test.NormalAccess (buffer, i % len);
sw.Stop ();
Print ("Normal access", sw);
sw.Restart ();
for (int i = 0; i < runs; i++)
Test.NormalRefAccess (ref buffer, i % len);
sw.Stop ();
Print ("Normal Ref access", sw);
sw.Restart ();
unsafe {
fixed (float* ptr = &buffer[0])
for (int i = 0; i < runs; i++) {
Test.IntPtrAccess ((IntPtr) ptr, i % len);
}
}
sw.Stop ();
Print ("IntPtr access (fixed outside loop)", sw);
sw.Restart ();
unsafe {
fixed (float* ptr = &buffer[0])
for (int i = 0; i < runs; i++) {
Test.IntPtrMisalignedAccess ((IntPtr) ptr, i % len);
}
}
sw.Stop ();
Print ("IntPtr Misaligned access (fixed outside loop)", sw);
sw.Restart ();
for (int i = 0; i < runs; i++)
Test.FixedAccess (buffer, i % len);
sw.Stop ();
Print ("Fixed access (fixed inside loop)", sw);
sw.Restart ();
unsafe {
fixed (float* ptr = &buffer[0]) {
for (int i = 0; i < runs; i++) {
Test.PtrAccess (ptr + (i % len));
}
}
}
sw.Stop ();
Print ("float* access (fixed outside loop)", sw);
sw.Restart ();
unsafe {
for (int i = 0; i < runs; i++) {
fixed (float* ptr = &buffer[i % len]) {
Test.PtrAccess (ptr);
}
}
}
sw.Stop ();
Print ("float* access (fixed in loop)", sw);
and finally the results:
Debug mode
Starting 100,000,000 items
00:00:01.0373583, items/sec = 96,432,000.00 Normal access
00:00:00.8582307, items/sec = 116,550,000.00 Normal Ref access
00:00:01.8822085, items/sec = 53,134,000.00 IntPtr access (fixed outside loop)
00:00:10.5356369, items/sec = 9,492,000.00 IntPtr Misaligned access (fixed outside loop)
00:00:01.6860701, items/sec = 59,311,000.00 Fixed access (fixed inside loop)
00:00:00.7577868, items/sec = 132,100,000.00 float* access (fixed outside loop)
00:00:01.0387792, items/sec = 96,339,000.00 float* access (fixed in loop)
Release mode
Starting 100,000,000 items
00:00:00.7454832, items/sec = 134,228,000.00 Normal access
00:00:00.6619090, items/sec = 151,285,000.00 Normal Ref access
00:00:00.9859089, items/sec = 101,522,000.00 IntPtr access (fixed outside loop)
00:00:10.1289018, items/sec = 9,873,000.00 IntPtr Misaligned access (fixed outside loop)
00:00:00.7899355, items/sec = 126,742,000.00 Fixed access (fixed inside loop)
00:00:00.5718507, items/sec = 175,131,000.00 float* access (fixed outside loop)
00:00:00.6842333, items/sec = 146,198,000.00 float* access (fixed in loop)
Empirically, the overhead appears to be, in the best case, ~270% on 32 bit JIT and ~200% on 64 bit (and the overhead gets worse the more times you "call" fixed
). So I'd try to minimise your fixed
blocks if performance is really critical.
Sorry, I'm not familiar enough with fixed / unsafe code to know why that's the case
Details
I also added some TestMore
methods which call your two test methods 10 times instead of 2
to give a more real world scenario of multiple methods being called on your fixed
struct.
The code I used:
class Program
{
static void Main(string[] args)
{
var someData = new ByteArray();
int iterations = 1000000000;
var multiple = new MultipleFixed();
var single = new SingleFixed();
// Warmup.
for (int i = 0; i < 100; i++)
{
multiple.Test(ref someData);
single.Test(ref someData);
multiple.TestMore(ref someData);
single.TestMore(ref someData);
}
// Environment.
if (Debugger.IsAttached)
Console.WriteLine("Debugger is attached!!!!!!!!!! This run is invalid!");
Console.WriteLine("CLR Version: " + Environment.Version);
Console.WriteLine("Pointer size: {0} bytes", IntPtr.Size);
Console.WriteLine("Iterations: " + iterations);
Console.Write("Starting run for Single... ");
var sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
single.Test(ref someData);
}
sw.Stop();
Console.WriteLine("Completed in {0:N3}ms - {1:N2}/sec", sw.Elapsed.TotalMilliseconds, iterations / sw.Elapsed.TotalSeconds);
Console.Write("Starting run for More Single... ");
sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
single.Test(ref someData);
}
sw.Stop();
Console.WriteLine("Completed in {0:N3}ms - {1:N2}/sec", sw.Elapsed.TotalMilliseconds, iterations / sw.Elapsed.TotalSeconds);
Console.Write("Starting run for Multiple... ");
sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
multiple.Test(ref someData);
}
sw.Stop();
Console.WriteLine("Completed in {0:N3}ms - {1:N2}/sec", sw.Elapsed.TotalMilliseconds, iterations / sw.Elapsed.TotalSeconds);
Console.Write("Starting run for More Multiple... ");
sw = Stopwatch.StartNew();
for (int i = 0; i < iterations; i++)
{
multiple.TestMore(ref someData);
}
sw.Stop();
Console.WriteLine("Completed in {0:N3}ms - {1:N2}/sec", sw.Elapsed.TotalMilliseconds, iterations / sw.Elapsed.TotalSeconds);
Console.ReadLine();
}
}
unsafe struct ByteArray
{
public fixed byte Data[1024];
}
class MultipleFixed
{
unsafe void SetValue(ref ByteArray bytes, int index, byte value)
{
fixed (byte* data = bytes.Data)
{
data[index] = value;
}
}
unsafe bool Validate(ref ByteArray bytes, int index, byte expectedValue)
{
fixed (byte* data = bytes.Data)
{
return data[index] == expectedValue;
}
}
public void Test(ref ByteArray bytes)
{
SetValue(ref bytes, 0, 1);
Validate(ref bytes, 0, 1);
}
public void TestMore(ref ByteArray bytes)
{
SetValue(ref bytes, 0, 1);
Validate(ref bytes, 0, 1);
SetValue(ref bytes, 0, 2);
Validate(ref bytes, 0, 2);
SetValue(ref bytes, 0, 3);
Validate(ref bytes, 0, 3);
SetValue(ref bytes, 0, 4);
Validate(ref bytes, 0, 4);
SetValue(ref bytes, 0, 5);
Validate(ref bytes, 0, 5);
}
}
class SingleFixed
{
unsafe void SetValue(byte* data, int index, byte value)
{
data[index] = value;
}
unsafe bool Validate(byte* data, int index, byte expectedValue)
{
return data[index] == expectedValue;
}
public unsafe void Test(ref ByteArray bytes)
{
fixed (byte* data = bytes.Data)
{
SetValue(data, 0, 1);
Validate(data, 0, 1);
}
}
public unsafe void TestMore(ref ByteArray bytes)
{
fixed (byte* data = bytes.Data)
{
SetValue(data, 0, 1);
Validate(data, 0, 1);
SetValue(data, 0, 2);
Validate(data, 0, 2);
SetValue(data, 0, 3);
Validate(data, 0, 3);
SetValue(data, 0, 4);
Validate(data, 0, 4);
SetValue(data, 0, 5);
Validate(data, 0, 5);
}
}
}
And the results in .NET 4.0, 32 bit JIT:
CLR Version: 4.0.30319.239
Pointer size: 4 bytes
Iterations: 1000000000
Starting run for Single... Completed in 2,092.350ms - 477,931,580.94/sec
Starting run for More Single... Completed in 2,236.767ms - 447,073,934.63/sec
Starting run for Multiple... Completed in 5,775.922ms - 173,132,528.92/sec
Starting run for More Multiple... Completed in 26,637.862ms - 37,540,550.36/sec
And in .NET 4.0, 64 bit JIT:
CLR Version: 4.0.30319.239
Pointer size: 8 bytes
Iterations: 1000000000
Starting run for Single... Completed in 2,907.946ms - 343,885,316.72/sec
Starting run for More Single... Completed in 2,904.903ms - 344,245,585.63/sec
Starting run for Multiple... Completed in 5,754.893ms - 173,765,185.93/sec
Starting run for More Multiple... Completed in 18,679.593ms - 53,534,358.13/sec