I am running into a huge performance bottleneck when using Azure table storage. My desire is to use tables as a sort of cache, so a long process may result in anywhere from hund
For more fun, here's a new answer - isolated independent test that's pulling some amazing numbers for write performance on production and does a hell of a lot better avoiding IO blocking and connection management. I'm very interested to see how this works for you as we are getting ridiculous write speeds ( > 7kps).
webconfig
For the test i was using parameters based on volume, so like 25000 items, 24 partitions, batchsize of 100 seems to always be the best, and ref count of 20. This is using TPL dataflow (http://www.nuget.org/packages/Microsoft.Tpl.Dataflow/) for BufflerBlock which provides a nice awaitable thread safe table reference pulling.
public class DyanmicBulkInsertTestPooledRefsAndAsynch : WebTest, IDynamicWebTest
{
private int _itemCount;
private int _partitionCount;
private int _batchSize;
private List _items;
private GuidIdPartitionSplitter _partitionSplitter;
private string _tableName;
private CloudStorageAccount _account;
private CloudTableClient _tableClient;
private Dictionary> _itemsByParition;
private int _maxRefCount;
private BufferBlock _tableRefs;
public DyanmicBulkInsertTestPooledRefsAndAsynch()
{
Properties = new List();
Properties.Add(new ItemProp("ItemCount", typeof(int)));
Properties.Add(new ItemProp("PartitionCount", typeof(int)));
Properties.Add(new ItemProp("BatchSize", typeof(int)));
Properties.Add(new ItemProp("MaxRefs", typeof(int)));
}
public List Properties { get; set; }
public void SetProps(Dictionary propValuesByPropName)
{
_itemCount = (int)propValuesByPropName["ItemCount"];
_partitionCount = (int)propValuesByPropName["PartitionCount"];
_batchSize = (int)propValuesByPropName["BatchSize"];
_maxRefCount = (int)propValuesByPropName["MaxRefs"];
}
protected override void SetupTest()
{
base.SetupTest();
ThreadPool.SetMinThreads(1024, 256);
ServicePointManager.DefaultConnectionLimit = 256;
ServicePointManager.UseNagleAlgorithm = false;
ServicePointManager.Expect100Continue = false;
_account = CloudStorageAccount.Parse(CloudConfigurationManager.GetSetting("DataConnectionString"));
_tableClient = _account.CreateCloudTableClient();
_tableName = "testtable" + new Random().Next(100000);
//create the refs
_tableRefs = new BufferBlock();
for (int i = 0; i < _maxRefCount; i++)
{
_tableRefs.Post(_tableClient.GetTableReference(_tableName));
}
var tableRefTask = GetTableRef();
tableRefTask.Wait();
var tableRef = tableRefTask.Result;
tableRef.CreateIfNotExists();
ReleaseRef(tableRef);
_items = TestUtils.GenerateTableItems(_itemCount);
_partitionSplitter = new GuidIdPartitionSplitter();
_partitionSplitter.BuildPartitions(_partitionCount);
_items.ForEach(o =>
{
o.ETag = "*";
o.Timestamp = DateTime.Now;
o.PartitionKey = _partitionSplitter.GetPartition(o);
});
_itemsByParition = _partitionSplitter.SplitIntoPartitionedSublists(_items);
}
private async Task GetTableRef()
{
return await _tableRefs.ReceiveAsync();
}
private void ReleaseRef(CloudTable tableRef)
{
_tableRefs.Post(tableRef);
}
protected override void ExecuteTest()
{
Task.WaitAll(_itemsByParition.Keys.Select(parition => Task.Factory.StartNew(() => InsertParitionItems(_itemsByParition[parition]))).ToArray());
}
private void InsertParitionItems(List items)
{
var tasks = new List();
for (int i = 0; i < items.Count; i += _batchSize)
{
int i1 = i;
var task = Task.Factory.StartNew(async () =>
{
var batchItems = items.Skip(i1).Take(_batchSize).ToList();
if (batchItems.Select(o => o.PartitionKey).Distinct().Count() > 1)
{
throw new Exception("Multiple partitions batch");
}
var batchOp = new TableBatchOperation();
batchItems.ForEach(batchOp.InsertOrReplace);
var tableRef = GetTableRef.Result();
tableRef.ExecuteBatch(batchOp);
ReleaseRef(tableRef);
});
tasks.Add(task);
}
Task.WaitAll(tasks.ToArray());
}
protected override void CleanupTest()
{
var tableRefTask = GetTableRef();
tableRefTask.Wait();
var tableRef = tableRefTask.Result;
tableRef.DeleteIfExists();
ReleaseRef(tableRef);
}
We are currently working on a version that can handle multiple storage accounts to hopefully get some insane speeds. Also, we are running these on 8 core virtual machines for large datasets, but with the new non blocking IO it should run great on a limited vm. Good luck!
public class SimpleGuidIdPartitionSplitter where T : IUniqueId
{
private ConcurrentDictionary _partitionByKey = new ConcurrentDictionary();
private List _partitions;
private bool _bPartitionsBuilt;
public SimpleGuidIdPartitionSplitter()
{
}
public void BuildPartitions(int iPartCount)
{
BuildPartitionIndentifiers(iPartCount);
}
public string GetPartition(T item)
{
if (_bPartitionsBuilt == false)
{
throw new Exception("Partitions Not Built");
}
var partKey = item.Id.ToString().Substring(34, 2);
return _partitionByKey[partKey];
}
public string GetPartition(Guid id)
{
if (_bPartitionsBuilt == false)
{
throw new Exception("Partitions Not Built");
}
var partKey = id.ToString().Substring(34, 2);
return _partitionByKey[partKey];
}
#region Helpers
private void BuildPartitionIndentifiers(int partitonCount)
{
var chars = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }.ToList();
var keys = new List();
for (int i = 0; i < chars.Count; i++)
{
var keyA = chars[i];
for (int j = 0; j < chars.Count; j++)
{
var keyB = chars[j];
keys.Add(string.Concat(keyA, keyB));
}
}
var keySetMaxSize = Math.Max(1, (int)Math.Floor((double)keys.Count / ((double)partitonCount)));
var keySets = new List>();
if (partitonCount > keys.Count)
{
partitonCount = keys.Count;
}
//Build the key sets
var index = 0;
while (index < keys.Count)
{
var keysSet = keys.Skip(index).Take(keySetMaxSize).ToList();
keySets.Add(keysSet);
index += keySetMaxSize;
}
//build the lookups and datatable for each key set
_partitions = new List();
for (int i = 0; i < keySets.Count; i++)
{
var partitionName = String.Concat("subSet_", i);
foreach (var key in keySets[i])
{
_partitionByKey[key] = partitionName;
}
_partitions.Add(partitionName);
}
_bPartitionsBuilt = true;
}
#endregion
}
internal static List GenerateTableItems(int count)
{
var items = new List();
var random = new Random();
for (int i = 0; i < count; i++)
{
var itemId = Guid.NewGuid();
items.Add(new TestTableEntity()
{
Id = itemId,
TestGuid = Guid.NewGuid(),
RowKey = itemId.ToString(),
TestBool = true,
TestDateTime = DateTime.Now,
TestDouble = random.Next() * 1000000,
TestInt = random.Next(10000),
TestString = Guid.NewGuid().ToString(),
});
}
var dupRowKeys = items.GroupBy(o => o.RowKey).Where(o => o.Count() > 1).Select(o => o.Key).ToList();
if (dupRowKeys.Count > 0)
{
throw new Exception("Dupicate Row Keys");
}
return items;
}
and one more thing - your timing and how are framework was affected point to this http://blogs.msdn.com/b/windowsazurestorage/archive/2013/08/08/net-clients-encountering-port-exhaustion-after-installing-kb2750149-or-kb2805227.aspx