Painfully slow Azure table insert and delete batch operations

自古美人都是妖i 提交于 2019-12-02 14:29:39

Ok, 3rd answers a charm?

http://blogs.msdn.com/b/windowsazurestorage/archive/2010/11/06/how-to-get-most-out-of-windows-azure-tables.aspx

A couple things - the storage emulator - from a friend that did some serious digging into it.

"Everything is hitting a single table in a single database (more partitions doesn't affect anything). Each table insert operation is at least 3 sql operations. Every batch is inside a transaction. Depending on transaction isolation level, those batches will have limited ability to execute in parallel.

Serial batches should be faster than individual inserts due to sql server behavior. (Individual inserts are essentially little transactions that each flush to disk, while a real transaction flushes to disk as a group)."

IE using multiple partitions dosen't affect performance on the emulator while it does against real azure storage.

Also enable logging and check your logs a little - c:\users\username\appdata\local\developmentstorage

Batch size of 100 seems to offer the best real performance, turn off naggle, turn off expect 100, beef up the connection limit.

Also make damn sure you are not accidentally inserting duplicates, that will cause an error and slow everything way way way down.

and test against real storage. There's a pretty decent library out there that handles most of this for you - http://www.nuget.org/packages/WindowsAzure.StorageExtensions/, just make sure you actually call ToList on the adds and such as it won't really execute till enumerated. Also that library uses dynamictableentity and thus there's a small perf hit for the serialization, but it does allow you to use pure POCO objects with no TableEntity stuff.

~ JT

basic concept - use paralleism to speed this up.

step 1 - give your threadpool enough threads to pull this off - ThreadPool.SetMinThreads(1024, 256);

step 2 - use partitions. I use guids as Ids, i use the last to characters to split into 256 unique partitons (actually I group those into N subsets in my case 48 partitions)

step 3 - insert using tasks, i use object pooling for table refs

public List<T> InsertOrUpdate(List<T> items)
        {
            var subLists = SplitIntoPartitionedSublists(items);

            var tasks = new List<Task>();

            foreach (var subList in subLists)
            {
                List<T> list = subList;
                var task = Task.Factory.StartNew(() =>
                    {
                        var batchOp = new TableBatchOperation();
                        var tableRef = GetTableRef();

                        foreach (var item in list)
                        {
                            batchOp.Add(TableOperation.InsertOrReplace(item));
                        }

                        tableRef.ExecuteBatch(batchOp);
                        ReleaseTableRef(tableRef);
                    });
                tasks.Add(task);
            }

            Task.WaitAll(tasks.ToArray());

            return items;
        }

private IEnumerable<List<T>> SplitIntoPartitionedSublists(IEnumerable<T> items)
        {
            var itemsByPartion = new Dictionary<string, List<T>>();

            //split items into partitions
            foreach (var item in items)
            {
                var partition = GetPartition(item);
                if (itemsByPartion.ContainsKey(partition) == false)
                {
                    itemsByPartion[partition] = new List<T>();
                }
                item.PartitionKey = partition;
                item.ETag = "*";
                itemsByPartion[partition].Add(item);
            }

            //split into subsets
            var subLists = new List<List<T>>();
            foreach (var partition in itemsByPartion.Keys)
            {
                var partitionItems = itemsByPartion[partition];
                for (int i = 0; i < partitionItems.Count; i += MaxBatch)
                {
                    subLists.Add(partitionItems.Skip(i).Take(MaxBatch).ToList());
                }
            }

            return subLists;
        }

        private void BuildPartitionIndentifiers(int partitonCount)
        {
            var chars = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }.ToList();
            var keys = new List<string>();

            for (int i = 0; i < chars.Count; i++)
            {
                var keyA = chars[i];
                for (int j = 0; j < chars.Count; j++)
                {
                    var keyB = chars[j];
                    keys.Add(string.Concat(keyA, keyB));
                }
            }


            var keySetMaxSize = Math.Max(1, (int)Math.Floor((double)keys.Count / ((double)partitonCount)));
            var keySets = new List<List<string>>();

            if (partitonCount > keys.Count)
            {
                partitonCount = keys.Count;
            }

            //Build the key sets
            var index = 0;
            while (index < keys.Count)
            {
                var keysSet = keys.Skip(index).Take(keySetMaxSize).ToList();
                keySets.Add(keysSet);
                index += keySetMaxSize;
            }

            //build the lookups and datatable for each key set
            _partitions = new List<string>();
            for (int i = 0; i < keySets.Count; i++)
            {
                var partitionName = String.Concat("subSet_", i);
                foreach (var key in keySets[i])
                {
                    _partitionByKey[key] = partitionName;
                }
                _partitions.Add(partitionName);
            }

        }

        private string GetPartition(T item)
        {
            var partKey = item.Id.ToString().Substring(34,2);
            return _partitionByKey[partKey];
        }

        private string GetPartition(Guid id)
        {
            var partKey = id.ToString().Substring(34, 2);
            return _partitionByKey[partKey];
        }

        private CloudTable GetTableRef()
        {
            CloudTable tableRef = null;
            //try to pop a table ref out of the stack
            var foundTableRefInStack = _tableRefs.TryPop(out tableRef);
            if (foundTableRefInStack == false)
            {
                //no table ref available must create a new one
                var client = _account.CreateCloudTableClient();
                client.RetryPolicy = new ExponentialRetry(TimeSpan.FromSeconds(1), 4);
                tableRef = client.GetTableReference(_sTableName);
            }

            //ensure table is created
            if (_bTableCreated != true)
            {
                tableRef.CreateIfNotExists();
                _bTableCreated = true;
            }

            return tableRef;
        }

result - 19-22kops storage account maximum

hit me up if your interested in the full source

need moar? use multiple storage accounts!

this is from months of trial and error, testing, beating my head against a desk. I really hope it helps.

Ashraf Alam

After going through lots of pain, experiments, finally been able to got optimal throughput for single table partition (2,000+ batch write operations per second) and much better throughput in storage account (3,500+ batch write operations per second) with Azure Table storage. I tried all different approaches, but setting the .net connection limit programmatically (I tried the configuration sample, but didn't work for me) solved the problem (based on a White Paper provided by Microsoft), as shown below:

ServicePoint tableServicePoint = ServicePointManager
    .FindServicePoint(_StorageAccount.TableEndpoint);

//This is a notorious issue that has affected many developers. By default, the value 
//for the number of .NET HTTP connections is 2.
//This implies that only 2 concurrent connections can be maintained. This manifests itself
//as "underlying connection was closed..." when the number of concurrent requests is
//greater than 2.

tableServicePoint.ConnectionLimit = 1000;

Anyone else who got 20K+ batch write operation per storage account, please share your experience.

For more fun, here's a new answer - isolated independent test that's pulling some amazing numbers for write performance on production and does a hell of a lot better avoiding IO blocking and connection management. I'm very interested to see how this works for you as we are getting ridiculous write speeds ( > 7kps).

webconfig

 <system.net>
    <connectionManagement>
      <add address="*" maxconnection="48"/>
    </connectionManagement>
  </system.net>

For the test i was using parameters based on volume, so like 25000 items, 24 partitions, batchsize of 100 seems to always be the best, and ref count of 20. This is using TPL dataflow (http://www.nuget.org/packages/Microsoft.Tpl.Dataflow/) for BufflerBlock which provides a nice awaitable thread safe table reference pulling.

public class DyanmicBulkInsertTestPooledRefsAndAsynch : WebTest, IDynamicWebTest
{
    private int _itemCount;
    private int _partitionCount;
    private int _batchSize;
    private List<TestTableEntity> _items;
    private GuidIdPartitionSplitter<TestTableEntity> _partitionSplitter;
    private string _tableName;
    private CloudStorageAccount _account;
    private CloudTableClient _tableClient;
    private Dictionary<string, List<TestTableEntity>> _itemsByParition;
    private int _maxRefCount;
    private BufferBlock<CloudTable> _tableRefs;


    public DyanmicBulkInsertTestPooledRefsAndAsynch()
    {
        Properties = new List<ItemProp>();    
        Properties.Add(new ItemProp("ItemCount", typeof(int)));
        Properties.Add(new ItemProp("PartitionCount", typeof(int)));
        Properties.Add(new ItemProp("BatchSize", typeof(int)));
        Properties.Add(new ItemProp("MaxRefs", typeof(int)));


    }

    public List<ItemProp> Properties { get; set; }

    public void SetProps(Dictionary<string, object> propValuesByPropName)
    {
        _itemCount = (int)propValuesByPropName["ItemCount"];
        _partitionCount = (int)propValuesByPropName["PartitionCount"];
        _batchSize = (int)propValuesByPropName["BatchSize"];
        _maxRefCount = (int)propValuesByPropName["MaxRefs"];
    }

    protected override void SetupTest()
    {
        base.SetupTest();

        ThreadPool.SetMinThreads(1024, 256);
        ServicePointManager.DefaultConnectionLimit = 256;
        ServicePointManager.UseNagleAlgorithm = false;
        ServicePointManager.Expect100Continue = false;


        _account = CloudStorageAccount.Parse(CloudConfigurationManager.GetSetting("DataConnectionString"));
        _tableClient = _account.CreateCloudTableClient();
        _tableName = "testtable" + new Random().Next(100000);

        //create the refs
        _tableRefs = new BufferBlock<CloudTable>();
        for (int i = 0; i < _maxRefCount; i++)
        {
            _tableRefs.Post(_tableClient.GetTableReference(_tableName));
        }

        var tableRefTask = GetTableRef();
        tableRefTask.Wait();
        var tableRef = tableRefTask.Result;

        tableRef.CreateIfNotExists();
        ReleaseRef(tableRef);

        _items = TestUtils.GenerateTableItems(_itemCount);
        _partitionSplitter = new GuidIdPartitionSplitter<TestTableEntity>();
        _partitionSplitter.BuildPartitions(_partitionCount);

        _items.ForEach(o =>
            {
                o.ETag = "*";
                o.Timestamp = DateTime.Now;
                o.PartitionKey = _partitionSplitter.GetPartition(o);
            });

        _itemsByParition = _partitionSplitter.SplitIntoPartitionedSublists(_items);
    }

    private async Task<CloudTable> GetTableRef()
    {
        return await _tableRefs.ReceiveAsync();            
    }

    private void ReleaseRef(CloudTable tableRef)
    {
        _tableRefs.Post(tableRef);
    }

    protected override void ExecuteTest()
    {
        Task.WaitAll(_itemsByParition.Keys.Select(parition => Task.Factory.StartNew(() => InsertParitionItems(_itemsByParition[parition]))).ToArray());
    }

    private void InsertParitionItems(List<TestTableEntity> items)
    {

        var tasks = new List<Task>();

        for (int i = 0; i < items.Count; i += _batchSize)
        {
            int i1 = i;

            var task = Task.Factory.StartNew(async () =>
            {
                var batchItems = items.Skip(i1).Take(_batchSize).ToList();

                if (batchItems.Select(o => o.PartitionKey).Distinct().Count() > 1)
                {
                    throw new Exception("Multiple partitions batch");
                }

                var batchOp = new TableBatchOperation();
                batchItems.ForEach(batchOp.InsertOrReplace);   

                var tableRef = GetTableRef.Result();
                tableRef.ExecuteBatch(batchOp);
                ReleaseRef(tableRef);
            });

            tasks.Add(task);

        }

        Task.WaitAll(tasks.ToArray());


    }

    protected override void CleanupTest()
    {
        var tableRefTask = GetTableRef();
        tableRefTask.Wait();
        var tableRef = tableRefTask.Result;
        tableRef.DeleteIfExists();
        ReleaseRef(tableRef);
    }

We are currently working on a version that can handle multiple storage accounts to hopefully get some insane speeds. Also, we are running these on 8 core virtual machines for large datasets, but with the new non blocking IO it should run great on a limited vm. Good luck!

 public class SimpleGuidIdPartitionSplitter<T> where T : IUniqueId
{
    private ConcurrentDictionary<string, string> _partitionByKey = new ConcurrentDictionary<string, string>();
    private List<string> _partitions;
    private bool _bPartitionsBuilt;

    public SimpleGuidIdPartitionSplitter()
    {

    }

    public void BuildPartitions(int iPartCount)
    {
        BuildPartitionIndentifiers(iPartCount);
    }

    public string GetPartition(T item)
    {
        if (_bPartitionsBuilt == false)
        {
            throw new Exception("Partitions Not Built");
        }

        var partKey = item.Id.ToString().Substring(34, 2);
        return _partitionByKey[partKey];
    }

    public string GetPartition(Guid id)
    {
        if (_bPartitionsBuilt == false)
        {
            throw new Exception("Partitions Not Built");
        }

        var partKey = id.ToString().Substring(34, 2);
        return _partitionByKey[partKey];
    }

    #region Helpers
    private void BuildPartitionIndentifiers(int partitonCount)
    {
        var chars = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }.ToList();
        var keys = new List<string>();

        for (int i = 0; i < chars.Count; i++)
        {
            var keyA = chars[i];
            for (int j = 0; j < chars.Count; j++)
            {
                var keyB = chars[j];
                keys.Add(string.Concat(keyA, keyB));
            }
        }


        var keySetMaxSize = Math.Max(1, (int)Math.Floor((double)keys.Count / ((double)partitonCount)));
        var keySets = new List<List<string>>();

        if (partitonCount > keys.Count)
        {
            partitonCount = keys.Count;
        }

        //Build the key sets
        var index = 0;
        while (index < keys.Count)
        {
            var keysSet = keys.Skip(index).Take(keySetMaxSize).ToList();
            keySets.Add(keysSet);
            index += keySetMaxSize;
        }

        //build the lookups and datatable for each key set
        _partitions = new List<string>();
        for (int i = 0; i < keySets.Count; i++)
        {
            var partitionName = String.Concat("subSet_", i);
            foreach (var key in keySets[i])
            {
                _partitionByKey[key] = partitionName;
            }
            _partitions.Add(partitionName);
        }

        _bPartitionsBuilt = true;
    }
    #endregion
}



internal static List<TestTableEntity> GenerateTableItems(int count)
        {
            var items = new List<TestTableEntity>();
            var random = new Random();

            for (int i = 0; i < count; i++)
            {
                var itemId = Guid.NewGuid();

                items.Add(new TestTableEntity()
                {
                    Id = itemId,
                    TestGuid = Guid.NewGuid(),
                    RowKey = itemId.ToString(),
                    TestBool = true,
                    TestDateTime = DateTime.Now,
                    TestDouble = random.Next() * 1000000,
                    TestInt = random.Next(10000),
                    TestString = Guid.NewGuid().ToString(),
                });
            }

            var dupRowKeys = items.GroupBy(o => o.RowKey).Where(o => o.Count() > 1).Select(o => o.Key).ToList();
            if (dupRowKeys.Count > 0)
            {
                throw  new Exception("Dupicate Row Keys");
            }

            return items;
        }

and one more thing - your timing and how are framework was affected point to this http://blogs.msdn.com/b/windowsazurestorage/archive/2013/08/08/net-clients-encountering-port-exhaustion-after-installing-kb2750149-or-kb2805227.aspx

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!