How to remove duplicates with a certain condition in mongodb?

后端未结

关注

 2  1335

一生所求 2021-01-21 20:31

For example, I have the following documents in my collection:

{
    \"_id\" : \"GuqXmAkkARqhBDqhy\",
    \"beatmapset_id\" : \"342537\",
    \"version\" : \"MX\"


      
      
        
          2条回答        

        
                    
            
            
                         
                
              
              
                
                   既然无缘
                                             
                
                
                (楼主)
            
              
              
                2021-01-21 21:19
              

            
            
                        
One approach you can take is to get a list of the unique ids of the documents with the duplicate beatmapset_id via the aggregation framework:

db.collection.aggregate([
    {
        "$group": {
            "_id": "$beatmapset_id",
            "count": { "$sum": 1 },
            "uniqueIds": { "$addToSet": "$_id" },
            "maxRating": { "$max": "$difficultyrating" }
        }
    },
    { 
        "$match": { 
            "count": { "$gte": 2 } 
        } 
    },
    { 
        "$sort" : { "count" : -1 } 
    }
]);


In the first stage of this example pipeline, we use the $group operator to aggregate documents by the desired index key values and record (in the uniqueIds field) each _id value of the grouped documents. We also count the number of grouped documents by using the $sum operator which adds up the values of the fields passed to it, in this case the constant 1 - thereby counting the number of grouped records into the count field. We also get the maximum difficultyrating value of the group by using the $max operator.

In the second stage of this example pipeline, we use the $match operator to filter out all documents with a count of 1. The filtered-out documents represent unique index keys.

The remaining documents identify documents in the collection that contain duplicate keys.

Sample Output:

/* 0 */
{
    "result" : [ 
        {
            "_id" : "342537",
            "count" : 3,
            "uniqueIds" : [ 
                "GbotZfrPEwW69FkGD", 
                "oHLT7KqsB7bztBGvu", 
                "GuqXmAkkARqhBDqhy"
            ],
            "maxRating" : "3.5552737712860107"
        }
    ],
    "ok" : 1
}


Since the db.collection.aggregate() method returns a cursor and can return result sets of any size, use the cursor method forEach() to iterate the cursor and access the result documents that you can then streamline with Bulk API remove operations:

var pipeline = [
        {
            "$group": {
                "_id": "$beatmapset_id",
                "count": { "$sum": 1 },
                "uniqueIds": { "$addToSet": "$_id" },
                "maxRating": { "$max": "$difficultyrating" }
            }
        },
        { 
            "$match": { 
                "count": { "$gte": 2 } 
            } 
        },
        { 
            "$sort" : { "count" : -1 } 
        }
    ],
    counter = 0,
    bulk = db.collection.initializeOrderedBulkOp();

db.collection.aggregate(pipeline).forEach(function(doc) {
    bulk.find({ 
        "_id": { "$in": doc.uniqueIds },
        "difficultyrating": { "$lt": doc.maxRating }    
    }).remove();

    counter++;
    if ( counter % 500 == 0 ) {
        // Execute per 500 operations and re-init.
        bulk.execute(); 
        bulk = db.mycollection.initializeOrderedBulkOp(); 
    }
});

// Catch any under or over the 500's and clean up queues
if (counter % 500 != 0)
    bulk.execute(); 

    
             
                                                        
            
            
              
                
                0
              
                   
                
               讨论(0)
              
                                                  
              
              
                          
             
       
          
              
                                       
     查看其它2个回答


            
                         
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
                              			
        
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复