import pymongo import click # 数据库基本信息 db_configs = { 'type': 'mongo', 'host': '127.0.0.1', 'port': '27017', "user": "", "password": "", 'db_name': 'mafengwo' } class Mongo(): def __init__(self): self.db_name = db_configs.get("db_name") self.host = db_configs.get("host") self.port = db_configs.get("port") self.client = pymongo.MongoClient(f'mongodb://{self.host}:{self.port}', connect=False, maxPoolSize=10) self.username = db_configs.get("user") self.password = db_configs.get("passwd") if self.username and self.password: self.db = self.client[self.db_name].authenticate(self.username, self.password) self.db = self.client[self.db_name] def reset_status(self, col="dianping_seed_data"): self.db[col].update_many({'$or': [{'status': 1}, {'status': 3}]}, {'$set': {"status": 0}}) def reset_all_status(self, col="dianping_seed_data"): self.db[col].update_many({}, {'$set': {"status": 0}}) def add_index(self, col="dianping_seed_data"): # status_code 0:初始,1:开始下载,2下载完了 self.db[col].create_index([('status', pymongo.ASCENDING)], unique=True) def get_index(self, col="dianping_seed_data"): indexlist = self.db[col].list_indexes() for index in indexlist: print(index) # 找出重复的放入result表中 def find_duplicate(self, col="dianping_seed_data"): """ {'$out': 'result'}:聚合之后将结果写到新的集合result表里。 :param col: :return: """ result = self.db[col].aggregate([ {'$group': { '_id': {'url': "$url"}, '_id_list': {'$addToSet': "$_id"}, ##_id字段添加到返回结果里面去 'count': {'$sum': 1} }, '$match': { 'status': {'$lte': 0} } }, {'$out': 'result'} ], allowDiskUse=True) for item in result: print(item) return result def delete_dup(self, col="dianping_seed_data"): delete_data = self.db.result.find() try: for d in delete_data: # 保留一条 unique_id_list = d.get("_id_list")[1:] for did in unique_id_list: self.db[col].delete_one({'_id': did}) self.db.result.drop() except Exception as e: print("删除的时候出现问题", e.args) @click.command() @click.option('--s', type=str, default="two", help="状态:all表示全部重置为0,two:表示重置状态为1、3的重置为0") @click.option('--i', type=str, default="a", help="a:增加索引 g:获取索引") def run(s, i): m = Mongo() if s: print("获取参数为:", s) if s == "all": print("所有数据状态重置为0:", s) m.reset_all_status() elif s == "two": print("部分数据状态重置为0:", s) if i: if i == "a": m.add_index() elif i == "g": m.get_index() if __name__ == '__main__': m = Mongo() m.delete_dup()
来源:https://www.cnblogs.com/c-x-a/p/11844881.html