# coding=utf-8 ''' Created on 2016年3月31日10:32:20 先检查已经下载好的网址进行 @author: ChenHao ''' from pymongo.mongo_client import MongoClient from util_common import Constant import time cli = MongoClient(Constant.MONGODB_URL) db = cli.spider timeStart = time.time() url_repeat_set = set() url_list = list() # 节约流量,这里只取2个字段 rs = db.detail_todo.find({}, {"_id": True, "url": True}, no_cursor_timeout=True) print ("共计:", rs.count()) for r in rs: if r["url"] in url_list: print (r) url_repeat_set.add(r["url"]) # 更新那一条的状态 db.detail_todo.update_one({"_id": r["_id"]}, {'$set': {'status': Constant.DISTINCT}}) else: url_list.append(r["url"]) timeEnd = time.time() print ("重复:", len(url_repeat_set)) print ("有效:", len(url_list)) print ("耗时:", timeEnd-timeStart) cli.close()