| 1234567891011121314151617181920212223242526272829303132333435363738 |
- # coding=utf-8
- '''
- Created on 2016年3月31日10:32:20
- 先检查已经下载好的网址进行
- @author: ChenHao
- '''
- from pymongo.mongo_client import MongoClient
- from util_common import Constant
- import time
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
- timeStart = time.time()
- url_repeat_set = set()
- url_list = list()
- # 节约流量,这里只取2个字段
- rs = db.detail_todo.find({}, {"_id": True, "url": True}, no_cursor_timeout=True)
- print ("共计:", rs.count())
- for r in rs:
- if r["url"] in url_list:
- print (r)
- url_repeat_set.add(r["url"])
- # 更新那一条的状态
- db.detail_todo.update_one({"_id": r["_id"]}, {'$set': {'status': Constant.DISTINCT}})
- else:
- url_list.append(r["url"])
- timeEnd = time.time()
- print ("重复:", len(url_repeat_set))
- print ("有效:", len(url_list))
- print ("耗时:", timeEnd-timeStart)
- cli.close()
|