distinct_detail.py 902 B

1234567891011121314151617181920212223242526272829303132333435363738
  1. # coding=utf-8
  2. '''
  3. Created on 2016年3月31日10:32:20
  4. 先检查已经下载好的网址进行
  5. @author: ChenHao
  6. '''
  7. from pymongo.mongo_client import MongoClient
  8. from util_common import Constant
  9. import time
  10. cli = MongoClient(Constant.MONGODB_URL)
  11. db = cli.spider
  12. timeStart = time.time()
  13. url_repeat_set = set()
  14. url_list = list()
  15. # 节约流量,这里只取2个字段
  16. rs = db.detail_todo.find({}, {"_id": True, "url": True}, no_cursor_timeout=True)
  17. print ("共计:", rs.count())
  18. for r in rs:
  19. if r["url"] in url_list:
  20. print (r)
  21. url_repeat_set.add(r["url"])
  22. # 更新那一条的状态
  23. db.detail_todo.update_one({"_id": r["_id"]}, {'$set': {'status': Constant.DISTINCT}})
  24. else:
  25. url_list.append(r["url"])
  26. timeEnd = time.time()
  27. print ("重复:", len(url_repeat_set))
  28. print ("有效:", len(url_list))
  29. print ("耗时:", timeEnd-timeStart)
  30. cli.close()