distinct_kindlist.py 604 B

12345678910111213141516171819202122232425
  1. # coding=utf-8
  2. '''
  3. Created on 2016年3月29日
  4. 去除之前没考虑到的重复
  5. @author: ChenHao
  6. '''
  7. from pymongo.mongo_client import MongoClient
  8. import re
  9. from util_common import Constant
  10. fin = open("../spider_download/Other/RepeatKindUrlStart.csv")
  11. lines = fin.readlines()
  12. fin.close()
  13. cli = MongoClient(Constant.MONGODB_URL)
  14. # cli = MongoClient("mongodb://localhost:27017/")
  15. db = cli.spider
  16. for line in lines:
  17. l = line.split(",")
  18. url = l[3]
  19. rexExp = re.compile('^' + url + '.*')
  20. rs = db.kindlist_todo.update_many({'url':rexExp}, {'$set': {'status': Constant.DISTINCT}})
  21. cli.close()