| 12345678910111213141516171819202122232425 |
- # coding=utf-8
- '''
- Created on 2016年3月29日
- 去除之前没考虑到的重复
- @author: ChenHao
- '''
- from pymongo.mongo_client import MongoClient
- import re
- from util_common import Constant
- fin = open("../spider_download/Other/RepeatKindUrlStart.csv")
- lines = fin.readlines()
- fin.close()
- cli = MongoClient(Constant.MONGODB_URL)
- # cli = MongoClient("mongodb://localhost:27017/")
- db = cli.spider
- for line in lines:
- l = line.split(",")
- url = l[3]
- rexExp = re.compile('^' + url + '.*')
- rs = db.kindlist_todo.update_many({'url':rexExp}, {'$set': {'status': Constant.DISTINCT}})
- cli.close()
|