# coding=utf-8 ''' Created on 2016年3月29日 去除之前没考虑到的重复 @author: ChenHao ''' from pymongo.mongo_client import MongoClient import re from util_common import Constant fin = open("../spider_download/Other/RepeatKindUrlStart.csv") lines = fin.readlines() fin.close() cli = MongoClient(Constant.MONGODB_URL) # cli = MongoClient("mongodb://localhost:27017/") db = cli.spider for line in lines: l = line.split(",") url = l[3] rexExp = re.compile('^' + url + '.*') rs = db.kindlist_todo.update_many({'url':rexExp}, {'$set': {'status': Constant.DISTINCT}}) cli.close()