# coding=utf-8 ''' Created on 2016年4月6日 类目转换关系 mouser->uu @author: ChenHao ''' from pymongo.mongo_client import MongoClient from pip._vendor.distlib.util import CONSTRAINTS from util_common import Constant cli = MongoClient(Constant.MONGODB_URL) db = cli.spider file_path = "../spider_download/Other/kind_translate.csv" ''' 1、将转换关系存入mongodb ''' fin = open(file_path, "r") lines = fin.readlines() fin.close() ''' 去除不正确的数据 ''' relation_list = list() for line in lines: str_line = line.replace("\n", "") arr_line = str_line.split(",") try: # 这里只用检查第一个 arr_line[0] = int(arr_line[0]) arr_line[1] = int(arr_line[1]) relation_list.append(arr_line) except: pass temp_set = set() relation_final_list = list() for rl in relation_list: if rl[0] not in temp_set: temp_set.add(rl[0]) d = dict() d["mouserid"] = rl[0] d["uuid"] = rl[1] relation_final_list.append(d) print (relation_final_list) db.kind_translate.insert_many(relation_final_list) ''' 遍历叶子节点,看哪一些没有覆盖到 ''' not_relate_list = list() rs = db.kind_temp_2.find({"isLeaf": True}) for r in rs: if r["id"] not in temp_set: not_relate_list.append(r) print (len(temp_set)) print (rs.count()) print (len(not_relate_list)) # cli = MongoClient(Constant.MONGODB_URL) # db = cli.spider # for line in lines: # temp_str = line.replace("\n", "") # temp_list = temp_str.split(",") # d = dict() # d["mouserid"] = temp_list[0] # d["uuid"] = temp_list[1] # db.kind_translate.insert_one(d) # cli.close()