# coding=utf-8 ''' Created on 2016年4月5日 使用component_original的原始数据得到有关联关系的所有数据的过程 @author: ChenHao ''' from pymongo.mongo_client import MongoClient from util_common import Constant from analysis import detailAnalysis_kind class CreatMain(object): def __init__(self): self.analysis_kind = detailAnalysis_kind.DetailAnalysisKind() self.kind = [] self.property = [] self.kindtranslate = [] def craw(self): pass # 对component_original进行去重操作,将得到的结果存入component_distinct中 def create_component_distinct(self, db): # 1、根据url去重 pass ''' @date 2016年4月5日14:56:25 @todo 因为数据量过大不支持mongodb distinct的方式,需要另想办法,所以展示不考虑对component_original的去重 ''' ''' @date 2016-4-14 load data in RAM ''' component_list = list() url_list = list() mgdb_id_list = list() # reset Status db.component_original.update_many({}, {'$set': {'status': Constant.DONE}}) # load all component data in RAM rs = db.component_original.find({}, {"_id": True, "url": True}, no_cursor_timeout=True) for r in rs: component_list.append(r) # distinct data for component in component_list: if component["url"] in url_list: mgdb_id_list.append(component["_id"]) else: url_list.append(r["url"]) # updata status in mongodb db.component_original.update_many({"_id": {"$in": mgdb_id_list}}, {'$set': {'status': Constant.DISTINCT}}) # 生成property def create_property(self, db): # 用现有的初始化 rs_old = db.property.find({}) property_list = list() labelCn_set = set() for r in rs_old: property_list.append(r) labelCn_set.add(r["labelCn"]) # 得到新的并组装后加入 rs_new = db.component_original.find({"status": Constant.DONE}).distinct("properties.lable") for r in rs_new: if r not in labelCn_set: d = dict() d["id"] = len(property_list) + 1 d["labelCn"] = r property_list.append(d) labelCn_set.add(r) db.property_temp.insert_many(property_list) # copy data from component_original def _copy_data_form_original(self, db): rs_kind = db.kind_temp_2.find() for r in rs_kind: self.kind.append(r) temp_property = db.property_temp.find() for pro in temp_property: self.property.append(pro) temp_translate = db.kind_translate.find() for translate in temp_translate: self.kindtranslate.append(translate) def _get_propertyid_by_label(label): for pro in self.property: if pro["labelCn"] == label: return pro["id"] rs = db.component_original.find({}, no_cursor_timeout=True) for index, r in enumerate(rs): d = dict() d["id"] = index + 1 d["code"] = r["code"] d["description"] = r["description"] d["company"] = r["company"] d["kindid"] = self.analysis_kind._get_uu_kindid_by_kindName(r['lastkind'], self.kind) if r["img_url_uu"]: d["img"] = r["img_url_uu"] else: d["img"] = Constant.COMPONENT_DEFAULT_IMG_URL componentid = d["id"] propertyValues = r["properties"] for inde, propertyValue in enumerate(propertyValues): d = dict() d["componentid"] = componentid d["propertyid"] = _get_propertyid_by_label(propertyValue["lable"]) d["detno"] = inde + 1 d["value"] = propertyValue["value"] db.propertyvalue_temp.insert_one(d) def _handle_brand(self, db): brand_set = set() rs_brand = db.propertyvalue_temp.find({"propertyid" : 40}) for propertyvalue in rs_brand: brand_set.add(propertyvalue["value"]) brand_list = list() for index, brandName in enumerate(brand_set): d = dict() d["id"] = index + 1 d["nameEn"] = brandName # Cn d["nameCn"] = brandName brand_list.append(d) db.brand_temp.insert_many(brand_list) if __name__ == '__main__': main = CreatMain() cli = MongoClient(Constant.MONGODB_URL) db = cli.spider # 1、根据url去重 main.create_component_distinct(db) # 2、得到property main.create_property(db) # 3 main._copy_data_form_original(db) main._handle_brand(db) cli.close()