| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- # coding=utf-8
- '''
- Created on 2016年4月5日
- 使用component_original的原始数据得到有关联关系的所有数据的过程
- @author: ChenHao
- '''
- from pymongo.mongo_client import MongoClient
- from util_common import Constant
- from analysis import detailAnalysis_kind
- class CreatMain(object):
- def __init__(self):
- self.analysis_kind = detailAnalysis_kind.DetailAnalysisKind()
- self.kind = []
- self.property = []
- self.kindtranslate = []
-
- def craw(self):
- pass
-
- # 对component_original进行去重操作,将得到的结果存入component_distinct中
- def create_component_distinct(self, db):
- # 1、根据url去重
- pass
- '''
- @date 2016年4月5日14:56:25
- @todo 因为数据量过大不支持mongodb distinct的方式,需要另想办法,所以展示不考虑对component_original的去重
- '''
- '''
- @date 2016-4-14
- load data in RAM
- '''
- component_list = list()
- url_list = list()
- mgdb_id_list = list()
-
- # reset Status
- db.component_original.update_many({}, {'$set': {'status': Constant.DONE}})
- # load all component data in RAM
- rs = db.component_original.find({}, {"_id": True, "url": True}, no_cursor_timeout=True)
- for r in rs:
- component_list.append(r)
- # distinct data
- for component in component_list:
- if component["url"] in url_list:
- mgdb_id_list.append(component["_id"])
- else:
- url_list.append(r["url"])
-
- # updata status in mongodb
- db.component_original.update_many({"_id": {"$in": mgdb_id_list}}, {'$set': {'status': Constant.DISTINCT}})
-
- # 生成property
- def create_property(self, db):
- # 用现有的初始化
- rs_old = db.property.find({})
- property_list = list()
- labelCn_set = set()
- for r in rs_old:
- property_list.append(r)
- labelCn_set.add(r["labelCn"])
-
- # 得到新的并组装后加入
- rs_new = db.component_original.find({"status": Constant.DONE}).distinct("properties.lable")
- for r in rs_new:
- if r not in labelCn_set:
- d = dict()
- d["id"] = len(property_list) + 1
- d["labelCn"] = r
- property_list.append(d)
- labelCn_set.add(r)
-
- db.property_temp.insert_many(property_list)
-
- # copy data from component_original
- def _copy_data_form_original(self, db):
- rs_kind = db.kind_temp_2.find()
- for r in rs_kind:
- self.kind.append(r)
-
- temp_property = db.property_temp.find()
- for pro in temp_property:
- self.property.append(pro)
-
- temp_translate = db.kind_translate.find()
- for translate in temp_translate:
- self.kindtranslate.append(translate)
-
- def _get_propertyid_by_label(label):
- for pro in self.property:
- if pro["labelCn"] == label:
- return pro["id"]
-
- rs = db.component_original.find({}, no_cursor_timeout=True)
- for index, r in enumerate(rs):
- d = dict()
- d["id"] = index + 1
- d["code"] = r["code"]
- d["description"] = r["description"]
- d["company"] = r["company"]
- d["kindid"] = self.analysis_kind._get_uu_kindid_by_kindName(r['lastkind'], self.kind)
-
- if r["img_url_uu"]:
- d["img"] = r["img_url_uu"]
- else:
- d["img"] = Constant.COMPONENT_DEFAULT_IMG_URL
-
- componentid = d["id"]
- propertyValues = r["properties"]
- for inde, propertyValue in enumerate(propertyValues):
- d = dict()
- d["componentid"] = componentid
- d["propertyid"] = _get_propertyid_by_label(propertyValue["lable"])
- d["detno"] = inde + 1
- d["value"] = propertyValue["value"]
- db.propertyvalue_temp.insert_one(d)
-
- def _handle_brand(self, db):
- brand_set = set()
- rs_brand = db.propertyvalue_temp.find({"propertyid" : 40})
- for propertyvalue in rs_brand:
- brand_set.add(propertyvalue["value"])
-
- brand_list = list()
- for index, brandName in enumerate(brand_set):
- d = dict()
- d["id"] = index + 1
- d["nameEn"] = brandName
- # Cn
- d["nameCn"] = brandName
- brand_list.append(d)
- db.brand_temp.insert_many(brand_list)
-
- if __name__ == '__main__':
- main = CreatMain()
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
-
- # 1、根据url去重
- main.create_component_distinct(db)
- # 2、得到property
- main.create_property(db)
-
- # 3
- main._copy_data_form_original(db)
-
- main._handle_brand(db)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- cli.close()
|