create_main.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # coding=utf-8
  2. '''
  3. Created on 2016年4月5日
  4. 使用component_original的原始数据得到有关联关系的所有数据的过程
  5. @author: ChenHao
  6. '''
  7. from pymongo.mongo_client import MongoClient
  8. from util_common import Constant
  9. from analysis import detailAnalysis_kind
  10. class CreatMain(object):
  11. def __init__(self):
  12. self.analysis_kind = detailAnalysis_kind.DetailAnalysisKind()
  13. self.kind = []
  14. self.property = []
  15. self.kindtranslate = []
  16. def craw(self):
  17. pass
  18. # 对component_original进行去重操作,将得到的结果存入component_distinct中
  19. def create_component_distinct(self, db):
  20. # 1、根据url去重
  21. pass
  22. '''
  23. @date 2016年4月5日14:56:25
  24. @todo 因为数据量过大不支持mongodb distinct的方式,需要另想办法,所以展示不考虑对component_original的去重
  25. '''
  26. '''
  27. @date 2016-4-14
  28. load data in RAM
  29. '''
  30. component_list = list()
  31. url_list = list()
  32. mgdb_id_list = list()
  33. # reset Status
  34. db.component_original.update_many({}, {'$set': {'status': Constant.DONE}})
  35. # load all component data in RAM
  36. rs = db.component_original.find({}, {"_id": True, "url": True}, no_cursor_timeout=True)
  37. for r in rs:
  38. component_list.append(r)
  39. # distinct data
  40. for component in component_list:
  41. if component["url"] in url_list:
  42. mgdb_id_list.append(component["_id"])
  43. else:
  44. url_list.append(r["url"])
  45. # updata status in mongodb
  46. db.component_original.update_many({"_id": {"$in": mgdb_id_list}}, {'$set': {'status': Constant.DISTINCT}})
  47. # 生成property
  48. def create_property(self, db):
  49. # 用现有的初始化
  50. rs_old = db.property.find({})
  51. property_list = list()
  52. labelCn_set = set()
  53. for r in rs_old:
  54. property_list.append(r)
  55. labelCn_set.add(r["labelCn"])
  56. # 得到新的并组装后加入
  57. rs_new = db.component_original.find({"status": Constant.DONE}).distinct("properties.lable")
  58. for r in rs_new:
  59. if r not in labelCn_set:
  60. d = dict()
  61. d["id"] = len(property_list) + 1
  62. d["labelCn"] = r
  63. property_list.append(d)
  64. labelCn_set.add(r)
  65. db.property_temp.insert_many(property_list)
  66. # copy data from component_original
  67. def _copy_data_form_original(self, db):
  68. rs_kind = db.kind_temp_2.find()
  69. for r in rs_kind:
  70. self.kind.append(r)
  71. temp_property = db.property_temp.find()
  72. for pro in temp_property:
  73. self.property.append(pro)
  74. temp_translate = db.kind_translate.find()
  75. for translate in temp_translate:
  76. self.kindtranslate.append(translate)
  77. def _get_propertyid_by_label(label):
  78. for pro in self.property:
  79. if pro["labelCn"] == label:
  80. return pro["id"]
  81. rs = db.component_original.find({}, no_cursor_timeout=True)
  82. for index, r in enumerate(rs):
  83. d = dict()
  84. d["id"] = index + 1
  85. d["code"] = r["code"]
  86. d["description"] = r["description"]
  87. d["company"] = r["company"]
  88. d["kindid"] = self.analysis_kind._get_uu_kindid_by_kindName(r['lastkind'], self.kind)
  89. if r["img_url_uu"]:
  90. d["img"] = r["img_url_uu"]
  91. else:
  92. d["img"] = Constant.COMPONENT_DEFAULT_IMG_URL
  93. componentid = d["id"]
  94. propertyValues = r["properties"]
  95. for inde, propertyValue in enumerate(propertyValues):
  96. d = dict()
  97. d["componentid"] = componentid
  98. d["propertyid"] = _get_propertyid_by_label(propertyValue["lable"])
  99. d["detno"] = inde + 1
  100. d["value"] = propertyValue["value"]
  101. db.propertyvalue_temp.insert_one(d)
  102. def _handle_brand(self, db):
  103. brand_set = set()
  104. rs_brand = db.propertyvalue_temp.find({"propertyid" : 40})
  105. for propertyvalue in rs_brand:
  106. brand_set.add(propertyvalue["value"])
  107. brand_list = list()
  108. for index, brandName in enumerate(brand_set):
  109. d = dict()
  110. d["id"] = index + 1
  111. d["nameEn"] = brandName
  112. # Cn
  113. d["nameCn"] = brandName
  114. brand_list.append(d)
  115. db.brand_temp.insert_many(brand_list)
  116. if __name__ == '__main__':
  117. main = CreatMain()
  118. cli = MongoClient(Constant.MONGODB_URL)
  119. db = cli.spider
  120. # 1、根据url去重
  121. main.create_component_distinct(db)
  122. # 2、得到property
  123. main.create_property(db)
  124. # 3
  125. main._copy_data_form_original(db)
  126. main._handle_brand(db)
  127. cli.close()