init_kind_translate.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. # coding=utf-8
  2. '''
  3. Created on 2016年4月6日
  4. 类目转换关系
  5. mouser->uu
  6. @author: ChenHao
  7. '''
  8. from pymongo.mongo_client import MongoClient
  9. from pip._vendor.distlib.util import CONSTRAINTS
  10. from util_common import Constant
  11. cli = MongoClient(Constant.MONGODB_URL)
  12. db = cli.spider
  13. file_path = "../spider_download/Other/kind_translate.csv"
  14. '''
  15. 1、将转换关系存入mongodb
  16. '''
  17. fin = open(file_path, "r")
  18. lines = fin.readlines()
  19. fin.close()
  20. '''
  21. 去除不正确的数据
  22. '''
  23. relation_list = list()
  24. for line in lines:
  25. str_line = line.replace("\n", "")
  26. arr_line = str_line.split(",")
  27. try:
  28. # 这里只用检查第一个
  29. arr_line[0] = int(arr_line[0])
  30. arr_line[1] = int(arr_line[1])
  31. relation_list.append(arr_line)
  32. except:
  33. pass
  34. temp_set = set()
  35. relation_final_list = list()
  36. for rl in relation_list:
  37. if rl[0] not in temp_set:
  38. temp_set.add(rl[0])
  39. d = dict()
  40. d["mouserid"] = rl[0]
  41. d["uuid"] = rl[1]
  42. relation_final_list.append(d)
  43. print (relation_final_list)
  44. db.kind_translate.insert_many(relation_final_list)
  45. '''
  46. 遍历叶子节点,看哪一些没有覆盖到
  47. '''
  48. not_relate_list = list()
  49. rs = db.kind_temp_2.find({"isLeaf": True})
  50. for r in rs:
  51. if r["id"] not in temp_set:
  52. not_relate_list.append(r)
  53. print (len(temp_set))
  54. print (rs.count())
  55. print (len(not_relate_list))
  56. # cli = MongoClient(Constant.MONGODB_URL)
  57. # db = cli.spider
  58. # for line in lines:
  59. # temp_str = line.replace("\n", "")
  60. # temp_list = temp_str.split(",")
  61. # d = dict()
  62. # d["mouserid"] = temp_list[0]
  63. # d["uuid"] = temp_list[1]
  64. # db.kind_translate.insert_one(d)
  65. #
  66. cli.close()