html_detail_parser.py.svn-base 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # coding=utf-8
  2. '''
  3. @author: ch
  4. '''
  5. from bs4 import BeautifulSoup
  6. import json
  7. import codecs
  8. from util_common import Constant
  9. import random
  10. import re
  11. class Parser(object):
  12. def _get_img_url(self, imgSrc):
  13. domain = "http://www.mouser.cn/"
  14. result = re.search(r"(\.\.\/)*(images\/.+)(\/images\/)(.+)", imgSrc)
  15. # 返回小图、大图
  16. # 在列表页没有“扩大”图标的,都没有大图
  17. # 需要在爬取图片的时候,先下载大图,不存在则下载小图
  18. return domain + result.group(2) + result.group(3) + result.group(4), domain + result.group(2) + "/lrg/" + result.group(4)
  19. def craw(self, html_str, url_mouser):
  20. componentAbstract = dict()
  21. componentAbstract["random"] = random.random()
  22. componentAbstract["url_mouser"] = url_mouser
  23. html_cont = html_str
  24. soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
  25. # mouser图片
  26. img_mainimage = soup.find(class_="default-img")
  27. # 排除mouser无图片情况
  28. if img_mainimage is not None:
  29. # ../出现位置不确定,不能用原拼接方法
  30. imgurl, lrgImgurl = self._get_img_url(img_mainimage["src"])
  31. componentAbstract["img_url_mouser"] = imgurl
  32. componentAbstract["img_url_lrg_mouser"] = lrgImgurl
  33. componentAbstract["imgTask"] = Constant.TODO
  34. # 类目
  35. breadcrumb = soup.find(id="breadcrumb")
  36. breadcrumbList = list()
  37. bL = list()
  38. for index, s in enumerate([line for line in breadcrumb.div.children if line is not None and line.string is not None and len(line.string.strip()) > 0]):
  39. if index % 2 == 0:
  40. breadcrumbList.append(s.string)
  41. bL.append(s.string)
  42. breadcrumbList.pop(0)
  43. breadcrumbList.pop(-1)
  44. componentAbstract["kinds"] = breadcrumbList
  45. componentAbstract["lastkind"] = bL.pop(-2)
  46. # 原厂型号
  47. divManufacturerPartNum = soup.find(id="divManufacturerPartNum")
  48. for s in divManufacturerPartNum.h1.stripped_strings:
  49. componentAbstract["code"] = s
  50. # 制造商
  51. """
  52. @Tip 这里不同于品牌,品牌在器件属性里面
  53. """
  54. ctl00_ContentMain_hlnk10 = soup.find(id="ctl00_ContentMain_hlnk10")
  55. componentAbstract["company"] = ctl00_ContentMain_hlnk10.string
  56. componentAbstract["companyUrl"] = ctl00_ContentMain_hlnk10["href"]
  57. # 说明
  58. divDes = soup.find(id="divDes")
  59. divDesTemp = ""
  60. for s in divDes.stripped_strings:
  61. divDesTemp += s
  62. componentAbstract["description"] = divDesTemp
  63. # 附件
  64. ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet = soup.find(id="ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet")
  65. # 排除无附件情况
  66. if ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet is not None:
  67. componentAbstract["attachName"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet.string
  68. componentAbstract["attachUrl"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet["href"]
  69. componentAbstract["attachTask"] = Constant.TODO
  70. # 属性值
  71. table = soup.find("table", class_="specs")
  72. tableList = list()
  73. # for index, tr in enumerate(table.tbody.children):
  74. for index, tr in enumerate(table.children):
  75. # 前2个数据(制造商,产品种类)不要
  76. if index <= 1:
  77. continue
  78. trTemp = dict()
  79. try:
  80. # 装载 属性名,属性值
  81. for index, td in enumerate(tr.stripped_strings):
  82. try:
  83. if index == 0:
  84. trTemp["lable"] = td
  85. if index == 2:
  86. trTemp["value"] = td
  87. except:
  88. print("error")
  89. # 装载属性是为可选项
  90. for index, td in enumerate(tr):
  91. if index == 5:
  92. try:
  93. td.imput
  94. trTemp["isPrimary"] = True
  95. except:
  96. trTemp["isPrimary"] = False
  97. except:
  98. pass
  99. if len(trTemp) != 0:
  100. tableList.append(trTemp)
  101. componentAbstract["properties"] = tableList
  102. return componentAbstract
  103. if __name__ == "__main__":
  104. path = "1.html"
  105. obj_spider = Parser()
  106. html_str = open(path, "r")
  107. componentAbstract = obj_spider.craw(html_str)
  108. # str = JSONEncoder().encode(componentAbstract)
  109. s = json.dumps(componentAbstract, ensure_ascii=False)
  110. print(s)
  111. fout = codecs.open('tencent.json', 'w', encoding='utf-8')
  112. fout.write(s)
  113. fout.close()
  114. #
  115. # fout = open("data.txt", "w")
  116. # fout.write(str)
  117. # fout.close()
  118. #