123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- # coding=utf-8
- '''
- @author: ch
- '''
- from bs4 import BeautifulSoup
- import json
- import codecs
- from util_common import Constant
- import random
- import re
- class Parser(object):
-
- def _get_img_url(self, imgSrc):
- domain = "http://www.mouser.cn/"
- result = re.search(r"(\.\.\/)*(images\/.+)(\/images\/)(.+)", imgSrc)
- # 返回小图、大图
- # 在列表页没有“扩大”图标的,都没有大图
- # 需要在爬取图片的时候,先下载大图,不存在则下载小图
- return domain + result.group(2) + result.group(3) + result.group(4), domain + result.group(2) + "/lrg/" + result.group(4)
-
- def craw(self, html_str, url_mouser):
- componentAbstract = dict()
- componentAbstract["random"] = random.random()
- componentAbstract["url_mouser"] = url_mouser
-
- html_cont = html_str
- soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
-
- # mouser图片
- img_mainimage = soup.find(class_="default-img")
- # 排除mouser无图片情况
- if img_mainimage is not None:
- # ../出现位置不确定,不能用原拼接方法
- imgurl, lrgImgurl = self._get_img_url(img_mainimage["src"])
- componentAbstract["img_url_mouser"] = imgurl
- componentAbstract["img_url_lrg_mouser"] = lrgImgurl
- componentAbstract["imgTask"] = Constant.TODO
- # 类目
- breadcrumb = soup.find(id="breadcrumb")
- breadcrumbList = list()
- bL = list()
- for index, s in enumerate([line for line in breadcrumb.div.children if line is not None and line.string is not None and len(line.string.strip()) > 0]):
- if index % 2 == 0:
- breadcrumbList.append(s.string)
- bL.append(s.string)
- breadcrumbList.pop(0)
- breadcrumbList.pop(-1)
- componentAbstract["kinds"] = breadcrumbList
- componentAbstract["lastkind"] = bL.pop(-2)
-
- # 原厂型号
- divManufacturerPartNum = soup.find(id="divManufacturerPartNum")
- for s in divManufacturerPartNum.h1.stripped_strings:
- componentAbstract["code"] = s
-
- # 制造商
- """
- @Tip 这里不同于品牌,品牌在器件属性里面
- """
- ctl00_ContentMain_hlnk10 = soup.find(id="ctl00_ContentMain_hlnk10")
- componentAbstract["company"] = ctl00_ContentMain_hlnk10.string
- componentAbstract["companyUrl"] = ctl00_ContentMain_hlnk10["href"]
-
- # 说明
- divDes = soup.find(id="divDes")
- divDesTemp = ""
- for s in divDes.stripped_strings:
- divDesTemp += s
- componentAbstract["description"] = divDesTemp
-
- # 附件
- ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet = soup.find(id="ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet")
- # 排除无附件情况
- if ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet is not None:
- componentAbstract["attachName"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet.string
- componentAbstract["attachUrl"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet["href"]
- componentAbstract["attachTask"] = Constant.TODO
-
- # 属性值
- table = soup.find("table", class_="specs")
- tableList = list()
- # for index, tr in enumerate(table.tbody.children):
- for index, tr in enumerate(table.children):
- # 前2个数据(制造商,产品种类)不要
- if index <= 1:
- continue
-
- trTemp = dict()
- try:
- # 装载 属性名,属性值
- for index, td in enumerate(tr.stripped_strings):
- try:
- if index == 0:
- trTemp["lable"] = td
- if index == 2:
- trTemp["value"] = td
- except:
- print("error")
-
- # 装载属性是为可选项
- for index, td in enumerate(tr):
- if index == 5:
- try:
- td.imput
- trTemp["isPrimary"] = True
- except:
- trTemp["isPrimary"] = False
-
- except:
- pass
- if len(trTemp) != 0:
- tableList.append(trTemp)
-
- componentAbstract["properties"] = tableList
- return componentAbstract
-
- if __name__ == "__main__":
- path = "1.html"
- obj_spider = Parser()
- html_str = open(path, "r")
- componentAbstract = obj_spider.craw(html_str)
- # str = JSONEncoder().encode(componentAbstract)
-
- s = json.dumps(componentAbstract, ensure_ascii=False)
- print(s)
- fout = codecs.open('tencent.json', 'w', encoding='utf-8')
- fout.write(s)
- fout.close()
- #
- # fout = open("data.txt", "w")
- # fout.write(str)
- # fout.close()
- #
-
|