# coding=utf-8 ''' @author: ch ''' from bs4 import BeautifulSoup import json import codecs from util_common import Constant import random import re class Parser(object): def _get_img_url(self, imgSrc): domain = "http://www.mouser.cn/" result = re.search(r"(\.\.\/)*(images\/.+)(\/images\/)(.+)", imgSrc) # 返回小图、大图 # 在列表页没有“扩大”图标的,都没有大图 # 需要在爬取图片的时候,先下载大图,不存在则下载小图 return domain + result.group(2) + result.group(3) + result.group(4), domain + result.group(2) + "/lrg/" + result.group(4) def craw(self, html_str, url_mouser): componentAbstract = dict() componentAbstract["random"] = random.random() componentAbstract["url_mouser"] = url_mouser html_cont = html_str soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') # mouser图片 img_mainimage = soup.find(class_="default-img") # 排除mouser无图片情况 if img_mainimage is not None: # ../出现位置不确定,不能用原拼接方法 imgurl, lrgImgurl = self._get_img_url(img_mainimage["src"]) componentAbstract["img_url_mouser"] = imgurl componentAbstract["img_url_lrg_mouser"] = lrgImgurl componentAbstract["imgTask"] = Constant.TODO # 类目 breadcrumb = soup.find(id="breadcrumb") breadcrumbList = list() bL = list() for index, s in enumerate([line for line in breadcrumb.div.children if line is not None and line.string is not None and len(line.string.strip()) > 0]): if index % 2 == 0: breadcrumbList.append(s.string) bL.append(s.string) breadcrumbList.pop(0) breadcrumbList.pop(-1) componentAbstract["kinds"] = breadcrumbList componentAbstract["lastkind"] = bL.pop(-2) # 原厂型号 divManufacturerPartNum = soup.find(id="divManufacturerPartNum") for s in divManufacturerPartNum.h1.stripped_strings: componentAbstract["code"] = s # 制造商 """ @Tip 这里不同于品牌,品牌在器件属性里面 """ ctl00_ContentMain_hlnk10 = soup.find(id="ctl00_ContentMain_hlnk10") componentAbstract["company"] = ctl00_ContentMain_hlnk10.string componentAbstract["companyUrl"] = ctl00_ContentMain_hlnk10["href"] # 说明 divDes = soup.find(id="divDes") divDesTemp = "" for s in divDes.stripped_strings: divDesTemp += s componentAbstract["description"] = divDesTemp # 附件 ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet = soup.find(id="ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet") # 排除无附件情况 if ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet is not None: componentAbstract["attachName"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet.string componentAbstract["attachUrl"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet["href"] componentAbstract["attachTask"] = Constant.TODO # 属性值 table = soup.find("table", class_="specs") tableList = list() # for index, tr in enumerate(table.tbody.children): for index, tr in enumerate(table.children): # 前2个数据(制造商,产品种类)不要 if index <= 1: continue trTemp = dict() try: # 装载 属性名,属性值 for index, td in enumerate(tr.stripped_strings): try: if index == 0: trTemp["lable"] = td if index == 2: trTemp["value"] = td except: print("error") # 装载属性是为可选项 for index, td in enumerate(tr): if index == 5: try: td.imput trTemp["isPrimary"] = True except: trTemp["isPrimary"] = False except: pass if len(trTemp) != 0: tableList.append(trTemp) componentAbstract["properties"] = tableList return componentAbstract if __name__ == "__main__": path = "1.html" obj_spider = Parser() html_str = open(path, "r") componentAbstract = obj_spider.craw(html_str) # str = JSONEncoder().encode(componentAbstract) s = json.dumps(componentAbstract, ensure_ascii=False) print(s) fout = codecs.open('tencent.json', 'w', encoding='utf-8') fout.write(s) fout.close() # # fout = open("data.txt", "w") # fout.write(str) # fout.close() #