usoft
/
manage


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							# coding=utf-8
'''
@author: ch
'''

from bs4 import BeautifulSoup
import json
import codecs
from util_common import Constant
import random
import re


class Parser(object):
    
    def _get_img_url(self, imgSrc):
        domain = "http://www.mouser.cn/"
        result = re.search(r"(\.\.\/)*(images\/.+)(\/images\/)(.+)", imgSrc)
        # 返回小图、大图
        # 在列表页没有“扩大”图标的，都没有大图
        # 需要在爬取图片的时候，先下载大图，不存在则下载小图
        return domain + result.group(2) + result.group(3) + result.group(4), domain + result.group(2) + "/lrg/" + result.group(4)
    
    def craw(self, html_str, url_mouser):
        componentAbstract = dict()
        componentAbstract["random"] = random.random()
        componentAbstract["url_mouser"] = url_mouser
        
        html_cont = html_str
        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
        
        # mouser图片
        img_mainimage = soup.find(class_="default-img")
        # 排除mouser无图片情况
        if img_mainimage is not None:
            # ../出现位置不确定，不能用原拼接方法
            imgurl, lrgImgurl = self._get_img_url(img_mainimage["src"])
            componentAbstract["img_url_mouser"] = imgurl
            componentAbstract["img_url_lrg_mouser"] = lrgImgurl
            componentAbstract["imgTask"] = Constant.TODO
        # 类目
        breadcrumb = soup.find(id="breadcrumb")
        breadcrumbList = list()
        bL = list()
        for index, s in enumerate([line for line in breadcrumb.div.children if line is not None and line.string is not None and len(line.string.strip()) > 0]):
            if index % 2 == 0:
                breadcrumbList.append(s.string)
                bL.append(s.string)
        breadcrumbList.pop(0)
        breadcrumbList.pop(-1)
        componentAbstract["kinds"] = breadcrumbList
        componentAbstract["lastkind"] = bL.pop(-2)
        
        # 原厂型号
        divManufacturerPartNum = soup.find(id="divManufacturerPartNum")
        for s in divManufacturerPartNum.h1.stripped_strings:
            componentAbstract["code"] = s
        
        # 制造商
        """
            @Tip 这里不同于品牌，品牌在器件属性里面
        """
        ctl00_ContentMain_hlnk10 = soup.find(id="ctl00_ContentMain_hlnk10")
        componentAbstract["company"] = ctl00_ContentMain_hlnk10.string
        componentAbstract["companyUrl"] = ctl00_ContentMain_hlnk10["href"]
        
        # 说明
        divDes = soup.find(id="divDes")
        divDesTemp = ""
        for s in divDes.stripped_strings:
            divDesTemp += s
        componentAbstract["description"] = divDesTemp
        
        # 附件
        ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet = soup.find(id="ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet")
        # 排除无附件情况
        if ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet is not None:
            componentAbstract["attachName"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet.string
            componentAbstract["attachUrl"] = ctl00_ContentMain_rptrCatalogDataSheet_ctl00_lnkCatalogDataSheet["href"]
            componentAbstract["attachTask"] = Constant.TODO
        
        # 属性值
        table = soup.find("table", class_="specs")
        tableList = list()
#         for index, tr in enumerate(table.tbody.children):
        for index, tr in enumerate(table.children):
            # 前2个数据（制造商，产品种类）不要
            if index <= 1:
                continue
            
            trTemp = dict()
            try:
                # 装载 属性名，属性值
                for index, td in enumerate(tr.stripped_strings):
                    try:
                        if index == 0:
                            trTemp["lable"] = td
                        if index == 2:
                            trTemp["value"] = td
                    except:
                        print("error")
                
                # 装载属性是为可选项
                for index, td in enumerate(tr):
                    if index == 5:
                        try:
                            td.imput
                            trTemp["isPrimary"] = True
                        except:
                            trTemp["isPrimary"] = False
                            
            except:
                pass
            if len(trTemp) != 0:
                tableList.append(trTemp)
        
        componentAbstract["properties"] = tableList
        return componentAbstract
    
if __name__ == "__main__":
    path = "1.html"
    obj_spider = Parser()
    html_str = open(path, "r")
    componentAbstract = obj_spider.craw(html_str)
#     str = JSONEncoder().encode(componentAbstract)
    
    s = json.dumps(componentAbstract, ensure_ascii=False)
    print(s)
    fout = codecs.open('tencent.json', 'w', encoding='utf-8')
    fout.write(s)
    fout.close()
#     
#     fout = open("data.txt", "w")
#     fout.write(str)
#     fout.close()
#