# coding=utf-8 ''' @author: ch ''' import random import re import urllib.parse from bs4 import BeautifulSoup from util_common import Constant class HtmlParser(object): # 利用listPage的soup获得分类的数组 def _get_kindlist_by_listPage(self, soup): # 先获得当前页面可获取的 kindlist = soup.find_all('a', id = re.compile(r"lnkBreadcrumb\w*")) kindls = list() for index, kind in enumerate(kindlist): # 第一个不要 if index > 0: nameCn = kind.string kindls.append(nameCn) # 再去组装列表页包含子类目的 kindls_rs = list() kindlist_children = soup.find_all('a', class_ = 'SearchResultsSubLevelCategory') if len(kindlist_children) > 0: for c in kindlist_children: l = list() l.extend(kindls) l.append(c.string) kindls_rs.append(l) else: kindls_rs.append(kindls) return kindls_rs # 获得[下一页]的链接 def _get_next_page_url(self, page_url, soup): link = soup.find('#ctl00_ContentMain_PagerTop_lnkNext') url = link['href'] new_full_url = urllib.parse.urljoin(page_url, url) return new_full_url # 获得[当前页产品详情]的链接 def _get_detail_urls_from_listPage(self, soup): new_product_urls = set() def add_to_new_product_urls (lis): for l in lis: ''' # 切割产生的结果多出来2种无效情况 "../../../../../" ;len() = 15 "/" ;len() = 1 ''' # 这里去掉多出来的2个结果 if len(l) > 20: # 重新拼装头部 url_head = "http://www.mouser.cn/ProductDetail" detail_url = url_head + l new_product_urls.add(detail_url) ''' # 这里因为通过父子节点查找很麻烦,所以直接找寻带有详情页链接的,再用Set过滤重复 ''' links = soup.find_all('a', href=re.compile(r"/ProductDetail/\w*")) for link in links: try: new_url = link['href'].split('ProductDetail') add_to_new_product_urls(new_url) except: print("error", link) return new_product_urls def _get_new_urls(self, page_url, soup): new_urls = set() links = soup.find('#ctl00_ContentMain_PagerTop_lnkNext') for link in links: new_url = link['href'] new_full_url = urllib.parse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self, page_url, soup): res_data = {} res_data['url'] = page_url title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title") res_data['title'] = title_node.get_text() summary_node = soup.find('div', class_="lemma-summary") res_data['summary'] = summary_node.get_text() return res_data # 类目列表的解析(获得商品名称和链接地址) def kindlist_parse(self, page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') next_page_url = self._get_next_page_url(page_url, soup) new_product_urls = self._get_new_product_urls(page_url, soup) return next_page_url, new_product_urls # 输入html的soup对象,计算并得到整个类目下的所有分页的数据 def _get_kindlist_by_firstpage(self, soup, url, index): l = list() number = 1 d = dict() d["kindid"] = index d["number"] = number number += 1 d["url"] = url d["random"] = random.random() d["status"] = Constant.TODO l.append(d) lastPage = soup.find('a', class_="first-last") try: # 有的页面只有一页 count = int(lastPage.string) for i in range(1, count + 1): d = dict() d["id"] = index d["number"] = number number += 1 d["url"] = url + "?No=" + str(25 * i) d["random"] = random.random() d["status"] = Constant.TODO l.append(d) except: pass return l