123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- # coding=utf-8
- '''
- @author: ch
- '''
- from bs4 import BeautifulSoup
- import urllib.parse
- import random
- from util_common import Constant
- import re
- class HtmlParser(object):
-
- # 利用listPage的soup获得分类的数组
- def _get_kindlist_by_listPage(self, soup):
- # 先获得当前页面可获取的
- kindlist = soup.find_all('a', id = re.compile(r"lnkBreadcrumb\w*"))
- kindls = list()
- for index, kind in enumerate(kindlist):
- # 第一个不要
- if index > 0:
- nameCn = kind.string
- kindls.append(nameCn)
- # 再去组装列表页包含子类目的
- kindls_rs = list()
- kindlist_children = soup.find_all('a', class_ = 'SearchResultsSubLevelCategory')
- if len(kindlist_children) > 0:
- for c in kindlist_children:
- l = list()
- l.extend(kindls)
- l.append(c.string)
- kindls_rs.append(l)
- else:
- kindls_rs.append(kindls)
- return kindls_rs
-
- # 获得[下一页]的链接
- def _get_next_page_url(self, page_url, soup):
- link = soup.find('#ctl00_ContentMain_PagerTop_lnkNext')
- url = link['href']
- new_full_url = urllib.parse.urljoin(page_url, url)
-
- return new_full_url
-
- # 获得[当前页产品详情]的链接
- def _get_detail_urls_from_listPage(self, soup):
- new_product_urls = set()
- def add_to_new_product_urls (lis):
- for l in lis:
- '''
- # 切割产生的结果多出来2种无效情况
- "../../../../../" ;len() = 15
- "/" ;len() = 1
- '''
- # 这里去掉多出来的2个结果
- if len(l) > 20:
- # 重新拼装头部
- url_head = "http://www.mouser.cn/ProductDetail"
- detail_url = url_head + l
- new_product_urls.add(detail_url)
-
- '''
- # 这里因为通过父子节点查找很麻烦,所以直接找寻带有详情页链接的,再用Set过滤重复
- '''
- links = soup.find_all('a', href=re.compile(r"/ProductDetail/\w*"))
- for link in links:
- try:
- new_url = link['href'].split('ProductDetail')
- add_to_new_product_urls(new_url)
- except:
- print("error", link)
-
- return new_product_urls
-
- def _get_new_urls(self, page_url, soup):
- new_urls = set()
- links = soup.find('#ctl00_ContentMain_PagerTop_lnkNext')
- for link in links:
- new_url = link['href']
- new_full_url = urllib.parse.urljoin(page_url, new_url)
- new_urls.add(new_full_url)
- return new_urls
-
- def _get_new_data(self, page_url, soup):
- res_data = {}
-
- res_data['url'] = page_url
-
- title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title")
- res_data['title'] = title_node.get_text()
-
- summary_node = soup.find('div', class_="lemma-summary")
- res_data['summary'] = summary_node.get_text()
-
- return res_data
-
- # 类目列表的解析(获得商品名称和链接地址)
- def kindlist_parse(self, page_url, html_cont):
- if page_url is None or html_cont is None:
- return
-
- soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
- next_page_url = self._get_next_page_url(page_url, soup)
- new_product_urls = self._get_new_product_urls(page_url, soup)
- return next_page_url, new_product_urls
-
- # 输入html的soup对象,计算并得到整个类目下的所有分页的数据
- def _get_kindlist_by_firstpage(self, soup, url, index):
- l = list()
- number = 1
-
- d = dict()
- d["kindid"] = index
- d["number"] = number
- number += 1
- d["url"] = url
- d["random"] = random.random()
- d["status"] = Constant.TODO
- l.append(d)
-
- lastPage = soup.find('a', class_="first-last")
- try:
- # 有的页面只有一页
- count = int(lastPage.string)
- for i in range(1, count + 1):
- d = dict()
- d["id"] = index
- d["number"] = number
- number += 1
- d["url"] = url + "?No=" + str(25 * i)
- d["random"] = random.random()
- d["status"] = Constant.TODO
- l.append(d)
- except:
- pass
- return l
-
|