html_parser.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. # coding=utf-8
  2. '''
  3. @author: ch
  4. '''
  5. from bs4 import BeautifulSoup
  6. import urllib.parse
  7. import random
  8. from util_common import Constant
  9. import re
  10. class HtmlParser(object):
  11. # 利用listPage的soup获得分类的数组
  12. def _get_kindlist_by_listPage(self, soup):
  13. # 先获得当前页面可获取的
  14. kindlist = soup.find_all('a', id = re.compile(r"lnkBreadcrumb\w*"))
  15. kindls = list()
  16. for index, kind in enumerate(kindlist):
  17. # 第一个不要
  18. if index > 0:
  19. nameCn = kind.string
  20. kindls.append(nameCn)
  21. # 再去组装列表页包含子类目的
  22. kindls_rs = list()
  23. kindlist_children = soup.find_all('a', class_ = 'SearchResultsSubLevelCategory')
  24. if len(kindlist_children) > 0:
  25. for c in kindlist_children:
  26. l = list()
  27. l.extend(kindls)
  28. l.append(c.string)
  29. kindls_rs.append(l)
  30. else:
  31. kindls_rs.append(kindls)
  32. return kindls_rs
  33. # 获得[下一页]的链接
  34. def _get_next_page_url(self, page_url, soup):
  35. link = soup.find('#ctl00_ContentMain_PagerTop_lnkNext')
  36. url = link['href']
  37. new_full_url = urllib.parse.urljoin(page_url, url)
  38. return new_full_url
  39. # 获得[当前页产品详情]的链接
  40. def _get_detail_urls_from_listPage(self, soup):
  41. new_product_urls = set()
  42. def add_to_new_product_urls (lis):
  43. for l in lis:
  44. '''
  45. # 切割产生的结果多出来2种无效情况
  46. "../../../../../" ;len() = 15
  47. "/" ;len() = 1
  48. '''
  49. # 这里去掉多出来的2个结果
  50. if len(l) > 20:
  51. # 重新拼装头部
  52. url_head = "http://www.mouser.cn/ProductDetail"
  53. detail_url = url_head + l
  54. new_product_urls.add(detail_url)
  55. '''
  56. # 这里因为通过父子节点查找很麻烦,所以直接找寻带有详情页链接的,再用Set过滤重复
  57. '''
  58. links = soup.find_all('a', href=re.compile(r"/ProductDetail/\w*"))
  59. for link in links:
  60. try:
  61. new_url = link['href'].split('ProductDetail')
  62. add_to_new_product_urls(new_url)
  63. except:
  64. print("error", link)
  65. return new_product_urls
  66. def _get_new_urls(self, page_url, soup):
  67. new_urls = set()
  68. links = soup.find('#ctl00_ContentMain_PagerTop_lnkNext')
  69. for link in links:
  70. new_url = link['href']
  71. new_full_url = urllib.parse.urljoin(page_url, new_url)
  72. new_urls.add(new_full_url)
  73. return new_urls
  74. def _get_new_data(self, page_url, soup):
  75. res_data = {}
  76. res_data['url'] = page_url
  77. title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title")
  78. res_data['title'] = title_node.get_text()
  79. summary_node = soup.find('div', class_="lemma-summary")
  80. res_data['summary'] = summary_node.get_text()
  81. return res_data
  82. # 类目列表的解析(获得商品名称和链接地址)
  83. def kindlist_parse(self, page_url, html_cont):
  84. if page_url is None or html_cont is None:
  85. return
  86. soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
  87. next_page_url = self._get_next_page_url(page_url, soup)
  88. new_product_urls = self._get_new_product_urls(page_url, soup)
  89. return next_page_url, new_product_urls
  90. # 输入html的soup对象,计算并得到整个类目下的所有分页的数据
  91. def _get_kindlist_by_firstpage(self, soup, url, index):
  92. l = list()
  93. number = 1
  94. d = dict()
  95. d["kindid"] = index
  96. d["number"] = number
  97. number += 1
  98. d["url"] = url
  99. d["random"] = random.random()
  100. d["status"] = Constant.TODO
  101. l.append(d)
  102. lastPage = soup.find('a', class_="first-last")
  103. try:
  104. # 有的页面只有一页
  105. count = int(lastPage.string)
  106. for i in range(1, count + 1):
  107. d = dict()
  108. d["id"] = index
  109. d["number"] = number
  110. number += 1
  111. d["url"] = url + "?No=" + str(25 * i)
  112. d["random"] = random.random()
  113. d["status"] = Constant.TODO
  114. l.append(d)
  115. except:
  116. pass
  117. return l