listParserTest.py 799 B

123456789101112131415161718192021222324252627282930313233
  1. # coding=utf-8
  2. '''
  3. Created on 2016年3月14日
  4. @author: ChenHao
  5. '''
  6. # 测试列表页提取详情
  7. import urllib.parse
  8. from bs4 import BeautifulSoup
  9. import re
  10. page_url = "http://www.mouser.cn/Embedded-Solutions/USB-Flash-Drives/_/N-d0rlr"
  11. new_product_urls = set()
  12. fin = open("1.html", "r")
  13. html_cont = fin.read()
  14. soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
  15. new_product_urls = set()
  16. temp_set = set()
  17. def add_to_temp (lis):
  18. for l in lis:
  19. temp_set.add(l)
  20. links = soup.find_all('a', href = re.compile(r"/ProductDetail/\w*"))
  21. for link in links:
  22. try:
  23. new_url = link['href'].split('ProductDetail')
  24. add_to_temp(new_url)
  25. except:
  26. print("error", link)
  27. for index, temp in enumerate(temp_set):
  28. print(index, temp)