| 123456789101112131415161718192021222324252627282930313233 |
- # coding=utf-8
- '''
- Created on 2016年3月14日
- @author: ChenHao
- '''
- # 测试列表页提取详情
- import urllib.parse
- from bs4 import BeautifulSoup
- import re
- page_url = "http://www.mouser.cn/Embedded-Solutions/USB-Flash-Drives/_/N-d0rlr"
- new_product_urls = set()
- fin = open("1.html", "r")
- html_cont = fin.read()
- soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
-
- new_product_urls = set()
- temp_set = set()
- def add_to_temp (lis):
- for l in lis:
- temp_set.add(l)
-
- links = soup.find_all('a', href = re.compile(r"/ProductDetail/\w*"))
- for link in links:
- try:
- new_url = link['href'].split('ProductDetail')
- add_to_temp(new_url)
- except:
- print("error", link)
- for index, temp in enumerate(temp_set):
- print(index, temp)
|