# coding=utf-8 ''' Created on 2016年3月14日 @author: ChenHao ''' # 测试列表页提取详情 import urllib.parse from bs4 import BeautifulSoup import re page_url = "http://www.mouser.cn/Embedded-Solutions/USB-Flash-Drives/_/N-d0rlr" new_product_urls = set() fin = open("1.html", "r") html_cont = fin.read() soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') new_product_urls = set() temp_set = set() def add_to_temp (lis): for l in lis: temp_set.add(l) links = soup.find_all('a', href = re.compile(r"/ProductDetail/\w*")) for link in links: try: new_url = link['href'].split('ProductDetail') add_to_temp(new_url) except: print("error", link) for index, temp in enumerate(temp_set): print(index, temp)