usoft
/
manage


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233
							# coding=utf-8
'''
Created on 2016年3月14日

@author: ChenHao
'''
# 测试列表页提取详情
import urllib.parse
from bs4 import BeautifulSoup
import re

page_url = "http://www.mouser.cn/Embedded-Solutions/USB-Flash-Drives/_/N-d0rlr"
new_product_urls = set()
fin = open("1.html", "r")
html_cont = fin.read()
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
            
new_product_urls = set()
temp_set = set()
def add_to_temp (lis):
    for l in lis:
        temp_set.add(l)
        
links = soup.find_all('a',  href = re.compile(r"/ProductDetail/\w*"))
for link in links:
    try:
        new_url = link['href'].split('ProductDetail')
        add_to_temp(new_url)
    except:
        print("error", link)
for index, temp in enumerate(temp_set):
    print(index, temp)