# coding=utf-8 ''' @author: ch ''' from http import cookiejar import urllib.request class HtmlDownloader(object): # head: dict of header def makeMyOpener(self, head={ 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Referer': 'http://www.mouser.cn/Electronic-Components/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }, proxy=None): cj = cookiejar.CookieJar() handlers = [urllib.request.HTTPCookieProcessor(cj)] if proxy is not None: handlers.append(urllib.request.ProxyHandler({'http': 'http://%s/' % proxy})) opener = urllib.request.build_opener(*handlers) header = [] for key, value in head.items(): elem = (key, value) header.append(elem) opener.addheaders = header return opener def download(self, url, proxy): if url is None: return None oper = self.makeMyOpener(proxy=proxy) uop = oper.open(url, timeout=30) if uop.getcode() != 200: return None return uop.read().decode("utf8") def download_file(self, url, proxy=None): if url is None: return None oper = self.makeMyOpener(head={ 'Connection': 'Keep-Alive', 'Accept': 'image/*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'Referer': 'http://www.mouser.cn/', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }, proxy=proxy) uop = oper.open(url, timeout=30) if uop.getcode() != 200: return None return uop.read()