html_downloader.py 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # coding=utf-8
  2. '''
  3. @author: ch
  4. '''
  5. from http import cookiejar
  6. import urllib.request
  7. class HtmlDownloader(object):
  8. # head: dict of header
  9. def makeMyOpener(self, head={
  10. 'Connection': 'Keep-Alive',
  11. 'Accept': 'text/html, application/xhtml+xml, */*',
  12. 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
  13. 'Referer': 'http://www.mouser.cn/Electronic-Components/',
  14. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
  15. }, proxy=None):
  16. cj = cookiejar.CookieJar()
  17. handlers = [urllib.request.HTTPCookieProcessor(cj)]
  18. if proxy is not None:
  19. handlers.append(urllib.request.ProxyHandler({'http': 'http://%s/' % proxy}))
  20. opener = urllib.request.build_opener(*handlers)
  21. header = []
  22. for key, value in head.items():
  23. elem = (key, value)
  24. header.append(elem)
  25. opener.addheaders = header
  26. return opener
  27. def download(self, url, proxy):
  28. if url is None:
  29. return None
  30. oper = self.makeMyOpener(proxy=proxy)
  31. uop = oper.open(url, timeout=30)
  32. if uop.getcode() != 200:
  33. return None
  34. return uop.read().decode("utf8")
  35. def download_file(self, url, proxy=None):
  36. if url is None:
  37. return None
  38. oper = self.makeMyOpener(head={
  39. 'Connection': 'Keep-Alive',
  40. 'Accept': 'image/*',
  41. 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
  42. 'Referer': 'http://www.mouser.cn/',
  43. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
  44. }, proxy=proxy)
  45. uop = oper.open(url, timeout=30)
  46. if uop.getcode() != 200:
  47. return None
  48. return uop.read()