listPageDownloader.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. from bs4 import BeautifulSoup
  2. #coding=utf-8
  3. '''
  4. Created on 2016年6月29日
  5. @author: uas
  6. '''
  7. from pymongo.mongo_client import MongoClient
  8. from util_common import Constant
  9. from proxy import ip_pool
  10. from util_common import html_downloader
  11. from urllib import request
  12. import urllib
  13. import random
  14. import time
  15. class listPageDownloader():
  16. def __init__(self):
  17. self.cli=MongoClient(Constant.MONGODB_URL)
  18. self.db=self.cli.spider
  19. self.pool=ip_pool.Pool()
  20. self.successed=0
  21. self.faliured=0
  22. self.total=0
  23. self.downloader=html_downloader.HtmlDownloader()
  24. self.mainpage="http://www.panelook.cn/"
  25. def _get_one_proxy(self):
  26. return self.pool.get()
  27. def _remove_one_proxy(self,proxy):
  28. self.pool.remove(proxy)
  29. def download(self):
  30. url="http://www.panelook.cn/"
  31. proxy=self._get_one_proxy()
  32. cont_file=self.downloader.download(url, proxy=proxy)
  33. return cont_file
  34. def simple_test(self):
  35. url="http://www.panelook.cn/"
  36. url_1='http://www.panelook.cn/adsearch_cn.php?op=EL-Display'
  37. url_2='http://www.mouser.cn/ProductDetail/Schneider-Electric/XB5RFA02/'
  38. base_url='http://www.mouser.cn/'
  39. current_url=urllib.parse.urljoin(base_url, 'ProductDetail/Schneider-Electric/XB5RFA02/')
  40. print(url_2)
  41. response=self.downloader.download(url_2, '101.201.235.141:8000')
  42. # response=urllib.request.urlopen(base_url,timeout=50)
  43. return response
  44. #
  45. def simple_parse(self,cont_file):
  46. soup = BeautifulSoup(cont_file,'html.parser')
  47. listpageurl=soup.findAll(class_="me")
  48. new_url_list=list()
  49. for r in listpageurl:
  50. new_url=urllib.parse.urljoin(self.mainpage,r.h4.a['href'])
  51. new_url_list.append((r.h4.a.string,new_url))
  52. print(new_url_list)
  53. return new_url_list
  54. # def simple_download(self,url_list):
  55. # for index,url in enumerate(url_list):
  56. # response=urllib.request.urlopen(url[1])
  57. # cont_file=response.read().decode('utf-8')
  58. # print(cont_file)
  59. # time.sleep(random.randint(1000,5000))
  60. # filename=url[0]
  61. # filename=filename+'.html'
  62. # print(filename)
  63. # with open(filename,'w')as file:
  64. # file.write(cont_file)
  65. #
  66. #
  67. if __name__=='__main__':
  68. downloader=listPageDownloader()
  69. cont_file=downloader.simple_test()
  70. print(cont_file)
  71. filename='1.html'
  72. with open(filename,'w') as html_file:
  73. html_file.write(cont_file);
  74. # new_url_list=downloader.simple_parse(cont_file)
  75. # downloader.simple_download(new_url_list)