123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- from bs4 import BeautifulSoup
- #coding=utf-8
- '''
- Created on 2016年6月29日
- @author: uas
- '''
- from pymongo.mongo_client import MongoClient
- from util_common import Constant
- from proxy import ip_pool
- from util_common import html_downloader
- from urllib import request
- import urllib
- import random
- import time
- class listPageDownloader():
-
- def __init__(self):
- self.cli=MongoClient(Constant.MONGODB_URL)
- self.db=self.cli.spider
- self.pool=ip_pool.Pool()
- self.successed=0
- self.faliured=0
- self.total=0
- self.downloader=html_downloader.HtmlDownloader()
- self.mainpage="http://www.panelook.cn/"
- def _get_one_proxy(self):
- return self.pool.get()
-
- def _remove_one_proxy(self,proxy):
- self.pool.remove(proxy)
-
-
-
-
- def download(self):
- url="http://www.panelook.cn/"
- proxy=self._get_one_proxy()
- cont_file=self.downloader.download(url, proxy=proxy)
- return cont_file
-
- def simple_test(self):
- url="http://www.panelook.cn/"
- url_1='http://www.panelook.cn/adsearch_cn.php?op=EL-Display'
- url_2='http://www.mouser.cn/ProductDetail/Schneider-Electric/XB5RFA02/'
- base_url='http://www.mouser.cn/'
- current_url=urllib.parse.urljoin(base_url, 'ProductDetail/Schneider-Electric/XB5RFA02/')
- print(url_2)
- response=self.downloader.download(url_2, '101.201.235.141:8000')
- # response=urllib.request.urlopen(base_url,timeout=50)
- return response
- #
- def simple_parse(self,cont_file):
- soup = BeautifulSoup(cont_file,'html.parser')
- listpageurl=soup.findAll(class_="me")
- new_url_list=list()
- for r in listpageurl:
- new_url=urllib.parse.urljoin(self.mainpage,r.h4.a['href'])
- new_url_list.append((r.h4.a.string,new_url))
- print(new_url_list)
- return new_url_list
-
- # def simple_download(self,url_list):
- # for index,url in enumerate(url_list):
- # response=urllib.request.urlopen(url[1])
- # cont_file=response.read().decode('utf-8')
- # print(cont_file)
- # time.sleep(random.randint(1000,5000))
- # filename=url[0]
- # filename=filename+'.html'
- # print(filename)
- # with open(filename,'w')as file:
- # file.write(cont_file)
- #
- #
-
- if __name__=='__main__':
- downloader=listPageDownloader()
- cont_file=downloader.simple_test()
- print(cont_file)
- filename='1.html'
- with open(filename,'w') as html_file:
- html_file.write(cont_file);
- # new_url_list=downloader.simple_parse(cont_file)
- # downloader.simple_download(new_url_list)
|