usoft
/
manage


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
							from bs4 import BeautifulSoup

#coding=utf-8


'''
Created on 2016年6月29日

@author: uas
'''

from pymongo.mongo_client import MongoClient

from util_common import Constant

from proxy import ip_pool

from util_common import html_downloader
from urllib import request
import urllib
import random
import time


class listPageDownloader():
    
    def __init__(self):
        self.cli=MongoClient(Constant.MONGODB_URL)
        self.db=self.cli.spider
        self.pool=ip_pool.Pool()
        self.successed=0
        self.faliured=0
        self.total=0
        self.downloader=html_downloader.HtmlDownloader()
        self.mainpage="http://www.panelook.cn/"
    def _get_one_proxy(self):
        return self.pool.get()
        
    def _remove_one_proxy(self,proxy):
        self.pool.remove(proxy)    
        
        
    def download(self):
        url="http://www.panelook.cn/"
        proxy=self._get_one_proxy()
        cont_file=self.downloader.download(url, proxy=proxy)

        return cont_file
    
    def simple_test(self):
        url="http://www.panelook.cn/"
        url_1='http://www.panelook.cn/adsearch_cn.php?op=EL-Display'
        url_2='http://www.mouser.cn/ProductDetail/Schneider-Electric/XB5RFA02/'
        base_url='http://www.mouser.cn/'
        current_url=urllib.parse.urljoin(base_url, 'ProductDetail/Schneider-Electric/XB5RFA02/')
        print(url_2)
        response=self.downloader.download(url_2, '101.201.235.141:8000')
#         response=urllib.request.urlopen(base_url,timeout=50)
        return response
#     
    def simple_parse(self,cont_file):
        soup = BeautifulSoup(cont_file,'html.parser')
        listpageurl=soup.findAll(class_="me")
        new_url_list=list()
        for r in listpageurl:

            new_url=urllib.parse.urljoin(self.mainpage,r.h4.a['href'])
            new_url_list.append((r.h4.a.string,new_url))
        print(new_url_list)
        return new_url_list
    
#     def simple_download(self,url_list):
#         for index,url in enumerate(url_list):
#             response=urllib.request.urlopen(url[1])
#             cont_file=response.read().decode('utf-8')
#             print(cont_file)
#             time.sleep(random.randint(1000,5000))
#             filename=url[0]
#             filename=filename+'.html'
#             print(filename)
#             with open(filename,'w')as file:
#                 file.write(cont_file)
#             
#         
    
if __name__=='__main__':
    downloader=listPageDownloader()
    cont_file=downloader.simple_test()
    print(cont_file)
    filename='1.html'
    with open(filename,'w') as html_file:
        html_file.write(cont_file);
#     new_url_list=downloader.simple_parse(cont_file)
#     downloader.simple_download(new_url_list)