| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- # coding=utf-8
- '''
- Created on 2016年3月7日
- 使用线程的方式为详情页生成下载任务
- @author: ChenHao
- '''
- from pymongo.mongo_client import MongoClient
- from util_common import Constant, html_parser
- from bs4 import BeautifulSoup
- import random
- import threading
- from pymongo.errors import ConnectionFailure
- class InitDetail():
- def __init__(self, maxThread=200):
- self.maxThread = maxThread
- self.parser = html_parser.HtmlParser()
- # 连接数据库
- self.cli = MongoClient(Constant.MONGODB_URL)
- self.db = self.cli.spider
-
- self.startThread = 0
- self.finishThread = 0
- self.successed = 0
- self.failed = 0
- self.details = 0
-
- def hasNext(self):
- try:
- return self.db.kindlist_todo.find({"status": Constant.DONE , "creatDetailTask": Constant.TODO}).count() > 0
- except:
- return True
-
- def _get_task(self, size=1):
- return self.db.kindlist_todo.find({"status": Constant.DONE , "creatDetailTask": Constant.TODO}).limit(size)
-
- def _on_success(self, url, kinds, details):
- self.finishThread += 1
- self.details += len(details)
- self.successed += 1
- print('success', url)
-
- def _on_error(self, url, e):
- self.finishThread += 1
- self.failed += 1
- print('failure', url, e)
-
- def run(self):
- tasks = self._get_task(self.maxThread)
- for task in tasks:
- thread = InitThread(self.db, self.parser, task['url'], task['str_html'], self._on_success, self._on_error)
- thread.start()
-
- def stat(self):
- return self.successed, self.failed
-
- def close(self):
- self.cli.close()
- class InitThread(threading.Thread):
- def __init__(self, db, parser, url, html, success, error):
- threading.Thread.__init__(self)
- self.db = db
- self.parser = parser
- self.url = url
- self.html = html
- self.success = success
- self.error = error
-
- def _parse_kind(self, soup):
- # 解析并存储解析出来的类目
- kindls = self.parser._get_kindlist_by_listPage(soup)
- kindls_list = list()
- for kind in kindls:
- kind_d = dict()
- kind_d["kindls"] = kind
- kindls_list.append(kind_d)
- return kindls_list
-
- def _parse_detail(self, soup):
- # 解析并存储detailTask
- detail_urls = self.parser._get_detail_urls_from_listPage(soup)
- # 组装detail任务
- task_list = list()
- for detail_url in detail_urls:
- d = dict()
- d["url"] = detail_url
- d["random"] = random.random()
- d["status"] = Constant.TODO
- d["analysisTask"] = Constant.TODO
- task_list.append(d)
- return task_list
-
- def _on_success(self, kinds, details):
- # 类目
- self.db.kind_from_listpage.insert_many(kinds)
- # 详情
- self.db.detail_todo.insert_many(details)
- # 将此listPage修改状态
- self.db.kindlist_todo.update_one({'url': self.url}, {'$set': {'creatDetailTask': Constant.DONE}})
-
- def _on_error(self, e):
- self.db.kindlist_todo.update_one({'url': self.url}, {'$set': {'creatDetailTask': Constant.ERROR}})
-
- def run(self):
- if len(self.html) < 150000 or str(self.html).find('购买所选商品') == -1:
- e = Exception("无效HTML")
- self._on_error(e)
- if self.error is not None:
- self.error(self.url, e)
- return
- try:
- soup = BeautifulSoup(self.html, 'html.parser', from_encoding='utf-8')
- kinds = self._parse_kind(soup)
- details = self._parse_detail(soup)
- self._on_success(kinds, details)
- if self.success is not None:
- self.success(self.url, kinds, details)
- except ConnectionFailure as e:
- if self.error is not None:
- self.error(self.url, e)
- except Exception as e:
- self._on_error(e)
- if self.error is not None:
- self.error(self.url, e)
-
-
- if __name__ == '__main__':
- task = InitDetail(maxThread=100)
- while task.hasNext():
- task.run()
-
- task.close()
-
|