# coding=utf-8 ''' Created on 2016年3月7日 使用线程的方式为详情页生成下载任务 @author: ChenHao ''' from pymongo.mongo_client import MongoClient from util_common import Constant, html_parser from bs4 import BeautifulSoup import random import threading from pymongo.errors import ConnectionFailure class InitDetail(): def __init__(self, maxThread=200): self.maxThread = maxThread self.parser = html_parser.HtmlParser() # 连接数据库 self.cli = MongoClient(Constant.MONGODB_URL) self.db = self.cli.spider self.startThread = 0 self.finishThread = 0 self.successed = 0 self.failed = 0 self.details = 0 def hasNext(self): try: return self.db.kindlist_todo.find({"status": Constant.DONE , "creatDetailTask": Constant.TODO}).count() > 0 except: return True def _get_task(self, size=1): return self.db.kindlist_todo.find({"status": Constant.DONE , "creatDetailTask": Constant.TODO}).limit(size) def _on_success(self, url, kinds, details): self.finishThread += 1 self.details += len(details) self.successed += 1 print('success', url) def _on_error(self, url, e): self.finishThread += 1 self.failed += 1 print('failure', url, e) def run(self): tasks = self._get_task(self.maxThread) for task in tasks: thread = InitThread(self.db, self.parser, task['url'], task['str_html'], self._on_success, self._on_error) thread.start() def stat(self): return self.successed, self.failed def close(self): self.cli.close() class InitThread(threading.Thread): def __init__(self, db, parser, url, html, success, error): threading.Thread.__init__(self) self.db = db self.parser = parser self.url = url self.html = html self.success = success self.error = error def _parse_kind(self, soup): # 解析并存储解析出来的类目 kindls = self.parser._get_kindlist_by_listPage(soup) kindls_list = list() for kind in kindls: kind_d = dict() kind_d["kindls"] = kind kindls_list.append(kind_d) return kindls_list def _parse_detail(self, soup): # 解析并存储detailTask detail_urls = self.parser._get_detail_urls_from_listPage(soup) # 组装detail任务 task_list = list() for detail_url in detail_urls: d = dict() d["url"] = detail_url d["random"] = random.random() d["status"] = Constant.TODO d["analysisTask"] = Constant.TODO task_list.append(d) return task_list def _on_success(self, kinds, details): # 类目 self.db.kind_from_listpage.insert_many(kinds) # 详情 self.db.detail_todo.insert_many(details) # 将此listPage修改状态 self.db.kindlist_todo.update_one({'url': self.url}, {'$set': {'creatDetailTask': Constant.DONE}}) def _on_error(self, e): self.db.kindlist_todo.update_one({'url': self.url}, {'$set': {'creatDetailTask': Constant.ERROR}}) def run(self): if len(self.html) < 150000 or str(self.html).find('购买所选商品') == -1: e = Exception("无效HTML") self._on_error(e) if self.error is not None: self.error(self.url, e) return try: soup = BeautifulSoup(self.html, 'html.parser', from_encoding='utf-8') kinds = self._parse_kind(soup) details = self._parse_detail(soup) self._on_success(kinds, details) if self.success is not None: self.success(self.url, kinds, details) except ConnectionFailure as e: if self.error is not None: self.error(self.url, e) except Exception as e: self._on_error(e) if self.error is not None: self.error(self.url, e) if __name__ == '__main__': task = InitDetail(maxThread=100) while task.hasNext(): task.run() task.close()