# coding=utf-8 ''' 2016年3月31日15:52:25 解析下载好的详情页,生成基本解析数据 ''' from pymongo.mongo_client import MongoClient from util_common import Constant, html_detail_parser import threading class ParseDetail(): def __init__(self, maxThread=200): self.maxThread = maxThread self.parser = html_detail_parser.Parser() # 连接数据库 self.cli = MongoClient(Constant.MONGODB_URL) self.db = self.cli.spider self.startThread = 0 self.finishThread = 0 self.successed = 0 self.failed = 0 def hasNext(self): try: return self.db.detail_todo.find({"status": Constant.DONE , "analysisTask": Constant.TODO}).count() > 0 except: return True def _get_task(self, size=1): return self.db.detail_todo.find({"status": Constant.DONE , "analysisTask": Constant.TODO}).limit(size) def _on_success(self, url, objDetail): self.finishThread += 1 self.successed += 1 print('success', url) def _on_error(self, url, e): self.finishThread += 1 self.failed += 1 print('failure', url, e) def run(self): tasks = self._get_task(self.maxThread) for task in tasks: thread = InitThread(self.db, self.parser, task['url'], task['_id'], task['str_html'], self._on_success, self._on_error) thread.start() def stat(self): return self.successed, self.failed def close(self): self.cli.close() class InitThread(threading.Thread): def __init__(self, db, parser, url_mouser, _id, html, success, error): threading.Thread.__init__(self) self.db = db self.parser = parser self.url_mouser = url_mouser self._id = _id self.html = html self.success = success self.error = error def _parse_detail(self, str_html, url_mouser): objDetail = self.parser.craw(str_html, url_mouser) return objDetail def _on_success(self, objDetail): # 保存器件 self.db.component_original.insert_one(objDetail) # 将此detail_todo修改状态 self.db.detail_todo.update_one({'_id': self._id}, {'$set': {'analysisTask': Constant.DONE}}) def _on_error(self, e): self.db.detail_todo.update_one({'_id': self._id}, {'$set': {'analysisTask': Constant.ERROR}}) def run(self): if len(self.html) < 80000 or str(self.html).find('实时供货') == -1: e = Exception("无效HTML") self._on_error(e) if self.error is not None: self.error(self.url, e) return try: objDetail = self._parse_detail(self.html, self.url_mouser) self._on_success(objDetail) if self.success is not None: self.success(self.url_mouser, objDetail) except Exception as e: self._on_error(e) if self.error is not None: self.error(self.url_mouser, e) if __name__ == '__main__': task = ParseDetail(maxThread=20) while task.hasNext(): task.run() task.close()