| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- # coding=utf-8
- '''
- 2016年3月31日15:52:25
- 解析下载好的详情页,生成基本解析数据
- '''
- from pymongo.mongo_client import MongoClient
- from util_common import Constant, html_detail_parser
- import threading
- class ParseDetail():
- def __init__(self, maxThread=200):
- self.maxThread = maxThread
- self.parser = html_detail_parser.Parser()
- # 连接数据库
- self.cli = MongoClient(Constant.MONGODB_URL)
- self.db = self.cli.spider
-
- self.startThread = 0
- self.finishThread = 0
- self.successed = 0
- self.failed = 0
-
- def hasNext(self):
- try:
- return self.db.detail_todo.find({"status": Constant.DONE , "analysisTask": Constant.TODO}).count() > 0
- except:
- return True
-
- def _get_task(self, size=1):
- return self.db.detail_todo.find({"status": Constant.DONE , "analysisTask": Constant.TODO}).limit(size)
-
- def _on_success(self, url, objDetail):
- self.finishThread += 1
- self.successed += 1
- print('success', url)
-
- def _on_error(self, url, e):
- self.finishThread += 1
- self.failed += 1
- print('failure', url, e)
-
- def run(self):
- tasks = self._get_task(self.maxThread)
- for task in tasks:
- thread = InitThread(self.db, self.parser, task['url'], task['_id'], task['str_html'], self._on_success, self._on_error)
- thread.start()
-
- def stat(self):
- return self.successed, self.failed
-
- def close(self):
- self.cli.close()
- class InitThread(threading.Thread):
- def __init__(self, db, parser, url_mouser, _id, html, success, error):
- threading.Thread.__init__(self)
- self.db = db
- self.parser = parser
- self.url_mouser = url_mouser
- self._id = _id
- self.html = html
- self.success = success
- self.error = error
-
- def _parse_detail(self, str_html, url_mouser):
- objDetail = self.parser.craw(str_html, url_mouser)
- return objDetail
-
- def _on_success(self, objDetail):
- # 保存器件
- self.db.component_original.insert_one(objDetail)
- # 将此detail_todo修改状态
- self.db.detail_todo.update_one({'_id': self._id}, {'$set': {'analysisTask': Constant.DONE}})
-
- def _on_error(self, e):
- self.db.detail_todo.update_one({'_id': self._id}, {'$set': {'analysisTask': Constant.ERROR}})
-
- def run(self):
- if len(self.html) < 80000 or str(self.html).find('实时供货') == -1:
- e = Exception("无效HTML")
- self._on_error(e)
- if self.error is not None:
- self.error(self.url, e)
- return
- try:
- objDetail = self._parse_detail(self.html, self.url_mouser)
- self._on_success(objDetail)
- if self.success is not None:
- self.success(self.url_mouser, objDetail)
- except Exception as e:
- self._on_error(e)
- if self.error is not None:
- self.error(self.url_mouser, e)
-
-
- if __name__ == '__main__':
- task = ParseDetail(maxThread=20)
- while task.hasNext():
- task.run()
-
- task.close()
-
|