detail_todo_parser.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. # coding=utf-8
  2. '''
  3. 2016年3月31日15:52:25
  4. 解析下载好的详情页,生成基本解析数据
  5. '''
  6. from pymongo.mongo_client import MongoClient
  7. from util_common import Constant, html_detail_parser
  8. import threading
  9. class ParseDetail():
  10. def __init__(self, maxThread=200):
  11. self.maxThread = maxThread
  12. self.parser = html_detail_parser.Parser()
  13. # 连接数据库
  14. self.cli = MongoClient(Constant.MONGODB_URL)
  15. self.db = self.cli.spider
  16. self.startThread = 0
  17. self.finishThread = 0
  18. self.successed = 0
  19. self.failed = 0
  20. def hasNext(self):
  21. try:
  22. return self.db.detail_todo.find({"status": Constant.DONE , "analysisTask": Constant.TODO}).count() > 0
  23. except:
  24. return True
  25. def _get_task(self, size=1):
  26. return self.db.detail_todo.find({"status": Constant.DONE , "analysisTask": Constant.TODO}).limit(size)
  27. def _on_success(self, url, objDetail):
  28. self.finishThread += 1
  29. self.successed += 1
  30. print('success', url)
  31. def _on_error(self, url, e):
  32. self.finishThread += 1
  33. self.failed += 1
  34. print('failure', url, e)
  35. def run(self):
  36. tasks = self._get_task(self.maxThread)
  37. for task in tasks:
  38. thread = InitThread(self.db, self.parser, task['url'], task['_id'], task['str_html'], self._on_success, self._on_error)
  39. thread.start()
  40. def stat(self):
  41. return self.successed, self.failed
  42. def close(self):
  43. self.cli.close()
  44. class InitThread(threading.Thread):
  45. def __init__(self, db, parser, url_mouser, _id, html, success, error):
  46. threading.Thread.__init__(self)
  47. self.db = db
  48. self.parser = parser
  49. self.url_mouser = url_mouser
  50. self._id = _id
  51. self.html = html
  52. self.success = success
  53. self.error = error
  54. def _parse_detail(self, str_html, url_mouser):
  55. objDetail = self.parser.craw(str_html, url_mouser)
  56. return objDetail
  57. def _on_success(self, objDetail):
  58. # 保存器件
  59. self.db.component_original.insert_one(objDetail)
  60. # 将此detail_todo修改状态
  61. self.db.detail_todo.update_one({'_id': self._id}, {'$set': {'analysisTask': Constant.DONE}})
  62. def _on_error(self, e):
  63. self.db.detail_todo.update_one({'_id': self._id}, {'$set': {'analysisTask': Constant.ERROR}})
  64. def run(self):
  65. if len(self.html) < 80000 or str(self.html).find('实时供货') == -1:
  66. e = Exception("无效HTML")
  67. self._on_error(e)
  68. if self.error is not None:
  69. self.error(self.url, e)
  70. return
  71. try:
  72. objDetail = self._parse_detail(self.html, self.url_mouser)
  73. self._on_success(objDetail)
  74. if self.success is not None:
  75. self.success(self.url_mouser, objDetail)
  76. except Exception as e:
  77. self._on_error(e)
  78. if self.error is not None:
  79. self.error(self.url_mouser, e)
  80. if __name__ == '__main__':
  81. task = ParseDetail(maxThread=20)
  82. while task.hasNext():
  83. task.run()
  84. task.close()