init_detail.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. # coding=utf-8
  2. '''
  3. Created on 2016年3月7日
  4. 为详情页生成下载任务
  5. @author: ChenHao
  6. '''
  7. from pymongo.mongo_client import MongoClient
  8. from util_common import Constant, html_parser
  9. from bs4 import BeautifulSoup
  10. import random
  11. praser = html_parser.HtmlParser()
  12. # 连接数据库
  13. cli = MongoClient(Constant.MONGODB_URL)
  14. db = cli.spider
  15. # 读取kindlist待生成detail任务的
  16. new_product_urls = set()
  17. rs = db.kindlist_todo.find({"status": Constant.DONE ,"creatDetailTask": Constant.TODO}, no_cursor_timeout=True)
  18. # listPage = rs;
  19. for listPage in rs:
  20. # 生成任务并保存
  21. url_listPage = listPage["url"]
  22. html_cont = listPage["str_html"]
  23. # 检查列表页的页面大小是不是正常,例如一个的大小为332004
  24. if len(html_cont) < 150000:
  25. db.kindlist_todo.update_one({'url':url_listPage}, {'$set': {'creatDetailTask': Constant.ERROR}})
  26. continue
  27. try:
  28. soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
  29. # 解析并存储解析出来的类目
  30. kindls = praser._get_kindlist_by_listPage(soup)
  31. kind_d = dict()
  32. kind_d["kindls"] = kindls
  33. db.kind_from_listpage.insert_one(kind_d)
  34. # 解析并存储detailTask
  35. detail_urls = praser._get_detail_urls_from_listPage(soup)
  36. # 组装detail任务
  37. task_list = list()
  38. for detail_url in detail_urls:
  39. d = dict()
  40. d["url"] = detail_url
  41. d["random"] = random.random()
  42. d["status"] = Constant.TODO
  43. d["analysisTask"] = Constant.TODO
  44. task_list.append(d)
  45. db.detail_todo.insert_many(task_list)
  46. # 将此listPage修改状态
  47. db.kindlist_todo.update_one({'url':url_listPage}, {'$set': {'creatDetailTask': Constant.DONE}})
  48. except:
  49. # 将此listPage修改为错误状态
  50. db.kindlist_todo.update_one({'url':url_listPage}, {'$set': {'creatDetailTask': Constant.ERROR}})
  51. cli.close()