# coding=utf-8 ''' Created on 2016年3月7日 为详情页生成下载任务 @author: ChenHao ''' from pymongo.mongo_client import MongoClient from util_common import Constant, html_parser from bs4 import BeautifulSoup import random praser = html_parser.HtmlParser() # 连接数据库 cli = MongoClient(Constant.MONGODB_URL) db = cli.spider # 读取kindlist待生成detail任务的 new_product_urls = set() rs = db.kindlist_todo.find({"status": Constant.DONE ,"creatDetailTask": Constant.TODO}, no_cursor_timeout=True) # listPage = rs; for listPage in rs: # 生成任务并保存 url_listPage = listPage["url"] html_cont = listPage["str_html"] # 检查列表页的页面大小是不是正常,例如一个的大小为332004 if len(html_cont) < 150000: db.kindlist_todo.update_one({'url':url_listPage}, {'$set': {'creatDetailTask': Constant.ERROR}}) continue try: soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') # 解析并存储解析出来的类目 kindls = praser._get_kindlist_by_listPage(soup) kind_d = dict() kind_d["kindls"] = kindls db.kind_from_listpage.insert_one(kind_d) # 解析并存储detailTask detail_urls = praser._get_detail_urls_from_listPage(soup) # 组装detail任务 task_list = list() for detail_url in detail_urls: d = dict() d["url"] = detail_url d["random"] = random.random() d["status"] = Constant.TODO d["analysisTask"] = Constant.TODO task_list.append(d) db.detail_todo.insert_many(task_list) # 将此listPage修改状态 db.kindlist_todo.update_one({'url':url_listPage}, {'$set': {'creatDetailTask': Constant.DONE}}) except: # 将此listPage修改为错误状态 db.kindlist_todo.update_one({'url':url_listPage}, {'$set': {'creatDetailTask': Constant.ERROR}}) cli.close()