# coding=utf-8 ''' Created on 2016年3月7日 为列表页生成下载任务 @author: ChenHao ''' from pymongo.mongo_client import MongoClient from util_common import html_parser, Constant import random # 输入html的soup对象, # 读取需要的类目,并将数据写入数据库 fin = open("../spider_download/Other/KindUrlStartCapacity.csv") lines = fin.readlines() fin.close() # 为这些类目创建文件夹 ''' for index, line in enumerate(lines): dirpath = "../spider_download/listPage/" + str(index+1) os.mkdir(dirpath) ''' # 输入html的soup对象,计算并得到整个类目下的所有分页的数据 def _get_kindlist_by_firstpage(url, index, lastPage): l = list() number = 1 d = dict() d["kindid"] = index d["number"] = number number += 1 d["url"] = url d["random"] = random.random() d["status"] = Constant.TODO l.append(d) try: # 有的页面只有一页 count = int(lastPage) for i in range(1, count + 1): d = dict() d["id"] = index d["number"] = number number += 1 d["url"] = url + "?No=" + str(25 * i) d["random"] = random.random() d["status"] = Constant.TODO l.append(d) except: pass return l # 生成需要下载的listPage html_praser = html_parser.HtmlParser() kindlist = list() for index, line in enumerate(lines): l = line.split(",") print (l) # 下载对应的html并加入 url = l[0] lastPage = l[1] print (type(lastPage)) kindlist.extend(_get_kindlist_by_firstpage(url, index, lastPage)) # 连接数据库并插入数据 cli = MongoClient(Constant.MONGODB_URL) db = cli.spider db.kindlist_todo.insert_many(kindlist) cli.close()