| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- # coding=utf-8
- '''
- Created on 2016年3月7日
- 为列表页生成下载任务
- @author: ChenHao
- '''
- from pymongo.mongo_client import MongoClient
- from util_common import html_parser, Constant
- import random
- # 输入html的soup对象,
- # 读取需要的类目,并将数据写入数据库
- fin = open("../spider_download/Other/KindUrlStartCapacity.csv")
- lines = fin.readlines()
- fin.close()
- # 为这些类目创建文件夹
- '''
- for index, line in enumerate(lines):
- dirpath = "../spider_download/listPage/" + str(index+1)
- os.mkdir(dirpath)
- '''
- # 输入html的soup对象,计算并得到整个类目下的所有分页的数据
- def _get_kindlist_by_firstpage(url, index, lastPage):
- l = list()
- number = 1
-
- d = dict()
- d["kindid"] = index
- d["number"] = number
- number += 1
- d["url"] = url
- d["random"] = random.random()
- d["status"] = Constant.TODO
- l.append(d)
-
- try:
- # 有的页面只有一页
- count = int(lastPage)
- for i in range(1, count + 1):
- d = dict()
- d["id"] = index
- d["number"] = number
- number += 1
- d["url"] = url + "?No=" + str(25 * i)
- d["random"] = random.random()
- d["status"] = Constant.TODO
- l.append(d)
- except:
- pass
- return l
- # 生成需要下载的listPage
- html_praser = html_parser.HtmlParser()
- kindlist = list()
- for index, line in enumerate(lines):
- l = line.split(",")
- print (l)
- # 下载对应的html并加入
- url = l[0]
- lastPage = l[1]
- print (type(lastPage))
- kindlist.extend(_get_kindlist_by_firstpage(url, index, lastPage))
- # 连接数据库并插入数据
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
- db.kindlist_todo.insert_many(kindlist)
- cli.close()
|