| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- # coding=utf-8
- '''
- Created on 2016年3月9日
- 这是基于任务文件的方法
- @author: ChenHao
- '''
- import codecs
- import json
- from util_common import html_downloader
- from os.path import getsize
- from time import sleep
- class ListPageMain():
- def __init__(self):
- self.downloader = html_downloader.HtmlDownloader()
- self.result = 1
-
- def craw(self, taskFilePath):
- try:
- fin = codecs.open(taskFilePath, "r", encoding='utf-8')
- line = fin.read()
- fin.close()
- if len(line) == 0:
- self.result = 0
- return
- l = json.loads(line)
- # 检查是否处理完毕
- print(len(l))
- if len(l) == 0:
- self.result = 0;
- obj = l.pop()
- url = obj["url"]
- i = obj["id"]
- number = obj["number"]
- filePath = "../spider_download/listPage/" + str(i) +"/" + str(number) + ".html"
- print(url)
- cont = self.downloader.download(url)
- fout = open(filePath, "w")
- fout.write(cont)
- fout.close()
-
- # 检查是不是有效下载
- if getsize(filePath) < 20000:
- print(filePath, getsize(filePath))
- # 等待1个 小时
- sleep(3600)
- else:
- print(len(l))
- st = json.dumps(l, ensure_ascii=False)
- fout = codecs.open(taskFilePath, "w", encoding='utf-8')
- line = fin.write(st)
- fout.close()
- except:
- pass
-
-
-
- if __name__ == "__main__":
- taskFilePath = "1.txt"
- obj_spider = ListPageMain()
- while(obj_spider.result):
- obj_spider.craw(taskFilePath)
- print("恭喜你!任务完成")
-
-
-
-
-
-
-
-
-
|