usoft
/
manage


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
							# coding=utf-8
'''
Created on 2016年3月9日
这是基于任务文件的方法
@author: ChenHao
'''
import codecs
import json
from util_common import html_downloader
from os.path import getsize
from time import sleep

class ListPageMain():
    def __init__(self):
        self.downloader = html_downloader.HtmlDownloader()
        self.result = 1
    
    def craw(self, taskFilePath):
        try:
            fin = codecs.open(taskFilePath, "r", encoding='utf-8')
            line = fin.read()
            fin.close()
            if len(line) == 0:
                self.result = 0
                return
            l = json.loads(line)
            # 检查是否处理完毕
            print(len(l))
            if len(l) == 0:
                self.result = 0;
            obj = l.pop()
            url = obj["url"]
            i = obj["id"]
            number = obj["number"]
            filePath = "../spider_download/listPage/" + str(i) +"/" + str(number) + ".html"
            print(url)
            cont = self.downloader.download(url)
            fout = open(filePath, "w")
            fout.write(cont)
            fout.close()
            
            # 检查是不是有效下载
            if getsize(filePath) < 20000:
                print(filePath, getsize(filePath))
                # 等待1个 小时
                sleep(3600)
            else:
                print(len(l))
                st = json.dumps(l, ensure_ascii=False)
                fout = codecs.open(taskFilePath, "w", encoding='utf-8')
                line = fin.write(st)
                fout.close()
        except:
            pass
        
                
    
if __name__ == "__main__":
    taskFilePath = "1.txt"
    obj_spider = ListPageMain()
    while(obj_spider.result):
        obj_spider.craw(taskFilePath)
    print("恭喜你！任务完成")