listPage_main_by_taskfile.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. # coding=utf-8
  2. '''
  3. Created on 2016年3月9日
  4. 这是基于任务文件的方法
  5. @author: ChenHao
  6. '''
  7. import codecs
  8. import json
  9. from util_common import html_downloader
  10. from os.path import getsize
  11. from time import sleep
  12. class ListPageMain():
  13. def __init__(self):
  14. self.downloader = html_downloader.HtmlDownloader()
  15. self.result = 1
  16. def craw(self, taskFilePath):
  17. try:
  18. fin = codecs.open(taskFilePath, "r", encoding='utf-8')
  19. line = fin.read()
  20. fin.close()
  21. if len(line) == 0:
  22. self.result = 0
  23. return
  24. l = json.loads(line)
  25. # 检查是否处理完毕
  26. print(len(l))
  27. if len(l) == 0:
  28. self.result = 0;
  29. obj = l.pop()
  30. url = obj["url"]
  31. i = obj["id"]
  32. number = obj["number"]
  33. filePath = "../spider_download/listPage/" + str(i) +"/" + str(number) + ".html"
  34. print(url)
  35. cont = self.downloader.download(url)
  36. fout = open(filePath, "w")
  37. fout.write(cont)
  38. fout.close()
  39. # 检查是不是有效下载
  40. if getsize(filePath) < 20000:
  41. print(filePath, getsize(filePath))
  42. # 等待1个 小时
  43. sleep(3600)
  44. else:
  45. print(len(l))
  46. st = json.dumps(l, ensure_ascii=False)
  47. fout = codecs.open(taskFilePath, "w", encoding='utf-8')
  48. line = fin.write(st)
  49. fout.close()
  50. except:
  51. pass
  52. if __name__ == "__main__":
  53. taskFilePath = "1.txt"
  54. obj_spider = ListPageMain()
  55. while(obj_spider.result):
  56. obj_spider.craw(taskFilePath)
  57. print("恭喜你!任务完成")