init_kindlist_for_capacity.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # coding=utf-8
  2. '''
  3. Created on 2016年3月7日
  4. 为列表页生成下载任务
  5. @author: ChenHao
  6. '''
  7. from pymongo.mongo_client import MongoClient
  8. from util_common import html_parser, Constant
  9. import random
  10. # 输入html的soup对象,
  11. # 读取需要的类目,并将数据写入数据库
  12. fin = open("../spider_download/Other/KindUrlStartCapacity.csv")
  13. lines = fin.readlines()
  14. fin.close()
  15. # 为这些类目创建文件夹
  16. '''
  17. for index, line in enumerate(lines):
  18. dirpath = "../spider_download/listPage/" + str(index+1)
  19. os.mkdir(dirpath)
  20. '''
  21. # 输入html的soup对象,计算并得到整个类目下的所有分页的数据
  22. def _get_kindlist_by_firstpage(url, index, lastPage):
  23. l = list()
  24. number = 1
  25. d = dict()
  26. d["kindid"] = index
  27. d["number"] = number
  28. number += 1
  29. d["url"] = url
  30. d["random"] = random.random()
  31. d["status"] = Constant.TODO
  32. l.append(d)
  33. try:
  34. # 有的页面只有一页
  35. count = int(lastPage)
  36. for i in range(1, count + 1):
  37. d = dict()
  38. d["id"] = index
  39. d["number"] = number
  40. number += 1
  41. d["url"] = url + "?No=" + str(25 * i)
  42. d["random"] = random.random()
  43. d["status"] = Constant.TODO
  44. l.append(d)
  45. except:
  46. pass
  47. return l
  48. # 生成需要下载的listPage
  49. html_praser = html_parser.HtmlParser()
  50. kindlist = list()
  51. for index, line in enumerate(lines):
  52. l = line.split(",")
  53. print (l)
  54. # 下载对应的html并加入
  55. url = l[0]
  56. lastPage = l[1]
  57. print (type(lastPage))
  58. kindlist.extend(_get_kindlist_by_firstpage(url, index, lastPage))
  59. # 连接数据库并插入数据
  60. cli = MongoClient(Constant.MONGODB_URL)
  61. db = cli.spider
  62. db.kindlist_todo.insert_many(kindlist)
  63. cli.close()