get_kindtree_from_listpage.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. # coding=utf-8
  2. '''
  3. Created on 2016年3月14日
  4. 利用列表页面分析得到类目树状结构
  5. @author: ChenHao
  6. '''
  7. from util_common import Constant
  8. from pymongo.mongo_client import MongoClient
  9. import re
  10. class GetKindTreeFromListPage():
  11. def __init__(self):
  12. pass
  13. def _get_kindlist_by_listPage(self, soup):
  14. kindlist = soup.find_all('a', id = re.compile(r"lnkBreadcrumb\w*"))
  15. kindls = list()
  16. for index, kind in enumerate(kindlist):
  17. # 第一个不要
  18. if index > 0:
  19. nameCn = kind.string
  20. kindls.append(nameCn)
  21. return kindls
  22. if __name__ == '__main__':
  23. getKindTreeFromListPage = GetKindTreeFromListPage()
  24. cli = MongoClient(Constant.MONGODB_URL)
  25. db = cli.spider
  26. rs = db.kindlist_todo.find_one({"status": Constant.DONE})
  27. html_cont = rs["str_html"]
  28. kindlist = getKindTreeFromListPage._get_kindlist_by_listPage(html_cont)
  29. print (kindlist)
  30. # for index, listpage in enumerate(rs):
  31. # try:
  32. # html_cont = listpage["str_html"]
  33. # kindlist = getKindTreeFromListPage._get_kindlist_by_listPage(html_cont)
  34. # print (index)
  35. # print (kindlist)
  36. # except:
  37. # print ("error")
  38. cli.close()