| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- # coding=utf-8
- '''
- Created on 2016年3月14日
- 利用列表页面分析得到类目树状结构
- @author: ChenHao
- '''
- from util_common import Constant
- from pymongo.mongo_client import MongoClient
- import re
- class GetKindTreeFromListPage():
-
- def __init__(self):
- pass
-
- def _get_kindlist_by_listPage(self, soup):
- kindlist = soup.find_all('a', id = re.compile(r"lnkBreadcrumb\w*"))
- kindls = list()
- for index, kind in enumerate(kindlist):
- # 第一个不要
- if index > 0:
- nameCn = kind.string
- kindls.append(nameCn)
- return kindls
-
- if __name__ == '__main__':
- getKindTreeFromListPage = GetKindTreeFromListPage()
-
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
- rs = db.kindlist_todo.find_one({"status": Constant.DONE})
-
- html_cont = rs["str_html"]
- kindlist = getKindTreeFromListPage._get_kindlist_by_listPage(html_cont)
- print (kindlist)
-
- # for index, listpage in enumerate(rs):
- # try:
- # html_cont = listpage["str_html"]
- # kindlist = getKindTreeFromListPage._get_kindlist_by_listPage(html_cont)
- # print (index)
- # print (kindlist)
- # except:
- # print ("error")
-
- cli.close()
-
|