# coding=utf-8 ''' Created on 2016年3月14日 利用列表页面分析得到类目树状结构 @author: ChenHao ''' from util_common import Constant from pymongo.mongo_client import MongoClient import re class GetKindTreeFromListPage(): def __init__(self): pass def _get_kindlist_by_listPage(self, soup): kindlist = soup.find_all('a', id = re.compile(r"lnkBreadcrumb\w*")) kindls = list() for index, kind in enumerate(kindlist): # 第一个不要 if index > 0: nameCn = kind.string kindls.append(nameCn) return kindls if __name__ == '__main__': getKindTreeFromListPage = GetKindTreeFromListPage() cli = MongoClient(Constant.MONGODB_URL) db = cli.spider rs = db.kindlist_todo.find_one({"status": Constant.DONE}) html_cont = rs["str_html"] kindlist = getKindTreeFromListPage._get_kindlist_by_listPage(html_cont) print (kindlist) # for index, listpage in enumerate(rs): # try: # html_cont = listpage["str_html"] # kindlist = getKindTreeFromListPage._get_kindlist_by_listPage(html_cont) # print (index) # print (kindlist) # except: # print ("error") cli.close()