#coding=utf-8 ''' Created on 2016年7月28日 @author: uas ''' from pymongo.mongo_client import MongoClient from bs4 import BeautifulSoup import re from util_common import Constant import bs4 class parameterParser(object): def __init__(self): self.cli=MongoClient(Constant.MONGODB_URL) self.db=self.cli.spider def craw(self): result=self.db.panel_parameter_0728.find({'downloadTask':2},{'str_html':True,'_id':True,'kind':True,'cmp_id':True}).limit(1) # for r in result: # print(r['str_html']) # print(type(r['str_html'])) # filename='test.html' # with open(filename,'wb')as file: # file.write(r['str_html'].encode('utf-8')) with open('test.html','rb')as file: cont_file=file.read() cont_file=cont_file.decode('utf-8') # for index,row in enumerate(result): ##开始遍历每一个器件 # # cont_file=result['str_html'] soup=BeautifulSoup(cont_file,'html.parser') a=soup.find_all('div',class_='tabwrapB') for element in a: #遍历每个属性框 detno=element.div.h2.string print(detno) for index,r in enumerate(element.table.children): ##遍历属性框中的阵阵属性值 print(index) if type(r)==bs4.element.Tag: d=dict() d['property']=r.th.string print("iam here.",d['property']) value='' for string in r.td.strings: if string is not None: value=string print("i am here",value) if value=='': print('there is an image here') value_list=list() for child in r.td.descendants: try: value_list.append(child.get('title')) except: pass value=','.join(value_list) # d['value']=value # d['detno']=detno # d['cmp_id']=row['cmp_id'] # d['kind']=row['kind'] # # # self.db.panel_propertyvalue_0728.insert_one(d) # # # if __name__=='__main__': parameterparser=parameterParser() parameterparser.craw()