1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- #coding=utf-8
- '''
- Created on 2016年7月28日
- @author: uas
- '''
- from pymongo.mongo_client import MongoClient
- from bs4 import BeautifulSoup
- import re
- from util_common import Constant
- import bs4
- class parameterParser(object):
-
- def __init__(self):
-
- self.cli=MongoClient(Constant.MONGODB_URL)
- self.db=self.cli.spider
-
-
-
- def craw(self):
-
- result=self.db.panel_parameter_0728.find({'downloadTask':2},{'str_html':True,'_id':True,'kind':True,'cmp_id':True}).limit(1)
-
- # for r in result:
- # print(r['str_html'])
- # print(type(r['str_html']))
- # filename='test.html'
- # with open(filename,'wb')as file:
- # file.write(r['str_html'].encode('utf-8'))
- with open('test.html','rb')as file:
- cont_file=file.read()
-
- cont_file=cont_file.decode('utf-8')
- # for index,row in enumerate(result): ##开始遍历每一个器件
- #
- # cont_file=result['str_html']
- soup=BeautifulSoup(cont_file,'html.parser')
- a=soup.find_all('div',class_='tabwrapB')
- for element in a: #遍历每个属性框
-
- detno=element.div.h2.string
- print(detno)
-
- for index,r in enumerate(element.table.children): ##遍历属性框中的阵阵属性值
- print(index)
- if type(r)==bs4.element.Tag:
-
- d=dict()
- d['property']=r.th.string
- print("iam here.",d['property'])
- value=''
- for string in r.td.strings:
- if string is not None:
- value=string
- print("i am here",value)
- if value=='':
- print('there is an image here')
- value_list=list()
- for child in r.td.descendants:
- try:
- value_list.append(child.get('title'))
- except:
- pass
- value=','.join(value_list)
-
-
- # d['value']=value
- # d['detno']=detno
- # d['cmp_id']=row['cmp_id']
- # d['kind']=row['kind']
- #
- #
- # self.db.panel_propertyvalue_0728.insert_one(d)
- #
- #
- #
-
-
-
- if __name__=='__main__':
-
-
- parameterparser=parameterParser()
- parameterparser.craw()
-
|