12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- '''
- Created on 2016年7月27日
- @author: uas
- '''
- #coding=utf-8
- from pymongo.mongo_client import MongoClient
- from bs4 import BeautifulSoup
- from util_common import Constant
- import re
- import bs4
- import sys
- # print(sys.getdefaultencoding())
- cli=MongoClient(Constant.MONGODB_URL)
- db=cli.spider
- rs=db.component_panel_0720.find({},{'_id':True,'analysisTask':True,'str_html':True,'kind':True})
- for index, r in enumerate(rs):
- try:
- print(index)
- detail=dict()
- cont_file=r['str_html']
- pattern=re.compile(r'<a href="(.+?)">详细参数</a>')
- result=re.findall(pattern, cont_file) ##直接匹配出group的元素
- detail['detail_page']=result[0]
- if len(detail['detail_page'])==0:
- print(1)
- continue
- soup=BeautifulSoup(cont_file,'html.parser')
- a=soup.find('table',class_='gf_tab')
- tablelist=list()
- for index,tr in enumerate(a.children):
-
- tr_temp=dict()
- if(type(tr)== bs4.element.Tag):
- for index,td in enumerate(tr.children):
- if index==1:
- tr_temp['property']=td.string[:-1]##对于节点中只有一个字节点 用string
-
-
- if index==3:
- for string in td.strings:
- if string.strip()=='':
- tr_temp['value']=string.strip()
- else:
- tr_temp['value']=string.strip().replace('\xa0\r\n','')
- break
- if len(tr_temp)!=0:
- tablelist.append(tr_temp)
- print(tablelist)
- detail['properties']=tablelist
- db.component_panel_0720.update({'_id':r['_id']},{'$set':{'analysisTask':Constant.DONE,'properties':detail['properties'],'detail_parameter_url':detail['detail_page']}})
- except:
- pass
|