detail_parse_for_panel.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. '''
  2. Created on 2016年7月27日
  3. @author: uas
  4. '''
  5. #coding=utf-8
  6. from pymongo.mongo_client import MongoClient
  7. from bs4 import BeautifulSoup
  8. from util_common import Constant
  9. import re
  10. import bs4
  11. import sys
  12. # print(sys.getdefaultencoding())
  13. cli=MongoClient(Constant.MONGODB_URL)
  14. db=cli.spider
  15. rs=db.component_panel_0720.find({},{'_id':True,'analysisTask':True,'str_html':True,'kind':True})
  16. for index, r in enumerate(rs):
  17. try:
  18. print(index)
  19. detail=dict()
  20. cont_file=r['str_html']
  21. pattern=re.compile(r'<a href="(.+?)">详细参数</a>')
  22. result=re.findall(pattern, cont_file) ##直接匹配出group的元素
  23. detail['detail_page']=result[0]
  24. if len(detail['detail_page'])==0:
  25. print(1)
  26. continue
  27. soup=BeautifulSoup(cont_file,'html.parser')
  28. a=soup.find('table',class_='gf_tab')
  29. tablelist=list()
  30. for index,tr in enumerate(a.children):
  31. tr_temp=dict()
  32. if(type(tr)== bs4.element.Tag):
  33. for index,td in enumerate(tr.children):
  34. if index==1:
  35. tr_temp['property']=td.string[:-1]##对于节点中只有一个字节点 用string
  36. if index==3:
  37. for string in td.strings:
  38. if string.strip()=='':
  39. tr_temp['value']=string.strip()
  40. else:
  41. tr_temp['value']=string.strip().replace('\xa0\r\n','')
  42. break
  43. if len(tr_temp)!=0:
  44. tablelist.append(tr_temp)
  45. print(tablelist)
  46. detail['properties']=tablelist
  47. db.component_panel_0720.update({'_id':r['_id']},{'$set':{'analysisTask':Constant.DONE,'properties':detail['properties'],'detail_parameter_url':detail['detail_page']}})
  48. except:
  49. pass