detail_parameter_parseforpanel.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. #coding=utf-8
  2. '''
  3. Created on 2016年7月28日
  4. @author: uas
  5. '''
  6. from pymongo.mongo_client import MongoClient
  7. from bs4 import BeautifulSoup
  8. import re
  9. from util_common import Constant
  10. import bs4
  11. class parameterParser(object):
  12. def __init__(self):
  13. self.cli=MongoClient(Constant.MONGODB_URL)
  14. self.db=self.cli.spider
  15. def craw(self):
  16. result=self.db.panel_parameter_0728.find({'downloadTask':2},{'str_html':True,'_id':True,'kind':True,'cmp_id':True}).limit(1)
  17. # for r in result:
  18. # print(r['str_html'])
  19. # print(type(r['str_html']))
  20. # filename='test.html'
  21. # with open(filename,'wb')as file:
  22. # file.write(r['str_html'].encode('utf-8'))
  23. with open('test.html','rb')as file:
  24. cont_file=file.read()
  25. cont_file=cont_file.decode('utf-8')
  26. # for index,row in enumerate(result): ##开始遍历每一个器件
  27. #
  28. # cont_file=result['str_html']
  29. soup=BeautifulSoup(cont_file,'html.parser')
  30. a=soup.find_all('div',class_='tabwrapB')
  31. for element in a: #遍历每个属性框
  32. detno=element.div.h2.string
  33. print(detno)
  34. for index,r in enumerate(element.table.children): ##遍历属性框中的阵阵属性值
  35. print(index)
  36. if type(r)==bs4.element.Tag:
  37. d=dict()
  38. d['property']=r.th.string
  39. print("iam here.",d['property'])
  40. value=''
  41. for string in r.td.strings:
  42. if string is not None:
  43. value=string
  44. print("i am here",value)
  45. if value=='':
  46. print('there is an image here')
  47. value_list=list()
  48. for child in r.td.descendants:
  49. try:
  50. value_list.append(child.get('title'))
  51. except:
  52. pass
  53. value=','.join(value_list)
  54. # d['value']=value
  55. # d['detno']=detno
  56. # d['cmp_id']=row['cmp_id']
  57. # d['kind']=row['kind']
  58. #
  59. #
  60. # self.db.panel_propertyvalue_0728.insert_one(d)
  61. #
  62. #
  63. #
  64. if __name__=='__main__':
  65. parameterparser=parameterParser()
  66. parameterparser.craw()