FillUp_Missed_KindForPanel.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. #coding=utf-8
  2. from pymongo.mongo_client import MongoClient
  3. from bs4 import BeautifulSoup
  4. from util_common import Constant
  5. import random
  6. import re
  7. class ParserForPanel(object):
  8. def __init__(self):
  9. self.cli=MongoClient(Constant.MONGODB_URL)
  10. self.db=self.cli.spider
  11. def craw(self):
  12. result=self.db.listPage_panel_todo.find({},{'_id':True,'str_html':True})
  13. cmp_id=0
  14. cmp_list_all=list()
  15. for index,r in enumerate(result):
  16. print(index)
  17. soup=BeautifulSoup(r['str_html'],'html.parser')
  18. cmp_list=soup.findAll('div',class_='pos1')
  19. for cmp in cmp_list:
  20. try:
  21. base_url='http://www.panelook.cn/'
  22. current_url=base_url+str(cmp.a.attrs.get('href'))
  23. ##current_url=http://www.panelook.cn/LH240Q29-SH02_LG Display_2.4_CELL_overview_cn_4527.html
  24. ##r['url']=http://www.panelook.cn/adsearch_cn.php?op=TFT-CELL-FOG-COG&st=0&pl=&page=68
  25. pattern=re.compile(r'op=(.+?)&')
  26. kind=re.findall(pattern,r['url'])
  27. if len(kind)>0:
  28. kind=''.join(kind)
  29. else:
  30. pattern=re.compile(r'op=(.+?)')
  31. kind=re.findall(pattern,r['url'])
  32. kind=''.join(kind)
  33. print(kind)
  34. self.db.component_panel_0720.update({'url':current_url},{'$set':{'kind':kind}})
  35. except:
  36. pass
  37. if __name__=='__main__':
  38. parser=ParserForPanel()
  39. parser.craw()