| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- #coding=utf-8
- from pymongo.mongo_client import MongoClient
- from bs4 import BeautifulSoup
- from util_common import Constant
- import random
- import re
- class ParserForPanel(object):
-
- def __init__(self):
- self.cli=MongoClient(Constant.MONGODB_URL)
- self.db=self.cli.spider
-
-
- def craw(self):
-
- result=self.db.listPage_panel_todo.find({},{'_id':True,'str_html':True})
-
- cmp_id=0
- cmp_list_all=list()
- for index,r in enumerate(result):
- print(index)
- soup=BeautifulSoup(r['str_html'],'html.parser')
- cmp_list=soup.findAll('div',class_='pos1')
- for cmp in cmp_list:
- try:
- base_url='http://www.panelook.cn/'
- current_url=base_url+str(cmp.a.attrs.get('href'))
- ##current_url=http://www.panelook.cn/LH240Q29-SH02_LG Display_2.4_CELL_overview_cn_4527.html
- ##r['url']=http://www.panelook.cn/adsearch_cn.php?op=TFT-CELL-FOG-COG&st=0&pl=&page=68
-
-
- pattern=re.compile(r'op=(.+?)&')
- kind=re.findall(pattern,r['url'])
- if len(kind)>0:
- kind=''.join(kind)
- else:
- pattern=re.compile(r'op=(.+?)')
- kind=re.findall(pattern,r['url'])
- kind=''.join(kind)
- print(kind)
- self.db.component_panel_0720.update({'url':current_url},{'$set':{'kind':kind}})
- except:
- pass
-
- if __name__=='__main__':
-
- parser=ParserForPanel()
- parser.craw()
|