123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- #coding=utf-8
- '''
- Created on 2016年7月20日
- @author: uas
- '''
- from pymongo.mongo_client import MongoClient
- from bs4 import BeautifulSoup
- from util_common import Constant
- import random
- class ParserForPanel(object):
-
- def __init__(self):
- self.cli=MongoClient(Constant.MONGODB_URL)
- self.db=self.cli.spider
-
-
- def craw(self):
-
- result=self.db.listPage_panel_todo.find({},{'_id':True,'str_html':True,'kind':True})
-
- cmp_id=0
- cmp_list_all=list()
- for index,row in enumerate(result):
- print(index)
- soup=BeautifulSoup(row['str_html'],'html.parser')
- cmp_list=soup.findAll('div',class_='pos1')
- for r in cmp_list:
- try:
- base_url='http://www.panelook.cn/'
- current_url=base_url+str(r.a.attrs.get('href'))
- d=dict()
- d['url']=current_url
- cmp_id+=1
- d['cmp_id']=cmp_id
- d['random']=random.random()
- d['analysisTask']=1
- d['downloadTask']=1
- d['kind']=row['kind']
- cmp_list_all.append(d)
- except:
- pass
-
-
- print("total",str(len(cmp_list_all)))
- for index,r in enumerate(cmp_list_all):
- # self.db.component_panel_0720.insert_one(r)
- self.db.component_panel_0720.update({'url':r['url']},{'$set':{'kind':r['kind']}})
- print(index)
-
-
- if __name__=='__main__':
-
- parser=ParserForPanel()
- parser.craw()
|