#coding=utf-8 ''' Created on 2016年7月20日 @author: uas ''' from pymongo.mongo_client import MongoClient from bs4 import BeautifulSoup from util_common import Constant import random class ParserForPanel(object): def __init__(self): self.cli=MongoClient(Constant.MONGODB_URL) self.db=self.cli.spider def craw(self): result=self.db.listPage_panel_todo.find({},{'_id':True,'str_html':True,'kind':True}) cmp_id=0 cmp_list_all=list() for index,row in enumerate(result): print(index) soup=BeautifulSoup(row['str_html'],'html.parser') cmp_list=soup.findAll('div',class_='pos1') for r in cmp_list: try: base_url='http://www.panelook.cn/' current_url=base_url+str(r.a.attrs.get('href')) d=dict() d['url']=current_url cmp_id+=1 d['cmp_id']=cmp_id d['random']=random.random() d['analysisTask']=1 d['downloadTask']=1 d['kind']=row['kind'] cmp_list_all.append(d) except: pass print("total",str(len(cmp_list_all))) for index,r in enumerate(cmp_list_all): # self.db.component_panel_0720.insert_one(r) self.db.component_panel_0720.update({'url':r['url']},{'$set':{'kind':r['kind']}}) print(index) if __name__=='__main__': parser=ParserForPanel() parser.craw()