ParserForPanel.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. #coding=utf-8
  2. '''
  3. Created on 2016年7月20日
  4. @author: uas
  5. '''
  6. from pymongo.mongo_client import MongoClient
  7. from bs4 import BeautifulSoup
  8. from util_common import Constant
  9. import random
  10. class ParserForPanel(object):
  11. def __init__(self):
  12. self.cli=MongoClient(Constant.MONGODB_URL)
  13. self.db=self.cli.spider
  14. def craw(self):
  15. result=self.db.listPage_panel_todo.find({},{'_id':True,'str_html':True,'kind':True})
  16. cmp_id=0
  17. cmp_list_all=list()
  18. for index,row in enumerate(result):
  19. print(index)
  20. soup=BeautifulSoup(row['str_html'],'html.parser')
  21. cmp_list=soup.findAll('div',class_='pos1')
  22. for r in cmp_list:
  23. try:
  24. base_url='http://www.panelook.cn/'
  25. current_url=base_url+str(r.a.attrs.get('href'))
  26. d=dict()
  27. d['url']=current_url
  28. cmp_id+=1
  29. d['cmp_id']=cmp_id
  30. d['random']=random.random()
  31. d['analysisTask']=1
  32. d['downloadTask']=1
  33. d['kind']=row['kind']
  34. cmp_list_all.append(d)
  35. except:
  36. pass
  37. print("total",str(len(cmp_list_all)))
  38. for index,r in enumerate(cmp_list_all):
  39. # self.db.component_panel_0720.insert_one(r)
  40. self.db.component_panel_0720.update({'url':r['url']},{'$set':{'kind':r['kind']}})
  41. print(index)
  42. if __name__=='__main__':
  43. parser=ParserForPanel()
  44. parser.craw()