imgfile_missed_by_mongodb.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. #coding=utf-8
  2. '''
  3. Created on 2016年6月27日
  4. @author: uas
  5. '''
  6. import os
  7. import random
  8. import threading
  9. import uuid
  10. import urllib.request
  11. import gridfs
  12. from pymongo.mongo_client import MongoClient
  13. from util_common import Constant
  14. from os.path import getsize
  15. class ImgFile_missed(object):
  16. def __init__(self,userName=None,maxThread=100):
  17. self.cli=MongoClient(Constant.MONGODB_URL)
  18. self.db=self.cli.spider
  19. self.fs=gridfs.GridFS(self.cli.pictures)
  20. self.user=userName
  21. self.activeThread=0
  22. self.maxThread=maxThread
  23. self.successsed=0
  24. self.failured=0
  25. self.total=0
  26. self.isLast=False
  27. def _find_uu_mouser_url(self,url):
  28. result=self.db.component_original.find_one({'img_url_mouser':url,"img_url_uu":{'$exists':True}})
  29. if result is not None:
  30. return result['img_url_uu']
  31. return None
  32. #
  33. def _get_imgName_and_url_uu(self):
  34. uuid_str=str(uuid.uuid1())
  35. imgName=uuid_str.replace('-','')+'.jpg'
  36. img_url_uu=Constant.IMG_URL_HEADER+imgName
  37. return imgName,img_url_uu
  38. def _get_Imginfo_in_gridfs(self,filename):
  39. fsdb=self.cli.pictures
  40. rs=fsdb.fs.files.find({'filename':filename})
  41. if rs is not None:
  42. return True
  43. return False
  44. def run(self):
  45. rs=self.db.blank_img.find({'img_url_mouser':{'$ne':''}})
  46. for index,r in enumerate(rs):
  47. print(index,'-->',r['cmp_id'])
  48. if index>1000:
  49. break
  50. # if self._find_uu_mouser_url(r['img_url_mouser']):
  51. # self.db.blank_img.update({'_id':r['_id']},{'$set':{'img_url_uu':self._find_uu_mouser_url(r['img_url_mouser'])}})
  52. # else:
  53. try:
  54. filename=str(random.randint(1,100))+'.jpg'
  55. urllib.request.urlretrieve(r['img_url_mouser'], filename)
  56. while (getsize(filename)<100):
  57. urllib.request.urlretrieve(r['img_url_mouser'], filename)
  58. print(getsize(filename))
  59. imgName,img_url_uu=self._get_imgName_and_url_uu()
  60. with open(filename,'rb')as file:
  61. data=file.read()
  62. self.fs.put(data,content_type="image/jpeg",filename=imgName)
  63. while not (self._get_Imginfo_in_gridfs(filename)):
  64. self.fs.put(data,content_type="image/jpeg",filename=imgName)
  65. self.db.blank_img.update({'_id':r['_id']},{'$set':{'img_url_uu':img_url_uu}})
  66. # u=urllib.request.urlopen(r['img_url_mouser'])
  67. # data=u.read()
  68. # print('1',data)
  69. # with open(filename,'wb') as file:
  70. # print(file.write(data))
  71. except Exception as e:
  72. print(e)
  73. #
  74. # finally:
  75. # if(os.path.exists(filename)):
  76. # os.remove(filename)
  77. if __name__=='__main__':
  78. img_missed=ImgFile_missed()
  79. img_missed.run()