ErrorHandle.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. # coding=utf-8
  2. '''
  3. Created on 2016年4月21日
  4. 对于图片的错误处理
  5. @author: ChenHao
  6. '''
  7. from util_common import Constant
  8. from pymongo.mongo_client import MongoClient
  9. import urllib.request
  10. class ImgFileErrorHandle(object):
  11. def __init__(self):
  12. pass
  13. def _clean_error_status(self, db):
  14. rs = db.error_img_url_uu.find()
  15. error_img_url_uu_list = list(i["url"] for i in rs)
  16. print (error_img_url_uu_list)
  17. db.component_original.update_many({"img_url_uu": {"$in": error_img_url_uu_list}}, {"$set": {"imgTask": Constant.TODO}})
  18. # 检查网络请求是否可用
  19. def _reset_status_for_error_img_by_accessible(self, db):
  20. rs = db.component_original.find({"imgTask": Constant.DONE}).distinct("img_url_uu")
  21. # 1、找出不正确的
  22. print ("合计", len(rs))
  23. error_cout = 0
  24. error_img_url_uu_set = set()
  25. for index, url in enumerate(rs):
  26. print (url)
  27. if index % 500 == 0:
  28. print (index, error_cout)
  29. response = urllib.request.urlopen(url)
  30. if response.getcode() != 200:
  31. error_cout += 1
  32. error_img_url_uu_set.add(url)
  33. # error_cout += 1
  34. # error_img_url_uu_set.add(url)
  35. # 为了保险,现在数据库里面存一份吧
  36. error_img_url_uu_list = list()
  37. for url in error_img_url_uu_set:
  38. d = dict()
  39. d["url"] = url
  40. error_img_url_uu_list.append(d)
  41. db.error_img_url_uu.insert_many(error_img_url_uu_list)
  42. # 将不正确的清除
  43. # db.component_original.update({"img_url_uu": {"$in": error_img_url_uu_set}}, {"$set": {"imgTask": Constant.TODO}})
  44. # 检查picture Collection 中是否有这些文件名
  45. def _reset_status_for_error_img_by_fileName(self, db, fsdb):
  46. files = fsdb.fs.files.find({}, {"filename":1, "_id":0})
  47. filenames = set()
  48. for file in files:
  49. filenames.add(file["filename"])
  50. print('all files', len(filenames))
  51. cmps_img_url_uu = db.component_original.find({"imgTask":Constant.DONE}).distinct("img_url_uu")
  52. err_cmps_img_url_uu = set()
  53. for img_url_uu in cmps_img_url_uu:
  54. cmp_file = img_url_uu[29:]
  55. if cmp_file not in filenames:
  56. err_cmps_img_url_uu.add(img_url_uu)
  57. print(len(err_cmps_img_url_uu))
  58. # 保存错误信息
  59. rs_error_img_url_uu = db.error_img_url_uu.find({})
  60. old_error_set = set(i["url"] for i in rs_error_img_url_uu)
  61. new_error_img_url_uu_list = list()
  62. for url in err_cmps_img_url_uu:
  63. if url not in old_error_set:
  64. d = dict()
  65. d["url"] = url
  66. new_error_img_url_uu_list.append(d)
  67. if len(new_error_img_url_uu_list) != 0:
  68. db.error_img_url_uu.insert_many(new_error_img_url_uu_list)
  69. if __name__ == '__main__':
  70. cli = MongoClient(Constant.MONGODB_URL)
  71. db = cli.spider
  72. fsdb = cli.pictures
  73. imgFileErrorHandle = ImgFileErrorHandle()
  74. # imgFileErrorHandle._reset_status_for_error_img(db)
  75. # imgFileErrorHandle._reset_status_for_error_img_by_fileName(db, fsdb)
  76. imgFileErrorHandle._clean_error_status(db)
  77. cli.close()