# coding=utf-8 ''' Created on 2016年4月21日 对于图片的错误处理 @author: ChenHao ''' from util_common import Constant from pymongo.mongo_client import MongoClient import urllib.request class ImgFileErrorHandle(object): def __init__(self): pass def _clean_error_status(self, db): rs = db.error_img_url_uu.find() error_img_url_uu_list = list(i["url"] for i in rs) print (error_img_url_uu_list) db.component_original.update_many({"img_url_uu": {"$in": error_img_url_uu_list}}, {"$set": {"imgTask": Constant.TODO}}) # 检查网络请求是否可用 def _reset_status_for_error_img_by_accessible(self, db): rs = db.component_original.find({"imgTask": Constant.DONE}).distinct("img_url_uu") # 1、找出不正确的 print ("合计", len(rs)) error_cout = 0 error_img_url_uu_set = set() for index, url in enumerate(rs): print (url) if index % 500 == 0: print (index, error_cout) response = urllib.request.urlopen(url) if response.getcode() != 200: error_cout += 1 error_img_url_uu_set.add(url) # error_cout += 1 # error_img_url_uu_set.add(url) # 为了保险,现在数据库里面存一份吧 error_img_url_uu_list = list() for url in error_img_url_uu_set: d = dict() d["url"] = url error_img_url_uu_list.append(d) db.error_img_url_uu.insert_many(error_img_url_uu_list) # 将不正确的清除 # db.component_original.update({"img_url_uu": {"$in": error_img_url_uu_set}}, {"$set": {"imgTask": Constant.TODO}}) # 检查picture Collection 中是否有这些文件名 def _reset_status_for_error_img_by_fileName(self, db, fsdb): files = fsdb.fs.files.find({}, {"filename":1, "_id":0}) filenames = set() for file in files: filenames.add(file["filename"]) print('all files', len(filenames)) cmps_img_url_uu = db.component_original.find({"imgTask":Constant.DONE}).distinct("img_url_uu") err_cmps_img_url_uu = set() for img_url_uu in cmps_img_url_uu: cmp_file = img_url_uu[29:] if cmp_file not in filenames: err_cmps_img_url_uu.add(img_url_uu) print(len(err_cmps_img_url_uu)) # 保存错误信息 rs_error_img_url_uu = db.error_img_url_uu.find({}) old_error_set = set(i["url"] for i in rs_error_img_url_uu) new_error_img_url_uu_list = list() for url in err_cmps_img_url_uu: if url not in old_error_set: d = dict() d["url"] = url new_error_img_url_uu_list.append(d) if len(new_error_img_url_uu_list) != 0: db.error_img_url_uu.insert_many(new_error_img_url_uu_list) if __name__ == '__main__': cli = MongoClient(Constant.MONGODB_URL) db = cli.spider fsdb = cli.pictures imgFileErrorHandle = ImgFileErrorHandle() # imgFileErrorHandle._reset_status_for_error_img(db) # imgFileErrorHandle._reset_status_for_error_img_by_fileName(db, fsdb) imgFileErrorHandle._clean_error_status(db) cli.close()