123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- '''
- Created on 2016年4月21日
- 对于图片的错误处理
- @author: ChenHao
- '''
- from util_common import Constant
- from pymongo.mongo_client import MongoClient
- import urllib.request
- class ImgFileErrorHandle(object):
-
- def __init__(self):
- pass
-
- def _clean_error_status(self, db):
- rs = db.error_img_url_uu.find()
- error_img_url_uu_list = list(i["url"] for i in rs)
- print (error_img_url_uu_list)
- db.component_original.update_many({"img_url_uu": {"$in": error_img_url_uu_list}}, {"$set": {"imgTask": Constant.TODO}})
-
-
- def _reset_status_for_error_img_by_accessible(self, db):
- rs = db.component_original.find({"imgTask": Constant.DONE}).distinct("img_url_uu")
-
- print ("合计", len(rs))
- error_cout = 0
- error_img_url_uu_set = set()
- for index, url in enumerate(rs):
- print (url)
- if index % 500 == 0:
- print (index, error_cout)
- response = urllib.request.urlopen(url)
- if response.getcode() != 200:
- error_cout += 1
- error_img_url_uu_set.add(url)
-
-
- error_img_url_uu_list = list()
- for url in error_img_url_uu_set:
- d = dict()
- d["url"] = url
- error_img_url_uu_list.append(d)
- db.error_img_url_uu.insert_many(error_img_url_uu_list)
-
-
-
-
- def _reset_status_for_error_img_by_fileName(self, db, fsdb):
- files = fsdb.fs.files.find({}, {"filename":1, "_id":0})
- filenames = set()
- for file in files:
- filenames.add(file["filename"])
- print('all files', len(filenames))
- cmps_img_url_uu = db.component_original.find({"imgTask":Constant.DONE}).distinct("img_url_uu")
- err_cmps_img_url_uu = set()
- for img_url_uu in cmps_img_url_uu:
- cmp_file = img_url_uu[29:]
- if cmp_file not in filenames:
- err_cmps_img_url_uu.add(img_url_uu)
- print(len(err_cmps_img_url_uu))
-
-
- rs_error_img_url_uu = db.error_img_url_uu.find({})
- old_error_set = set(i["url"] for i in rs_error_img_url_uu)
-
- new_error_img_url_uu_list = list()
- for url in err_cmps_img_url_uu:
- if url not in old_error_set:
- d = dict()
- d["url"] = url
- new_error_img_url_uu_list.append(d)
- if len(new_error_img_url_uu_list) != 0:
- db.error_img_url_uu.insert_many(new_error_img_url_uu_list)
-
-
-
- if __name__ == '__main__':
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
- fsdb = cli.pictures
- imgFileErrorHandle = ImgFileErrorHandle()
-
-
-
- imgFileErrorHandle._clean_error_status(db)
-
- cli.close()
-
-
|