# coding=utf-8
'''
Created on 2016年4月21日
对于图片的错误处理
@author: ChenHao
'''
from util_common import Constant
from pymongo.mongo_client import MongoClient
import urllib.request

class ImgFileErrorHandle(object):
    
    def __init__(self):
        pass
    
    def _clean_error_status(self, db):
        rs = db.error_img_url_uu.find()
        error_img_url_uu_list = list(i["url"] for i in rs)
        print (error_img_url_uu_list)
        db.component_original.update_many({"img_url_uu": {"$in": error_img_url_uu_list}}, {"$set": {"imgTask": Constant.TODO}})
    
    # 检查网络请求是否可用
    def _reset_status_for_error_img_by_accessible(self, db):
        rs = db.component_original.find({"imgTask": Constant.DONE}).distinct("img_url_uu")
        # 1、找出不正确的
        print ("合计", len(rs))
        error_cout = 0
        error_img_url_uu_set = set()
        for index, url in enumerate(rs):
            print (url)
            if index % 500 == 0:
                print (index, error_cout)
            response = urllib.request.urlopen(url)
            if response.getcode() != 200:
                error_cout += 1
                error_img_url_uu_set.add(url)
#                 error_cout += 1
#                 error_img_url_uu_set.add(url)
        
        # 为了保险,现在数据库里面存一份吧
        error_img_url_uu_list = list()
        for url in error_img_url_uu_set:
            d = dict()
            d["url"] = url
            error_img_url_uu_list.append(d)
        db.error_img_url_uu.insert_many(error_img_url_uu_list)
        
        # 将不正确的清除
#         db.component_original.update({"img_url_uu": {"$in": error_img_url_uu_set}}, {"$set": {"imgTask": Constant.TODO}})
        
    # 检查picture Collection 中是否有这些文件名
    def _reset_status_for_error_img_by_fileName(self, db, fsdb):
        files = fsdb.fs.files.find({}, {"filename":1, "_id":0})
        filenames = set()
        for file in files:
            filenames.add(file["filename"])
        print('all files', len(filenames))
        cmps_img_url_uu = db.component_original.find({"imgTask":Constant.DONE}).distinct("img_url_uu")
        err_cmps_img_url_uu = set()
        for img_url_uu in cmps_img_url_uu:
            cmp_file = img_url_uu[29:]
            if cmp_file not in filenames:
                err_cmps_img_url_uu.add(img_url_uu)
        print(len(err_cmps_img_url_uu))
        
        # 保存错误信息
        rs_error_img_url_uu = db.error_img_url_uu.find({})
        old_error_set = set(i["url"] for i in rs_error_img_url_uu)
        
        new_error_img_url_uu_list = list()
        for url in err_cmps_img_url_uu:
            if url not in old_error_set:
                d = dict()
                d["url"] = url
                new_error_img_url_uu_list.append(d)
        if len(new_error_img_url_uu_list) != 0:
            db.error_img_url_uu.insert_many(new_error_img_url_uu_list)
        
    
    
if __name__ == '__main__':
    cli = MongoClient(Constant.MONGODB_URL)
    db = cli.spider
    fsdb = cli.pictures

    imgFileErrorHandle = ImgFileErrorHandle()
    
#     imgFileErrorHandle._reset_status_for_error_img(db)
    
#     imgFileErrorHandle._reset_status_for_error_img_by_fileName(db, fsdb)
    
    imgFileErrorHandle._clean_error_status(db)
    
    cli.close()