from pymongo.mongo_client import MongoClient from util_common import Constant ''' 找无效的图片链接 ''' cli = MongoClient(Constant.MONGODB_URL) db = cli.spider fsdb = cli.pictures files = fsdb.fs.files.find({}, {"filename":1, "_id":0}) filenames = set() for file in files: filenames.add(file["filename"]) print('all files', len(filenames)) cmps = db.component_original.find({"imgTask":Constant.DONE}, {"img_url_uu":1}).limit(100) err_cmps = set() for cmp in cmps: cmp_file = str(cmp["img_url_uu"])[29:] if cmp_file not in filenames: print(cmp_file) err_cmps.add(cmp["_id"]) print(len(err_cmps)) # cmps = db.component_original.find({"imgTask" : Constant.TODO, "img_url_mouser": {"$exists": True}}, {"img_url_mouser":1}) # all_urls = set() # for cmp in cmps: # all_urls.add(cmp["img_url_mouser"]) # print(len(all_urls))