123456789101112131415161718192021222324252627 |
- from pymongo.mongo_client import MongoClient
- from util_common import Constant
- '''
- 找无效的图片链接
- '''
- cli = MongoClient(Constant.MONGODB_URL)
- db = cli.spider
- fsdb = cli.pictures
- files = fsdb.fs.files.find({}, {"filename":1, "_id":0})
- filenames = set()
- for file in files:
- filenames.add(file["filename"])
- print('all files', len(filenames))
- cmps = db.component_original.find({"imgTask":Constant.DONE}, {"img_url_uu":1}).limit(100)
- err_cmps = set()
- for cmp in cmps:
- cmp_file = str(cmp["img_url_uu"])[29:]
- if cmp_file not in filenames:
- print(cmp_file)
- err_cmps.add(cmp["_id"])
- print(len(err_cmps))
|