err_img.py 866 B

123456789101112131415161718192021222324252627
  1. from pymongo.mongo_client import MongoClient
  2. from util_common import Constant
  3. '''
  4. 找无效的图片链接
  5. '''
  6. cli = MongoClient(Constant.MONGODB_URL)
  7. db = cli.spider
  8. fsdb = cli.pictures
  9. files = fsdb.fs.files.find({}, {"filename":1, "_id":0})
  10. filenames = set()
  11. for file in files:
  12. filenames.add(file["filename"])
  13. print('all files', len(filenames))
  14. cmps = db.component_original.find({"imgTask":Constant.DONE}, {"img_url_uu":1}).limit(100)
  15. err_cmps = set()
  16. for cmp in cmps:
  17. cmp_file = str(cmp["img_url_uu"])[29:]
  18. if cmp_file not in filenames:
  19. print(cmp_file)
  20. err_cmps.add(cmp["_id"])
  21. print(len(err_cmps))
  22. # cmps = db.component_original.find({"imgTask" : Constant.TODO, "img_url_mouser": {"$exists": True}}, {"img_url_mouser":1})
  23. # all_urls = set()
  24. # for cmp in cmps:
  25. # all_urls.add(cmp["img_url_mouser"])
  26. # print(len(all_urls))