pdffile_dup_del.py 1.2 KB

12345678910111213141516171819202122232425262728293031
  1. # coding=utf-8
  2. '''
  3. duplication deletion
  4. '''
  5. from pymongo.mongo_client import MongoClient
  6. from util_common import Constant
  7. import requests
  8. cli = MongoClient(Constant.MONGODB_URL)
  9. db = cli.spider
  10. attachs = db.component_original.find({"attachTask" : Constant.DONE}, {"_id": True, "attachUrl": True, "attachUrl_uu": True})
  11. fs_api_delete = "http://10.10.100.200:9999/file/delete?path=%s"
  12. attach_ids = set()
  13. for attach in attachs:
  14. if attach["_id"] not in attach_ids:
  15. try:
  16. oth_attachs = db.component_original.find({"attachTask" : Constant.DONE, "_id": {"$ne": attach["_id"]}, "attachUrl": attach["attachUrl"], "attachUrl_uu": {"$ne": attach["attachUrl_uu"]}}, {"_id": True, "attachUrl_uu": True})
  17. for oth_attach in oth_attachs:
  18. requests.get(fs_api_delete % oth_attach['attachUrl_uu'])
  19. db.component_original.update_one({'_id': oth_attach["_id"]}, {'$set': {'attachUrl_uu': attach['attachUrl_uu'], 'attach_download_user': 'duplication'}})
  20. attach_ids.add(oth_attach["_id"])
  21. print('delete', attach['attachUrl'], oth_attach['attachUrl_uu'])
  22. except Exception as e:
  23. print(attach['attachUrl_uu'], e)
  24. continue
  25. cli.close()