pdffile_modify.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. # coding=utf-8
  2. '''
  3. remove last page
  4. '''
  5. import os
  6. import random
  7. import urllib
  8. from pymongo.mongo_client import MongoClient
  9. import requests
  10. from PyPDF2.pdf import PdfFileReader, PdfFileWriter
  11. from util_common import Constant
  12. cli = MongoClient(Constant.MONGODB_URL)
  13. db = cli.spider
  14. attachs = db.component_original.find({"attachTask" : Constant.DONE, "attach_download_user": {'$ne': 'admin'}}, {"_id": True, "attachUrl_uu": True})
  15. fs_api_delete = "http://10.10.100.200:9999/file/delete?path=%s"
  16. for attach in attachs:
  17. old_filename = str(random.random()) + ".pdf"
  18. new_filename = str(random.random()) + ".pdf"
  19. try:
  20. # urllib.request.urlretrieve(attach['attachUrl_uu'], old_filename)
  21. # can not connect to dfs.ubtoc.com
  22. try:
  23. urllib.request.urlretrieve(attach['attachUrl_uu'].replace('dfs.ubtoc.com/','10.10.100.200:9999/file/download?path='), old_filename)
  24. except:
  25. # error attach_url
  26. db.component_original.update_one({'_id': attach["_id"]}, {'$set': {'attachTask': Constant.TODO, 'attachUrl_uu': None, 'attach_download_user': None}})
  27. continue
  28. input_stream = open(old_filename, 'rb')
  29. pdf_input = PdfFileReader(input_stream)
  30. pdf_output = PdfFileWriter()
  31. page = 0
  32. pages = pdf_input.getNumPages() - 1
  33. # remove last page
  34. while page < pages:
  35. pdf_output.addPage(pdf_input.getPage(page))
  36. page += 1
  37. output_stream = open(new_filename, 'wb')
  38. pdf_output.write(output_stream)
  39. output_stream.close()
  40. # replace file
  41. with open(new_filename, 'rb') as file:
  42. requests.get(fs_api_delete % attach['attachUrl_uu'])
  43. res = requests.post(Constant.FS_API_UPLOAD, files={'file': file})
  44. res_j = res.json()
  45. db.component_original.update_one({'_id': attach["_id"]}, {'$set': {'attachUrl_uu': res_j['path'], 'attach_download_user': 'admin'}})
  46. input_stream.close()
  47. except Exception as e:
  48. print(attach['attachUrl_uu'], e)
  49. continue
  50. finally:
  51. if os.path.exists(old_filename):
  52. os.remove(old_filename)
  53. if os.path.exists(new_filename):
  54. os.remove(new_filename)
  55. cli.close()