pdf_splitter.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. import fitz
  2. from PIL import Image
  3. import io
  4. import os
  5. import concurrent.futures
  6. from typing import List, Dict
  7. from utils.file.minio.minio_util import MinIOUtil
  8. from src.utils.async_utils import ThreadPoolManager
  9. class PDFSplitter:
  10. """PDF扫描件按页拆分工具"""
  11. @staticmethod
  12. def _process_page(page_num: int, pdf_document: fitz.Document, pdf_filename: str, dataset_id: int, is_upload: bool) -> Dict[str, any]:
  13. """
  14. 处理单个PDF页面
  15. Args:
  16. page_num: 页码索引
  17. pdf_document: PDF文档对象
  18. pdf_filename: PDF文件名(不含扩展名)
  19. dataset_id: 数据集ID
  20. is_upload: 是否上传到MinIO
  21. Returns:
  22. Dict: 包含页面信息的字典
  23. """
  24. # 获取页面
  25. page = pdf_document[page_num]
  26. # 页码从1开始
  27. page_number = page_num + 1
  28. # 将页面转换为图像
  29. # 使用较高分辨率,DPI=300
  30. pix = page.get_pixmap(dpi=300)
  31. # 将fitz pixmap转换为PIL图像
  32. image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
  33. if is_upload:
  34. # 初始化MinioUtil
  35. minio_util = MinIOUtil()
  36. # 将图像转换为字节流,便于后续处理
  37. image_bytes = io.BytesIO()
  38. image.save(image_bytes, format='PNG')
  39. image_bytes.seek(0)
  40. # 生成图片文件名
  41. image_filename = f"{pdf_filename}_{page_number}.png"
  42. # 重置字节流指针
  43. image_bytes.seek(0)
  44. # 上传图片到MinIO,获取URL
  45. bucket_name = str(dataset_id) if dataset_id else "bookpage"
  46. image_url = minio_util.custom_upload_file(file=image_bytes, original_filename=image_filename, bucket_name=bucket_name)
  47. return {
  48. "page_number": page_number,
  49. "image": image,
  50. "image_bytes": image_bytes,
  51. "image_path": image_url
  52. }
  53. else:
  54. return {
  55. "page_number": page_number,
  56. "image": image,
  57. "image_bytes": None,
  58. "image_path": None
  59. }
  60. @staticmethod
  61. def split_pdf(pdf_path: str, dataset_id: int = None, is_upload: bool = True, pdf_content: io.BytesIO = None, original_filename: str = None) -> List[Dict[str, any]]:
  62. """
  63. 将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO
  64. Args:
  65. pdf_path: PDF文件路径
  66. dataset_id: 数据集ID
  67. is_upload: 是否上传到MinIO
  68. pdf_content: PDF文件内容(字节流),如果提供则优先使用
  69. original_filename: 原始文件名,如果提供则优先使用
  70. Returns:
  71. List[Dict]: 包含每一页信息的列表,每个字典包含:
  72. - page_number: 页码
  73. - image: PIL图像对象
  74. - image_bytes: 图像字节流
  75. - image_path: MinIO中保存的图片URL
  76. """
  77. pdf_document = None
  78. try:
  79. # 打开PDF文件
  80. if pdf_content:
  81. # 使用内存中的PDF内容
  82. pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
  83. # 使用提供的原始文件名
  84. if original_filename:
  85. pdf_filename = os.path.splitext(os.path.basename(original_filename))[0]
  86. else:
  87. pdf_filename = "temp_pdf"
  88. else:
  89. # 使用文件路径
  90. pdf_document = fitz.open(pdf_path)
  91. # 获取PDF文件名(不含扩展名)
  92. pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
  93. # 使用全局线程池管理器进行并行处理
  94. result = []
  95. # 提交所有页面处理任务
  96. future_to_page = {
  97. ThreadPoolManager.submit(
  98. "parser",
  99. PDFSplitter._process_page,
  100. page_num,
  101. pdf_document,
  102. pdf_filename,
  103. dataset_id,
  104. is_upload
  105. ): page_num
  106. for page_num in range(len(pdf_document))
  107. }
  108. # 收集处理结果
  109. for future in concurrent.futures.as_completed(future_to_page):
  110. try:
  111. page_result = future.result()
  112. result.append(page_result)
  113. except Exception as e:
  114. raise Exception(f"处理页面失败: {str(e)}")
  115. # 将result根据page_number排序
  116. result.sort(key=lambda x: x["page_number"])
  117. return result
  118. except Exception as e:
  119. raise Exception(f"PDF拆分失败: {str(e)}")
  120. finally:
  121. ThreadPoolManager.shutdown_all()
  122. # 确保PDF文件总是被关闭
  123. if pdf_document is not None:
  124. try:
  125. pdf_document.close()
  126. except Exception:
  127. pass
  128. @staticmethod
  129. def save_page_image(image: Image.Image, output_path: str) -> None:
  130. """
  131. 保存单页图像
  132. Args:
  133. image: PIL图像对象
  134. output_path: 输出文件路径
  135. """
  136. image.save(output_path, format='PNG')