pdf_splitter.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import fitz
  2. from PIL import Image
  3. import io
  4. from typing import List, Dict, Tuple
  5. class PDFSplitter:
  6. """PDF扫描件按页拆分工具"""
  7. @staticmethod
  8. def split_pdf(pdf_path: str) -> List[Dict[str, any]]:
  9. """
  10. 将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO
  11. Args:
  12. pdf_path: PDF文件路径
  13. Returns:
  14. List[Dict]: 包含每一页信息的列表,每个字典包含:
  15. - page_number: 页码
  16. - image: PIL图像对象
  17. - image_bytes: 图像字节流
  18. - image_path: MinIO中保存的图片URL
  19. """
  20. import os
  21. from utils.minio.minio_util import MinIOUtil
  22. try:
  23. # 初始化MinioUtil
  24. minio_util = MinIOUtil()
  25. # 打开PDF文件
  26. pdf_document = fitz.open(pdf_path)
  27. # 获取PDF文件名(不含扩展名)
  28. pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
  29. result = []
  30. for page_num in range(len(pdf_document)):
  31. # 获取页面
  32. page = pdf_document[page_num]
  33. # 页码从1开始
  34. page_number = page_num + 1
  35. # 将页面转换为图像
  36. # 使用较高分辨率,DPI=300
  37. pix = page.get_pixmap(dpi=300)
  38. # 将fitz pixmap转换为PIL图像
  39. image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
  40. # 将图像转换为字节流,便于后续处理
  41. image_bytes = io.BytesIO()
  42. image.save(image_bytes, format='PNG')
  43. image_bytes.seek(0)
  44. # 生成图片文件名
  45. image_filename = f"{pdf_filename}_{page_number}.png"
  46. # 重置字节流指针
  47. image_bytes.seek(0)
  48. # 上传图片到MinIO,获取URL
  49. image_url = minio_util.upload_file(image_bytes, image_filename)
  50. result.append({
  51. "page_number": page_number,
  52. "image": image,
  53. "image_bytes": image_bytes,
  54. "image_path": image_url
  55. })
  56. # 关闭PDF文件
  57. pdf_document.close()
  58. return result
  59. except Exception as e:
  60. raise Exception(f"PDF拆分失败: {str(e)}")
  61. @staticmethod
  62. def save_page_image(image: Image.Image, output_path: str) -> None:
  63. """
  64. 保存单页图像
  65. Args:
  66. image: PIL图像对象
  67. output_path: 输出文件路径
  68. """
  69. image.save(output_path, format='PNG')