pdf_splitter.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. import fitz
  2. from PIL import Image
  3. import io
  4. from typing import List, Dict, Tuple
  5. class PDFSplitter:
  6. """PDF扫描件按页拆分工具"""
  7. @staticmethod
  8. def split_pdf(pdf_path: str) -> List[Dict[str, any]]:
  9. """
  10. 将PDF按页拆分,转换为图像并记录页码,同时保存图片到指定目录
  11. Args:
  12. pdf_path: PDF文件路径
  13. Returns:
  14. List[Dict]: 包含每一页信息的列表,每个字典包含:
  15. - page_number: 页码
  16. - image: PIL图像对象
  17. - image_bytes: 图像字节流
  18. - image_path: 保存的图片路径
  19. """
  20. import os
  21. from pathlib import Path
  22. try:
  23. # 打开PDF文件
  24. pdf_document = fitz.open(pdf_path)
  25. # 获取PDF文件名(不含扩展名)
  26. pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
  27. # 输出目录
  28. output_dir = r"d:\project\work\ragflow_plugs\book\output\temp"
  29. # 确保输出目录存在
  30. os.makedirs(output_dir, exist_ok=True)
  31. result = []
  32. for page_num in range(len(pdf_document)):
  33. # 获取页面
  34. page = pdf_document[page_num]
  35. # 页码从1开始
  36. page_number = page_num + 1
  37. # 将页面转换为图像
  38. # 使用较高分辨率,DPI=300
  39. pix = page.get_pixmap(dpi=300)
  40. # 将fitz pixmap转换为PIL图像
  41. image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
  42. # 将图像转换为字节流,便于后续处理
  43. image_bytes = io.BytesIO()
  44. image.save(image_bytes, format='PNG')
  45. image_bytes.seek(0)
  46. # 保存图片到指定目录
  47. image_filename = f"{pdf_filename}_{page_number}.png"
  48. image_path = os.path.join(output_dir, image_filename)
  49. image.save(image_path, format='PNG')
  50. result.append({
  51. "page_number": page_number,
  52. "image": image,
  53. "image_bytes": image_bytes,
  54. "image_path": image_path
  55. })
  56. # 关闭PDF文件
  57. pdf_document.close()
  58. return result
  59. except Exception as e:
  60. raise Exception(f"PDF拆分失败: {str(e)}")
  61. @staticmethod
  62. def save_page_image(image: Image.Image, output_path: str) -> None:
  63. """
  64. 保存单页图像
  65. Args:
  66. image: PIL图像对象
  67. output_path: 输出文件路径
  68. """
  69. image.save(output_path, format='PNG')