| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- import fitz
- from PIL import Image
- import io
- from typing import List, Dict, Tuple
- class PDFSplitter:
- """PDF扫描件按页拆分工具"""
-
- @staticmethod
- def split_pdf(pdf_path: str) -> List[Dict[str, any]]:
- """
- 将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO
-
- Args:
- pdf_path: PDF文件路径
-
- Returns:
- List[Dict]: 包含每一页信息的列表,每个字典包含:
- - page_number: 页码
- - image: PIL图像对象
- - image_bytes: 图像字节流
- - image_path: MinIO中保存的图片URL
- """
- import os
- from utils.file.minio.minio_util import MinIOUtil
-
- try:
- # 初始化MinioUtil
- minio_util = MinIOUtil()
-
- # 打开PDF文件
- pdf_document = fitz.open(pdf_path)
-
- # 获取PDF文件名(不含扩展名)
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
-
- result = []
- for page_num in range(len(pdf_document)):
- # 获取页面
- page = pdf_document[page_num]
- # 页码从1开始
- page_number = page_num + 1
-
- # 将页面转换为图像
- # 使用较高分辨率,DPI=300
- pix = page.get_pixmap(dpi=300)
-
- # 将fitz pixmap转换为PIL图像
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-
- # 将图像转换为字节流,便于后续处理
- image_bytes = io.BytesIO()
- image.save(image_bytes, format='PNG')
- image_bytes.seek(0)
-
- # 生成图片文件名
- image_filename = f"{pdf_filename}_{page_number}.png"
-
- # 重置字节流指针
- image_bytes.seek(0)
-
- # 上传图片到MinIO,获取URL
- image_url = minio_util.upload_file(image_bytes, image_filename)
-
- result.append({
- "page_number": page_number,
- "image": image,
- "image_bytes": image_bytes,
- "image_path": image_url
- })
-
- # 关闭PDF文件
- pdf_document.close()
-
- return result
- except Exception as e:
- raise Exception(f"PDF拆分失败: {str(e)}")
-
- @staticmethod
- def save_page_image(image: Image.Image, output_path: str) -> None:
- """
- 保存单页图像
-
- Args:
- image: PIL图像对象
- output_path: 输出文件路径
- """
- image.save(output_path, format='PNG')
|