import fitz from PIL import Image import io from typing import List, Dict, Tuple class PDFSplitter: """PDF扫描件按页拆分工具""" @staticmethod def split_pdf(pdf_path: str) -> List[Dict[str, any]]: """ 将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO Args: pdf_path: PDF文件路径 Returns: List[Dict]: 包含每一页信息的列表,每个字典包含: - page_number: 页码 - image: PIL图像对象 - image_bytes: 图像字节流 - image_path: MinIO中保存的图片URL """ import os from utils.minio.minio_util import MinIOUtil try: # 初始化MinioUtil minio_util = MinIOUtil() # 打开PDF文件 pdf_document = fitz.open(pdf_path) # 获取PDF文件名(不含扩展名) pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] result = [] for page_num in range(len(pdf_document)): # 获取页面 page = pdf_document[page_num] # 页码从1开始 page_number = page_num + 1 # 将页面转换为图像 # 使用较高分辨率,DPI=300 pix = page.get_pixmap(dpi=300) # 将fitz pixmap转换为PIL图像 image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # 将图像转换为字节流,便于后续处理 image_bytes = io.BytesIO() image.save(image_bytes, format='PNG') image_bytes.seek(0) # 生成图片文件名 image_filename = f"{pdf_filename}_{page_number}.png" # 重置字节流指针 image_bytes.seek(0) # 上传图片到MinIO,获取URL image_url = minio_util.upload_file(image_bytes, image_filename) result.append({ "page_number": page_number, "image": image, "image_bytes": image_bytes, "image_path": image_url }) # 关闭PDF文件 pdf_document.close() return result except Exception as e: raise Exception(f"PDF拆分失败: {str(e)}") @staticmethod def save_page_image(image: Image.Image, output_path: str) -> None: """ 保存单页图像 Args: image: PIL图像对象 output_path: 输出文件路径 """ image.save(output_path, format='PNG')