import fitz from PIL import Image import io import os import concurrent.futures from typing import List, Dict from utils.file.minio.minio_util import MinIOUtil from src.utils.async_utils import ThreadPoolManager class PDFSplitter: """PDF扫描件按页拆分工具""" @staticmethod def _process_page(page_num: int, pdf_document: fitz.Document, pdf_filename: str, dataset_id: int, is_upload: bool) -> Dict[str, any]: """ 处理单个PDF页面 Args: page_num: 页码索引 pdf_document: PDF文档对象 pdf_filename: PDF文件名(不含扩展名) dataset_id: 数据集ID is_upload: 是否上传到MinIO Returns: Dict: 包含页面信息的字典 """ # 获取页面 page = pdf_document[page_num] # 页码从1开始 page_number = page_num + 1 # 将页面转换为图像 # 使用较高分辨率,DPI=300 pix = page.get_pixmap(dpi=300) # 将fitz pixmap转换为PIL图像 image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) if is_upload: # 初始化MinioUtil minio_util = MinIOUtil() # 将图像转换为字节流,便于后续处理 image_bytes = io.BytesIO() image.save(image_bytes, format='PNG') image_bytes.seek(0) # 生成图片文件名 image_filename = f"{pdf_filename}_{page_number}.png" # 重置字节流指针 image_bytes.seek(0) # 上传图片到MinIO,获取URL bucket_name = str(dataset_id) if dataset_id else "bookpage" image_url = minio_util.custom_upload_file(file=image_bytes, original_filename=image_filename, bucket_name=bucket_name) return { "page_number": page_number, "image": image, "image_bytes": image_bytes, "image_path": image_url } else: return { "page_number": page_number, "image": image, "image_bytes": None, "image_path": None } @staticmethod def split_pdf(pdf_path: str, dataset_id: int = None, is_upload: bool = True, pdf_content: io.BytesIO = None, original_filename: str = None) -> List[Dict[str, any]]: """ 将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO Args: pdf_path: PDF文件路径 dataset_id: 数据集ID is_upload: 是否上传到MinIO pdf_content: PDF文件内容(字节流),如果提供则优先使用 original_filename: 原始文件名,如果提供则优先使用 Returns: List[Dict]: 包含每一页信息的列表,每个字典包含: - page_number: 页码 - image: PIL图像对象 - image_bytes: 图像字节流 - image_path: MinIO中保存的图片URL """ pdf_document = None try: # 打开PDF文件 if pdf_content: # 使用内存中的PDF内容 pdf_document = fitz.open(stream=pdf_content, filetype="pdf") # 使用提供的原始文件名 if original_filename: pdf_filename = os.path.splitext(os.path.basename(original_filename))[0] else: pdf_filename = "temp_pdf" else: # 使用文件路径 pdf_document = fitz.open(pdf_path) # 获取PDF文件名(不含扩展名) pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # 使用全局线程池管理器进行并行处理 result = [] # 提交所有页面处理任务 future_to_page = { ThreadPoolManager.submit( "parser", PDFSplitter._process_page, page_num, pdf_document, pdf_filename, dataset_id, is_upload ): page_num for page_num in range(len(pdf_document)) } # 收集处理结果 for future in concurrent.futures.as_completed(future_to_page): try: page_result = future.result() result.append(page_result) except Exception as e: raise Exception(f"处理页面失败: {str(e)}") # 将result根据page_number排序 result.sort(key=lambda x: x["page_number"]) return result except Exception as e: raise Exception(f"PDF拆分失败: {str(e)}") finally: ThreadPoolManager.shutdown_all() # 确保PDF文件总是被关闭 if pdf_document is not None: try: pdf_document.close() except Exception: pass @staticmethod def save_page_image(image: Image.Image, output_path: str) -> None: """ 保存单页图像 Args: image: PIL图像对象 output_path: 输出文件路径 """ image.save(output_path, format='PNG')