| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157 |
- import fitz
- from PIL import Image
- import io
- import os
- import concurrent.futures
- from typing import List, Dict
- from utils.file.minio.minio_util import MinIOUtil
- from src.utils.async_utils import ThreadPoolManager
- class PDFSplitter:
- """PDF扫描件按页拆分工具"""
-
- @staticmethod
- def _process_page(page_num: int, pdf_document: fitz.Document, pdf_filename: str, dataset_id: int, is_upload: bool) -> Dict[str, any]:
- """
- 处理单个PDF页面
-
- Args:
- page_num: 页码索引
- pdf_document: PDF文档对象
- pdf_filename: PDF文件名(不含扩展名)
- dataset_id: 数据集ID
- is_upload: 是否上传到MinIO
-
- Returns:
- Dict: 包含页面信息的字典
- """
- # 获取页面
- page = pdf_document[page_num]
- # 页码从1开始
- page_number = page_num + 1
-
- # 将页面转换为图像
- # 使用较高分辨率,DPI=300
- pix = page.get_pixmap(dpi=300)
-
- # 将fitz pixmap转换为PIL图像
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-
- if is_upload:
- # 初始化MinioUtil
- minio_util = MinIOUtil()
- # 将图像转换为字节流,便于后续处理
- image_bytes = io.BytesIO()
- image.save(image_bytes, format='PNG')
- image_bytes.seek(0)
-
- # 生成图片文件名
- image_filename = f"{pdf_filename}_{page_number}.png"
-
- # 重置字节流指针
- image_bytes.seek(0)
-
- # 上传图片到MinIO,获取URL
- bucket_name = str(dataset_id) if dataset_id else "bookpage"
- image_url = minio_util.custom_upload_file(file=image_bytes, original_filename=image_filename, bucket_name=bucket_name)
-
- return {
- "page_number": page_number,
- "image": image,
- "image_bytes": image_bytes,
- "image_path": image_url
- }
- else:
- return {
- "page_number": page_number,
- "image": image,
- "image_bytes": None,
- "image_path": None
- }
- @staticmethod
- def split_pdf(pdf_path: str, dataset_id: int = None, is_upload: bool = True, pdf_content: io.BytesIO = None, original_filename: str = None) -> List[Dict[str, any]]:
- """
- 将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO
-
- Args:
- pdf_path: PDF文件路径
- dataset_id: 数据集ID
- is_upload: 是否上传到MinIO
- pdf_content: PDF文件内容(字节流),如果提供则优先使用
- original_filename: 原始文件名,如果提供则优先使用
-
- Returns:
- List[Dict]: 包含每一页信息的列表,每个字典包含:
- - page_number: 页码
- - image: PIL图像对象
- - image_bytes: 图像字节流
- - image_path: MinIO中保存的图片URL
- """
- pdf_document = None
- try:
- # 打开PDF文件
- if pdf_content:
- # 使用内存中的PDF内容
- pdf_document = fitz.open(stream=pdf_content, filetype="pdf")
- # 使用提供的原始文件名
- if original_filename:
- pdf_filename = os.path.splitext(os.path.basename(original_filename))[0]
- else:
- pdf_filename = "temp_pdf"
- else:
- # 使用文件路径
- pdf_document = fitz.open(pdf_path)
- # 获取PDF文件名(不含扩展名)
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
-
- # 使用全局线程池管理器进行并行处理
- result = []
- # 提交所有页面处理任务
- future_to_page = {
- ThreadPoolManager.submit(
- "parser",
- PDFSplitter._process_page,
- page_num,
- pdf_document,
- pdf_filename,
- dataset_id,
- is_upload
- ): page_num
- for page_num in range(len(pdf_document))
- }
-
- # 收集处理结果
- for future in concurrent.futures.as_completed(future_to_page):
- try:
- page_result = future.result()
- result.append(page_result)
- except Exception as e:
- raise Exception(f"处理页面失败: {str(e)}")
-
- # 将result根据page_number排序
- result.sort(key=lambda x: x["page_number"])
- return result
- except Exception as e:
- raise Exception(f"PDF拆分失败: {str(e)}")
- finally:
- ThreadPoolManager.shutdown_all()
- # 确保PDF文件总是被关闭
- if pdf_document is not None:
- try:
- pdf_document.close()
- except Exception:
- pass
-
- @staticmethod
- def save_page_image(image: Image.Image, output_path: str) -> None:
- """
- 保存单页图像
-
- Args:
- image: PIL图像对象
- output_path: 输出文件路径
- """
- image.save(output_path, format='PNG')
|