| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- """PDF拆分功能验证脚本"""
- import sys
- import os
- from pathlib import Path
- sys.path.append(os.path.dirname(os.path.abspath(__file__)))
- def check_pymupdf():
- """检查PyMuPDF是否安装"""
- print("检查PyMuPDF是否安装...")
- try:
- import fitz
- print(f"✓ PyMuPDF已安装")
- print(f" 版本: {fitz.__version__}")
- return True
- except ImportError:
- print("✗ 未安装PyMuPDF,请运行: pip install PyMuPDF")
- return False
- except Exception as e:
- print(f"✗ 检查PyMuPDF时出错: {str(e)}")
- return False
- def check_pdf_file(pdf_path):
- """检查PDF文件是否存在"""
- print(f"检查PDF文件: {pdf_path}")
- pdf = Path(pdf_path)
- if pdf.exists():
- print(f"✓ PDF文件存在,大小: {pdf.stat().st_size} 字节")
- return True
- else:
- print(f"✗ PDF文件不存在: {pdf_path}")
- return False
- def test_pdf_splitter():
- """测试PDF拆分功能"""
- print("=" * 50)
- print("PDF拆分功能验证")
- print("=" * 50)
-
- # 检查PyMuPDF
- pymupdf_ok = check_pymupdf()
- print()
-
- # 检查示例PDF文件
- pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"
- pdf_ok = check_pdf_file(pdf_path)
- print()
-
- if not pymupdf_ok:
- print("=" * 50)
- print("PyMuPDF安装指南:")
- print("1. 运行命令安装: pip install PyMuPDF")
- print("2. 安装完成后重试")
- print("=" * 50)
- return False
-
- if not pdf_ok:
- print("=" * 50)
- print(r"请确保PDF文件存在: D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf")
- print("或修改脚本中的pdf_path变量为实际的PDF文件路径")
- print("=" * 50)
- return False
-
- # 测试PDF拆分功能
- print("开始测试PDF拆分功能...")
- try:
- from services.pdf_parser.pdf_splitter import PDFSplitter
-
- splitter = PDFSplitter()
- print(f"正在拆分PDF: {pdf_path}")
- pages = splitter.split_pdf(pdf_path)
-
- print(f"✓ PDF拆分成功,共 {len(pages)} 页")
- for page in pages[:3]: # 只显示前3页
- print(f" - 页码: {page['page_number']}, 图像大小: {page['image'].size}")
-
- if len(pages) > 3:
- print(f" ... 以及 {len(pages) - 3} 页")
-
- return True
- except Exception as e:
- print(f"✗ PDF拆分失败: {str(e)}")
- print("可能的解决方案:")
- print("1. 确保poppler已正确安装并在PATH中")
- print("2. 检查PDF文件是否损坏")
- print("3. 检查pdf2image库版本是否兼容")
- return False
- def main():
- """主函数"""
- test_pdf_splitter()
-
- print("\n" + "=" * 50)
- print("验证完成")
- print("=" * 50)
- if __name__ == "__main__":
- main()
|