"""PDF拆分功能验证脚本""" import sys import os from pathlib import Path sys.path.append(os.path.dirname(os.path.abspath(__file__))) def check_pymupdf(): """检查PyMuPDF是否安装""" print("检查PyMuPDF是否安装...") try: import fitz print(f"✓ PyMuPDF已安装") print(f" 版本: {fitz.__version__}") return True except ImportError: print("✗ 未安装PyMuPDF,请运行: pip install PyMuPDF") return False except Exception as e: print(f"✗ 检查PyMuPDF时出错: {str(e)}") return False def check_pdf_file(pdf_path): """检查PDF文件是否存在""" print(f"检查PDF文件: {pdf_path}") pdf = Path(pdf_path) if pdf.exists(): print(f"✓ PDF文件存在,大小: {pdf.stat().st_size} 字节") return True else: print(f"✗ PDF文件不存在: {pdf_path}") return False def test_pdf_splitter(): """测试PDF拆分功能""" print("=" * 50) print("PDF拆分功能验证") print("=" * 50) # 检查PyMuPDF pymupdf_ok = check_pymupdf() print() # 检查示例PDF文件 pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf" pdf_ok = check_pdf_file(pdf_path) print() if not pymupdf_ok: print("=" * 50) print("PyMuPDF安装指南:") print("1. 运行命令安装: pip install PyMuPDF") print("2. 安装完成后重试") print("=" * 50) return False if not pdf_ok: print("=" * 50) print(r"请确保PDF文件存在: D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf") print("或修改脚本中的pdf_path变量为实际的PDF文件路径") print("=" * 50) return False # 测试PDF拆分功能 print("开始测试PDF拆分功能...") try: from services.pdf_parser.pdf_splitter import PDFSplitter splitter = PDFSplitter() print(f"正在拆分PDF: {pdf_path}") pages = splitter.split_pdf(pdf_path) print(f"✓ PDF拆分成功,共 {len(pages)} 页") for page in pages[:3]: # 只显示前3页 print(f" - 页码: {page['page_number']}, 图像大小: {page['image'].size}") if len(pages) > 3: print(f" ... 以及 {len(pages) - 3} 页") return True except Exception as e: print(f"✗ PDF拆分失败: {str(e)}") print("可能的解决方案:") print("1. 确保poppler已正确安装并在PATH中") print("2. 检查PDF文件是否损坏") print("3. 检查pdf2image库版本是否兼容") return False def main(): """主函数""" test_pdf_splitter() print("\n" + "=" * 50) print("验证完成") print("=" * 50) if __name__ == "__main__": main()