verify_pdf_splitter.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. """PDF拆分功能验证脚本"""
  2. import sys
  3. import os
  4. from pathlib import Path
  5. sys.path.append(os.path.dirname(os.path.abspath(__file__)))
  6. def check_pymupdf():
  7. """检查PyMuPDF是否安装"""
  8. print("检查PyMuPDF是否安装...")
  9. try:
  10. import fitz
  11. print(f"✓ PyMuPDF已安装")
  12. print(f" 版本: {fitz.__version__}")
  13. return True
  14. except ImportError:
  15. print("✗ 未安装PyMuPDF,请运行: pip install PyMuPDF")
  16. return False
  17. except Exception as e:
  18. print(f"✗ 检查PyMuPDF时出错: {str(e)}")
  19. return False
  20. def check_pdf_file(pdf_path):
  21. """检查PDF文件是否存在"""
  22. print(f"检查PDF文件: {pdf_path}")
  23. pdf = Path(pdf_path)
  24. if pdf.exists():
  25. print(f"✓ PDF文件存在,大小: {pdf.stat().st_size} 字节")
  26. return True
  27. else:
  28. print(f"✗ PDF文件不存在: {pdf_path}")
  29. return False
  30. def test_pdf_splitter():
  31. """测试PDF拆分功能"""
  32. print("=" * 50)
  33. print("PDF拆分功能验证")
  34. print("=" * 50)
  35. # 检查PyMuPDF
  36. pymupdf_ok = check_pymupdf()
  37. print()
  38. # 检查示例PDF文件
  39. pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"
  40. pdf_ok = check_pdf_file(pdf_path)
  41. print()
  42. if not pymupdf_ok:
  43. print("=" * 50)
  44. print("PyMuPDF安装指南:")
  45. print("1. 运行命令安装: pip install PyMuPDF")
  46. print("2. 安装完成后重试")
  47. print("=" * 50)
  48. return False
  49. if not pdf_ok:
  50. print("=" * 50)
  51. print(r"请确保PDF文件存在: D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf")
  52. print("或修改脚本中的pdf_path变量为实际的PDF文件路径")
  53. print("=" * 50)
  54. return False
  55. # 测试PDF拆分功能
  56. print("开始测试PDF拆分功能...")
  57. try:
  58. from services.pdf_parser.pdf_splitter import PDFSplitter
  59. splitter = PDFSplitter()
  60. print(f"正在拆分PDF: {pdf_path}")
  61. pages = splitter.split_pdf(pdf_path)
  62. print(f"✓ PDF拆分成功,共 {len(pages)} 页")
  63. for page in pages[:3]: # 只显示前3页
  64. print(f" - 页码: {page['page_number']}, 图像大小: {page['image'].size}")
  65. if len(pages) > 3:
  66. print(f" ... 以及 {len(pages) - 3} 页")
  67. return True
  68. except Exception as e:
  69. print(f"✗ PDF拆分失败: {str(e)}")
  70. print("可能的解决方案:")
  71. print("1. 确保poppler已正确安装并在PATH中")
  72. print("2. 检查PDF文件是否损坏")
  73. print("3. 检查pdf2image库版本是否兼容")
  74. return False
  75. def main():
  76. """主函数"""
  77. test_pdf_splitter()
  78. print("\n" + "=" * 50)
  79. print("验证完成")
  80. print("=" * 50)
  81. if __name__ == "__main__":
  82. main()