main.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. import sys
  2. import os
  3. # 添加项目根目录到Python路径
  4. sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
  5. from services.pdf_parser.workflow import PDFParsingWorkflow
  6. import json
  7. from typing import Dict, Any
  8. from conf.config import ModelConfig
  9. os.environ["LANGSMITH_TRACING"] = "true"
  10. os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
  11. os.environ["LANGSMITH_API_KEY"] = "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"
  12. os.environ["LANGSMITH_PROJECT"] = "ragflow_plugs"
  13. class PDFParsingService:
  14. """PDF扫描件拆分解析服务"""
  15. def __init__(self, model_name: str = None):
  16. """
  17. 初始化PDF解析服务
  18. Args:
  19. model_name: QWEN VL模型名称,若为None则使用配置文件中的值
  20. """
  21. # 从配置文件获取默认模型名称
  22. default_model = ModelConfig.get_model_name()
  23. self.model_name = model_name or default_model
  24. self.workflow = PDFParsingWorkflow(model_name=self.model_name)
  25. def parse_pdf(self, pdf_path: str) -> Dict[str, Any]:
  26. """
  27. 解析PDF扫描件
  28. Args:
  29. pdf_path: PDF文件路径
  30. Returns:
  31. Dict: 解析结果,包含:
  32. - pdf_path: PDF文件路径
  33. - total_pages: 总页数
  34. - parsed_results: 每一页的解析结果
  35. - is_complete: 是否完成
  36. """
  37. # 运行工作流
  38. result = self.workflow.run(pdf_path)
  39. # 整理输出结果
  40. output = {
  41. "pdf_path": result["pdf_path"],
  42. "total_pages": len(result["split_pages"]),
  43. "parsed_results": result["parsed_results"],
  44. "is_complete": result["is_complete"]
  45. }
  46. return output
  47. def parse_pdf_to_json(self, pdf_path: str, output_json_path: str = None) -> str:
  48. """
  49. 解析PDF并输出为JSON格式
  50. Args:
  51. pdf_path: PDF文件路径
  52. output_json_path: 输出JSON文件路径,若为None则返回JSON字符串
  53. Returns:
  54. str: JSON字符串或输出文件路径
  55. """
  56. result = self.parse_pdf(pdf_path)
  57. json_str = json.dumps(result, ensure_ascii=False, indent=2)
  58. if output_json_path:
  59. with open(output_json_path, "w", encoding="utf-8") as f:
  60. f.write(json_str)
  61. return output_json_path
  62. return json_str
  63. def parse_pdf_to_markdown(self, pdf_path: str) -> str:
  64. """
  65. 解析PDF并输出为Markdown格式,包含页码、描述和图片
  66. Args:
  67. pdf_path: PDF文件路径
  68. Returns:
  69. str: 输出Markdown文件路径
  70. """
  71. # 解析PDF
  72. result = self.parse_pdf(pdf_path)
  73. # 获取PDF文件名(不含扩展名)
  74. pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
  75. # 输出目录
  76. output_dir = r"d:\project\work\ragflow_plugs\book\output"
  77. # 确保输出目录存在
  78. os.makedirs(output_dir, exist_ok=True)
  79. # Markdown文件名
  80. md_filename = f"{pdf_filename}.md"
  81. md_file_path = os.path.join(output_dir, md_filename)
  82. # 图片存储目录
  83. images_dir = os.path.join(output_dir, f"{pdf_filename}_images")
  84. os.makedirs(images_dir, exist_ok=True)
  85. # 构建Markdown内容
  86. md_content = f"# {pdf_filename} 解析结果\n\n"
  87. md_content += f"**总页数**: {result['total_pages']}\n"
  88. md_content += f"**模型**: {self.model_name}\n\n"
  89. md_content += "---\n\n"
  90. # 遍历所有解析结果
  91. for page_result in result['parsed_results']:
  92. page_number = page_result.get('page_number', 0)
  93. content = page_result.get('content', '')
  94. # 写入页码和描述
  95. md_content += f"## 第 {page_number} 页\n\n"
  96. md_content += f"### 描述\n{content}\n\n"
  97. # 从temp目录获取已保存的图片
  98. pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
  99. temp_image_dir = r".\temp"
  100. temp_image_filename = f"{pdf_filename}_{page_number}.png"
  101. temp_image_path = os.path.join(temp_image_dir, temp_image_filename)
  102. # 检查图片是否存在
  103. if os.path.exists(temp_image_path):
  104. # 在Markdown中引用temp目录中的图片,使用正斜杠确保语法正确
  105. temp_image_url = temp_image_path.replace("\\", "/")
  106. md_content += f"### 图片\n"
  107. md_content += f"![第 {page_number} 页图片]({temp_image_url})\n\n"
  108. md_content += "---\n\n"
  109. # 写入Markdown文件
  110. with open(md_file_path, "w", encoding="utf-8") as f:
  111. f.write(md_content)
  112. return md_file_path
  113. def main():
  114. """主函数,示例用法"""
  115. # 示例:使用服务解析PDF
  116. # 1. 创建服务实例(使用配置文件中的默认模型)
  117. service = PDFParsingService()
  118. # 2. 或指定模型名称
  119. # service = PDFParsingService(model_name="qwen3-vl")
  120. # 3. 解析PDF文件
  121. pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"
  122. # 4. 保存为Markdown文件
  123. md_output_path = service.parse_pdf_to_markdown(pdf_path)
  124. print(f"解析结果已保存到: {md_output_path}")
  125. # 5. 或直接获取结果
  126. # result = service.parse_pdf(pdf_path)
  127. # print(json.dumps(result, ensure_ascii=False, indent=2))
  128. # 6. 或保存为JSON文件
  129. # output_path = service.parse_pdf_to_json(pdf_path, "output.json")
  130. # print(f"解析结果已保存到: {output_path}")
  131. if __name__ == "__main__":
  132. main()