import sys import os # 添加项目根目录到Python路径 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from services.pdf_parser.workflow import PDFParsingWorkflow import json from typing import Dict, Any from conf.config import ModelConfig os.environ["LANGSMITH_TRACING"] = "true" os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com" os.environ["LANGSMITH_API_KEY"] = "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" os.environ["LANGSMITH_PROJECT"] = "ragflow_plugs" class PDFParsingService: """PDF扫描件拆分解析服务""" def __init__(self, model_name: str = None): """ 初始化PDF解析服务 Args: model_name: QWEN VL模型名称,若为None则使用配置文件中的值 """ # 从配置文件获取默认模型名称 default_model = ModelConfig.get_model_name() self.model_name = model_name or default_model self.workflow = PDFParsingWorkflow(model_name=self.model_name) def parse_pdf(self, pdf_path: str) -> Dict[str, Any]: """ 解析PDF扫描件 Args: pdf_path: PDF文件路径 Returns: Dict: 解析结果,包含: - pdf_path: PDF文件路径 - total_pages: 总页数 - parsed_results: 每一页的解析结果 - is_complete: 是否完成 """ # 运行工作流 result = self.workflow.run(pdf_path) # 整理输出结果 output = { "pdf_path": result["pdf_path"], "total_pages": len(result["split_pages"]), "parsed_results": result["parsed_results"], "is_complete": result["is_complete"] } return output def parse_pdf_to_json(self, pdf_path: str, output_json_path: str = None) -> str: """ 解析PDF并输出为JSON格式 Args: pdf_path: PDF文件路径 output_json_path: 输出JSON文件路径,若为None则返回JSON字符串 Returns: str: JSON字符串或输出文件路径 """ result = self.parse_pdf(pdf_path) json_str = json.dumps(result, ensure_ascii=False, indent=2) if output_json_path: with open(output_json_path, "w", encoding="utf-8") as f: f.write(json_str) return output_json_path return json_str def parse_pdf_to_markdown(self, pdf_path: str) -> str: """ 解析PDF并输出为Markdown格式,包含页码、描述和图片 Args: pdf_path: PDF文件路径 Returns: str: 输出Markdown文件路径 """ # 解析PDF result = self.parse_pdf(pdf_path) # 获取PDF文件名(不含扩展名) pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] # 输出目录 output_dir = r"d:\project\work\ragflow_plugs\book\output" # 确保输出目录存在 os.makedirs(output_dir, exist_ok=True) # Markdown文件名 md_filename = f"{pdf_filename}.md" md_file_path = os.path.join(output_dir, md_filename) # 图片存储目录 images_dir = os.path.join(output_dir, f"{pdf_filename}_images") os.makedirs(images_dir, exist_ok=True) # 构建Markdown内容 md_content = f"# {pdf_filename} 解析结果\n\n" md_content += f"**总页数**: {result['total_pages']}\n" md_content += f"**模型**: {self.model_name}\n\n" md_content += "---\n\n" # 遍历所有解析结果 for page_result in result['parsed_results']: page_number = page_result.get('page_number', 0) content = page_result.get('content', '') # 写入页码和描述 md_content += f"## 第 {page_number} 页\n\n" md_content += f"### 描述\n{content}\n\n" # 从temp目录获取已保存的图片 pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0] temp_image_dir = r".\temp" temp_image_filename = f"{pdf_filename}_{page_number}.png" temp_image_path = os.path.join(temp_image_dir, temp_image_filename) # 检查图片是否存在 if os.path.exists(temp_image_path): # 在Markdown中引用temp目录中的图片,使用正斜杠确保语法正确 temp_image_url = temp_image_path.replace("\\", "/") md_content += f"### 图片\n" md_content += f"![第 {page_number} 页图片]({temp_image_url})\n\n" md_content += "---\n\n" # 写入Markdown文件 with open(md_file_path, "w", encoding="utf-8") as f: f.write(md_content) return md_file_path def main(): """主函数,示例用法""" # 示例:使用服务解析PDF # 1. 创建服务实例(使用配置文件中的默认模型) service = PDFParsingService() # 2. 或指定模型名称 # service = PDFParsingService(model_name="qwen3-vl") # 3. 解析PDF文件 pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf" # 4. 保存为Markdown文件 md_output_path = service.parse_pdf_to_markdown(pdf_path) print(f"解析结果已保存到: {md_output_path}") # 5. 或直接获取结果 # result = service.parse_pdf(pdf_path) # print(json.dumps(result, ensure_ascii=False, indent=2)) # 6. 或保存为JSON文件 # output_path = service.parse_pdf_to_json(pdf_path, "output.json") # print(f"解析结果已保存到: {output_path}") if __name__ == "__main__": main()