| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- import sys
- import os
- # 添加项目根目录到Python路径
- sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
- from services.pdf_parser.workflow import PDFParsingWorkflow
- import json
- from typing import Dict, Any
- from conf.config import ModelConfig
- os.environ["LANGSMITH_TRACING"] = "true"
- os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
- os.environ["LANGSMITH_API_KEY"] = "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"
- os.environ["LANGSMITH_PROJECT"] = "ragflow_plugs"
- class PDFParsingService:
- """PDF扫描件拆分解析服务"""
-
- def __init__(self, model_name: str = None):
- """
- 初始化PDF解析服务
-
- Args:
- model_name: QWEN VL模型名称,若为None则使用配置文件中的值
- """
- # 从配置文件获取默认模型名称
- default_model = ModelConfig.get_model_name()
- self.model_name = model_name or default_model
- self.workflow = PDFParsingWorkflow(model_name=self.model_name)
-
- def parse_pdf(self, pdf_path: str) -> Dict[str, Any]:
- """
- 解析PDF扫描件
-
- Args:
- pdf_path: PDF文件路径
-
- Returns:
- Dict: 解析结果,包含:
- - pdf_path: PDF文件路径
- - total_pages: 总页数
- - parsed_results: 每一页的解析结果
- - is_complete: 是否完成
- """
- # 运行工作流
- result = self.workflow.run(pdf_path)
-
- # 整理输出结果
- output = {
- "pdf_path": result["pdf_path"],
- "total_pages": len(result["split_pages"]),
- "parsed_results": result["parsed_results"],
- "is_complete": result["is_complete"]
- }
-
- return output
-
- def parse_pdf_to_json(self, pdf_path: str, output_json_path: str = None) -> str:
- """
- 解析PDF并输出为JSON格式
-
- Args:
- pdf_path: PDF文件路径
- output_json_path: 输出JSON文件路径,若为None则返回JSON字符串
-
- Returns:
- str: JSON字符串或输出文件路径
- """
- result = self.parse_pdf(pdf_path)
- json_str = json.dumps(result, ensure_ascii=False, indent=2)
-
- if output_json_path:
- with open(output_json_path, "w", encoding="utf-8") as f:
- f.write(json_str)
- return output_json_path
-
- return json_str
-
- def parse_pdf_to_markdown(self, pdf_path: str) -> str:
- """
- 解析PDF并输出为Markdown格式,包含页码、描述和图片
-
- Args:
- pdf_path: PDF文件路径
-
- Returns:
- str: 输出Markdown文件路径
- """
- # 解析PDF
- result = self.parse_pdf(pdf_path)
-
- # 获取PDF文件名(不含扩展名)
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
-
- # 输出目录
- output_dir = r"d:\project\work\ragflow_plugs\book\output"
-
- # 确保输出目录存在
- os.makedirs(output_dir, exist_ok=True)
-
- # Markdown文件名
- md_filename = f"{pdf_filename}.md"
- md_file_path = os.path.join(output_dir, md_filename)
-
- # 图片存储目录
- images_dir = os.path.join(output_dir, f"{pdf_filename}_images")
- os.makedirs(images_dir, exist_ok=True)
-
- # 构建Markdown内容
- md_content = f"# {pdf_filename} 解析结果\n\n"
- md_content += f"**总页数**: {result['total_pages']}\n"
- md_content += f"**模型**: {self.model_name}\n\n"
- md_content += "---\n\n"
-
- # 遍历所有解析结果
- for page_result in result['parsed_results']:
- page_number = page_result.get('page_number', 0)
- content = page_result.get('content', '')
-
- # 写入页码和描述
- md_content += f"## 第 {page_number} 页\n\n"
- md_content += f"### 描述\n{content}\n\n"
-
- # 从temp目录获取已保存的图片
- pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
- temp_image_dir = r".\temp"
- temp_image_filename = f"{pdf_filename}_{page_number}.png"
- temp_image_path = os.path.join(temp_image_dir, temp_image_filename)
-
- # 检查图片是否存在
- if os.path.exists(temp_image_path):
- # 在Markdown中引用temp目录中的图片,使用正斜杠确保语法正确
- temp_image_url = temp_image_path.replace("\\", "/")
- md_content += f"### 图片\n"
- md_content += f"\n\n"
- md_content += "---\n\n"
-
- # 写入Markdown文件
- with open(md_file_path, "w", encoding="utf-8") as f:
- f.write(md_content)
-
- return md_file_path
- def main():
- """主函数,示例用法"""
- # 示例:使用服务解析PDF
- # 1. 创建服务实例(使用配置文件中的默认模型)
- service = PDFParsingService()
-
- # 2. 或指定模型名称
- # service = PDFParsingService(model_name="qwen3-vl")
-
- # 3. 解析PDF文件
- pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"
-
- # 4. 保存为Markdown文件
- md_output_path = service.parse_pdf_to_markdown(pdf_path)
- print(f"解析结果已保存到: {md_output_path}")
-
- # 5. 或直接获取结果
- # result = service.parse_pdf(pdf_path)
- # print(json.dumps(result, ensure_ascii=False, indent=2))
-
- # 6. 或保存为JSON文件
- # output_path = service.parse_pdf_to_json(pdf_path, "output.json")
- # print(f"解析结果已保存到: {output_path}")
- if __name__ == "__main__":
- main()
|