Răsfoiți Sursa

19维度图书解析流程调整

yingge 3 luni în urmă
părinte
comite
5c06557683

+ 26 - 1
src/api/db/services/prompt_service.py

@@ -4,7 +4,7 @@
 提供维度和提示词版本的 CRUD 操作。
 服务初始化时会自动检查并创建对应的向量数据库表/索引。
 """
-
+import json
 from typing import List, Dict, Any, Optional
 from src.utils.mysql import get_global_mysql_client
 from src.utils.vector_db import get_vector_db_client
@@ -239,6 +239,31 @@ class PromptService:
         result = self._db.fetch_one(sql, [dimension_id])
         return result['dataset_id'] if result else None
 
+    def get_decomposition_method(self) -> Dict[str, Dict[str, Any]]:
+        """获取所有维度的decomposition method"""
+        sql = "SELECT id, decomposition_method, is_preced, preced_node FROM prompt_dimensions"
+        results = self._db.fetch_all(sql)
+        return {
+            str(row['id']): {
+                "decomposition_method": row['decomposition_method'],
+                "is_preced": row['is_preced'],
+                "preced_node": json.loads(row['preced_node'])
+            }
+            for row in results
+        }
+    
+    def get_decomposition_method_by_dimension_id(self, dimension_id: int) -> Optional[str]:
+        """根据维度ID获取对应的decomposition method"""
+        sql = "SELECT decomposition_method FROM prompt_dimensions WHERE id = %s"
+        result = self._db.fetch_one(sql, [dimension_id])
+        return result['decomposition_method'] if result else None
+
+    def get_preced_node_name(self, dimension_id: int) -> List[str]:
+        """根据维度ID获取前一个节点名称,preced_node为json格式存储的整型数组"""
+        sql = "SELECT preced_node FROM prompt_dimensions WHERE id = %s"
+        result = self._db.fetch_one(sql, [dimension_id])
+        return json.loads(result['preced_node']) if result else []
+
     def update_dimension(self, dimension_id: int, name: str = None, description: str = None) -> int:
         """更新维度信息"""
         updates = []

+ 2 - 0
src/datasets/parser/nodes/__init__.py

@@ -25,6 +25,7 @@ from src.datasets.parser.nodes.prompt_retrieval_node import PromptRetrievalNode
 from src.datasets.parser.nodes.table_name_generation_node import TableNameGenerationNode
 from src.datasets.parser.nodes.dimension_result_node import DimensionResultNode
 from src.datasets.parser.nodes.picture_stitching_node import PictureStitchingNode
+from src.datasets.parser.nodes.summary_node import SummaryNode
 
 __all__ = [
     "PDFSplitNode",
@@ -44,4 +45,5 @@ __all__ = [
     "TableNameGenerationNode",
     "DimensionResultNode",
     "PictureStitchingNode",
+    "SummaryNode",
 ]

+ 2 - 45
src/datasets/parser/nodes/image_parse_node.py

@@ -57,38 +57,6 @@ class ImageParseNode(BaseNode):
     def name(self) -> str:
         return "image_parse"
     
-    def _default_prompt_template(self, is_book_image: bool = False) -> str:
-        """
-        默认的图像解析提示词模板
-        
-        Args:
-            is_book_image: 是否为完整书本图片模式
-        """
-        if is_book_image:
-            return """
-                角色定位:你是一位顶尖的儿童绘本分析师与视觉工程专家,擅长将插画视觉信息转化为高精度的结构化元数据。
-                任务描述:请深度解析提供的完整绘本图片(包含所有页面),提取基本要素和特征。
-                
-                输出格式:JSON
-                {{
-                    "content": "完整绘本内容描述",
-                    "elements": []
-                }}
-            """
-        else:
-            return """
-                角色定位:你是一位顶尖的儿童绘本分析师与视觉工程专家,擅长将插画视觉信息转化为高精度的结构化元数据。
-                任务描述:请深度解析提供的绘本页面,提取基本要素和特征。
-                当前提取页码为:{page_number}
-                
-                输出格式:JSON
-                {{
-                    "page_number": {page_number},
-                    "content": "页面内容描述",
-                    "elements": []
-                }}
-            """
-    
     def _parse_single_page(self, page: Dict[str, Any], prompt_template: str) -> Dict[str, Any]:
         """
         解析单个页面
@@ -103,20 +71,12 @@ class ImageParseNode(BaseNode):
         image = page.get("image")
         
         prompt = prompt_template.format(page_number=page_number)
-        
         logger.debug(f"开始解析第 {page_number} 页")
         
         try:
             parser = QWenVLParser(self.model_name)
             result = parser.parse_image(image, page_number, prompt)
             parsed_content = parse_json_response(result, expected_type=dict)
-            # 处理markdown格式的JSON标签
-            # parsed_content = parse_markdown_json(result)
-            # if parsed_content:
-            #     result = parsed_content
-            # else:
-            #     # 如果解析失败,保留原始内容
-            #     logger.warning(f"解析JSON内容失败,保留原始内容")
             
             logger.debug(f"第 {page_number} 页解析完成")
             return parsed_content
@@ -170,15 +130,13 @@ class ImageParseNode(BaseNode):
             包含parsed_results的更新字典
         """
         # 检查使用哪种模式
+        prompt_template = getattr(state, 'dimension_prompt', None)
         if self.use_book_image:
             # 书本模式:解析完整长图
             book_image = getattr(state, 'book_image', None)
             if book_image is None:
                 logger.warning("book_image为空,无法解析")
-                return {"parsed_results": [], "processed_pages": 0}
-            
-            prompt_template = state.dimension_prompt or self._default_prompt_template(is_book_image=True)
-            
+                return {"parsed_results": [], "processed_pages": 0}            
             logger.info("使用书本模式解析完整长图")
             result = self._parse_book_image(book_image, prompt_template)
             
@@ -195,7 +153,6 @@ class ImageParseNode(BaseNode):
         else:
             # 分页模式:解析多张分页图片
             pages = getattr(state, 'split_pages', None) or getattr(state, 'image_pages', [])
-            prompt_template = state.dimension_prompt or self._default_prompt_template(is_book_image=False)
             
             if not pages:
                 logger.warning("没有待解析的页面")

+ 1 - 0
src/datasets/parser/nodes/pdf_split_node.py

@@ -66,6 +66,7 @@ class PDFSplitNode(BaseNode):
         logger.info(f"PDF拆分完成,共 {len(split_pages)} 页")
         
         return {
+            "original_filename": original_filename,
             "split_pages": split_pages,
             "parsed_results": [],
             "processed_pages": 0,

+ 83 - 5
src/datasets/parser/nodes/prompt_retrieval_node.py

@@ -2,13 +2,13 @@
 提示词获取节点
 """
 
+from locale import currency
 from typing import Dict, Any, Optional
-from unittest import result
 from src.datasets.parser.core.base import BaseNode
+from src.model.qwen_vl import QWenVLParser
 from src.api.db.services.prompt_service import get_prompt_service
 from src.common.logging_config import get_logger
 
-
 logger = get_logger(__name__)
 
 
@@ -19,7 +19,7 @@ class PromptRetrievalNode(BaseNode):
     从数据库获取指定维度的激活提示词。
     """
     
-    def __init__(self, dimension_id: int):
+    def __init__(self, dimension_id: int, use_book_image: bool = False):
         """
         初始化提示词获取节点
         
@@ -27,6 +27,7 @@ class PromptRetrievalNode(BaseNode):
             dimension_id: 维度ID
         """
         self.dimension_id = dimension_id
+        self.use_book_image = False
         self._prompt_service = None
     
     @property
@@ -40,6 +41,83 @@ class PromptRetrievalNode(BaseNode):
         if self._prompt_service is None:
             self._prompt_service = get_prompt_service()
         return self._prompt_service
+
+    def _default_prompt_template(self, is_book_image: bool = False) -> str:
+        """
+        默认的图像解析提示词模板
+
+        Args:
+            is_book_image: 是否为完整书本图片模式
+        """
+        if is_book_image:
+            return """
+                角色定位:你是一位顶尖的儿童绘本分析师与视觉工程专家,擅长将插画视觉信息转化为高精度的结构化元数据。
+                任务描述:请深度解析提供的完整绘本图片(包含所有页面),提取基本要素和特征。
+                
+                输出格式:JSON
+                {{
+                    "content": "完整绘本内容描述",
+                    "elements": []
+                }}
+            """
+        else:
+            return """
+                角色定位:你是一位顶尖的儿童绘本分析师与视觉工程专家,擅长将插画视觉信息转化为高精度的结构化元数据。
+                任务描述:请深度解析提供的绘本页面,提取基本要素和特征。
+                当前提取页码为:{page_number}
+                
+                输出格式:JSON
+                {{
+                    "page_number": {page_number},
+                    "content": "页面内容描述",
+                    "elements": []
+                }}
+            """
+
+    # 提示词拼装
+    def _assemble_prompt(self, dimension_prompt: str, state: Any) -> str:
+        """
+        拼装完整提示词
+        
+        Args:
+            preceding_node: 前一节点名称
+            dimension_prompt: 维度提示词
+            
+        Returns:
+            完整提示词字符串
+        """
+
+        decomposition_methods = getattr(state, 'decomposition_methods', None)
+        original_filename = getattr(state, 'original_filename', None)
+        preceding_node = decomposition_methods[self.dimension_id]["preced_node"]
+        
+        prompt_template = dimension_prompt or self._default_prompt_template(is_book_image=self.use_book_image)
+
+        # 从temp目录下读取preceding_node中包含的markdown文件获取content列表
+        content = []
+        for node in preceding_node:
+            current_content = ""
+            with open(f"temp/{original_filename}_{node}.md", "r", encoding="utf-8") as f:
+                current_content = f.read()
+            content.append(current_content)
+        chat_model = QWenVLParser(self.model_name)
+        # 使用大语言模型对content列表中的content进行整合、压缩
+        system_prompt = """
+        你是一个高级数据处理助手。请对输入的 内容 进行结构化压缩。
+
+        工作流:
+            - 分类: 扫描所有内容,根据主题将其归类。
+            - 合并: 将相同主题下的不同描述合并,合并时保留所有独特的细节(如特定的数值、专有名词)。
+            - 精简: 使用短语或精炼的句子改写冗长的描述。
+
+        约束条件:
+            - 使用 Markdown 无序列表格式输出。
+            - 每个要点必须是独立的知识点。
+            - 禁止生成模棱两可的总结,必须保留具体的技术参数或关键事实。  
+        """
+        # 将content列表中的内容合并为一个字符串
+        compressed_content = chat_model.chat("\n".join(content), system_prompt)
+        return prompt_template.format(content=compressed_content)
     
     def execute(self, state: Any) -> Dict[str, Any]:
         """
@@ -65,9 +143,9 @@ class PromptRetrievalNode(BaseNode):
             }
         
         logger.info(f"[Prompt-{self.dimension_id}] 提示词获取成功")
-        
+        prompt_template = self._assemble_prompt(result.get('content'), state)
         return {
-            "dimension_prompt": result.get('content'),
+            "dimension_prompt": prompt_template,
             "dataset_id": result.get('dataset_id'),
             "dimension_id": self.dimension_id
         }

+ 144 - 0
src/datasets/parser/nodes/summary_node.py

@@ -0,0 +1,144 @@
+"""
+总结节点
+
+对图像解析结果进行总结提要,并保存到文件。
+"""
+
+import os
+from typing import Dict, Any, List, Optional
+from langchain_core.prompts import ChatPromptTemplate
+
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from src.conf.settings import model_settings
+from src.model.qwen_vl import QWenVLParser
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class SummaryNode(BaseNode):
+    """
+    总结节点
+    
+    对图像解析结果进行总结提要,并将结果保存到文件。
+    """
+    
+    def __init__(self, dimension_id: int, model_name: Optional[str] = None):
+        """
+        初始化总结节点
+        
+        Args:
+            dimension_id: 维度ID,用于生成文件名
+            model_name: 模型名称
+        """
+        self.dimension_id = dimension_id
+        self.model_name = model_name or model_settings.chat_model_name
+        # 总结提示模板
+        self.summary_prompt = ChatPromptTemplate.from_template(
+            """
+            你是一位专业的内容总结专家,擅长从长篇文本中提取核心内容并生成简洁的总结。
+            
+            请对以下内容进行总结,要求:
+            1. 提炼核心观点和关键信息
+            2. 保持内容的完整性和准确性
+            3. 使用清晰、连贯的语言
+            4. 总结长度适中,不要过于冗长
+            
+            内容:
+            {content}
+            
+            总结:
+            """
+        )
+    
+    @property
+    def name(self) -> str:
+        return f"summary_node"
+    
+    def _summarize_content(self, content: str) -> str:
+        """
+        使用模型对内容进行总结
+        
+        Args:
+            content: 待总结的内容
+            
+        Returns:
+            str: 总结结果
+        """
+        try:
+            # 构建提示
+            messages = self.summary_prompt.format_messages(content=content)
+            # 调用模型生成总结
+            chat_model = QWenVLParser(self.model_name)
+            response = chat_model.invoke(messages)
+            return response.content
+        except Exception as e:
+            logger.error(f"总结内容时出错: {str(e)}")
+            return f"总结失败: {str(e)}"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        执行总结操作
+        
+        Args:
+            state: 包含解析结果的状态
+            
+        Returns:
+            包含总结结果的字典
+        """
+        # 获取解析结果
+        parsed_results = getattr(state, 'parsed_results', [])
+        
+        if not parsed_results:
+            logger.warning("没有解析结果可总结")
+            return {
+                "summary": "",
+                "saved_path": "",
+                "is_complete": True
+            }
+        
+        # 提取并合并内容
+        content_parts = []
+        for result in parsed_results:
+            if isinstance(result, dict):
+                content = result.get('content', '')
+                if content:
+                    content_parts.append(content)
+        
+        if not content_parts:
+            logger.warning("解析结果中没有内容可总结")
+            return {
+                "summary": "",
+                "saved_path": "",
+                "is_complete": True
+            }
+        
+        # 合并内容
+        combined_content = "\n".join(content_parts)
+        logger.info(f"开始总结内容,长度: {len(combined_content)} 字符")
+        
+        # 生成总结
+        summary = self._summarize_content(combined_content)
+        logger.info("内容总结完成")
+        
+        # 确保temp目录存在
+        temp_dir = "temp"
+        os.makedirs(temp_dir, exist_ok=True)
+        
+        # 保存总结到文件
+        file_path = os.path.join(temp_dir, f"{state.original_filename}_{self.dimension_id}.md")
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(summary)
+            logger.info(f"总结已保存到: {file_path}")
+        except Exception as e:
+            logger.error(f"保存总结文件时出错: {str(e)}")
+            file_path = ""
+        
+        return {
+            "summary": summary,
+            "saved_path": file_path,
+            "is_complete": True
+        }

+ 6 - 2
src/datasets/parser/nodes/vectorize_node.py

@@ -75,7 +75,8 @@ class VectorizeNode(BaseNode):
         document_id: str,
         dataset_id: str,
         file_name: str,
-        file_page_count: int
+        file_page_count: int,
+        original_filename: Optional[str] = None
     ) -> Optional[Dict[str, Any]]:
         """
         向量化单个文档
@@ -123,6 +124,7 @@ class VectorizeNode(BaseNode):
                 "chunk_id": chunk_id,
                 "metadata": {
                     "file_page_count": file_page_count,
+                    "book_name": original_filename
                 }
             }
             
@@ -148,6 +150,7 @@ class VectorizeNode(BaseNode):
         dataset_id = getattr(state, 'dataset_id', '')
         pdf_path = getattr(state, 'pdf_path', '')
         table_name = getattr(state, 'table_name', '')
+        original_filename = getattr(state, 'original_filename', None)
         
         if not parsed_results:
             logger.warning("没有待向量化的解析结果")
@@ -170,7 +173,8 @@ class VectorizeNode(BaseNode):
                 document_id,
                 dataset_id,
                 file_name,
-                file_page_count
+                file_page_count,
+                original_filename
             ): i
             for i, parsed_result in enumerate(parsed_results)
         }

+ 3 - 2
src/datasets/parser/states/parser_states.py

@@ -131,14 +131,15 @@ class DynamicDimensionState(BaseState):
     document_id: str = Field(default="", description="文档ID")
     ragflow_id: str = Field(default="", description="RagFlow用户ID")
     rag_flow_api_key: str = Field(default="", description="RAGFlow API密钥")
-    
+    decomposition_methods: Dict[int, Dict[str, Any]] = Field(default_factory=dict, description="维度分解方法")
+    use_book_image: bool = Field(default=False, description="是否使用书本模式解析")
     # 中间状态
     table_name: str = Field(default="", description="向量表名(由 TableNameGenerationNode 生成)")
     split_pages: List[Dict[str, Any]] = Field(default_factory=list, description="拆分后的页面列表")
     parsed_results: List[Dict[str, Any]] = Field(default_factory=list, description="解析结果列表")
     parsed_contents: List[ParsedContent] = Field(default_factory=list, description="解析内容列表")
     book_image: Image.Image = Field(default=None, description="书本图片")
-
+    original_filename: str = Field(default="", description="原始文件名")
     # 输出 - 每个维度的结果
     dimension_results: Dict[int, Dict[str, Any]] = Field(default_factory=dict, description="每个维度的解析结果")
     total_vectorized_pages: int = Field(default=0, description="总向量化页面数")

+ 30 - 9
src/datasets/parser/workflow_nodes/dimension_book_split_node.py

@@ -13,7 +13,8 @@ from src.datasets.parser.nodes import (
     DimensionResultNode,
     RAGFlowDocumentUploadNode,
     RAGFlowChunkNode,
-    PictureStitchingNode
+    PictureStitchingNode,
+    SummaryNode
 )
 from src.datasets.parser.states.parser_states import DynamicDimensionState
 from src.common.logging_config import get_logger
@@ -66,35 +67,42 @@ class DimensionBookSplitNode(BaseNode):
     def _build_sub_workflow(self, state):
         """
         构建子工作流
-        
+
         Args:
             state: 状态
-            
+
         Returns:
             编译后的 LangGraph 工作流
         """
         logger.info(f"[维度书本拆分-{self.dimension_id}] 开始构建子工作流")
         rag_flow_api_key = getattr(state, 'rag_flow_api_key', '')
+        decomposition_methods = getattr(state, 'decomposition_methods', None)
+        is_preced = decomposition_methods[self.dimension_id]["is_preced"] or 0
 
         # 创建工作流构建器
         builder = WorkflowBuilder(DynamicDimensionState)
-        
+
         # 创建节点
-        prompt_node = PromptRetrievalNode(self.dimension_id)
+        prompt_node = PromptRetrievalNode(dimension_id=self.dimension_id, use_book_image=True)
         document_upload_node = RAGFlowDocumentUploadNode(api_key=rag_flow_api_key)
         table_name_node = TableNameGenerationNode(self.dimension_id)
         stitching_node = PictureStitchingNode()
         parse_node = ImageParseNode(
+            dimension_id=self.dimension_id,
             model_name=self.model_name,
             max_workers=self.max_workers,
-            use_book_image=True  # 标记使用book_image模式
+            use_book_image=True,  # 标记使用book_image模式
         )
         chunk_node = RAGFlowChunkNode(api_key=rag_flow_api_key)
         vectorize_node = VectorizeNode()
         result_node = DimensionResultNode(self.dimension_id)
         
+        # 条件创建总结节点
+        if is_preced == 1:
+            summary_node = SummaryNode(dimension_id=self.dimension_id)
+
         # 添加节点
-        builder.add_nodes(
+        nodes = [
             prompt_node,
             document_upload_node,
             table_name_node,
@@ -103,15 +111,28 @@ class DimensionBookSplitNode(BaseNode):
             chunk_node,
             vectorize_node,
             result_node
-        )
+        ]
+        
+        # 条件添加总结节点
+        if is_preced == 1:
+            nodes.insert(nodes.index(parse_node) + 1, summary_node)
         
+        builder.add_nodes(*nodes)
+
         # 设置边
         builder.set_entry(prompt_node.name)
         builder.add_edge(prompt_node.name, document_upload_node.name)
         builder.add_edge(document_upload_node.name, table_name_node.name)
         builder.add_edge(table_name_node.name, stitching_node.name)
         builder.add_edge(stitching_node.name, parse_node.name)
-        builder.add_edge(parse_node.name, chunk_node.name)
+        
+        # 条件设置总结节点边
+        if is_preced == 1:
+            builder.add_edge(parse_node.name, summary_node.name)
+            builder.add_edge(summary_node.name, chunk_node.name)
+        else:
+            builder.add_edge(parse_node.name, chunk_node.name)
+        
         builder.add_edge(chunk_node.name, vectorize_node.name)
         builder.add_edge(vectorize_node.name, result_node.name)
         builder.set_finish(result_node.name)

+ 30 - 10
src/datasets/parser/workflow_nodes/dimension_page_split_node.py

@@ -12,7 +12,8 @@ from src.datasets.parser.nodes import (
     TableNameGenerationNode, 
     DimensionResultNode,
     RAGFlowDocumentUploadNode,
-    RAGFlowChunkNode
+    RAGFlowChunkNode,
+    SummaryNode
 )
 from src.datasets.parser.states.parser_states import DynamicDimensionState
 from src.common.logging_config import get_logger
@@ -60,34 +61,40 @@ class DimensionPageSplitNode(BaseNode):
     def _build_sub_workflow(self, state):
         """
         构建子工作流
-        
+
         Args:
             state: 状态
-            
+
         Returns:
             编译后的 LangGraph 工作流
         """
         logger.info(f"[维度分页拆分-{self.dimension_id}] 开始构建子工作流")
         rag_flow_api_key = getattr(state, 'rag_flow_api_key', '')
-
+        decomposition_methods = getattr(state, 'decomposition_methods', None)
+        is_preced = decomposition_methods[self.dimension_id]["is_preced"] or 0
 
         # 创建工作流构建器
         builder = WorkflowBuilder(DynamicDimensionState)
-        
+
         # 创建节点
-        prompt_node = PromptRetrievalNode(self.dimension_id)
+        prompt_node = PromptRetrievalNode(dimension_id=self.dimension_id)
         document_upload_node = RAGFlowDocumentUploadNode(api_key=rag_flow_api_key)
         table_name_node = TableNameGenerationNode(self.dimension_id)
         parse_node = ImageParseNode(
+            dimension_id=self.dimension_id,
             model_name=self.model_name,
-            max_workers=self.max_workers
+            max_workers=self.max_workers,
         )
         chunk_node = RAGFlowChunkNode(api_key=rag_flow_api_key)
         vectorize_node = VectorizeNode()
         result_node = DimensionResultNode(self.dimension_id)
         
+        # 条件创建总结节点
+        if is_preced == 1:
+            summary_node = SummaryNode(dimension_id=self.dimension_id)
+
         # 添加节点
-        builder.add_nodes(
+        nodes = [
             prompt_node,
             document_upload_node,
             table_name_node,
@@ -95,14 +102,27 @@ class DimensionPageSplitNode(BaseNode):
             chunk_node,
             vectorize_node,
             result_node
-        )
+        ]
+        
+        # 条件添加总结节点
+        if is_preced == 1:
+            nodes.insert(nodes.index(parse_node) + 1, summary_node)
         
+        builder.add_nodes(*nodes)
+
         # 设置边
         builder.set_entry(prompt_node.name)
         builder.add_edge(prompt_node.name, document_upload_node.name)
         builder.add_edge(document_upload_node.name, table_name_node.name)
         builder.add_edge(table_name_node.name, parse_node.name)
-        builder.add_edge(parse_node.name, chunk_node.name)
+        
+        # 条件设置总结节点边
+        if is_preced == 1:
+            builder.add_edge(parse_node.name, summary_node.name)
+            builder.add_edge(summary_node.name, chunk_node.name)
+        else:
+            builder.add_edge(parse_node.name, chunk_node.name)
+        
         builder.add_edge(chunk_node.name, vectorize_node.name)
         builder.add_edge(vectorize_node.name, result_node.name)
         builder.set_finish(result_node.name)

+ 20 - 8
src/datasets/parser/workflows/dynamic_dimension_workflow.py

@@ -18,7 +18,8 @@ from src.datasets.parser.nodes import (
     PDFSplitNode,
     CompleteNode
 )
-from src.datasets.parser.workflow_nodes import DimensionPageSplitNode
+from src.datasets.parser.workflow_nodes import DimensionPageSplitNode, DimensionBookSplitNode
+from src.api.db.services.prompt_service import get_prompt_service
 from src.utils.ragflow.ragflow_user_service import get_ragflow_user_service
 from src.common.logging_config import get_logger
 
@@ -55,6 +56,8 @@ class DynamicDimensionWorkflow:
         self.model_name = model_name
         self.max_workers = max_workers
         self.langfuse_handler = CallbackHandler()
+        # 懒加载维度分解方法(初始化时加载一次)
+        self._decomposition_methods = get_prompt_service().get_decomposition_method()
     
     def _build_workflow_for_dimensions(self, dimension_ids: List[int]):
         """
@@ -72,7 +75,8 @@ class DynamicDimensionWorkflow:
         split_node = PDFSplitNode()
         complete_node = CompleteNode(message_template="动态多维度解析完成")
         
-        # 查询维度知识库对应的decomposition method
+        # 使用预加载的维度分解方法
+        decomposition_methods = self._decomposition_methods
 
         # 使用 WorkflowBuilder 构建
         builder = WorkflowBuilder(DynamicDimensionState)
@@ -90,13 +94,20 @@ class DynamicDimensionWorkflow:
         prev_node = "pdf_split"
         
         for dim_id in dimension_ids:
-            
+            decomposition_method = decomposition_methods[str(dim_id)]["decomposition_method"]
             # 为每个维度创建分页拆分节点
-            skill_node = DimensionPageSplitNode(
-                dimension_id=dim_id,
-                model_name=self.model_name,
-                max_workers=self.max_workers
-            )
+            if decomposition_method == 0:
+                skill_node = DimensionBookSplitNode(
+                    dimension_id=dim_id,
+                    model_name=self.model_name,
+                    max_workers=self.max_workers
+                )
+            elif decomposition_method == 1:
+                skill_node = DimensionPageSplitNode(
+                    dimension_id=dim_id,
+                    model_name=self.model_name,
+                    max_workers=self.max_workers
+                )
             builder.add_node(skill_node)
             builder.add_edge(prev_node, skill_node.name)
             prev_node = skill_node.name
@@ -161,6 +172,7 @@ class DynamicDimensionWorkflow:
             dimension_ids=dimension_ids,
             ragflow_id=ragflow_id,
             rag_flow_api_key=ragflow_api_key,
+            decomposition_methods=self._decomposition_methods,
             # dataset_name=dataset_name or pdf_path.split("/")[-1].split("\\")[-1].replace(".pdf", ""),
             # ragflow_api_url=ragflow_api_url,
             # rag_flow_api_key=rag_flow_api_key,

+ 1 - 1
src/utils/vector_db/elasticsearch_adapter.py

@@ -282,7 +282,7 @@ class ElasticsearchAdapter(VectorDBClient):
         topn = query.get("topn", 10)
         
         # match 的 query 不能为 None,否则 ES 报 VALUE_NULL。无文案时仅用向量(match_all)
-        if matching_text:
+        if matching_text and match_field == "content":
             # 处理中文文本
             text_clause = self._build_chinese_query(match_field, matching_text)
         else: