Quellcode durchsuchen

多维度流程拆分

yingge vor 3 Monaten
Ursprung
Commit
1146474adc
46 geänderte Dateien mit 4902 neuen und 103 gelöschten Zeilen
  1. 45 17
      main.py
  2. 26 0
      sql/prompt_schema.sql
  3. 375 0
      src/api/db/services/prompt_service.py
  4. 158 0
      src/api/db/services/vector_search_service.py
  5. 379 4
      src/api/sdk/dataset_manage.py
  6. 201 0
      src/api/sdk/prompt_manage.py
  7. 15 12
      src/api/sdk/search_infinity.py
  8. 84 0
      src/common/logging_config.py
  9. 19 3
      src/common/result.py
  10. 1 1
      src/conf/settings.py
  11. 16 0
      src/datasets/parser/core/__init__.py
  12. 125 0
      src/datasets/parser/core/base.py
  13. 118 0
      src/datasets/parser/core/registry.py
  14. 188 0
      src/datasets/parser/core/workflow_builder.py
  15. 35 0
      src/datasets/parser/nodes/__init__.py
  16. 63 0
      src/datasets/parser/nodes/complete_node.py
  17. 43 0
      src/datasets/parser/nodes/export_csv_node.py
  18. 146 0
      src/datasets/parser/nodes/image_parse_node.py
  19. 113 0
      src/datasets/parser/nodes/pdf_ocr_node.py
  20. 63 0
      src/datasets/parser/nodes/pdf_split_node.py
  21. 168 0
      src/datasets/parser/nodes/qa_generate_node.py
  22. 282 0
      src/datasets/parser/nodes/ragflow_nodes.py
  23. 89 0
      src/datasets/parser/nodes/text_split_node.py
  24. 151 0
      src/datasets/parser/nodes/vectorize_node.py
  25. 42 28
      src/datasets/parser/pdf_parser/pdf_splitter.py
  26. 2 2
      src/datasets/parser/question_answer_parser/question_answer_parser.py
  27. 15 0
      src/datasets/parser/states/__init__.py
  28. 106 0
      src/datasets/parser/states/parser_states.py
  29. 15 0
      src/datasets/parser/workflows/__init__.py
  30. 339 0
      src/datasets/parser/workflows/dynamic_dimension_workflow.py
  31. 157 0
      src/datasets/parser/workflows/image_workflow.py
  32. 129 0
      src/datasets/parser/workflows/pdf_workflow.py
  33. 135 0
      src/datasets/parser/workflows/qa_workflow.py
  34. 10 6
      src/job/chunk_update_job.py
  35. 7 9
      src/model/multimodal_embedding.py
  36. 3 6
      src/model/qwen_vl.py
  37. 7 3
      src/utils/es/client_manager.py
  38. 13 9
      src/utils/file/image_util.py
  39. 7 3
      src/utils/file/minio/minio_util.py
  40. 208 0
      src/utils/task_queue.py
  41. 14 0
      src/utils/vector_db/__init__.py
  42. 134 0
      src/utils/vector_db/base.py
  43. 347 0
      src/utils/vector_db/elasticsearch_adapter.py
  44. 118 0
      src/utils/vector_db/factory.py
  45. 146 0
      src/utils/vector_db/infinity_adapter.py
  46. 45 0
      test_qa_workflow.py

+ 45 - 17
main.py

@@ -2,13 +2,17 @@
 import uvicorn
 from fastapi import FastAPI
 from contextlib import asynccontextmanager
+from src.common.logging_config import get_logger
+
+# 获取日志器
+logger = get_logger(__name__)
 
-# 导入所有子应用
 from src.api.sdk.search_infinity import app as search_app
 from src.api.sdk.tag_manage import app as tag_app
 from src.api.sdk.dataset_manage import app as dataset_app
 from src.api.sdk.api_manage import app as api_manage_app
 from src.api.sdk.dify_dataset_manage import app as dify_dataset_manage_app
+from src.api.sdk.prompt_manage import app as prompt_manage_app
 
 # 导入认证中间件
 from src.utils.auth import verify_api_key
@@ -17,43 +21,58 @@ from src.utils.auth import verify_api_key
 @asynccontextmanager
 async def main_lifespan(app: FastAPI):
     """主应用生命周期管理"""
-    from src.utils.infinity import get_client, close_client
-    # 1. 初始化Infinity全局客户端(在服务启动时)
-    get_client(database="book_image_db", min_connections=5, max_connections=10)
-    print("✅ Infinity客户端已初始化")
+    from src.conf.settings import vector_db_settings
+    from src.utils.vector_db import get_vector_db_client, close_vector_db_client
+    
+    # 1. 初始化向量数据库客户端(根据配置选择 Infinity 或 Elasticsearch)
+    db_type = vector_db_settings.vector_db_type
+    if db_type == "infinity":
+        get_vector_db_client(database="book_image_db")
+        logger.info("✅ Infinity向量数据库客户端已初始化")
+    elif db_type == "es":
+        get_vector_db_client()
+        logger.info("✅ Elasticsearch向量数据库客户端已初始化")
+    else:
+        logger.warning(f"⚠️ 未知的向量数据库类型: {db_type}")
     
     # 2. 初始化MySQL全局客户端
     from src.utils.mysql import init_global_mysql_client, close_global_mysql_client
     init_global_mysql_client()
-    print("✅ MySQL客户端已初始化")
+    logger.info("✅ MySQL客户端已初始化")
     
     # 3. 初始化MinIO全局客户端并校验存储桶
     from src.utils.file.minio.minio_util import init_minio_client, close_minio_client
     init_minio_client(check_bucket=True)
-    print("✅ MinIO客户端已初始化并校验存储桶")
+    logger.info("✅ MinIO客户端已初始化并校验存储桶")
+
+    # 4. 初始化提示词维度对应的向量数据库表/索引
+    from src.api.db.services.prompt_service import get_prompt_service
+    prompt_service = get_prompt_service()
+    prompt_service.init_vector_db_tables()
+    logger.info("✅ 提示词维度向量数据库表/索引已初始化")
 
-    # 4. 启动Chunk更新定时任务
+    # 5. 启动Chunk更新定时任务
     from src.job.chunk_update_job import start_scheduler, shutdown_scheduler
     start_scheduler()
-    print("✅ Chunk update scheduler started")
+    logger.info("✅ Chunk update scheduler started")
     
     yield
 
     # 1. 关闭Chunk更新定时任务
     shutdown_scheduler()
-    print("✅ Chunk update scheduler shutdown")
+    logger.info("✅ Chunk update scheduler shutdown")
 
     # 2. 关闭MinIO全局客户端
     close_minio_client()
-    print("✅ MinIO客户端已关闭")
+    logger.info("✅ MinIO客户端已关闭")
 
     # 3. 关闭MySQL全局客户端
     close_global_mysql_client()
-    print("✅ MySQL客户端已关闭")
+    logger.info("✅ MySQL客户端已关闭")
 
-    # 4. 关闭Infinity全局客户端(在服务关闭时)
-    close_client()
-    print("✅ Infinity客户端已关闭")
+    # 4. 关闭向量数据库客户端
+    close_vector_db_client()
+    logger.info(f"✅ 向量数据库客户端已关闭 (类型: {db_type})")
     
 
     
@@ -80,6 +99,8 @@ main_app.mount("/dataset", dataset_app, name="dataset_api")
 main_app.mount("/api", api_manage_app, name="api_manage")
 # 5. Dify 数据集管理 API - 访问路径: /dify_dataset/*
 main_app.mount("/dify_dataset", dify_dataset_manage_app, name="dify_dataset_manage")
+# 6. 提示词管理 API - 访问路径: /prompt/*
+main_app.mount("/prompt", prompt_manage_app, name="prompt_manage")
 
 from src.common.result import Result
 
@@ -103,11 +124,18 @@ async def root():
 @main_app.get("/health")
 async def health_check():
     """主应用健康检查"""
-    data = {"status": "healthy", "service": "Infinity API Gateway"}
+    from src.utils.task_queue import get_task_queue
+    queue_info = get_task_queue().get_queue_info()
+    data = {
+        "status": "healthy",
+        "service": "Infinity API Gateway",
+        "version": "2.0.0",
+        "queue": queue_info
+    }
     return Result.success(data=data, message="服务健康")
 
 if __name__ == "__main__":
-    print("=== 启动 GRAPH_RAG API Gateway ===")
+    logger.info("=== 启动 GRAPH_RAG API Gateway ===")
     """启动主应用"""
     uvicorn.run(
         "main:main_app",  # 应用路径: 模块名:应用实例名

+ 26 - 0
sql/prompt_schema.sql

@@ -0,0 +1,26 @@
+-- 提示词维度表
+CREATE TABLE `prompt_dimensions` (
+  `id` int NOT NULL AUTO_INCREMENT,
+  `name` varchar(255) NOT NULL COMMENT '维度名称,如:摘要生成、问答生成',
+  `description` text COMMENT '维度描述',
+  `created_at` datetime DEFAULT CURRENT_TIMESTAMP,
+  `updated_at` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `idx_name` (`name`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='提示词维度表';
+
+-- 提示词版本表
+CREATE TABLE `prompt_versions` (
+  `id` int NOT NULL AUTO_INCREMENT,
+  `dimension_id` int NOT NULL COMMENT '关联的维度ID',
+  `version_number` int NOT NULL COMMENT '版本号,递增整数',
+  `content` longtext NOT NULL COMMENT '提示词内容,保留原始格式(换行、空格等)',
+  `is_active` tinyint(1) DEFAULT '0' COMMENT '是否为当前激活版本',
+  `remark` varchar(500) DEFAULT NULL COMMENT '版本备注',
+  `created_by` varchar(255) DEFAULT NULL COMMENT '创建人',
+  `created_at` datetime DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (`id`),
+  KEY `idx_dimension` (`dimension_id`),
+  KEY `idx_dimension_active` (`dimension_id`, `is_active`),
+  CONSTRAINT `fk_prompt_versions_dimension` FOREIGN KEY (`dimension_id`) REFERENCES `prompt_dimensions` (`id`) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='提示词版本表';

+ 375 - 0
src/api/db/services/prompt_service.py

@@ -0,0 +1,375 @@
+"""
+提示词管理服务
+
+提供维度和提示词版本的 CRUD 操作。
+服务初始化时会自动检查并创建对应的向量数据库表/索引。
+"""
+
+from typing import List, Dict, Any, Optional
+from src.utils.mysql import get_global_mysql_client
+from src.utils.vector_db import get_vector_db_client
+from src.conf.settings import vector_db_settings
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+# Infinity 表结构定义(用于 book_{dimension_id} 表)
+INFINITY_TABLE_COLUMNS = [
+    {"name": "id", "type": "varchar", "default": ""},
+    {"name": "file_name", "type": "varchar", "default": ""},
+    {"name": "page_number", "type": "int", "default": 0},
+    {"name": "content", "type": "varchar", "default": ""},
+    {"name": "doc_content", "type": "varchar", "default": ""},
+    {"name": "image_path", "type": "varchar", "default": ""},
+    {"name": "dataset_id", "type": "varchar", "default": ""},
+    {"name": "document_id", "type": "varchar", "default": ""},
+    {"name": "ability_tags", "type": "varchar", "default": ""},
+    {"name": "content_tag", "type": "varchar", "default": ""},
+    {"name": "dense_vector_1024", "type": "vector,1024,float"},
+]
+
+# Elasticsearch 索引映射定义
+ES_INDEX_MAPPINGS = {
+    "properties": {
+        "id": {"type": "keyword"},
+        "file_name": {"type": "keyword"},
+        "page_number": {"type": "integer"},
+        "content": {"type": "text", "analyzer": "standard"},
+        "doc_content": {"type": "text", "analyzer": "standard"},
+        "image_path": {"type": "keyword"},
+        "dataset_id": {"type": "keyword"},
+        "document_id": {"type": "keyword"},
+        "ability_tags": {"type": "keyword"},
+        "content_tag": {"type": "keyword"},
+        "dense_vector_1024": {
+            "type": "dense_vector",
+            "dims": 1024,
+            "index": True,
+            "similarity": "cosine"
+        }
+    }
+}
+
+
+class PromptService:
+    """提示词管理服务"""
+    
+    def __init__(self):
+        """初始化提示词服务"""
+        self._db = get_global_mysql_client()
+        self._vector_client = None
+    
+    def _get_vector_client(self):
+        """获取向量数据库客户端"""
+        if self._vector_client is None:
+            self._vector_client = get_vector_db_client()
+        return self._vector_client
+    
+    def _get_table_name(self, dimension_id: int) -> str:
+        """获取维度对应的表名/索引名"""
+        return f"book_{dimension_id}"
+    
+    def init_vector_db_tables(self):
+        """
+        初始化向量数据库表/索引
+        
+        检查所有已存在的维度,如果对应的表/索引不存在则创建。
+        支持 Infinity 和 Elasticsearch。
+        
+        此方法由 main.py 生命周期管理调用。
+        """
+        db_type = vector_db_settings.vector_db_type
+        
+        try:
+            # 获取所有维度
+            dimensions = self.get_all_dimensions()
+            
+            if db_type == "infinity":
+                self._init_infinity_tables(dimensions)
+            elif db_type == "es":
+                self._init_es_indices(dimensions)
+            else:
+                logger.warning(f"未知的向量数据库类型: {db_type},跳过初始化")
+        except Exception as e:
+            logger.error(f"初始化向量数据库表/索引失败: {str(e)}")
+    
+    def _init_infinity_tables(self, dimensions: List[Dict]):
+        """初始化 Infinity 表"""
+        try:
+            client = self._get_vector_client()
+            existing_tables = client.list_tables()
+            logger.info(f"Infinity 现有表: {existing_tables}")
+            
+            for dim in dimensions:
+                table_name = self._get_table_name(dim['id'])
+                if table_name not in existing_tables:
+                    self._create_infinity_table(table_name)
+                    logger.info(f"✅ 创建 Infinity 表: {table_name} (维度: {dim['name']})")
+        except Exception as e:
+            logger.error(f"初始化 Infinity 表失败: {str(e)}")
+    
+    def _init_es_indices(self, dimensions: List[Dict]):
+        """初始化 Elasticsearch 索引"""
+        try:
+            client = self._get_vector_client()
+            
+            for dim in dimensions:
+                index_name = self._get_table_name(dim['id'])
+                if not client.index_exists(index_name):
+                    self._create_es_index(index_name)
+                    logger.info(f"✅ 创建 ES 索引: {index_name} (维度: {dim['name']})")
+                else:
+                    logger.debug(f"ES 索引已存在: {index_name}")
+        except Exception as e:
+            logger.error(f"初始化 ES 索引失败: {str(e)}")
+    
+    def _create_infinity_table(self, table_name: str):
+        """创建 Infinity 表"""
+        client = self._get_vector_client()
+        client.create_table(
+            table_name=table_name,
+            columns_definition=INFINITY_TABLE_COLUMNS
+        )
+    
+    def _create_es_index(self, index_name: str):
+        """创建 Elasticsearch 索引"""
+        client = self._get_vector_client()
+        client.create_index(
+            index_name=index_name,
+            mappings=ES_INDEX_MAPPINGS
+        )
+    
+    def _create_vector_db_table(self, dimension_id: int, dimension_name: str):
+        """
+        为维度创建向量数据库表/索引
+        
+        根据配置自动选择 Infinity 或 Elasticsearch。
+        """
+        db_type = vector_db_settings.vector_db_type
+        table_name = self._get_table_name(dimension_id)
+        
+        try:
+            if db_type == "infinity":
+                self._create_infinity_table(table_name)
+                logger.info(f"✅ 创建 Infinity 表: {table_name} (维度: {dimension_name})")
+            elif db_type == "es":
+                self._create_es_index(table_name)
+                logger.info(f"✅ 创建 ES 索引: {table_name} (维度: {dimension_name})")
+        except Exception as e:
+            logger.error(f"创建向量数据库表/索引失败: {str(e)}")
+    
+    # ==================== 维度管理 ====================
+    
+    def add_dimension(self, name: str, description: str = None) -> Dict[str, Any]:
+        """
+        添加维度
+        
+        创建维度后会自动创建对应的向量数据库表/索引 (book_{dimension_id})。
+        
+        Args:
+            name: 维度名称
+            description: 维度描述
+            
+        Returns:
+            新建的维度信息
+        """
+        sql = """
+            INSERT INTO prompt_dimensions (name, description)
+            VALUES (%s, %s)
+        """
+        self._db.execute(sql, [name, description])
+        
+        # 获取新建的维度
+        dimension = self.get_dimension_by_name(name)
+        
+        # 创建对应的向量数据库表/索引
+        self._create_vector_db_table(dimension['id'], name)
+        
+        return dimension
+    
+    def get_dimension_by_name(self, name: str) -> Optional[Dict[str, Any]]:
+        """根据名称获取维度"""
+        sql = "SELECT * FROM prompt_dimensions WHERE name = %s"
+        return self._db.fetch_one(sql, [name])
+    
+    def get_dimension_by_id(self, dimension_id: int) -> Optional[Dict[str, Any]]:
+        """根据ID获取维度"""
+        sql = "SELECT * FROM prompt_dimensions WHERE id = %s"
+        return self._db.fetch_one(sql, [dimension_id])
+    
+    def get_all_dimensions(self) -> List[Dict[str, Any]]:
+        """获取所有维度"""
+        sql = "SELECT * FROM prompt_dimensions ORDER BY created_at DESC"
+        return self._db.fetch_all(sql)
+    
+    def update_dimension(self, dimension_id: int, name: str = None, description: str = None) -> int:
+        """更新维度信息"""
+        updates = []
+        params = []
+        
+        if name is not None:
+            updates.append("name = %s")
+            params.append(name)
+        if description is not None:
+            updates.append("description = %s")
+            params.append(description)
+        
+        if not updates:
+            return 0
+        
+        params.append(dimension_id)
+        sql = f"UPDATE prompt_dimensions SET {', '.join(updates)} WHERE id = %s"
+        return self._db.execute(sql, params)
+    
+    def delete_dimension(self, dimension_id: int) -> int:
+        """删除维度(级联删除所有版本)"""
+        sql = "DELETE FROM prompt_dimensions WHERE id = %s"
+        return self._db.execute(sql, [dimension_id])
+    
+    # ==================== 版本管理 ====================
+    
+    def add_version(
+        self,
+        dimension_id: int,
+        content: str,
+        remark: str = None,
+        created_by: str = None,
+        set_active: bool = True
+    ) -> Dict[str, Any]:
+        """
+        添加新版本
+        
+        Args:
+            dimension_id: 维度ID
+            content: 提示词内容(原始格式保留)
+            remark: 版本备注
+            created_by: 创建人
+            set_active: 是否设为激活版本
+            
+        Returns:
+            新建的版本信息
+        """
+        # 获取当前最大版本号
+        max_version_sql = """
+            SELECT COALESCE(MAX(version_number), 0) as max_version 
+            FROM prompt_versions WHERE dimension_id = %s
+        """
+        result = self._db.fetch_one(max_version_sql, [dimension_id])
+        new_version = result['max_version'] + 1
+        
+        # 如果设为激活版本,先取消当前激活版本
+        if set_active:
+            self._db.execute(
+                "UPDATE prompt_versions SET is_active = 0 WHERE dimension_id = %s",
+                [dimension_id]
+            )
+        
+        # 插入新版本
+        insert_sql = """
+            INSERT INTO prompt_versions (dimension_id, version_number, content, is_active, remark, created_by)
+            VALUES (%s, %s, %s, %s, %s, %s)
+        """
+        self._db.execute(insert_sql, [
+            dimension_id, new_version, content, 
+            1 if set_active else 0, remark, created_by
+        ])
+        
+        return self.get_version(dimension_id, new_version)
+    
+    def get_version(self, dimension_id: int, version_number: int) -> Optional[Dict[str, Any]]:
+        """获取指定版本"""
+        sql = """
+            SELECT * FROM prompt_versions 
+            WHERE dimension_id = %s AND version_number = %s
+        """
+        return self._db.fetch_one(sql, [dimension_id, version_number])
+    
+    def get_versions_by_dimension(self, dimension_id: int) -> List[Dict[str, Any]]:
+        """获取维度的所有版本"""
+        sql = """
+            SELECT * FROM prompt_versions 
+            WHERE dimension_id = %s 
+            ORDER BY version_number DESC
+        """
+        return self._db.fetch_all(sql, [dimension_id])
+    
+    def get_active_version(self, dimension_id: int) -> Optional[Dict[str, Any]]:
+        """获取当前激活版本"""
+        sql = """
+            SELECT * FROM prompt_versions 
+            WHERE dimension_id = %s AND is_active = 1
+        """
+        return self._db.fetch_one(sql, [dimension_id])
+    
+    def get_active_prompt(self, dimension_name: str) -> Optional[str]:
+        """
+        根据维度名称获取当前激活的提示词内容
+        
+        这是最常用的方法,用于在工作流中获取提示词。
+        
+        Args:
+            dimension_name: 维度名称
+            
+        Returns:
+            提示词内容,若不存在则返回 None
+        """
+        sql = """
+            SELECT pv.content 
+            FROM prompt_versions pv
+            JOIN prompt_dimensions pd ON pv.dimension_id = pd.id
+            WHERE pd.name = %s AND pv.is_active = 1
+        """
+        result = self._db.fetch_one(sql, [dimension_name])
+        return result['content'] if result else None
+    
+    def get_active_prompt_by_id(self, dimension_id: int) -> Optional[str]:
+        """
+        根据维度ID获取当前激活的提示词内容
+        
+        Args:
+            dimension_id: 维度ID
+            
+        Returns:
+            提示词内容,若不存在则返回 None
+        """
+        sql = """
+            SELECT content FROM prompt_versions 
+            WHERE dimension_id = %s AND is_active = 1
+        """
+        result = self._db.fetch_one(sql, [dimension_id])
+        return result['content'] if result else None
+    
+    def set_active_version(self, dimension_id: int, version_number: int) -> int:
+        """设置激活版本"""
+        # 先取消当前激活版本
+        self._db.execute(
+            "UPDATE prompt_versions SET is_active = 0 WHERE dimension_id = %s",
+            [dimension_id]
+        )
+        # 设置新的激活版本
+        sql = """
+            UPDATE prompt_versions SET is_active = 1 
+            WHERE dimension_id = %s AND version_number = %s
+        """
+        return self._db.execute(sql, [dimension_id, version_number])
+    
+    def delete_version(self, dimension_id: int, version_number: int) -> int:
+        """删除版本"""
+        sql = """
+            DELETE FROM prompt_versions 
+            WHERE dimension_id = %s AND version_number = %s
+        """
+        return self._db.execute(sql, [dimension_id, version_number])
+
+
+# 全局服务实例
+_prompt_service: Optional[PromptService] = None
+
+
+def get_prompt_service() -> PromptService:
+    """获取提示词服务实例"""
+    global _prompt_service
+    if _prompt_service is None:
+        _prompt_service = PromptService()
+    return _prompt_service

+ 158 - 0
src/api/db/services/vector_search_service.py

@@ -0,0 +1,158 @@
+"""
+向量数据库搜索服务
+
+提供统一的搜索接口,自动根据配置选择底层数据库实现。
+"""
+
+from typing import Dict, Any, List, Optional
+from src.conf.settings import vector_db_settings
+from src.utils.vector_db import get_vector_db_client, VectorDBClient
+from src.utils.file.image_util import image_util
+from src.model.multimodal_embedding import get_embedding_model
+from src.utils.infinity.result_util import convert_to_json
+from src.api.db.models import SearchRequest
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class VectorSearchService:
+    """
+    向量数据库搜索服务
+    
+    自动根据 VECTOR_DB_TYPE 配置选择对应的数据库实现。
+    提供与原 InfinitySearchService 兼容的接口。
+    """
+
+    default_output_fields = [
+        "file_name",
+        "page_number",
+        "content",
+        "image_path",
+        "dataset_id",
+        "document_id"
+    ]
+
+    def __init__(
+        self,
+        client: VectorDBClient = None,
+        database: str = None,
+        vector_field: str = None,
+        match_field: str = None,
+        match_type: str = None,
+        table_name: str = None,
+        output_fields: List[str] = None
+    ):
+        """
+        初始化搜索服务
+        
+        Args:
+            client: 向量数据库客户端(可选,不传则自动创建)
+            database: 数据库名称
+            vector_field: 向量字段名
+            match_field: 匹配字段名
+            match_type: 匹配类型
+            table_name: 表名
+            output_fields: 输出字段列表
+        """
+        self._client = client or get_vector_db_client(database=database)
+        self.output_fields = output_fields or self.default_output_fields
+        self.vector_field = vector_field or "dense_vector_1024"
+        self.match_field = match_field or "content"
+        self.match_type = match_type or "cosine"
+        self.table_name = table_name or vector_db_settings.infinity_table_name
+
+    @property
+    def client(self) -> VectorDBClient:
+        """获取底层客户端"""
+        return self._client
+
+    def search(self, request: SearchRequest) -> List[Dict[str, Any]]:
+        """
+        执行全文搜索
+    
+        Args:
+            request: 搜索请求
+        
+        Returns:
+            搜索结果列表
+        """
+        try:
+            search_query = self._convert_search_request_to_search_query(request)
+            result = self._client.search(self.table_name, self.output_fields, search_query)
+            result_dict = result.to_result()
+            return convert_to_json(result_dict)
+        except Exception as e:
+            logger.error(f"搜索失败: {str(e)}")
+            raise Exception(f"搜索失败: {str(e)}")
+
+    def vector_search(self, request: SearchRequest) -> List[Dict[str, Any]]:
+        """
+        执行向量搜索
+    
+        Args:
+            request: 搜索请求
+        
+        Returns:
+            搜索结果列表
+        """
+        try:
+            search_query = self._convert_search_request_to_search_query(request)
+            result = self._client.vector_search(self.table_name, self.output_fields, search_query)
+            result_dict = result.to_result()
+            return convert_to_json(result_dict)
+        except Exception as e:
+            logger.error(f"向量搜索失败: {str(e)}")
+            raise Exception(f"向量搜索失败: {str(e)}")
+
+    def hybrid_search(self, request: SearchRequest) -> List[Dict[str, Any]]:
+        """
+        执行混合搜索
+    
+        Args:
+            request: 搜索请求
+        
+        Returns:
+            搜索结果列表
+        """
+        try:
+            search_query = self._convert_search_request_to_search_query(request)
+            result = self._client.hybrid_search(self.table_name, self.output_fields, search_query)
+            result_dict = result.to_result()
+            return convert_to_json(result_dict)
+        except Exception as e:
+            logger.error(f"混合搜索失败: {str(e)}")
+            raise Exception(f"混合搜索失败: {str(e)}")
+
+    def _convert_search_request_to_search_query(self, request: SearchRequest) -> dict:
+        """
+        将 SearchRequest 转换为搜索查询参数
+        """
+        try:
+            query_vector = None
+            if request.score_threshold is not None:
+                if request.image_url is not None:
+                    image = image_util._url_to_image(request.image_url)
+                    query_vector = get_embedding_model().get_multimodal_embedding(request.matching_text, image)
+                else:
+                    query_vector = get_embedding_model().get_text_embedding(request.matching_text)
+            
+            search_query = {
+                "match_field": self.match_field,
+                "matching_text": request.matching_text,
+                "vector_field": self.vector_field,
+                "query_vector": query_vector,
+                "topn": request.topn,
+                "knn_params": {
+                    "ef": str(request.topn * 10),
+                    "threshold": str(request.score_threshold) if request.score_threshold else "0"
+                }
+            }
+            return search_query
+        except Exception as e:
+            logger.error(f"参数转换失败: {str(e)}")
+            raise Exception(f"参数转换失败: {str(e)}")
+
+
+# 向后兼容别名
+SearchService = VectorSearchService

+ 379 - 4
src/api/sdk/dataset_manage.py

@@ -3,25 +3,67 @@
 
 该文件提供数据集管理的 API 接口,支持:
 - PDF 文件上传和解析
+- QA问答对生成
+- 图片解析
 - 数据集创建
 """
 
+import tempfile
+import os
 from fastapi import FastAPI, UploadFile, File, Form
+from pydantic import BaseModel, Field
+from typing import Optional
+
 from src.api.dataset.services.dataset_manage_service import DatasetManageService
 from src.common.result import Result
+from src.utils.task_queue import get_task_queue
+from src.common.logging_config import get_logger
 
+logger = get_logger(__name__)
 
 # 创建 FastAPI 应用
 app = FastAPI(
     title="数据集管理 API",
-    description="数据集管理服务,提供 PDF 解析和数据集创建功能",
-    version="1.0.0"
+    description="数据集管理服务,提供 PDF 解析、QA生成和图片解析功能",
+    version="2.0.0"
 )
 
 # 创建数据集管理服务实例
 dataset_service = DatasetManageService()
 
 
+# ==================== 请求模型 ====================
+
+class PDFParseRequest(BaseModel):
+    """PDF解析请求模型"""
+    dataset_name: str = Field(..., description="数据集名称")
+    page_dataset_id: str = Field(..., description="页面数据集ID")
+
+
+class QAParseRequest(BaseModel):
+    """QA解析请求模型"""
+    dataset_id: str = Field(..., description="RAGFlow数据集ID")
+    qa_count_per_chunk: int = Field(default=50, ge=1, le=200, description="每块生成的QA数量")
+    chunk_size: int = Field(default=1000, ge=100, le=5000, description="文本分块大小")
+    chunk_overlap: int = Field(default=200, ge=0, le=1000, description="分块重叠大小")
+
+
+class ImageParseRequest(BaseModel):
+    """图片解析请求模型"""
+    book_name: str = Field(..., description="书名")
+    dataset_id: str = Field(..., description="数据集ID")
+
+
+class DynamicParseRequest(BaseModel):
+    """动态多维度解析请求模型"""
+    dimension_ids: list = Field(..., description="维度ID列表")
+    book_name: str = Field(..., description="书名")
+    dataset_id: str = Field(..., description="数据集ID")
+    document_id: str = Field(default="", description="文档ID")
+
+
+# ==================== 原有接口 ====================
+
 @app.post("/parse-pdf")
 async def parse_pdf(
     file: UploadFile = File(...),
@@ -29,7 +71,7 @@ async def parse_pdf(
     
 ):
     """
-    解析 PDF 文件接口
+    解析 PDF 文件接口(原版)
     
     - **file**: PDF 文件附件
     - **series_name**: 系列名
@@ -51,4 +93,337 @@ async def parse_pdf(
         
         return Result.success(data=result, message="PDF 解析成功")
     except Exception as e:
-        return Result.error(code=500, message=f"解析 PDF 文件失败: {str(e)}")
+        return Result.error(code=500, message=f"解析 PDF 文件失败: {str(e)}")
+
+
+# ==================== V2 工作流接口 ====================
+
+@app.post("/v2/pdf-parse")
+async def pdf_parse_v2(
+    file: UploadFile = File(..., description="PDF文件"),
+    dataset_name: str = Form(..., description="数据集名称"),
+    page_dataset_id: str = Form(..., description="页面数据集ID")
+):
+    """
+    PDF解析接口 (V2工作流)
+    
+    使用组件化工作流解析PDF,包含:
+    - PDF拆分为图片
+    - VL模型OCR解析
+    - 向量化入库Infinity
+    - 同步到RAGFlow
+    
+    - **file**: PDF 文件附件
+    - **dataset_name**: 数据集名称
+    - **page_dataset_id**: 页面数据集ID
+    """
+    try:
+        # 验证文件格式
+        if not file.filename.endswith((".pdf", ".PDF")):
+            return Result.error(code=400, message="只支持 PDF 格式的文件")
+        
+        # 保存文件到临时目录
+        file_content = await file.read()
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(file_content)
+            pdf_path = tmp_file.name
+        
+        logger.info(f"开始PDF解析V2: {file.filename}")
+        
+        # 定义工作流执行函数
+        def run_pdf_workflow():
+            try:
+                from src.datasets.parser.workflows import PDFParsingWorkflowV2
+                workflow = PDFParsingWorkflowV2()
+                result = workflow.run(
+                    pdf_path=pdf_path,
+                    page_dataset_id=page_dataset_id,
+                    dataset_name=dataset_name
+                )
+                return result
+            finally:
+                # 清理临时文件
+                if os.path.exists(pdf_path):
+                    os.unlink(pdf_path)
+        
+        # 提交到任务队列
+        task_queue = get_task_queue()
+        task_id = await task_queue.submit(
+            name=f"PDF解析-{file.filename}",
+            func=run_pdf_workflow
+        )
+        
+        queue_info = task_queue.get_queue_info()
+        return Result.success(data={
+            "task_id": task_id,
+            "message": "任务已提交到队列",
+            "queue_info": queue_info
+        }, message="任务已提交")
+    except Exception as e:
+        logger.error(f"PDF解析V2提交失败: {str(e)}")
+        return Result.error(code=500, message=f"任务提交失败: {str(e)}")
+
+
+@app.post("/v2/qa-parse")
+async def qa_parse_v2(
+    file: UploadFile = File(..., description="PDF文件"),
+    dataset_id: str = Form(..., description="RAGFlow数据集ID"),
+    qa_count_per_chunk: int = Form(default=50, description="每块生成的QA数量"),
+    chunk_size: int = Form(default=1000, description="文本分块大小"),
+    chunk_overlap: int = Form(default=200, description="分块重叠大小")
+):
+    """
+    QA问答对解析接口 (V2工作流)
+    
+    从PDF生成问答对,包含:
+    - PDF OCR解析
+    - 文本分块
+    - 并行QA对生成
+    - 导出CSV并上传RAGFlow
+    
+    - **file**: PDF 文件附件
+    - **dataset_id**: RAGFlow数据集ID
+    - **qa_count_per_chunk**: 每块生成的QA数量(默认50)
+    - **chunk_size**: 文本分块大小(默认1000)
+    - **chunk_overlap**: 分块重叠大小(默认200)
+    """
+    try:
+        # 验证文件格式
+        if not file.filename.endswith((".pdf", ".PDF")):
+            return Result.error(code=400, message="只支持 PDF 格式的文件")
+        
+        # 保存文件到临时目录
+        file_content = await file.read()
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
+            tmp_file.write(file_content)
+            pdf_path = tmp_file.name
+        
+        logger.info(f"开始QA解析V2: {file.filename}")
+        
+        # 定义工作流执行函数
+        def run_qa_workflow():
+            try:
+                from src.datasets.parser.workflows import QAParsingWorkflowV2
+                workflow = QAParsingWorkflowV2()
+                result = workflow.run(
+                    pdf_path=pdf_path,
+                    dataset_id=dataset_id,
+                    qa_count_per_chunk=qa_count_per_chunk,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap
+                )
+                return result
+            finally:
+                # 清理临时文件
+                if os.path.exists(pdf_path):
+                    os.unlink(pdf_path)
+        
+        # 提交到任务队列
+        task_queue = get_task_queue()
+        task_id = await task_queue.submit(
+            name=f"QA解析-{file.filename}",
+            func=run_qa_workflow
+        )
+        
+        queue_info = task_queue.get_queue_info()
+        return Result.success(data={
+            "task_id": task_id,
+            "message": "任务已提交到队列",
+            "queue_info": queue_info
+        }, message="任务已提交")
+    except Exception as e:
+        logger.error(f"QA解析V2提交失败: {str(e)}")
+        return Result.error(code=500, message=f"任务提交失败: {str(e)}")
+
+
+@app.post("/v2/image-parse")
+async def image_parse_v2(
+    file: UploadFile = File(..., description="图片压缩包(ZIP)"),
+    book_name: str = Form(..., description="书名"),
+    dataset_id: str = Form(..., description="数据集ID")
+):
+    """
+    图片解析接口 (V2工作流)
+    
+    解析图片压缩包,包含:
+    - 解压并上传到MinIO
+    - VL模型图片解析
+    - 向量化入库Infinity
+    
+    - **file**: 图片压缩包(ZIP格式)
+    - **book_name**: 书名
+    - **dataset_id**: 数据集ID
+    """
+    try:
+        # 验证文件格式
+        if not file.filename.endswith((".zip", ".ZIP")):
+            return Result.error(code=400, message="只支持 ZIP 格式的压缩包")
+        
+        # 保存文件到临时目录
+        file_content = await file.read()
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
+            tmp_file.write(file_content)
+            zip_path = tmp_file.name
+        
+        logger.info(f"开始图片解析V2: {file.filename}")
+        
+        # 定义工作流执行函数
+        def run_image_workflow():
+            try:
+                from src.datasets.parser.workflows import ImageParsingWorkflowV2
+                workflow = ImageParsingWorkflowV2()
+                result = workflow.run(
+                    zip_file_path=zip_path,
+                    book_name=book_name,
+                    dataset_id=dataset_id
+                )
+                return result
+            finally:
+                # 清理临时文件
+                if os.path.exists(zip_path):
+                    os.unlink(zip_path)
+        
+        # 提交到任务队列
+        task_queue = get_task_queue()
+        task_id = await task_queue.submit(
+            name=f"图片解析-{file.filename}",
+            func=run_image_workflow
+        )
+        
+        queue_info = task_queue.get_queue_info()
+        return Result.success(data={
+            "task_id": task_id,
+            "message": "任务已提交到队列",
+            "queue_info": queue_info
+        }, message="任务已提交")
+    except Exception as e:
+        logger.error(f"图片解析V2提交失败: {str(e)}")
+        return Result.error(code=500, message=f"任务提交失败: {str(e)}")
+
+
+# ==================== 任务队列接口 ====================
+
+@app.get("/v2/task/{task_id}")
+async def get_task_status(task_id: str):
+    """
+    获取任务状态
+    
+    - **task_id**: 任务ID
+    """
+    task_queue = get_task_queue()
+    status = task_queue.get_task_status(task_id)
+    if not status:
+        return Result.error(code=404, message=f"任务不存在: {task_id}")
+    return Result.success(data=status, message="获取任务状态成功")
+
+
+@app.get("/v2/queue")
+async def get_queue_status():
+    """
+    获取队列状态信息
+    """
+    task_queue = get_task_queue()
+    queue_info = task_queue.get_queue_info()
+    return Result.success(data=queue_info, message="获取队列状态成功")
+
+
+@app.post("/v2/queue/clear")
+async def clear_completed_tasks():
+    """
+    清理已完成的任务记录
+    """
+    task_queue = get_task_queue()
+    task_queue.clear_completed()
+    return Result.success(data={}, message="已清理完成的任务")
+
+
+# ==================== 动态多维度解析接口 ====================
+
+@app.post("/v2/dynamic-parse")
+async def dynamic_parse_v2(
+    file: UploadFile = File(..., description="图片压缩包(ZIP)"),
+    dimension_ids: str = Form(..., description="维度ID列表,逗号分隔,如: 1,2,3"),
+    book_name: str = Form(..., description="书名"),
+    dataset_id: str = Form(..., description="数据集ID"),
+    document_id: str = Form(default="", description="文档ID")
+):
+    """
+    动态多维度解析接口 (V2工作流)
+    
+    根据传入的维度ID列表,依次使用每个维度的提示词解析图片并入库。
+    每个维度对应的表/索引命名为 book_{dimension_id}。
+    
+    - **file**: 图片压缩包(ZIP格式)
+    - **dimension_ids**: 维度ID列表,逗号分隔
+    - **book_name**: 书名
+    - **dataset_id**: 数据集ID
+    - **document_id**: 文档ID(可选)
+    """
+    try:
+        # 验证文件格式
+        if not file.filename.endswith((".pdf", ".PDF")):
+            return Result.error(code=400, message="只支持 PDF 格式的压缩包")
+        
+        # 解析维度ID列表
+        try:
+            dim_ids = [int(x.strip()) for x in dimension_ids.split(",") if x.strip()]
+        except ValueError:
+            return Result.error(code=400, message="维度ID格式错误,应为逗号分隔的整数")
+        
+        if not dim_ids:
+            return Result.error(code=400, message="维度ID列表不能为空")
+        
+        # 保存文件到临时目录
+        file_content = await file.read()
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as tmp_file:
+            tmp_file.write(file_content)
+            zip_path = tmp_file.name
+        
+        logger.info(f"开始动态多维度解析: {file.filename}, 维度: {dim_ids}")
+        
+        # 定义工作流执行函数
+        def run_dynamic_workflow():
+            try:
+                from src.datasets.parser.workflows.dynamic_dimension_workflow import DynamicDimensionWorkflow
+                from src.utils.file.image_util import image_util
+                
+                # 解压图片并获取页面列表
+                image_urls = image_util.process_image_zip(zip_path, book_name)
+                image_pages = [
+                    {"page_number": i + 1, "image_url": url}
+                    for i, url in enumerate(image_urls)
+                ]
+                
+                # 执行工作流
+                workflow = DynamicDimensionWorkflow()
+                result = workflow.run(
+                    dimension_ids=dim_ids,
+                    image_pages=image_pages,
+                    split_pages=image_pages,
+                    document_id=document_id or f"{book_name}_{dataset_id}",
+                    dataset_id=dataset_id,
+                    pdf_path=""
+                )
+                return result
+            finally:
+                # 清理临时文件
+                if os.path.exists(zip_path):
+                    os.unlink(zip_path)
+        
+        # 提交到任务队列
+        task_queue = get_task_queue()
+        task_id = await task_queue.submit(
+            name=f"动态解析-{file.filename}-维度{len(dim_ids)}个",
+            func=run_dynamic_workflow
+        )
+        
+        queue_info = task_queue.get_queue_info()
+        return Result.success(data={
+            "task_id": task_id,
+            "dimension_ids": dim_ids,
+            "message": "任务已提交到队列",
+            "queue_info": queue_info
+        }, message="任务已提交")
+    except Exception as e:
+        logger.error(f"动态解析提交失败: {str(e)}")
+        return Result.error(code=500, message=f"任务提交失败: {str(e)}")

+ 201 - 0
src/api/sdk/prompt_manage.py

@@ -0,0 +1,201 @@
+"""
+提示词管理 API
+
+提供维度和提示词版本管理的 RESTful 接口。
+"""
+
+from fastapi import FastAPI
+from pydantic import BaseModel, Field
+from typing import Optional, List
+
+from src.api.db.services.prompt_service import get_prompt_service
+from src.common.result import Result
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+# 创建 FastAPI 应用
+app = FastAPI(
+    title="提示词管理 API",
+    description="维度和提示词版本管理服务",
+    version="1.0.0"
+)
+
+
+# ==================== 请求模型 ====================
+
+class DimensionCreateRequest(BaseModel):
+    """创建维度请求"""
+    name: str = Field(..., description="维度名称")
+    description: Optional[str] = Field(None, description="维度描述")
+
+
+class DimensionUpdateRequest(BaseModel):
+    """更新维度请求"""
+    name: Optional[str] = Field(None, description="维度名称")
+    description: Optional[str] = Field(None, description="维度描述")
+
+
+class VersionCreateRequest(BaseModel):
+    """创建版本请求"""
+    dimension_id: int = Field(..., description="维度ID")
+    content: str = Field(..., description="提示词内容")
+    remark: Optional[str] = Field(None, description="版本备注")
+    created_by: Optional[str] = Field(None, description="创建人")
+    set_active: bool = Field(True, description="是否设为激活版本")
+
+
+class SetActiveRequest(BaseModel):
+    """设置激活版本请求"""
+    dimension_id: int = Field(..., description="维度ID")
+    version_number: int = Field(..., description="版本号")
+
+
+# ==================== 维度接口 ====================
+
+@app.post("/dimension")
+async def create_dimension(request: DimensionCreateRequest):
+    """创建维度"""
+    try:
+        service = get_prompt_service()
+        result = service.add_dimension(request.name, request.description)
+        return Result.success(data=result, message="维度创建成功")
+    except Exception as e:
+        logger.error(f"创建维度失败: {str(e)}")
+        return Result.error(code=500, message=f"创建维度失败: {str(e)}")
+
+
+@app.get("/dimension")
+async def get_dimensions():
+    """获取所有维度"""
+    try:
+        service = get_prompt_service()
+        result = service.get_all_dimensions()
+        return Result.success(data=result, message="获取成功")
+    except Exception as e:
+        logger.error(f"获取维度列表失败: {str(e)}")
+        return Result.error(code=500, message=f"获取失败: {str(e)}")
+
+
+@app.get("/dimension/{dimension_id}")
+async def get_dimension(dimension_id: int):
+    """获取单个维度"""
+    try:
+        service = get_prompt_service()
+        result = service.get_dimension_by_id(dimension_id)
+        if not result:
+            return Result.error(code=404, message="维度不存在")
+        return Result.success(data=result, message="获取成功")
+    except Exception as e:
+        logger.error(f"获取维度失败: {str(e)}")
+        return Result.error(code=500, message=f"获取失败: {str(e)}")
+
+
+@app.put("/dimension/{dimension_id}")
+async def update_dimension(dimension_id: int, request: DimensionUpdateRequest):
+    """更新维度"""
+    try:
+        service = get_prompt_service()
+        service.update_dimension(dimension_id, request.name, request.description)
+        result = service.get_dimension_by_id(dimension_id)
+        return Result.success(data=result, message="更新成功")
+    except Exception as e:
+        logger.error(f"更新维度失败: {str(e)}")
+        return Result.error(code=500, message=f"更新失败: {str(e)}")
+
+
+@app.delete("/dimension/{dimension_id}")
+async def delete_dimension(dimension_id: int):
+    """删除维度(级联删除所有版本)"""
+    try:
+        service = get_prompt_service()
+        service.delete_dimension(dimension_id)
+        return Result.success(data={}, message="删除成功")
+    except Exception as e:
+        logger.error(f"删除维度失败: {str(e)}")
+        return Result.error(code=500, message=f"删除失败: {str(e)}")
+
+
+# ==================== 版本接口 ====================
+
+@app.post("/version")
+async def create_version(request: VersionCreateRequest):
+    """创建新版本"""
+    try:
+        service = get_prompt_service()
+        result = service.add_version(
+            dimension_id=request.dimension_id,
+            content=request.content,
+            remark=request.remark,
+            created_by=request.created_by,
+            set_active=request.set_active
+        )
+        return Result.success(data=result, message="版本创建成功")
+    except Exception as e:
+        logger.error(f"创建版本失败: {str(e)}")
+        return Result.error(code=500, message=f"创建版本失败: {str(e)}")
+
+
+@app.get("/versions/{dimension_id}")
+async def get_versions(dimension_id: int):
+    """获取维度的所有版本"""
+    try:
+        service = get_prompt_service()
+        result = service.get_versions_by_dimension(dimension_id)
+        return Result.success(data=result, message="获取成功")
+    except Exception as e:
+        logger.error(f"获取版本列表失败: {str(e)}")
+        return Result.error(code=500, message=f"获取失败: {str(e)}")
+
+
+@app.get("/active/{dimension_id}")
+async def get_active_version(dimension_id: int):
+    """获取当前激活版本"""
+    try:
+        service = get_prompt_service()
+        result = service.get_active_version(dimension_id)
+        if not result:
+            return Result.error(code=404, message="无激活版本")
+        return Result.success(data=result, message="获取成功")
+    except Exception as e:
+        logger.error(f"获取激活版本失败: {str(e)}")
+        return Result.error(code=500, message=f"获取失败: {str(e)}")
+
+
+@app.get("/active/name/{dimension_name}")
+async def get_active_prompt_by_name(dimension_name: str):
+    """根据维度名称获取激活的提示词内容"""
+    try:
+        service = get_prompt_service()
+        content = service.get_active_prompt(dimension_name)
+        if content is None:
+            return Result.error(code=404, message=f"维度 '{dimension_name}' 不存在或无激活版本")
+        return Result.success(data={"content": content}, message="获取成功")
+    except Exception as e:
+        logger.error(f"获取提示词失败: {str(e)}")
+        return Result.error(code=500, message=f"获取失败: {str(e)}")
+
+
+@app.post("/active")
+async def set_active_version(request: SetActiveRequest):
+    """设置激活版本"""
+    try:
+        service = get_prompt_service()
+        service.set_active_version(request.dimension_id, request.version_number)
+        result = service.get_active_version(request.dimension_id)
+        return Result.success(data=result, message="设置成功")
+    except Exception as e:
+        logger.error(f"设置激活版本失败: {str(e)}")
+        return Result.error(code=500, message=f"设置失败: {str(e)}")
+
+
+@app.delete("/version/{dimension_id}/{version_number}")
+async def delete_version(dimension_id: int, version_number: int):
+    """删除版本"""
+    try:
+        service = get_prompt_service()
+        service.delete_version(dimension_id, version_number)
+        return Result.success(data={}, message="删除成功")
+    except Exception as e:
+        logger.error(f"删除版本失败: {str(e)}")
+        return Result.error(code=500, message=f"删除失败: {str(e)}")

+ 15 - 12
src/api/sdk/search_infinity.py

@@ -1,18 +1,19 @@
 # Infinity搜索API服务
 
 from fastapi import FastAPI, HTTPException
-from src.api.db.services.infinity_search_service import InfinitySearchService
-from src.utils.infinity import get_client
+from src.api.db.services.vector_search_service import VectorSearchService
+from src.utils.vector_db import get_vector_db_client
 from src.common.result import Result
 from src.utils.async_utils import run_in_threadpool
 from src.api.db.models import SearchRequest
+from src.conf.settings import vector_db_settings
 
 # 创建FastAPI应用
 
 app = FastAPI(
-    title="Infinity Search API",
-    description="基于Infinity向量数据库搜索API服务",
-    version="1.0.0"
+    title="Vector Search API",
+    description="向量数据库搜索API服务(支持 Infinity / Elasticsearch)",
+    version="2.0.0"
 )
 
 # 1. 普通搜索接口
@@ -27,7 +28,7 @@ async def search(request: SearchRequest):
     - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
     """
     try:
-        search_service = InfinitySearchService(infinity_client=get_client())
+        search_service = VectorSearchService(client=get_vector_db_client())
         result = await run_in_threadpool(search_service.search, request)
         return Result.success(data=result, message="搜索成功")
     except Exception as e:
@@ -45,7 +46,7 @@ async def vector_search(request: SearchRequest):
     - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
     """
     try:
-        search_service = InfinitySearchService(infinity_client=get_client())
+        search_service = VectorSearchService(client=get_vector_db_client())
         result = await run_in_threadpool(search_service.vector_search, request)
         return Result.success(data=result, message="向量搜索成功")
     except Exception as e:
@@ -63,7 +64,7 @@ async def hybrid_search(request: SearchRequest):
     - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
     """
     try:
-        search_service = InfinitySearchService(infinity_client=get_client())
+        search_service = VectorSearchService(client=get_vector_db_client())
         result = await run_in_threadpool(search_service.hybrid_search, request)
         return Result.success(data=result, message="混合搜索成功")
     except Exception as e:
@@ -81,10 +82,12 @@ async def question_search(request: SearchRequest):
     """
     try:
         output_fields = ["content"]
-        search_service = InfinitySearchService(infinity_client=get_client(database="ragflow_db"), 
-                                               table_name="ragflow_92162247e93e11f084830242ac1d0002_1c4f7a82f66c11f09c750242c0a8d002", 
-                                               vector_field="q_1024_vec",
-                                               output_fields=output_fields)
+        search_service = VectorSearchService(
+            client=get_vector_db_client(database="ragflow_db"), 
+            table_name="ragflow_92162247e93e11f084830242ac1d0002_1c4f7a82f66c11f09c750242c0a8d002", 
+            vector_field="q_1024_vec",
+            output_fields=output_fields
+        )
         result = await run_in_threadpool(search_service.hybrid_search, request)
         return Result.success(data=result, message="问答对检索成功")
     except Exception as e:

+ 84 - 0
src/common/logging_config.py

@@ -0,0 +1,84 @@
+"""
+集中式日志配置模块
+
+提供统一的日志配置和日志器获取接口。
+"""
+
+import logging
+import sys
+from pathlib import Path
+from typing import Optional
+
+
+# 日志格式配置
+LOG_FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+
+# 默认日志级别
+DEFAULT_LOG_LEVEL = logging.INFO
+
+# 日志文件配置(可选)
+LOG_DIR = Path("logs")
+LOG_FILE = LOG_DIR / "app.log"
+
+
+def setup_logging(
+    level: int = DEFAULT_LOG_LEVEL,
+    log_to_file: bool = False,
+    log_file: Optional[Path] = None
+) -> None:
+    """
+    配置全局日志设置
+    
+    Args:
+        level: 日志级别
+        log_to_file: 是否输出到文件
+        log_file: 日志文件路径(如果log_to_file为True)
+    """
+    # 创建根日志器
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    
+    # 清除现有的处理器
+    root_logger.handlers.clear()
+    
+    # 创建格式化器
+    formatter = logging.Formatter(LOG_FORMAT, DATE_FORMAT)
+    
+    # 添加控制台处理器
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(level)
+    console_handler.setFormatter(formatter)
+    root_logger.addHandler(console_handler)
+    
+    # 添加文件处理器(如果需要)
+    if log_to_file:
+        file_path = log_file or LOG_FILE
+        # 确保日志目录存在
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        file_handler = logging.FileHandler(file_path, encoding="utf-8")
+        file_handler.setLevel(level)
+        file_handler.setFormatter(formatter)
+        root_logger.addHandler(file_handler)
+
+
+def get_logger(name: str) -> logging.Logger:
+    """
+    获取命名日志器
+    
+    Args:
+        name: 日志器名称(通常使用 __name__)
+        
+    Returns:
+        配置好的日志器实例
+        
+    Example:
+        >>> logger = get_logger(__name__)
+        >>> logger.info("Application started")
+    """
+    return logging.getLogger(name)
+
+
+# 在模块导入时自动配置日志
+setup_logging()

+ 19 - 3
src/common/result.py

@@ -1,10 +1,26 @@
 """
 Unified API response template
 """
+from datetime import datetime
 from typing import Optional, Any, Dict, List
 from fastapi.responses import JSONResponse
 
 
+def _serialize_data(data: Any) -> Any:
+    """
+    递归序列化数据,将 datetime 对象转换为 ISO 格式字符串
+    """
+    if data is None:
+        return None
+    if isinstance(data, dict):
+        return {k: _serialize_data(v) for k, v in data.items()}
+    elif isinstance(data, list):
+        return [_serialize_data(item) for item in data]
+    elif isinstance(data, datetime):
+        return data.isoformat()
+    return data
+
+
 class Result:
     """
     Unified API response class
@@ -21,7 +37,7 @@ class Result:
                 "code": 200,
                 "success": True,
                 "message": message,
-                "data": data,
+                "data": _serialize_data(data),
                 "total": total,
                 "page": page
             }
@@ -38,7 +54,7 @@ class Result:
                 "code": code,
                 "success": False,
                 "message": message,
-                "data": data,
+                "data": _serialize_data(data),
                 "total": 0,
                 "page": 1
             }
@@ -55,7 +71,7 @@ class Result:
                 "code": 200,
                 "success": True,
                 "message": message,
-                "data": data,
+                "data": _serialize_data(data),
                 "total": total,
                 "page": page
             }

+ 1 - 1
src/conf/settings.py

@@ -19,7 +19,7 @@ class AppSettings(BaseSettings):
 class ModelSettings(BaseSettings):
     """模型配置类"""
     model_provider: str = Field(default="openai", alias="MODEL_PROVIDER")
-    model_name: str = Field(default="Qwen/Qwen3-VL-8B-Instruct", alias="MODEL_NAME")
+    vl_model_name: str = Field(default="Qwen/Qwen3-VL-8B-Instruct", alias="VL_MODEL_NAME")
     chat_model_name: str = Field(default="deepseek-ai/DeepSeek-V3.2", alias="CHAT_MODEL_NAME")
     embedding_model_name: str = Field(default="Qwen/Qwen3-Embedding-0.6B", alias="EMBEDDING_MODEL_NAME")
     base_url: str = Field(default="https://api.openai.com/v1", alias="BASE_URL")

+ 16 - 0
src/datasets/parser/core/__init__.py

@@ -0,0 +1,16 @@
+"""
+Parser核心框架模块
+
+提供节点抽象基类、状态基类和工作流构建器。
+"""
+
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import NodeRegistry
+from src.datasets.parser.core.workflow_builder import WorkflowBuilder
+
+__all__ = [
+    "BaseNode",
+    "BaseState",
+    "NodeRegistry",
+    "WorkflowBuilder",
+]

+ 125 - 0
src/datasets/parser/core/base.py

@@ -0,0 +1,125 @@
+"""
+节点抽象基类和基础状态定义
+
+提供所有节点组件的抽象基类和通用状态类。
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, Optional, List
+from pydantic import BaseModel, Field, ConfigDict
+
+
+class BaseState(BaseModel):
+    """
+    基础状态类
+    
+    所有工作流状态类都应继承此类。
+    提供通用的状态字段和配置。
+    
+    Attributes:
+        is_complete: 是否处理完成
+        error_message: 错误信息
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    is_complete: bool = Field(default=False, description="是否处理完成")
+    error_message: Optional[str] = Field(default=None, description="错误信息")
+
+
+class BaseNode(ABC):
+    """
+    节点抽象基类
+    
+    所有节点组件都应继承此类并实现抽象方法。
+    支持作为callable使用,可直接传递给LangGraph。
+    
+    Example:
+        >>> class MyNode(BaseNode):
+        ...     @property
+        ...     def name(self) -> str:
+        ...         return "my_node"
+        ...     
+        ...     def execute(self, state: BaseState) -> Dict[str, Any]:
+        ...         return {"processed": True}
+        >>> 
+        >>> node = MyNode()
+        >>> graph.add_node(node.name, node)
+    """
+    
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """
+        节点名称
+        
+        用于在工作流中标识节点,必须唯一。
+        
+        Returns:
+            节点名称字符串
+        """
+        pass
+    
+    @property
+    def description(self) -> str:
+        """
+        节点描述
+        
+        可选,用于文档和调试。
+        
+        Returns:
+            节点描述字符串
+        """
+        return self.__class__.__doc__ or ""
+    
+    @abstractmethod
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        执行节点逻辑
+        
+        Args:
+            state: 当前工作流状态
+            
+        Returns:
+            状态更新字典,仅包含需要更新的字段
+        """
+        pass
+    
+    def __call__(self, state) -> Dict[str, Any]:
+        """
+        使节点可作为callable使用
+        
+        Args:
+            state: 当前工作流状态
+            
+        Returns:
+            状态更新字典
+        """
+        return self.execute(state)
+    
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}(name='{self.name}')>"
+
+
+class ConditionalNode(BaseNode):
+    """
+    条件节点抽象基类
+    
+    用于需要进行条件判断并返回路由结果的节点。
+    """
+    
+    @abstractmethod
+    def check_condition(self, state: BaseState) -> str:
+        """
+        检查条件并返回路由结果
+        
+        Args:
+            state: 当前工作流状态
+            
+        Returns:
+            路由目标名称字符串
+        """
+        pass
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """条件节点不执行逻辑,仅用于路由判断"""
+        return {}

+ 118 - 0
src/datasets/parser/core/registry.py

@@ -0,0 +1,118 @@
+"""
+节点组件注册中心
+
+提供节点组件的注册、获取和管理功能。
+"""
+
+from typing import Dict, Type, Optional, List
+from src.datasets.parser.core.base import BaseNode
+
+
+class NodeRegistry:
+    """
+    节点组件注册中心
+    
+    单例模式,管理所有已注册的节点组件。
+    支持按名称或类型获取节点实例。
+    
+    Example:
+        >>> registry = NodeRegistry()
+        >>> registry.register(PDFSplitNode)
+        >>> node = registry.get("pdf_split")
+    """
+    
+    _instance: Optional["NodeRegistry"] = None
+    _nodes: Dict[str, Type[BaseNode]] = {}
+    
+    def __new__(cls) -> "NodeRegistry":
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._nodes = {}
+        return cls._instance
+    
+    def register(self, node_class: Type[BaseNode], name: Optional[str] = None) -> None:
+        """
+        注册节点组件类
+        
+        Args:
+            node_class: 节点类(必须继承自BaseNode)
+            name: 注册名称(可选,默认使用节点的name属性)
+        """
+        # 创建临时实例获取节点名称
+        temp_instance = node_class()
+        node_name = name or temp_instance.name
+        self._nodes[node_name] = node_class
+    
+    def get(self, name: str, **kwargs) -> BaseNode:
+        """
+        获取节点实例
+        
+        Args:
+            name: 节点名称
+            **kwargs: 传递给节点构造函数的参数
+            
+        Returns:
+            节点实例
+            
+        Raises:
+            KeyError: 如果节点未注册
+        """
+        if name not in self._nodes:
+            raise KeyError(f"Node '{name}' not registered. Available nodes: {list(self._nodes.keys())}")
+        return self._nodes[name](**kwargs)
+    
+    def get_class(self, name: str) -> Type[BaseNode]:
+        """
+        获取节点类
+        
+        Args:
+            name: 节点名称
+            
+        Returns:
+            节点类
+        """
+        if name not in self._nodes:
+            raise KeyError(f"Node '{name}' not registered")
+        return self._nodes[name]
+    
+    def list_nodes(self) -> List[str]:
+        """
+        列出所有已注册的节点名称
+        
+        Returns:
+            节点名称列表
+        """
+        return list(self._nodes.keys())
+    
+    def clear(self) -> None:
+        """清空所有注册的节点"""
+        self._nodes.clear()
+    
+    def __contains__(self, name: str) -> bool:
+        return name in self._nodes
+
+
+def register_node(name: Optional[str] = None):
+    """
+    节点注册装饰器
+    
+    使用此装饰器可以自动将节点类注册到全局注册中心。
+    
+    Args:
+        name: 注册名称(可选)
+        
+    Example:
+        >>> @register_node()
+        ... class MyNode(BaseNode):
+        ...     @property
+        ...     def name(self):
+        ...         return "my_node"
+    """
+    def decorator(cls: Type[BaseNode]) -> Type[BaseNode]:
+        NodeRegistry().register(cls, name)
+        return cls
+    return decorator
+
+
+# 全局注册中心实例
+node_registry = NodeRegistry()

+ 188 - 0
src/datasets/parser/core/workflow_builder.py

@@ -0,0 +1,188 @@
+"""
+工作流构建器
+
+提供链式API简化LangGraph工作流的组装。
+"""
+
+from typing import Dict, Any, List, Tuple, Callable, Optional, Type, Union
+from langgraph.graph import StateGraph, START, END
+from pydantic import BaseModel
+
+from src.datasets.parser.core.base import BaseNode, BaseState, ConditionalNode
+
+
+class WorkflowBuilder:
+    """
+    工作流构建器
+    
+    提供链式API简化LangGraph工作流的组装过程。
+    
+    Example:
+        >>> workflow = WorkflowBuilder(MyState) \\
+        ...     .add_node(SplitNode()) \\
+        ...     .add_node(ParseNode()) \\
+        ...     .add_node(StoreNode()) \\
+        ...     .add_edge(START, "split") \\
+        ...     .add_edge("split", "parse") \\
+        ...     .add_edge("parse", "store") \\
+        ...     .add_edge("store", END) \\
+        ...     .build()
+    """
+    
+    def __init__(self, state_class: Type[BaseState]):
+        """
+        初始化工作流构建器
+        
+        Args:
+            state_class: 状态类(必须继承自BaseState)
+        """
+        self.state_class = state_class
+        self.graph = StateGraph(state_class)
+        self._nodes: Dict[str, BaseNode] = {}
+        self._edges: List[Tuple[str, str]] = []
+        self._conditional_edges: List[Tuple[str, Callable, Dict[str, str]]] = []
+    
+    def add_node(self, node: BaseNode) -> "WorkflowBuilder":
+        """
+        添加节点
+        
+        Args:
+            node: 节点实例
+            
+        Returns:
+            self,支持链式调用
+        """
+        self._nodes[node.name] = node
+        self.graph.add_node(node.name, node)
+        return self
+    
+    def add_nodes(self, *nodes: BaseNode) -> "WorkflowBuilder":
+        """
+        批量添加节点
+        
+        Args:
+            *nodes: 节点实例列表
+            
+        Returns:
+            self,支持链式调用
+        """
+        for node in nodes:
+            self.add_node(node)
+        return self
+    
+    def add_edge(self, source: Union[str, type], target: Union[str, type]) -> "WorkflowBuilder":
+        """
+        添加边
+        
+        Args:
+            source: 源节点名称或START
+            target: 目标节点名称或END
+            
+        Returns:
+            self,支持链式调用
+        """
+        # 处理START和END常量
+        src = START if source is START else source
+        tgt = END if target is END else target
+        
+        self._edges.append((src, tgt))
+        self.graph.add_edge(src, tgt)
+        return self
+    
+    def add_conditional_edge(
+        self,
+        source: str,
+        condition: Union[Callable, ConditionalNode],
+        routes: Dict[str, str]
+    ) -> "WorkflowBuilder":
+        """
+        添加条件边
+        
+        Args:
+            source: 源节点名称
+            condition: 条件函数或ConditionalNode
+            routes: 路由映射,{条件结果: 目标节点名称}
+            
+        Returns:
+            self,支持链式调用
+        """
+        if isinstance(condition, ConditionalNode):
+            condition_func = condition.check_condition
+        else:
+            condition_func = condition
+        
+        self._conditional_edges.append((source, condition_func, routes))
+        self.graph.add_conditional_edges(source, condition_func, routes)
+        return self
+    
+    def add_sequence(self, *node_names: str) -> "WorkflowBuilder":
+        """
+        添加顺序执行的节点序列
+        
+        自动为相邻节点创建边。
+        
+        Args:
+            *node_names: 节点名称序列
+            
+        Returns:
+            self,支持链式调用
+        """
+        for i in range(len(node_names) - 1):
+            self.add_edge(node_names[i], node_names[i + 1])
+        return self
+    
+    def set_entry(self, node_name: str) -> "WorkflowBuilder":
+        """
+        设置入口节点
+        
+        Args:
+            node_name: 入口节点名称
+            
+        Returns:
+            self,支持链式调用
+        """
+        self.add_edge(START, node_name)
+        return self
+    
+    def set_finish(self, node_name: str) -> "WorkflowBuilder":
+        """
+        设置结束节点
+        
+        Args:
+            node_name: 结束节点名称
+            
+        Returns:
+            self,支持链式调用
+        """
+        self.add_edge(node_name, END)
+        return self
+    
+    def build(self):
+        """
+        编译并返回工作流
+        
+        Returns:
+            编译后的LangGraph工作流
+        """
+        return self.graph.compile()
+    
+    def get_node(self, name: str) -> Optional[BaseNode]:
+        """
+        获取已添加的节点
+        
+        Args:
+            name: 节点名称
+            
+        Returns:
+            节点实例,如果不存在返回None
+        """
+        return self._nodes.get(name)
+    
+    def list_nodes(self) -> List[str]:
+        """
+        列出所有已添加的节点名称
+        
+        Returns:
+            节点名称列表
+        """
+        return list(self._nodes.keys())

+ 35 - 0
src/datasets/parser/nodes/__init__.py

@@ -0,0 +1,35 @@
+"""
+节点组件模块
+
+提供可复用的节点组件。
+"""
+
+from src.datasets.parser.nodes.pdf_split_node import PDFSplitNode
+from src.datasets.parser.nodes.image_parse_node import ImageParseNode
+from src.datasets.parser.nodes.text_split_node import TextSplitNode
+from src.datasets.parser.nodes.vectorize_node import VectorizeNode
+from src.datasets.parser.nodes.ragflow_nodes import (
+    RAGFlowDatasetNode,
+    RAGFlowDocumentUploadNode,
+    RAGFlowDocumentParseNode,
+    RAGFlowChunkNode,
+)
+from src.datasets.parser.nodes.qa_generate_node import QAGenerateNode
+from src.datasets.parser.nodes.complete_node import CompleteNode
+from src.datasets.parser.nodes.pdf_ocr_node import PDFOCRNode
+from src.datasets.parser.nodes.export_csv_node import ExportCSVNode
+
+__all__ = [
+    "PDFSplitNode",
+    "ImageParseNode",
+    "TextSplitNode",
+    "VectorizeNode",
+    "RAGFlowDatasetNode",
+    "RAGFlowDocumentUploadNode",
+    "RAGFlowDocumentParseNode",
+    "RAGFlowChunkNode",
+    "QAGenerateNode",
+    "CompleteNode",
+    "PDFOCRNode",
+    "ExportCSVNode",
+]

+ 63 - 0
src/datasets/parser/nodes/complete_node.py

@@ -0,0 +1,63 @@
+"""
+完成节点
+
+标记工作流完成。
+"""
+
+from typing import Dict, Any
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class CompleteNode(BaseNode):
+    """
+    完成节点
+    
+    标记工作流处理完成,输出统计信息。
+    """
+    
+    def __init__(self, message_template: str = "工作流完成"):
+        """
+        初始化完成节点
+        
+        Args:
+            message_template: 完成消息模板
+        """
+        self.message_template = message_template
+    
+    @property
+    def name(self) -> str:
+        return "complete"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        标记完成
+        
+        Args:
+            state: 当前状态
+            
+        Returns:
+            包含is_complete=True的更新字典
+        """
+        # 收集统计信息
+        stats = []
+        
+        if hasattr(state, 'processed_pages'):
+            stats.append(f"解析 {state.processed_pages} 页")
+        if hasattr(state, 'vectorized_pages'):
+            stats.append(f"向量化 {state.vectorized_pages} 页")
+        if hasattr(state, 'qa_count'):
+            stats.append(f"生成 {state.qa_count} 个QA对")
+        if hasattr(state, 'processed_images'):
+            stats.append(f"处理 {state.processed_images} 张图片")
+        
+        stats_str = ",".join(stats) if stats else ""
+        message = f"{self.message_template}:{stats_str}" if stats_str else self.message_template
+        
+        logger.info(message)
+        
+        return {"is_complete": True}

+ 43 - 0
src/datasets/parser/nodes/export_csv_node.py

@@ -0,0 +1,43 @@
+import tempfile
+import csv
+from typing import Dict, Any
+from src.datasets.parser.core.base import BaseNode
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+class ExportCSVNode(BaseNode):
+    """
+    导出CSV节点
+    
+    将QA对导出为CSV文件。
+    """
+    
+    @property
+    def name(self) -> str:
+        return "export_csv"
+    
+    def execute(self, state) -> Dict[str, Any]:
+        qa_pairs = getattr(state, 'qa_pairs', [])
+        
+        if not qa_pairs:
+            logger.warning("没有QA对需要导出")
+            return {"csv_path": None}
+        
+        logger.info(f"开始导出 {len(qa_pairs)} 个QA对到CSV")
+        
+        # 创建临时CSV文件
+        with tempfile.NamedTemporaryFile(
+            mode='w',
+            suffix='.csv',
+            delete=False,
+            encoding='utf-8',
+            newline=''
+        ) as f:
+            writer = csv.DictWriter(f, fieldnames=['question', 'answer'])
+            writer.writeheader()
+            writer.writerows(qa_pairs)
+            csv_path = f.name
+        
+        logger.info(f"CSV导出完成: {csv_path}")
+        return {"csv_path": csv_path}

+ 146 - 0
src/datasets/parser/nodes/image_parse_node.py

@@ -0,0 +1,146 @@
+"""
+图像解析节点
+
+使用VL模型解析图像内容,支持并行处理。
+"""
+
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, Any, List, Optional
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from src.model.qwen_vl import QWenVLParser
+from src.conf.settings import model_settings
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class ImageParseNode(BaseNode):
+    """
+    图像解析节点
+    
+    使用VL模型(如Qwen-VL)解析图像内容,支持并行处理多张图片。
+    
+    需要的状态字段:
+        - split_pages: 待解析的页面列表(包含image字段)
+        
+    更新的状态字段:
+        - parsed_results: 解析结果列表
+        - processed_pages: 已处理的页面数量
+    """
+    
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        max_workers: int = 5,
+        prompt_template: Optional[str] = None
+    ):
+        """
+        初始化图像解析节点
+        
+        Args:
+            model_name: VL模型名称
+            max_workers: 并行处理的最大工作线程数
+            prompt_template: 自定义提示词模板
+        """
+        self.model_name = model_name or model_settings.vl_model_name
+        self.max_workers = max_workers
+        self.prompt_template = prompt_template or self._default_prompt_template()
+    
+    @property
+    def name(self) -> str:
+        return "image_parse"
+    
+    def _default_prompt_template(self) -> str:
+        """默认的图像解析提示词模板"""
+        return """
+            角色定位:你是一位顶尖的儿童绘本分析师与视觉工程专家,擅长将插画视觉信息转化为高精度的结构化元数据。
+            任务描述:请深度解析提供的绘本页面,提取基本要素和特征。
+            当前提取页码为:{page_number}
+            
+            输出格式:JSON
+            {{
+                "page_number": {page_number},
+                "content": "页面内容描述",
+                "elements": []
+            }}
+        """
+    
+    def _parse_single_page(self, page: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        解析单个页面
+        
+        Args:
+            page: 页面信息,包含page_number和image字段
+            
+        Returns:
+            解析结果字典
+        """
+        page_number = page.get("page_number", 0)
+        image = page.get("image")
+        
+        prompt = self.prompt_template.format(page_number=page_number)
+        
+        logger.debug(f"开始解析第 {page_number} 页")
+        
+        try:
+            parser = QWenVLParser(self.model_name)
+            result = parser.parse_image(image, page_number, prompt)
+            logger.debug(f"第 {page_number} 页解析完成")
+            return result
+        except Exception as e:
+            logger.error(f"解析第 {page_number} 页时出错: {str(e)}")
+            return {
+                "page_number": page_number,
+                "content": "",
+                "error": str(e)
+            }
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        执行图像解析
+        
+        Args:
+            state: 包含split_pages或image_pages的状态
+            
+        Returns:
+            包含parsed_results的更新字典
+        """
+        # 支持多种状态字段名称
+        pages = getattr(state, 'split_pages', None) or getattr(state, 'image_pages', [])
+        
+        if not pages:
+            logger.warning("没有待解析的页面")
+            return {"parsed_results": [], "processed_pages": 0}
+        
+        logger.info(f"开始并行解析 {len(pages)} 页")
+        
+        parsed_results = []
+        
+        # 使用ThreadPoolExecutor实现并行处理
+        with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="parse_page_") as executor:
+            future_to_page = {
+                executor.submit(self._parse_single_page, page): page
+                for page in pages
+            }
+            
+            for future in concurrent.futures.as_completed(future_to_page):
+                try:
+                    result = future.result()
+                    parsed_results.append(result)
+                except Exception as e:
+                    page = future_to_page[future]
+                    logger.error(f"解析第 {page.get('page_number', '?')} 页时出错: {str(e)}")
+        
+        # 按页码排序结果
+        parsed_results.sort(key=lambda x: x.get("page_number", 0))
+        
+        logger.info(f"所有页面解析完成,共解析 {len(parsed_results)} 页")
+        
+        return {
+            "parsed_results": parsed_results,
+            "processed_pages": len(parsed_results),
+            "is_complete": True
+        }

+ 113 - 0
src/datasets/parser/nodes/pdf_ocr_node.py

@@ -0,0 +1,113 @@
+"""
+PDF OCR解析节点
+"""
+
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, Any, List
+from src.datasets.parser.core.base import BaseNode
+from src.datasets.parser.pdf_parser.pdf_splitter import PDFSplitter
+from src.model.qwen_vl import QWenVLParser
+from src.common.logging_config import get_logger
+from src.conf.settings import model_settings
+
+logger = get_logger(__name__)
+
+
+class PDFOCRNode(BaseNode):
+    """
+    PDF OCR解析节点
+    
+    使用VL模型提取PDF文本内容。
+    """
+    
+    def __init__(self, model_name: str = None, max_workers: int = 4):
+        self.model_name = model_name or model_settings.vl_model_name
+        self.max_workers = max_workers
+    
+    @property
+    def name(self) -> str:
+        return "pdf_ocr"
+    
+    def _parse_single_page(self, page: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        解析单个页面
+        
+        Args:
+            page: 页面信息,包含page_number和image字段
+            
+        Returns:
+            解析结果字典,包含page_number和content字段
+        """
+        page_number = page.get("page_number", 0)
+        image = page.get("image")
+        
+        prompt = "请提取这张图片中的所有文字内容,只输出文字,不要添加任何解释。"
+        
+        logger.debug(f"开始解析第 {page_number} 页")
+        
+        try:
+            parser = QWenVLParser(self.model_name)
+            result = parser.parse_image(image, page_number, prompt)
+            text = result.get("content", "")
+            logger.info(f"页面 {page_number} 提取 {len(text)} 字符")
+            return {
+                "page_number": page_number,
+                "content": text
+            }
+        except Exception as e:
+            logger.error(f"解析第 {page_number} 页时出错: {str(e)}")
+            return {
+                "page_number": page_number,
+                "content": ""
+            }
+    
+    def execute(self, state) -> Dict[str, Any]:
+        pdf_path = state.pdf_path
+        logger.info(f"开始OCR解析PDF: {pdf_path}")
+        
+        # 拆分PDF为图片
+        splitter = PDFSplitter()
+        pages = splitter.split_pdf(pdf_path=pdf_path, is_upload=False)
+        
+        if not pages:
+            logger.warning("PDF拆分后没有页面")
+            return {"full_text": ""}
+        
+        logger.info(f"开始并行OCR解析 {len(pages)} 页,最大线程数: {self.max_workers}")
+        
+        parsed_results = []
+        
+        # 使用ThreadPoolExecutor实现并行处理
+        with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="ocr_page_") as executor:
+            future_to_page = {
+                executor.submit(self._parse_single_page, page): page
+                for page in pages
+            }
+            
+            for future in concurrent.futures.as_completed(future_to_page):
+                try:
+                    result = future.result()
+                    parsed_results.append(result)
+                except Exception as e:
+                    page = future_to_page[future]
+                    page_number = page.get("page_number", 0)
+                    logger.error(f"解析第 {page_number} 页时出错: {str(e)}")
+                    parsed_results.append({
+                        "page_number": page_number,
+                        "content": ""
+                    })
+        
+        # 按页码排序结果,确保顺序性
+        parsed_results.sort(key=lambda x: x.get("page_number", 0))
+        
+        # 按顺序拼接文本
+        full_text_parts = [result.get("content", "") for result in parsed_results]
+        full_text = "\n\n".join(full_text_parts)
+        
+        logger.info(f"PDF OCR完成,提取 {len(full_text)} 字符")
+        
+        return {"full_text": full_text}
+
+
+

+ 63 - 0
src/datasets/parser/nodes/pdf_split_node.py

@@ -0,0 +1,63 @@
+"""
+PDF拆分节点
+
+将PDF文件拆分为独立的页面图片。
+"""
+
+from typing import Dict, Any
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class PDFSplitNode(BaseNode):
+    """
+    PDF拆分节点
+    
+    将PDF文件拆分为独立的页面图片,支持后续的OCR解析。
+    
+    需要的状态字段:
+        - pdf_path: PDF文件路径
+        
+    更新的状态字段:
+        - split_pages: 拆分后的页面列表
+        - processed_pages: 已处理的页面数量
+    """
+    
+    @property
+    def name(self) -> str:
+        return "pdf_split"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        执行PDF拆分
+        
+        Args:
+            state: 包含pdf_path的状态
+            
+        Returns:
+            包含split_pages的更新字典
+        """
+        from src.datasets.parser.pdf_parser.pdf_splitter import PDFSplitter
+        
+        pdf_path = getattr(state, 'pdf_path', None)
+        if not pdf_path:
+            raise ValueError("State must contain 'pdf_path' field")
+        
+        logger.info(f"开始拆分PDF: {pdf_path}")
+        
+        # 拆分PDF
+        splitter = PDFSplitter()
+        split_pages = splitter.split_pdf(pdf_path)
+        
+        logger.info(f"PDF拆分完成,共 {len(split_pages)} 页")
+        
+        return {
+            "split_pages": split_pages,
+            "parsed_results": [],
+            "processed_pages": 0,
+            "is_complete": False
+        }

+ 168 - 0
src/datasets/parser/nodes/qa_generate_node.py

@@ -0,0 +1,168 @@
+"""
+QA对生成节点
+
+从文本块生成问答对。
+"""
+
+import json
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, Any, List, Optional
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from src.model.qwen_vl import QWenVLParser
+from src.conf.settings import model_settings
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class QAGenerateNode(BaseNode):
+    """
+    QA对生成节点
+    
+    从文本块并行生成问答对。
+    
+    需要的状态字段:
+        - text_chunks: 文本块列表
+        - qa_count_per_chunk: 每块生成的QA数量(可选)
+        
+    更新的状态字段:
+        - qa_pairs: QA对列表
+        - qa_count: 生成的QA总数
+    """
+    
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        default_qa_count: int = 50,
+        max_workers: int = 5
+    ):
+        """
+        初始化QA生成节点
+        
+        Args:
+            model_name: 模型名称
+            default_qa_count: 每块默认生成的QA数量
+            max_workers: 并行处理的最大工作线程数
+        """
+        self.model_name = model_name or model_settings.chat_model_name
+        self.default_qa_count = default_qa_count
+        self.max_workers = max_workers
+    
+    @property
+    def name(self) -> str:
+        return "qa_generate"
+    
+    def _generate_qa_for_chunk(self, chunk: str, count: int, chunk_index: int) -> List[Dict[str, str]]:
+        """
+        为单个分块生成QA对
+        
+        Args:
+            chunk: 文本块内容
+            count: 生成的QA数量
+            chunk_index: 块索引
+            
+        Returns:
+            QA对列表
+        """
+        prompt = f"""
+            请根据以下文本内容,生成{count}个高质量的问答对。
+            
+            要求:
+            1. 问题应该多样化,涵盖文本的不同方面
+            2. 答案应该准确、完整,直接来源于文本
+            3. 问题应该自然,像真实用户会问的问题
+            4. 避免过于简单或过于复杂的问题
+            
+            文本内容:
+            {chunk}
+            
+            请以JSON数组格式输出,每个元素包含"question"和"answer"字段:
+            [
+                {{"question": "问题1", "answer": "答案1"}},
+                {{"question": "问题2", "answer": "答案2"}}
+            ]
+        """
+        
+        logger.debug(f"开始为第 {chunk_index + 1} 块生成QA对")
+        
+        try:
+            parser = QWenVLParser(self.model_name)
+            result = parser.chat(prompt)
+            
+            # 解析JSON响应
+            qa_pairs = self._parse_qa_response(result)
+            logger.debug(f"第 {chunk_index + 1} 块生成 {len(qa_pairs)} 个QA对")
+            return qa_pairs
+        except Exception as e:
+            logger.error(f"第 {chunk_index + 1} 块QA生成失败: {str(e)}")
+            return []
+    
+    def _parse_qa_response(self, response: str) -> List[Dict[str, str]]:
+        """
+        解析QA响应
+        
+        Args:
+            response: 模型响应文本
+            
+        Returns:
+            QA对列表
+        """
+        try:
+            # 尝试直接解析JSON
+            return json.loads(response)
+        except json.JSONDecodeError:
+            # 尝试提取JSON部分
+            import re
+            json_match = re.search(r'\[[\s\S]*\]', response)
+            if json_match:
+                try:
+                    return json.loads(json_match.group())
+                except json.JSONDecodeError:
+                    pass
+            logger.warning("无法解析QA响应为JSON")
+            return []
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        执行QA对生成
+        
+        Args:
+            state: 包含text_chunks的状态
+            
+        Returns:
+            包含qa_pairs的更新字典
+        """
+        text_chunks = getattr(state, 'text_chunks', [])
+        qa_count_per_chunk = getattr(state, 'qa_count_per_chunk', self.default_qa_count)
+        
+        if not text_chunks:
+            logger.warning("没有待处理的文本块")
+            return {"qa_pairs": [], "qa_count": 0}
+        
+        logger.info(f"开始并行生成QA对,共 {len(text_chunks)} 块,每块 {qa_count_per_chunk} 个")
+        
+        all_qa_pairs = []
+        
+        # 使用ThreadPoolExecutor实现并行处理
+        with ThreadPoolExecutor(max_workers=self.max_workers, thread_name_prefix="qa_gen_") as executor:
+            futures = [
+                executor.submit(self._generate_qa_for_chunk, chunk, qa_count_per_chunk, i)
+                for i, chunk in enumerate(text_chunks)
+            ]
+            
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    qa_pairs = future.result()
+                    all_qa_pairs.extend(qa_pairs)
+                except Exception as e:
+                    logger.error(f"QA生成任务失败: {str(e)}")
+        
+        logger.info(f"QA对生成完成,共 {len(all_qa_pairs)} 个")
+        
+        return {
+            "qa_pairs": all_qa_pairs,
+            "qa_count": len(all_qa_pairs)
+        }

+ 282 - 0
src/datasets/parser/nodes/ragflow_nodes.py

@@ -0,0 +1,282 @@
+"""
+RAGFlow相关节点
+
+提供与RAGFlow交互的节点组件,包括数据集、文档和Chunk管理。
+"""
+
+from typing import Dict, Any, Optional
+from src.datasets.parser.core.base import BaseNode, ConditionalNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from src.utils.ragflow.ragflow_service import RAGFlowService
+from src.conf.rag_parser_config import RagParserDefaults
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class RAGFlowDatasetNode(BaseNode):
+    """
+    RAGFlow数据集管理节点
+    
+    获取或创建RAGFlow数据集。
+    
+    需要的状态字段:
+        - dataset_name: 数据集名称
+        
+    更新的状态字段:
+        - dataset_id: 数据集ID
+    """
+    
+    def __init__(self, create_if_not_exists: bool = True):
+        """
+        初始化数据集节点
+        
+        Args:
+            create_if_not_exists: 如果数据集不存在是否自动创建
+        """
+        self.create_if_not_exists = create_if_not_exists
+        self.ragflow_service = RAGFlowService()
+    
+    @property
+    def name(self) -> str:
+        return "ragflow_dataset"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        获取或创建数据集
+        
+        Args:
+            state: 包含dataset_name的状态
+            
+        Returns:
+            包含dataset_id的更新字典
+        """
+        dataset_name = getattr(state, 'dataset_name', '')
+        
+        if not dataset_name:
+            raise ValueError("State must contain 'dataset_name' field")
+        
+        # 尝试获取现有数据集
+        try:
+            dataset = self.ragflow_service.get_dataset(name=dataset_name)
+            if dataset:
+                dataset_id = dataset["id"]
+                logger.info(f"数据集 {dataset_name} 已存在,ID: {dataset_id}")
+                return {"dataset_id": dataset_id}
+        except Exception as e:
+            logger.debug(f"获取数据集失败: {e}")
+        
+        # 创建新数据集
+        if self.create_if_not_exists:
+            logger.info(f"开始创建数据集: {dataset_name}")
+            dataset = self.ragflow_service.create_dataset(
+                name=dataset_name,
+                description="",
+                permission=RagParserDefaults.DATASET_PERMISSION,
+                chunk_method=RagParserDefaults.DATASET_CHUNK_METHOD,
+                parser_config=RagParserDefaults.DATASET_CONFIG_DICT
+            )
+            dataset_id = dataset["id"]
+            logger.info(f"数据集创建成功,ID: {dataset_id}")
+            return {"dataset_id": dataset_id}
+        
+        raise ValueError(f"数据集 {dataset_name} 不存在且create_if_not_exists=False")
+
+
+class DatasetExistsCondition(ConditionalNode):
+    """
+    数据集存在性检查条件节点
+    """
+    
+    @property
+    def name(self) -> str:
+        return "check_dataset_exists"
+    
+    def check_condition(self, state: BaseState) -> str:
+        """检查数据集是否存在"""
+        dataset_id = getattr(state, 'dataset_id', '')
+        if dataset_id:
+            return "exists"
+        return "not_exists"
+
+
+@register_node()
+class RAGFlowDocumentUploadNode(BaseNode):
+    """
+    RAGFlow文档上传节点
+    
+    上传文档到RAGFlow数据集。
+    
+    需要的状态字段:
+        - dataset_id: 数据集ID
+        - pdf_path: 文件路径
+        
+    更新的状态字段:
+        - document_id: 文档ID
+    """
+    
+    def __init__(self, target_field: str = "document_id"):
+        """
+        初始化文档上传节点
+        
+        Args:
+            target_field: 存储文档ID的目标字段名
+        """
+        self.target_field = target_field
+        self.ragflow_service = RAGFlowService()
+    
+    @property
+    def name(self) -> str:
+        return "ragflow_document_upload"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        上传文档
+        
+        Args:
+            state: 包含dataset_id和文件路径的状态
+            
+        Returns:
+            包含document_id的更新字典
+        """
+        dataset_id = getattr(state, 'dataset_id', '')
+        file_path = getattr(state, 'pdf_path', '') or getattr(state, 'csv_path', '')
+        
+        if not dataset_id:
+            raise ValueError("State must contain 'dataset_id' field")
+        if not file_path:
+            raise ValueError("State must contain 'pdf_path' or 'csv_path' field")
+        
+        logger.info(f"开始上传文档到数据集 {dataset_id}: {file_path}")
+        
+        document_info_list = self.ragflow_service.upload_document(
+            dataset_id=dataset_id,
+            file_path=file_path
+        )
+        
+        if document_info_list and len(document_info_list) > 0:
+            document_id = document_info_list[0]["id"]
+            logger.info(f"文档上传成功,ID: {document_id}")
+            return {self.target_field: document_id}
+        
+        raise Exception("文档上传失败: 未返回有效的文档信息")
+
+
+@register_node()
+class RAGFlowDocumentParseNode(BaseNode):
+    """
+    RAGFlow文档解析节点
+    
+    触发RAGFlow对文档的解析。
+    
+    需要的状态字段:
+        - dataset_id: 数据集ID
+        - document_id: 文档ID
+    """
+    
+    def __init__(self):
+        self.ragflow_service = RAGFlowService()
+    
+    @property
+    def name(self) -> str:
+        return "ragflow_document_parse"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        解析文档
+        
+        Args:
+            state: 包含dataset_id和document_id的状态
+            
+        Returns:
+            空字典(解析为异步操作)
+        """
+        dataset_id = getattr(state, 'dataset_id', '')
+        document_id = getattr(state, 'document_id', '')
+        
+        if not dataset_id or not document_id:
+            raise ValueError("State must contain 'dataset_id' and 'document_id' fields")
+        
+        logger.info(f"开始解析文档 {document_id}")
+        
+        parse_success = self.ragflow_service.parse_document(
+            dataset_id=dataset_id,
+            document_ids=[document_id]
+        )
+        
+        if parse_success:
+            logger.info(f"文档解析成功")
+            return {"parsed_results": []}
+        
+        raise Exception("文档解析失败")
+
+
+@register_node()
+class RAGFlowChunkNode(BaseNode):
+    """
+    RAGFlow Chunk创建节点
+    
+    创建文档Chunk。
+    
+    需要的状态字段:
+        - page_dataset_id: 页面数据集ID
+        - page_document_id: 页面文档ID
+        - parsed_results: 解析结果列表
+    """
+    
+    def __init__(self):
+        self.ragflow_service = RAGFlowService()
+    
+    @property
+    def name(self) -> str:
+        return "ragflow_chunk"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        创建Chunks
+        
+        Args:
+            state: 包含相关ID和解析结果的状态
+            
+        Returns:
+            空字典
+        """
+        from src.utils.ragflow.chunk_record import get_chunk_record_service
+        from src.conf.settings import vector_db_settings
+        import os
+        
+        page_dataset_id = getattr(state, 'page_dataset_id', '')
+        page_document_id = getattr(state, 'page_document_id', '')
+        parsed_results = getattr(state, 'parsed_results', [])
+        split_pages = getattr(state, 'split_pages', [])
+        
+        logger.info(f"开始创建Chunks,共 {len(parsed_results)} 页")
+        
+        for i, parsed_result in enumerate(parsed_results):
+            page_number = parsed_result.get("page_number", i + 1)
+            text = parsed_result.get("content", "")
+            image_path = split_pages[i].get("image_path", "") if i < len(split_pages) else ""
+            
+            img_id = f"{vector_db_settings.infinity_page_dataset_id}-{os.path.basename(image_path).split('.')[0]}.png" if image_path else ""
+            
+            chunk = self.ragflow_service.create_chunk(
+                dataset_id=page_dataset_id,
+                document_id=page_document_id,
+                content=text
+            )
+            chunk_id = chunk["chunk"]["id"]
+            logger.debug(f"创建第 {page_number} 页Chunk,ID: {chunk_id}")
+            
+            # 记录到定时任务表
+            if img_id:
+                get_chunk_record_service().record_chunk_add(
+                    database_name=vector_db_settings.infinity_ragflow_database,
+                    table_name=vector_db_settings.infinity_page_table_name,
+                    chunk_id=chunk_id,
+                    cond=f"id = '{chunk_id}'",
+                    data={"img_id": img_id}
+                )
+        
+        logger.info(f"Chunks创建完成")
+        return {}

+ 89 - 0
src/datasets/parser/nodes/text_split_node.py

@@ -0,0 +1,89 @@
+"""
+文本分块节点
+
+将长文本分割为适合处理的小块。
+"""
+
+from typing import Dict, Any, List, Optional
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class TextSplitNode(BaseNode):
+    """
+    文本分块节点
+    
+    使用LangChain的RecursiveCharacterTextSplitter将长文本分割为小块。
+    
+    需要的状态字段:
+        - full_text: 待分块的完整文本
+        
+    更新的状态字段:
+        - text_chunks: 分块后的文本列表
+    """
+    
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        separators: Optional[List[str]] = None
+    ):
+        """
+        初始化文本分块节点
+        
+        Args:
+            chunk_size: 每块的最大字符数
+            chunk_overlap: 相邻块之间的重叠字符数
+            separators: 自定义分隔符列表
+        """
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.separators = separators or ["\n\n", "\n", "。", ".", " ", ""]
+    
+    @property
+    def name(self) -> str:
+        return "text_split"
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        执行文本分块
+        
+        Args:
+            state: 包含full_text或通过chunk_size/chunk_overlap配置的状态
+            
+        Returns:
+            包含text_chunks的更新字典
+        """
+        full_text = getattr(state, 'full_text', '')
+        
+        # 支持从状态中获取动态配置
+        chunk_size = getattr(state, 'chunk_size', self.chunk_size)
+        chunk_overlap = getattr(state, 'chunk_overlap', self.chunk_overlap)
+        
+        if not full_text:
+            logger.warning("没有待分块的文本")
+            return {"text_chunks": []}
+        
+        logger.info(f"开始文本分块,文本长度: {len(full_text)},块大小: {chunk_size},重叠: {chunk_overlap}")
+        
+        # 创建文本分割器
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            separators=self.separators,
+            length_function=len
+        )
+        
+        # 分割文本
+        text_chunks = text_splitter.split_text(full_text)
+        
+        logger.info(f"文本分块完成,共 {len(text_chunks)} 块")
+        
+        return {
+            "text_chunks": text_chunks
+        }

+ 151 - 0
src/datasets/parser/nodes/vectorize_node.py

@@ -0,0 +1,151 @@
+"""
+向量化节点
+
+将解析结果向量化并存入Infinity数据库。
+"""
+
+import os
+from typing import Dict, Any, List, Optional
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.core.registry import register_node
+from src.model.multimodal_embedding import Embedding
+from src.conf.settings import model_settings, vector_db_settings
+from src.utils.infinity import get_client
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_node()
+class VectorizeNode(BaseNode):
+    """
+    向量化入库节点
+    
+    将解析结果向量化并存入Infinity向量数据库。
+    
+    需要的状态字段:
+        - parsed_results: 解析结果列表
+        - split_pages: 包含图片信息的页面列表(可选)
+        - document_id: 文档ID
+        - dataset_id: 数据集ID
+        - pdf_path: PDF文件路径(可选)
+        
+    更新的状态字段:
+        - vectorized_results: 向量化结果列表
+        - vectorized_pages: 已向量化的页面数量
+    """
+    
+    def __init__(
+        self,
+        table_name: Optional[str] = None,
+        database_name: Optional[str] = None,
+        embedding_model_name: Optional[str] = None
+    ):
+        """
+        初始化向量化节点
+        
+        Args:
+            table_name: Infinity表名
+            database_name: Infinity数据库名
+            embedding_model_name: 嵌入模型名称
+        """
+        self.table_name = table_name or vector_db_settings.infinity_table_name
+        self.database_name = database_name or vector_db_settings.infinity_database
+        self.embedding_model_name = embedding_model_name or model_settings.multimodal_embedding_model_name
+        self._embedding_model = None
+    
+    @property
+    def name(self) -> str:
+        return "vectorize_store"
+    
+    @property
+    def embedding_model(self) -> Embedding:
+        """懒加载嵌入模型"""
+        if self._embedding_model is None:
+            self._embedding_model = Embedding(
+                model_name=self.embedding_model_name,
+                api_key=model_settings.dashscope_api_key
+            )
+        return self._embedding_model
+    
+    def execute(self, state: BaseState) -> Dict[str, Any]:
+        """
+        执行向量化入库
+        
+        Args:
+            state: 包含parsed_results的状态
+            
+        Returns:
+            包含vectorized_results的更新字典
+        """
+        parsed_results = getattr(state, 'parsed_results', [])
+        split_pages = getattr(state, 'split_pages', [])
+        document_id = getattr(state, 'document_id', '')
+        dataset_id = getattr(state, 'dataset_id', '')
+        pdf_path = getattr(state, 'pdf_path', '')
+        
+        if not parsed_results:
+            logger.warning("没有待向量化的解析结果")
+            return {"vectorized_results": [], "vectorized_pages": 0}
+        
+        logger.info(f"开始向量化入库,共 {len(parsed_results)} 页")
+        
+        # 准备要入库的文档列表
+        documents_to_store = []
+        
+        # 获取文件名和总页数
+        file_name = os.path.basename(pdf_path) if pdf_path else ''
+        file_page_count = len(split_pages)
+        
+        # 遍历所有解析结果,生成向量化文档
+        for i, parsed_result in enumerate(parsed_results):
+            try:
+                page_number = parsed_result.get("page_number", i + 1)
+                text = parsed_result.get("content", "")
+                image = split_pages[i].get("image") if i < len(split_pages) else None
+                image_path = split_pages[i].get("image_path", "") if i < len(split_pages) else ""
+                
+                # 获取多模态嵌入向量
+                logger.debug(f"正在生成第 {page_number} 页的多模态嵌入...")
+                
+                if image:
+                    embedding = self.embedding_model.get_multimodal_embedding(text, image)
+                else:
+                    embedding = self.embedding_model.get_text_embedding(text)
+                
+                # 生成1024维稠密向量
+                dense_vector_1024 = embedding[:1024] if len(embedding) >= 1024 else embedding
+                
+                # 创建文档
+                document = {
+                    "id": f"{document_id}_{page_number}",
+                    "file_name": file_name,
+                    "file_page_count": file_page_count,
+                    "page_number": page_number,
+                    "content": text,
+                    "image_path": image_path,
+                    "dense_vector_1024": dense_vector_1024,
+                    "dataset_id": dataset_id,
+                    "document_id": document_id
+                }
+                
+                documents_to_store.append(document)
+                logger.debug(f"第 {page_number} 页向量化完成")
+            except Exception as e:
+                logger.error(f"第 {i+1} 页向量化失败: {str(e)}")
+        
+        # 批量入库
+        if documents_to_store:
+            logger.info(f"开始入库,共 {len(documents_to_store)} 个文档")
+            result = get_client().insert(
+                table_name=self.table_name,
+                documents=documents_to_store,
+                database_name=self.database_name
+            )
+            logger.info(f"入库完成")
+        
+        return {
+            "vectorized_results": documents_to_store,
+            "vectorized_pages": len(documents_to_store),
+            "is_complete": True
+        }

+ 42 - 28
src/datasets/parser/pdf_parser/pdf_splitter.py

@@ -1,14 +1,16 @@
 import fitz
 from PIL import Image
 import io
+import os
 from typing import List, Dict, Tuple
 from src.conf.settings import vector_db_settings
+from utils.file.minio.minio_util import MinIOUtil
 
 class PDFSplitter:
     """PDF扫描件按页拆分工具"""
     
     @staticmethod
-    def split_pdf(pdf_path: str) -> List[Dict[str, any]]:
+    def split_pdf(pdf_path: str, is_upload: bool = True) -> List[Dict[str, any]]:
         """
         将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO
         
@@ -22,13 +24,10 @@ class PDFSplitter:
                 - image_bytes: 图像字节流
                 - image_path: MinIO中保存的图片URL
         """
-        import os
-        from utils.file.minio.minio_util import MinIOUtil
+
         
-        try:
-            # 初始化MinioUtil
-            minio_util = MinIOUtil()
-            
+        pdf_document = None
+        try:            
             # 打开PDF文件
             pdf_document = fitz.open(pdf_path)
             
@@ -49,34 +48,49 @@ class PDFSplitter:
                 # 将fitz pixmap转换为PIL图像
                 image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                 
-                # 将图像转换为字节流,便于后续处理
-                image_bytes = io.BytesIO()
-                image.save(image_bytes, format='PNG')
-                image_bytes.seek(0)
-                
-                # 生成图片文件名
-                image_filename = f"{pdf_filename}_{page_number}.png"
-                
-                # 重置字节流指针
-                image_bytes.seek(0)
-                
-                # 上传图片到MinIO,获取URL
-                image_url = minio_util.custom_upload_file(file=image_bytes, original_filename=image_filename, bucket_name=vector_db_settings.infinity_page_dataset_id)
+                if is_upload:
+                    # 初始化MinioUtil
+                    minio_util = MinIOUtil()
+                    # 将图像转换为字节流,便于后续处理
+                    image_bytes = io.BytesIO()
+                    image.save(image_bytes, format='PNG')
+                    image_bytes.seek(0)
+                    
+                    # 生成图片文件名
+                    image_filename = f"{pdf_filename}_{page_number}.png"
+                    
+                    # 重置字节流指针
+                    image_bytes.seek(0)
+                    
+                    # 上传图片到MinIO,获取URL
+                    image_url = minio_util.custom_upload_file(file=image_bytes, original_filename=image_filename, bucket_name=vector_db_settings.infinity_page_dataset_id)
                 
-                result.append({
-                    "page_number": page_number,
-                    "image": image,
-                    "image_bytes": image_bytes,
-                    "image_path": image_url
-                })
+                    result.append({
+                        "page_number": page_number,
+                        "image": image,
+                        "image_bytes": image_bytes,
+                        "image_path": image_url
+                    })
+                else:
+                    result.append({
+                        "page_number": page_number,
+                        "image": image,
+                        "image_bytes": None,
+                        "image_path": None
+                    })
             
-            # 关闭PDF文件
-            pdf_document.close()
             # 将result根据page_number排序
             result.sort(key=lambda x: x["page_number"])
             return result
         except Exception as e:
             raise Exception(f"PDF拆分失败: {str(e)}")
+        finally:
+            # 确保PDF文件总是被关闭
+            if pdf_document is not None:
+                try:
+                    pdf_document.close()
+                except Exception:
+                    pass
     
     @staticmethod
     def save_page_image(image: Image.Image, output_path: str) -> None:

+ 2 - 2
src/datasets/parser/question_answer_parser/question_answer_parser.py

@@ -105,7 +105,7 @@ class QuestionAnswerParserWorkflow:
         try:
             # 使用PDFSplitter拆分PDF为图片
             splitter = PDFSplitter()
-            pages = splitter.split_pdf(state.pdf_path)
+            pages = splitter.split_pdf(pdf_path=state.pdf_path, is_upload=False)
             
             # 使用QWenVL模型提取每页文本
             extracted_texts = []
@@ -117,7 +117,7 @@ class QuestionAnswerParserWorkflow:
                 
                 # OCR提取文本的prompt
                 prompt = """请提取图片中的所有文字内容,保持原有的段落结构。
-只输出提取的文字,不要添加任何额外的说明或格式。"""
+                            只输出提取的文字,不要添加任何额外的说明或格式。"""
                 
                 result = parser.parse_image(image, page_number, prompt)
                 text = result.get("content", "")

+ 15 - 0
src/datasets/parser/states/__init__.py

@@ -0,0 +1,15 @@
+"""
+解析器状态定义模块
+"""
+
+from src.datasets.parser.states.parser_states import (
+    PDFParsingState,
+    QAParsingState,
+    ImageParsingState,
+)
+
+__all__ = [
+    "PDFParsingState",
+    "QAParsingState",
+    "ImageParsingState",
+]

+ 106 - 0
src/datasets/parser/states/parser_states.py

@@ -0,0 +1,106 @@
+"""
+解析器工作流状态定义
+
+定义各类解析工作流使用的状态类。
+"""
+
+from typing import List, Dict, Any, Optional
+from pydantic import Field, ConfigDict
+from src.datasets.parser.core.base import BaseState
+
+
+class PDFParsingState(BaseState):
+    """
+    PDF解析工作流状态
+    
+    用于PDF扫描件的拆分、OCR解析和向量化入库。
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    # 输入参数
+    pdf_path: str = Field(..., description="PDF文件路径")
+    dataset_name: str = Field(..., description="数据集名称")
+    page_dataset_id: str = Field(..., description="页面数据集ID")
+    
+    # RAGFlow相关
+    dataset_id: str = Field(default="", description="RAGFLOW数据集ID")
+    document_id: str = Field(default="", description="上传后的文档ID")
+    page_document_id: str = Field(default="", description="上传后的页面文档ID")
+    
+    # 处理过程
+    split_pages: List[Dict[str, Any]] = Field(default_factory=list, description="拆分后的页面列表")
+    current_page: Dict[str, Any] = Field(default_factory=dict, description="当前处理的页面")
+    parsed_results: List[Dict[str, Any]] = Field(default_factory=list, description="解析结果列表")
+    vectorized_results: List[Dict[str, Any]] = Field(default_factory=list, description="向量化结果列表")
+    
+    # 统计信息
+    processed_pages: int = Field(default=0, description="已处理的页面数量")
+    vectorized_pages: int = Field(default=0, description="已向量化的页面数量")
+
+
+class QAParsingState(BaseState):
+    """
+    QA解析工作流状态
+    
+    用于PDF到QA问答对的解析流程。
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    # 输入参数
+    pdf_path: str = Field(..., description="PDF文件路径")
+    dataset_id: str = Field(..., description="RAGFlow数据集ID")
+    qa_count_per_chunk: int = Field(default=50, description="每块生成的QA数量")
+    chunk_size: int = Field(default=1000, description="文本分块大小")
+    chunk_overlap: int = Field(default=200, description="分块重叠大小")
+    
+    # 处理过程
+    full_text: str = Field(default="", description="PDF提取的完整文本")
+    text_chunks: List[str] = Field(default_factory=list, description="分块后的文本列表")
+    qa_pairs: List[Dict[str, Any]] = Field(default_factory=list, description="生成的QA对列表")
+    csv_path: Optional[str] = Field(default=None, description="导出的CSV文件路径")
+    document_id: Optional[str] = Field(default=None, description="上传的文档ID")
+    
+    # 统计信息
+    qa_count: int = Field(default=0, description="生成的QA对数量")
+
+
+class ImageParsingState(BaseState):
+    """
+    图片解析工作流状态
+    
+    用于图片压缩包的解析和向量化入库。
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    
+    # 输入参数
+    zip_file_path: str = Field(..., description="图片压缩包路径")
+    book_name: str = Field(..., description="书名")
+    dataset_id: str = Field(..., description="数据集ID")
+    ragflow_api_url: str = Field(default="", description="RAGFLOW API URL")
+    rag_flow_api_key: str = Field(default="", description="RAGFLOW API密钥")
+    
+    # 处理过程
+    image_urls: List[str] = Field(default_factory=list, description="上传后的图片URL列表")
+    image_pages: List[Dict[str, Any]] = Field(default_factory=list, description="图片页面信息列表")
+    parsed_results: List[Dict[str, Any]] = Field(default_factory=list, description="解析结果列表")
+    vectorized_results: List[Dict[str, Any]] = Field(default_factory=list, description="向量化结果列表")
+    
+    # 统计信息
+    processed_images: int = Field(default=0, description="已处理的图片数量")
+    vectorized_images: int = Field(default=0, description="已向量化的图片数量")
+
+
+class TextChunk(BaseState):
+    """
+    文本分块状态(可组合使用)
+    """
+    text_chunks: List[str] = Field(default_factory=list, description="分块后的文本列表")
+    chunk_count: int = Field(default=0, description="分块数量")
+
+
+class VectorizationMixin(BaseState):
+    """
+    向量化相关状态(可组合使用)
+    """
+    vectorized_results: List[Dict[str, Any]] = Field(default_factory=list, description="向量化结果列表")
+    vectorized_count: int = Field(default=0, description="已向量化数量")

+ 15 - 0
src/datasets/parser/workflows/__init__.py

@@ -0,0 +1,15 @@
+"""
+工作流模块
+
+提供使用节点组件组装的工作流。
+"""
+
+from src.datasets.parser.workflows.pdf_workflow import PDFParsingWorkflowV2
+from src.datasets.parser.workflows.qa_workflow import QAParsingWorkflowV2
+from src.datasets.parser.workflows.image_workflow import ImageParsingWorkflowV2
+
+__all__ = [
+    "PDFParsingWorkflowV2",
+    "QAParsingWorkflowV2",
+    "ImageParsingWorkflowV2",
+]

+ 339 - 0
src/datasets/parser/workflows/dynamic_dimension_workflow.py

@@ -0,0 +1,339 @@
+"""
+动态多维度解析工作流 (LangGraph 动态构建方案)
+
+根据维度ID列表,运行时动态构建 LangGraph 节点,每个维度作为独立节点。
+支持动态选择维度和执行顺序。
+
+工作流结构 (动态生成):
+    START → ragflow_dataset → ragflow_document_upload → ragflow_document_parse
+          → pdf_split → skill_dim_1 → skill_dim_3 → skill_dim_5 → complete → END
+"""
+
+from typing import Dict, Any, List, Optional
+from langgraph.graph import START, END
+from langfuse.langchain import CallbackHandler
+
+from src.datasets.parser.core.workflow_builder import WorkflowBuilder
+from src.datasets.parser.core.base import BaseNode, BaseState
+from src.datasets.parser.states.parser_states import PDFParsingState
+from src.datasets.parser.nodes import (
+    PDFSplitNode,
+    ImageParseNode,
+    VectorizeNode,
+    CompleteNode,
+    RAGFlowDatasetNode,
+    RAGFlowDocumentUploadNode,
+    RAGFlowDocumentParseNode,
+)
+from src.api.db.services.prompt_service import get_prompt_service
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class DynamicDimensionState(BaseState):
+    """动态维度解析状态"""
+    # 输入参数
+    pdf_path: str = ""
+    dimension_ids: List[int] = []
+    dataset_id: str = ""
+    dataset_name: str = ""
+    document_id: str = ""
+    
+    # RAGFlow 相关
+    ragflow_api_url: str = ""
+    rag_flow_api_key: str = ""
+    
+    # 中间状态
+    split_pages: List[Dict[str, Any]] = []
+    parsed_results: List[Dict[str, Any]] = []
+    
+    # 输出 - 每个维度的结果
+    dimension_results: Dict[int, Dict[str, Any]] = {}
+    total_vectorized_pages: int = 0
+    is_complete: bool = False
+
+
+class DimensionSkillNode(BaseNode):
+    """
+    维度技能节点
+    
+    单个维度的处理节点,包含:
+    1. 获取维度提示词
+    2. 使用提示词解析图片
+    3. 向量化入库到 book_{dimension_id}
+    
+    每个维度都是独立的 LangGraph 节点,可被 Langfuse 追踪。
+    """
+    
+    def __init__(
+        self,
+        dimension_id: int,
+        model_name: str = "Qwen/Qwen3-VL-8B-Instruct",
+        max_workers: int = 5
+    ):
+        self.dimension_id = dimension_id
+        self.model_name = model_name
+        self.max_workers = max_workers
+        self._prompt_service = None
+        self._prompt = None  # 缓存提示词
+    
+    @property
+    def name(self) -> str:
+        """节点名称,格式: skill_dim_{id}"""
+        return f"skill_dim_{self.dimension_id}"
+    
+    @property
+    def prompt_service(self):
+        """懒加载提示词服务"""
+        if self._prompt_service is None:
+            self._prompt_service = get_prompt_service()
+        return self._prompt_service
+    
+    def _get_table_name(self) -> str:
+        """获取维度对应的表名"""
+        return f"book_{self.dimension_id}"
+    
+    def _get_prompt(self) -> Optional[str]:
+        """获取并缓存维度提示词"""
+        if self._prompt is None:
+            self._prompt = self.prompt_service.get_active_prompt_by_id(self.dimension_id)
+        return self._prompt
+    
+    def execute(self, state: DynamicDimensionState) -> Dict[str, Any]:
+        """
+        执行维度技能
+        
+        1. 获取提示词
+        2. 解析图片
+        3. 向量化入库
+        """
+        logger.info(f"[Skill-{self.dimension_id}] 开始执行维度技能")
+        
+        # 1. 获取提示词
+        prompt = self._get_prompt()
+        if not prompt:
+            logger.warning(f"[Skill-{self.dimension_id}] 没有激活的提示词,跳过")
+            # 更新状态中的维度结果
+            dim_results = dict(getattr(state, 'dimension_results', {}) or {})
+            dim_results[self.dimension_id] = {
+                "dimension_id": self.dimension_id,
+                "skipped": True,
+                "reason": "no_active_prompt"
+            }
+            return {"dimension_results": dim_results}
+        
+        table_name = self._get_table_name()
+        logger.info(f"[Skill-{self.dimension_id}] 表名: {table_name}")
+        
+        # 2. 创建并执行解析节点
+        parse_node = ImageParseNode(
+            model_name=self.model_name,
+            max_workers=self.max_workers,
+            prompt_template=prompt
+        )
+        parse_result = parse_node.execute(state)
+        parsed_results = parse_result.get("parsed_results", [])
+        
+        logger.info(f"[Skill-{self.dimension_id}] 解析完成,共 {len(parsed_results)} 页")
+        
+        # 3. 创建临时状态用于向量化
+        temp_state = DynamicDimensionState(
+            split_pages=getattr(state, 'split_pages', []),
+            parsed_results=parsed_results,
+            document_id=getattr(state, 'document_id', ''),
+            dataset_id=getattr(state, 'dataset_id', ''),
+            pdf_path=getattr(state, 'pdf_path', '')
+        )
+        
+        # 4. 执行向量化
+        vectorize_node = VectorizeNode(table_name=table_name)
+        vectorize_result = vectorize_node.execute(temp_state)
+        vectorized_pages = vectorize_result.get('vectorized_pages', 0)
+        
+        logger.info(f"[Skill-{self.dimension_id}] 入库完成,共 {vectorized_pages} 页到 {table_name}")
+        
+        # 5. 更新状态
+        dim_results = dict(getattr(state, 'dimension_results', {}) or {})
+        dim_results[self.dimension_id] = {
+            "dimension_id": self.dimension_id,
+            "table_name": table_name,
+            "parsed_pages": parse_result.get("processed_pages", 0),
+            "vectorized_pages": vectorized_pages,
+            "success": True
+        }
+        
+        total_vectorized = getattr(state, 'total_vectorized_pages', 0) + vectorized_pages
+        
+        return {
+            "dimension_results": dim_results,
+            "total_vectorized_pages": total_vectorized
+        }
+
+
+class DynamicDimensionWorkflow:
+    """
+    动态多维度解析工作流 (LangGraph 动态构建方案)
+    
+    每次 run() 时根据 dimension_ids 参数动态构建工作流图,
+    每个维度作为独立的 LangGraph 节点,支持:
+    - 动态选择维度
+    - 自定义执行顺序
+    - Langfuse 独立追踪每个维度
+    
+    Example:
+        >>> workflow = DynamicDimensionWorkflow()
+        >>> result = workflow.run(
+        ...     pdf_path="/path/to/file.pdf",
+        ...     dimension_ids=[3, 1, 5],  # 按此顺序执行
+        ...     dataset_id="xxx"
+        ... )
+    """
+    
+    def __init__(self, model_name: str = "Qwen/Qwen3-VL-8B-Instruct", max_workers: int = 5):
+        """
+        初始化工作流
+        
+        Args:
+            model_name: VL模型名称
+            max_workers: 每个维度内部图片解析的并行线程数
+        """
+        self.model_name = model_name
+        self.max_workers = max_workers
+        self.langfuse_handler = CallbackHandler()
+    
+    def _build_workflow_for_dimensions(self, dimension_ids: List[int]):
+        """
+        根据维度ID列表动态构建 LangGraph 工作流
+        
+        Args:
+            dimension_ids: 维度ID列表,决定节点和执行顺序
+            
+        Returns:
+            编译后的 LangGraph 工作流
+        """
+        logger.info(f"动态构建工作流,维度: {dimension_ids}")
+        
+        # 创建固定节点
+        dataset_node = RAGFlowDatasetNode(create_if_not_exists=True)
+        upload_node = RAGFlowDocumentUploadNode(target_field="document_id")
+        parse_doc_node = RAGFlowDocumentParseNode()
+        split_node = PDFSplitNode()
+        complete_node = CompleteNode(message_template="动态多维度解析完成")
+        
+        # 使用 WorkflowBuilder 构建
+        builder = WorkflowBuilder(DynamicDimensionState)
+        
+        # 添加固定节点
+        builder.add_nodes(
+            dataset_node,
+            upload_node,
+            parse_doc_node,
+            split_node,
+            complete_node
+        )
+        
+        # 定义固定边: START → ragflow_dataset → upload → parse → pdf_split
+        builder.set_entry("ragflow_dataset")
+        builder.add_edge("ragflow_dataset", "ragflow_document_upload")
+        builder.add_edge("ragflow_document_upload", "ragflow_document_parse")
+        builder.add_edge("ragflow_document_parse", "pdf_split")
+        
+        # 动态添加维度技能节点
+        prev_node = "pdf_split"
+        
+        for dim_id in dimension_ids:
+            # 为每个维度创建技能节点
+            skill_node = DimensionSkillNode(
+                dimension_id=dim_id,
+                model_name=self.model_name,
+                max_workers=self.max_workers
+            )
+            builder.add_node(skill_node)
+            builder.add_edge(prev_node, skill_node.name)
+            prev_node = skill_node.name
+            logger.debug(f"添加节点: {skill_node.name}")
+        
+        # 连接到完成节点
+        builder.add_edge(prev_node, "complete")
+        builder.set_finish("complete")
+        
+        logger.info(f"工作流构建完成,节点链: ragflow... → pdf_split → {' → '.join([f'skill_dim_{d}' for d in dimension_ids])} → complete")
+        
+        return builder.build()
+    
+    def run(
+        self,
+        pdf_path: str,
+        dimension_ids: List[int],
+        dataset_id: str = "",
+        dataset_name: str = "",
+        document_id: str = "",
+        ragflow_api_url: str = "",
+        rag_flow_api_key: str = ""
+    ) -> Dict[str, Any]:
+        """
+        运行动态多维度解析工作流
+        
+        Args:
+            pdf_path: PDF文件路径
+            dimension_ids: 维度ID列表(决定执行哪些维度及顺序)
+            dataset_id: 数据集ID
+            dataset_name: 数据集名称
+            document_id: 文档ID(可选)
+            ragflow_api_url: RAGFlow API URL
+            rag_flow_api_key: RAGFlow API Key
+            
+        Returns:
+            包含所有维度解析结果的字典
+        """
+        if not dimension_ids:
+            logger.warning("未提供维度ID列表")
+            return {"success": False, "error": "no_dimension_ids"}
+        
+        if not pdf_path:
+            logger.warning("未提供PDF文件路径")
+            return {"success": False, "error": "no_pdf_path"}
+        
+        logger.info(f"开始运行动态多维度解析: {pdf_path}")
+        logger.info(f"维度执行顺序: {dimension_ids}")
+        
+        # 1. 根据维度列表动态构建工作流
+        workflow = self._build_workflow_for_dimensions(dimension_ids)
+        
+        # 2. 创建初始状态
+        initial_state = DynamicDimensionState(
+            pdf_path=pdf_path,
+            dimension_ids=dimension_ids,
+            dataset_id=dataset_id,
+            dataset_name=dataset_name or pdf_path.split("/")[-1].split("\\")[-1].replace(".pdf", ""),
+            document_id=document_id,
+            ragflow_api_url=ragflow_api_url,
+            rag_flow_api_key=rag_flow_api_key,
+            dimension_results={},
+            total_vectorized_pages=0
+        )
+        
+        # 3. 执行工作流
+        result = workflow.invoke(
+            initial_state,
+            config={"callbacks": [self.langfuse_handler]}
+        )
+        
+        # 4. 处理结果
+        if isinstance(result, dict):
+            final_result = result
+        else:
+            final_result = result.dict() if hasattr(result, 'dict') else dict(result)
+        
+        # 5. 添加统计信息
+        dim_results = final_result.get('dimension_results', {})
+        success_count = sum(1 for r in dim_results.values() if r.get("success"))
+        
+        final_result["success"] = True
+        final_result["dimensions_processed"] = len(dimension_ids)
+        final_result["dimensions_success"] = success_count
+        
+        logger.info(f"工作流完成,成功: {success_count}/{len(dimension_ids)}")
+        
+        return final_result

+ 157 - 0
src/datasets/parser/workflows/image_workflow.py

@@ -0,0 +1,157 @@
+"""
+图片解析工作流V2
+
+使用组件化节点构建的图片解析工作流。
+"""
+
+from typing import Dict, Any, List
+from langgraph.graph import START, END
+from langfuse.langchain import CallbackHandler
+
+from src.datasets.parser.core.workflow_builder import WorkflowBuilder
+from src.datasets.parser.core.base import BaseNode
+from src.datasets.parser.states.parser_states import ImageParsingState
+from src.datasets.parser.nodes import (
+    ImageParseNode,
+    VectorizeNode,
+    CompleteNode,
+)
+from src.utils.file.image_util import image_util
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class ImageUploadNode(BaseNode):
+    """
+    图片上传节点
+    
+    解压图片压缩包并上传到MinIO。
+    """
+    
+    @property
+    def name(self) -> str:
+        return "image_upload"
+    
+    def execute(self, state) -> Dict[str, Any]:
+        zip_file_path = state.zip_file_path
+        book_name = state.book_name
+        
+        logger.info(f"开始处理图片压缩包: {zip_file_path}")
+        
+        # 处理压缩包并上传到MinIO
+        image_urls = image_util.process_image_zip(zip_file_path, book_name)
+        
+        # 构建图片页面列表
+        image_pages = [
+            {"page_number": i + 1, "image_url": url}
+            for i, url in enumerate(image_urls)
+        ]
+        
+        logger.info(f"图片上传完成,共 {len(image_urls)} 张")
+        
+        return {
+            "image_urls": image_urls,
+            "image_pages": image_pages,
+            "processed_images": len(image_urls)
+        }
+
+
+class ImageParsingWorkflowV2:
+    """
+    图片解析工作流V2
+    
+    使用组件化节点构建,支持:
+    - 图片压缩包解压上传
+    - VL模型图片解析
+    - 向量化入库Infinity
+    
+    Example:
+        >>> workflow = ImageParsingWorkflowV2()
+        >>> result = workflow.run(
+        ...     zip_file_path="/path/to/images.zip",
+        ...     book_name="my_book",
+        ...     dataset_id="xxx"
+        ... )
+    """
+    
+    def __init__(self, model_name: str = "Qwen/Qwen3-VL-8B-Instruct"):
+        """
+        初始化工作流
+        
+        Args:
+            model_name: VL模型名称
+        """
+        self.model_name = model_name
+        self.langfuse_handler = CallbackHandler()
+        self.workflow = self._build_workflow()
+    
+    def _build_workflow(self):
+        """构建工作流"""
+        # 创建节点实例
+        upload_node = ImageUploadNode()
+        parse_node = ImageParseNode(model_name=self.model_name)
+        vectorize_node = VectorizeNode()
+        complete_node = CompleteNode(message_template="图片解析工作流完成")
+        
+        # 使用WorkflowBuilder构建
+        builder = WorkflowBuilder(ImageParsingState)
+        
+        builder.add_nodes(
+            upload_node,
+            parse_node,
+            vectorize_node,
+            complete_node
+        )
+        
+        # 定义顺序执行的边
+        builder.set_entry("image_upload")
+        builder.add_sequence(
+            "image_upload",
+            "image_parse",
+            "vectorize_store",
+            "complete"
+        )
+        builder.set_finish("complete")
+        
+        return builder.build()
+    
+    def run(
+        self,
+        zip_file_path: str,
+        book_name: str,
+        dataset_id: str,
+        ragflow_api_url: str = "",
+        rag_flow_api_key: str = ""
+    ) -> Dict[str, Any]:
+        """
+        运行图片解析工作流
+        
+        Args:
+            zip_file_path: 图片压缩包路径
+            book_name: 书名
+            dataset_id: 数据集ID
+            ragflow_api_url: RAGFlow API URL
+            rag_flow_api_key: RAGFlow API密钥
+            
+        Returns:
+            包含最终状态的字典
+        """
+        logger.info(f"开始运行图片解析工作流: {zip_file_path}")
+        
+        initial_state = ImageParsingState(
+            zip_file_path=zip_file_path,
+            book_name=book_name,
+            dataset_id=dataset_id,
+            ragflow_api_url=ragflow_api_url,
+            rag_flow_api_key=rag_flow_api_key
+        )
+        
+        result = self.workflow.invoke(
+            initial_state,
+            config={"callbacks": [self.langfuse_handler]}
+        )
+        
+        if isinstance(result, dict):
+            return result
+        return result.dict() if hasattr(result, 'dict') else dict(result)

+ 129 - 0
src/datasets/parser/workflows/pdf_workflow.py

@@ -0,0 +1,129 @@
+"""
+PDF解析工作流V2
+
+使用组件化节点构建的PDF解析工作流。
+"""
+
+from typing import Dict, Any
+from langgraph.graph import START, END
+from langfuse.langchain import CallbackHandler
+
+from src.datasets.parser.core.workflow_builder import WorkflowBuilder
+from src.datasets.parser.states.parser_states import PDFParsingState
+from src.datasets.parser.nodes import (
+    PDFSplitNode,
+    ImageParseNode,
+    VectorizeNode,
+    RAGFlowDatasetNode,
+    RAGFlowDocumentUploadNode,
+    RAGFlowDocumentParseNode,
+    RAGFlowChunkNode,
+    CompleteNode,
+)
+from src.datasets.parser.nodes.ragflow_nodes import DatasetExistsCondition
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class PDFParsingWorkflowV2:
+    """
+    PDF扫描件解析工作流V2
+    
+    使用组件化节点构建,支持:
+    - PDF拆分为图片
+    - VL模型OCR解析
+    - 向量化入库Infinity
+    - 同步到RAGFlow
+    
+    Example:
+        >>> workflow = PDFParsingWorkflowV2()
+        >>> result = workflow.run(
+        ...     pdf_path="/path/to/file.pdf",
+        ...     page_dataset_id="xxx",
+        ...     dataset_name="my_dataset"
+        ... )
+    """
+    
+    def __init__(self, model_name: str = "Qwen/Qwen3-VL-8B-Instruct"):
+        """
+        初始化工作流
+        
+        Args:
+            model_name: VL模型名称
+        """
+        self.model_name = model_name
+        self.langfuse_handler = CallbackHandler()
+        self.workflow = self._build_workflow()
+    
+    def _build_workflow(self):
+        """构建工作流"""
+        # 创建节点实例
+        dataset_node = RAGFlowDatasetNode(create_if_not_exists=True)
+        dataset_condition = DatasetExistsCondition()
+        upload_node = RAGFlowDocumentUploadNode(target_field="document_id")
+        page_upload_node = RAGFlowDocumentUploadNode(target_field="page_document_id")
+        parse_doc_node = RAGFlowDocumentParseNode()
+        split_node = PDFSplitNode()
+        image_parse_node = ImageParseNode(model_name=self.model_name)
+        vectorize_node = VectorizeNode()
+        chunk_node = RAGFlowChunkNode()
+        complete_node = CompleteNode(message_template="PDF解析工作流完成")
+        
+        # 使用WorkflowBuilder构建
+        builder = WorkflowBuilder(PDFParsingState)
+        
+        # 添加所有节点
+        builder.add_nodes(
+            dataset_node,
+            upload_node,
+            page_upload_node,
+            parse_doc_node,
+            split_node,
+            image_parse_node,
+            vectorize_node,
+            chunk_node,
+            complete_node
+        )
+        
+        # 定义边
+        builder.set_entry("ragflow_dataset")
+        builder.add_edge("ragflow_dataset", "ragflow_document_upload")
+        builder.add_edge("ragflow_document_upload", "ragflow_document_parse")
+        builder.add_edge("ragflow_document_parse", "pdf_split")
+        builder.add_edge("pdf_split", "image_parse")
+        builder.add_edge("image_parse", "vectorize_store")
+        builder.add_edge("vectorize_store", "ragflow_chunk")
+        builder.add_edge("ragflow_chunk", "complete")
+        builder.set_finish("complete")
+        
+        return builder.build()
+    
+    def run(self, pdf_path: str, page_dataset_id: str, dataset_name: str) -> Dict[str, Any]:
+        """
+        运行PDF解析工作流
+        
+        Args:
+            pdf_path: PDF文件路径
+            page_dataset_id: 页面数据集ID
+            dataset_name: 数据集名称
+            
+        Returns:
+            包含最终状态的字典
+        """
+        logger.info(f"开始运行PDF解析工作流: {pdf_path}")
+        
+        initial_state = PDFParsingState(
+            pdf_path=pdf_path,
+            page_dataset_id=page_dataset_id,
+            dataset_name=dataset_name
+        )
+        
+        result = self.workflow.invoke(
+            initial_state,
+            config={"callbacks": [self.langfuse_handler]}
+        )
+        
+        if isinstance(result, dict):
+            return result
+        return result.dict() if hasattr(result, 'dict') else dict(result)

+ 135 - 0
src/datasets/parser/workflows/qa_workflow.py

@@ -0,0 +1,135 @@
+"""
+QA解析工作流V2
+
+使用组件化节点构建的QA问答对解析工作流。
+"""
+
+from typing import Dict, Any
+from langgraph.graph import StateGraph, START, END
+from langfuse.langchain import CallbackHandler
+
+from src.datasets.parser.core.workflow_builder import WorkflowBuilder
+from src.datasets.parser.states.parser_states import QAParsingState
+from src.datasets.parser.nodes import (
+    TextSplitNode,
+    QAGenerateNode,
+    RAGFlowDocumentUploadNode,
+    RAGFlowDocumentParseNode,
+    CompleteNode,
+    PDFOCRNode,
+    ExportCSVNode,
+)
+from src.conf.settings import model_settings
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class QAParsingWorkflowV2:
+    """
+    QA问答对解析工作流V2
+    
+    使用组件化节点构建,支持:
+    - PDF OCR解析
+    - 文本分块
+    - 并行QA对生成
+    - 导出CSV并上传RAGFlow
+    
+    Example:
+        >>> workflow = QAParsingWorkflowV2()
+        >>> result = workflow.run(
+        ...     pdf_path="/path/to/file.pdf",
+        ...     dataset_id="xxx",
+        ...     qa_count_per_chunk=50
+        ... )
+    """
+    
+    def __init__(self, model_name: str = None):
+        """
+        初始化工作流
+        
+        Args:
+            model_name: VL模型名称
+        """
+        self.model_name = model_name or model_settings.vl_model_name
+        self.langfuse_handler = CallbackHandler()
+        self.workflow = self._build_workflow()
+    
+    def _build_workflow(self):
+        """构建工作流"""
+        # 创建节点实例
+        ocr_node = PDFOCRNode(model_name=self.model_name)
+        split_node = TextSplitNode()
+        qa_node = QAGenerateNode(model_name=self.model_name)
+        export_node = ExportCSVNode()
+        upload_node = RAGFlowDocumentUploadNode(target_field="document_id")
+        parse_node = RAGFlowDocumentParseNode()
+        complete_node = CompleteNode(message_template="QA解析工作流完成")
+        
+        # 使用WorkflowBuilder构建
+        builder = WorkflowBuilder(QAParsingState)
+        
+        builder.add_nodes(
+            ocr_node,
+            split_node,
+            qa_node,
+            export_node,
+            upload_node,
+            parse_node,
+            complete_node
+        )
+        
+        # 定义顺序执行的边
+        builder.set_entry("pdf_ocr")
+        builder.add_sequence(
+            "pdf_ocr",
+            "text_split",
+            "qa_generate",
+            "export_csv",
+            "ragflow_document_upload",
+            "ragflow_document_parse",
+            "complete"
+        )
+        builder.set_finish("complete")
+        
+        return builder.build()
+    
+    def run(
+        self,
+        pdf_path: str,
+        dataset_id: str,
+        qa_count_per_chunk: int = 50,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200
+    ) -> Dict[str, Any]:
+        """
+        运行QA解析工作流
+        
+        Args:
+            pdf_path: PDF文件路径
+            dataset_id: RAGFlow数据集ID
+            qa_count_per_chunk: 每块生成的QA数量
+            chunk_size: 文本分块大小
+            chunk_overlap: 分块重叠大小
+            
+        Returns:
+            包含最终状态的字典
+        """
+        logger.info(f"开始运行QA解析工作流: {pdf_path}")
+        
+        initial_state = QAParsingState(
+            pdf_path=pdf_path,
+            dataset_id=dataset_id,
+            qa_count_per_chunk=qa_count_per_chunk,
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        )
+        
+        result = self.workflow.invoke(
+            initial_state,
+            config={"callbacks": [self.langfuse_handler]}
+        )
+        
+        if isinstance(result, dict):
+            return result
+        return result.dict() if hasattr(result, 'dict') else dict(result)

+ 10 - 6
src/job/chunk_update_job.py

@@ -13,6 +13,10 @@ from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.interval import IntervalTrigger
 from src.utils.mysql import get_global_mysql_client
 from src.utils.infinity import get_client
+from src.common.logging_config import get_logger
+
+# 获取日志器
+logger = get_logger(__name__)
 
 # 初始化调度器
 _scheduler = None
@@ -55,7 +59,7 @@ class ChunkUpdateJob:
                         ["已执行", datetime.now(), task_id]
                     )
                     
-                    print(f"Task {task_id} executed successfully")
+                    logger.info(f"Task {task_id} executed successfully")
                 except Exception as e:
                     # 更新任务状态为"执行失败"
                     self.mysql_client.execute(
@@ -63,9 +67,9 @@ class ChunkUpdateJob:
                         ["执行失败", str(e), datetime.now(), task_id]
                     )
                     
-                    print(f"Task {task_id} execution failed: {e}")
+                    logger.error(f"Task {task_id} execution failed: {e}")
         except Exception as e:
-            print(f"Failed to process due tasks: {e}")
+            logger.error(f"Failed to process due tasks: {e}")
     
     def _execute_task(self, database_name: str, table_name: str, chunk_id: str, 
                       cond: str, data: dict) -> None:
@@ -89,7 +93,7 @@ class ChunkUpdateJob:
                     data=json.loads(data),
                     database_name=database_name
                 )
-            print(f"Updated chunk {chunk_id} in {database_name}.{table_name}")
+            logger.info(f"Updated chunk {chunk_id} in {database_name}.{table_name}")
         except Exception as e:
             raise Exception(f"Failed to update chunk {chunk_id}: {e}")
 
@@ -116,7 +120,7 @@ def start_scheduler():
         
         # 启动调度器
         _scheduler.start()
-        print("✅ Chunk update scheduler started")
+        # logger.info("✅ Chunk update scheduler started")
 
 
 def shutdown_scheduler():
@@ -126,4 +130,4 @@ def shutdown_scheduler():
     if _scheduler is not None:
         _scheduler.shutdown()
         _scheduler = None
-        print("✅ Chunk update scheduler shutdown")
+        # logger.info("✅ Chunk update scheduler shutdown")

+ 7 - 9
src/model/multimodal_embedding.py

@@ -11,7 +11,7 @@ from src.utils.file.image_util import image_util
 class Embedding:
     """Embedding模型工具"""
     
-    def __init__(self, model_name: str = None, multi_model_name: str = None, api_key: str = None, dashscope_api_key: str = None):
+    def __init__(self, model_name: str = None, api_key: str = None):
         """
         初始化Embedding模型
         
@@ -21,11 +21,11 @@ class Embedding:
         """
         # 获取模型配置
         self.model_provider = model_settings.model_provider
-        self.model_name = model_name or model_settings.model_name
-        self.multi_model_name = multi_model_name or model_settings.multimodal_embedding_model_name
+        self.embedding_model_name = model_name or model_settings.embedding_model_name
+        self.multi_embedding_model_name = model_name or model_settings.multimodal_embedding_model_name
         self.base_url = model_settings.base_url
         self.api_key = api_key or model_settings.api_key
-        self.dashscope_api_key = dashscope_api_key or model_settings.dashscope_api_key
+        self.dashscope_api_key = api_key or model_settings.dashscope_api_key
     
     @observe(name="text_embedding", as_type="embedding")
     def get_text_embedding(self, text: str) -> List[float]:
@@ -101,7 +101,7 @@ class Embedding:
                 }
             ]
             response = MultiModalEmbedding.call(
-                model=self.multi_model_name,
+                model=self.multi_embedding_model_name,
                 api_key=self.dashscope_api_key,
                 input=item
             )
@@ -139,7 +139,7 @@ class Embedding:
                 item.append({'text': text})
         
             response = MultiModalEmbedding.call(
-                model=self.multi_model_name,
+                model=self.multi_embedding_model_name,
                 api_key=self.dashscope_api_key,
                 input=item
             )
@@ -161,7 +161,5 @@ def get_embedding_model() -> Embedding:
     """
     return Embedding(
         model_name=model_settings.embedding_model_name,
-        multi_model_name=model_settings.multimodal_embedding_model_name,
-        api_key=model_settings.api_key,
-        dashscope_api_key=model_settings.dashscope_api_key
+        api_key=model_settings.api_key
     )

+ 3 - 6
src/model/qwen_vl.py

@@ -4,7 +4,6 @@ import base64
 import io
 from langchain.chat_models import init_chat_model
 from src.conf.settings import model_settings
-from langfuse.langchain import CallbackHandler
 
 class QWenVLParser:
     """QWEN VL模型图像解析工具"""
@@ -18,11 +17,9 @@ class QWenVLParser:
         """
         # 获取模型配置
         self.model_provider = model_settings.model_provider
-        self.model_name = model_name or model_settings.model_name
+        self.model_name = model_name or model_settings.vl_model_name
         self.base_url = model_settings.base_url
-        self.api_key = model_settings.api_key
-        self.langfuse_handler = CallbackHandler()
-        # 使用langchain的init_chat_model初始化模型
+        self.api_key = model_settings.api_key        # 使用langchain的init_chat_model初始化模型
         self.model = init_chat_model(
             model_provider=self.model_provider,
             model=self.model_name,
@@ -83,7 +80,7 @@ class QWenVLParser:
             ]
             
             # 使用langchain模型调用OpenAI API
-            response = self.model.invoke(input=messages, config={"callbacks": [self.langfuse_handler]})
+            response = self.model.invoke(input=messages)
             
             # 提取解析结果
             content = response.content

+ 7 - 3
src/utils/es/client_manager.py

@@ -5,6 +5,10 @@ from elasticsearch import Elasticsearch
 from elasticsearch.exceptions import ConnectionError as ESConnectionError
 from src.utils.decorators.singleton import singleton
 from src.conf.settings import es_settings
+from src.common.logging_config import get_logger
+
+# 获取日志器
+logger = get_logger(__name__)
 
 
 @singleton
@@ -47,12 +51,12 @@ class ESClientManager:
                 request_timeout=es_settings.es_timeout,
                 verify_certs=es_settings.es_verify_certs
             )
-            print("✅ Elasticsearch客户端初始化成功")
+            logger.info("✅ Elasticsearch客户端初始化成功")
         except ESConnectionError as e:
-            print(f"❌ Elasticsearch连接失败: {e}")
+            logger.error(f"❌ Elasticsearch连接失败: {e}")
             raise
         except Exception as e:
-            print(f"❌ Elasticsearch初始化失败: {e}")
+            logger.error(f"❌ Elasticsearch初始化失败: {e}")
             raise
     
     def ping(self) -> bool:

+ 13 - 9
src/utils/file/image_util.py

@@ -10,6 +10,10 @@ from io import BytesIO
 from PIL import Image
 from src.utils.file.minio.minio_util import MinIOUtil
 from src.utils.file.file_utils import generate_unique_filename
+from src.common.logging_config import get_logger
+
+# 获取日志器
+logger = get_logger(__name__)
 
 
 class ImageUtil:
@@ -56,7 +60,7 @@ class ImageUtil:
         Returns:
             List[str]: 按页码顺序排序的minio url集合
         """
-        print(f"开始处理图片压缩包: {zip_file_path}")
+        logger.info(f"开始处理图片压缩包: {zip_file_path}")
         
         # 用于存储图片信息的列表,格式: (页码, url)
         image_info_list = []
@@ -74,7 +78,7 @@ class ImageUtil:
                     and not f.startswith('__MACOSX')  # 排除macOS生成的隐藏文件
                 ]
                 
-                print(f"找到 {len(image_files)} 张图片")
+                logger.info(f"找到 {len(image_files)} 张图片")
                 
                 # 遍历所有图片文件
                 for image_file in image_files:
@@ -86,7 +90,7 @@ class ImageUtil:
                         _, ext = os.path.splitext(image_file)
                         new_filename = f"{book_name}_P{page_num}{ext}"
                         
-                        print(f"处理图片: {image_file} -> {new_filename}, 页码: {page_num}")
+                        logger.debug(f"处理图片: {image_file} -> {new_filename}, 页码: {page_num}")
                         
                         # 读取图片文件内容
                         with zip_ref.open(image_file) as f:
@@ -105,7 +109,7 @@ class ImageUtil:
                         image_info_list.append((page_num, image_url))
                         
                     except Exception as e:
-                        print(f"处理图片 {image_file} 失败: {str(e)}")
+                        logger.error(f"处理图片 {image_file} 失败: {str(e)}")
                         continue
             
             # 按页码顺序排序
@@ -114,11 +118,11 @@ class ImageUtil:
             # 提取url列表
             image_urls = [url for _, url in image_info_list]
             
-            print(f"图片压缩包处理完成,共处理 {len(image_urls)} 张图片")
+            logger.info(f"图片压缩包处理完成,共处理 {len(image_urls)} 张图片")
             return image_urls
             
         except Exception as e:
-            print(f"处理图片压缩包失败: {str(e)}")
+            logger.error(f"处理图片压缩包失败: {str(e)}")
             raise
     
     def _compress_image(self, image_stream: BytesIO, original_filename: str, max_size_kb: int = 5000) -> BytesIO:
@@ -257,7 +261,7 @@ class ImageUtil:
         
         # 读取字节流并返回
         compressed_bytes = compressed_stream.getvalue()
-        print(f"图片转换为字节流完成,字节大小为 {len(compressed_bytes)} 字节")
+        logger.debug(f"图片转换为字节流完成,字节大小为 {len(compressed_bytes)} 字节")
         
         return compressed_bytes
     
@@ -272,7 +276,7 @@ class ImageUtil:
         Returns:
             bytes: 压缩后的图片字节流
         """
-        print(f"开始压缩图片,原大小为 {len(image_bytes) / 1024:.2f}KB")
+        logger.debug(f"开始压缩图片,原大小为 {len(image_bytes) / 1024:.2f}KB")
         
         # 将字节流转换为BytesIO对象
         image_stream = BytesIO(image_bytes)
@@ -280,7 +284,7 @@ class ImageUtil:
         # 调用现有的压缩方法
         compressed_bytes = self._compress_image_to_bytes(image_stream, max_size_kb)
         
-        print(f"图片压缩完成,压缩后大小为 {len(compressed_bytes) / 1024:.2f}KB")
+        logger.debug(f"图片压缩完成,压缩后大小为 {len(compressed_bytes) / 1024:.2f}KB")
         
         return compressed_bytes
     

+ 7 - 3
src/utils/file/minio/minio_util.py

@@ -3,6 +3,10 @@ from typing import BinaryIO
 from datetime import timedelta
 from src.conf.settings import minio_settings
 from src.utils.file.file_utils import generate_unique_filename
+from src.common.logging_config import get_logger
+
+# 获取日志器
+logger = get_logger(__name__)
 
 # 全局MinIO客户端实例
 _global_minio_client = None
@@ -26,9 +30,9 @@ class MinIOUtil:
         try:
             if not self.client.bucket_exists(self.bucket_name):
                 self.client.make_bucket(self.bucket_name)
-                print(f"Bucket '{self.bucket_name}' created successfully.")
+                logger.info(f"Bucket '{self.bucket_name}' created successfully.")
             else:
-                print(f"Bucket '{self.bucket_name}' already exists.")
+                logger.info(f"Bucket '{self.bucket_name}' already exists.")
         except Exception as e:
             raise RuntimeError(f"Failed to create bucket: {e}")
 
@@ -145,7 +149,7 @@ class MinIOUtil:
             )
             return True
         except Exception as e:
-            print(f"Delete failed: {e}")
+            logger.error(f"Delete failed: {e}")
             return False
 
     def _get_content_type(self, filename: str) -> str:

+ 208 - 0
src/utils/task_queue.py

@@ -0,0 +1,208 @@
+"""
+任务队列管理器
+
+提供带并发限制的异步任务队列,用于管理工作流执行。
+"""
+
+import asyncio
+import uuid
+from typing import Callable, Any, Dict, Optional
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class TaskStatus(str, Enum):
+    """任务状态"""
+    PENDING = "pending"      # 等待中
+    RUNNING = "running"      # 运行中
+    COMPLETED = "completed"  # 已完成
+    FAILED = "failed"        # 失败
+
+
+@dataclass
+class Task:
+    """任务信息"""
+    id: str
+    name: str
+    status: TaskStatus = TaskStatus.PENDING
+    created_at: datetime = field(default_factory=datetime.now)
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    result: Any = None
+    error: Optional[str] = None
+    position: int = 0  # 队列位置
+
+
+class TaskQueue:
+    """
+    带并发限制的异步任务队列
+    
+    Example:
+        >>> queue = TaskQueue(max_concurrent=3)
+        >>> task_id = await queue.submit("pdf_parse", workflow.run, pdf_path="xxx")
+        >>> status = queue.get_task_status(task_id)
+    """
+    
+    _instance: Optional["TaskQueue"] = None
+    
+    def __new__(cls, max_concurrent: int = 3) -> "TaskQueue":
+        """单例模式"""
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            cls._instance._initialized = False
+        return cls._instance
+    
+    def __init__(self, max_concurrent: int = 3):
+        if self._initialized:
+            return
+        
+        self.max_concurrent = max_concurrent
+        self._semaphore = asyncio.Semaphore(max_concurrent)
+        self._tasks: Dict[str, Task] = {}
+        self._queue: asyncio.Queue = asyncio.Queue()
+        self._running_count = 0
+        self._initialized = True
+        logger.info(f"任务队列初始化完成,最大并发数: {max_concurrent}")
+    
+    async def submit(
+        self,
+        name: str,
+        func: Callable[..., Any],
+        **kwargs
+    ) -> str:
+        """
+        提交任务到队列
+        
+        Args:
+            name: 任务名称
+            func: 要执行的函数
+            **kwargs: 函数参数
+            
+        Returns:
+            任务ID
+        """
+        task_id = str(uuid.uuid4())[:8]
+        task = Task(
+            id=task_id,
+            name=name,
+            position=self._queue.qsize() + self._running_count + 1
+        )
+        self._tasks[task_id] = task
+        
+        logger.info(f"任务已提交: {task_id} ({name}),队列位置: {task.position}")
+        
+        # 启动任务执行
+        asyncio.create_task(self._execute_task(task, func, kwargs))
+        
+        return task_id
+    
+    async def _execute_task(self, task: Task, func: Callable, kwargs: dict):
+        """执行任务"""
+        # 等待获取信号量(限制并发)
+        async with self._semaphore:
+            self._running_count += 1
+            task.status = TaskStatus.RUNNING
+            task.started_at = datetime.now()
+            task.position = 0  # 正在执行,位置为0
+            
+            # 更新其他任务的队列位置
+            self._update_positions()
+            
+            logger.info(f"任务开始执行: {task.id} ({task.name}),当前运行: {self._running_count}/{self.max_concurrent}")
+            
+            try:
+                # 在线程池中执行同步函数
+                loop = asyncio.get_event_loop()
+                result = await loop.run_in_executor(None, lambda: func(**kwargs))
+                
+                task.status = TaskStatus.COMPLETED
+                task.result = result
+                logger.info(f"任务执行成功: {task.id} ({task.name})")
+            except Exception as e:
+                task.status = TaskStatus.FAILED
+                task.error = str(e)
+                logger.error(f"任务执行失败: {task.id} ({task.name}) - {str(e)}")
+            finally:
+                task.completed_at = datetime.now()
+                self._running_count -= 1
+                self._update_positions()
+    
+    def _update_positions(self):
+        """更新等待中任务的队列位置"""
+        pending_tasks = [
+            t for t in self._tasks.values() 
+            if t.status == TaskStatus.PENDING
+        ]
+        pending_tasks.sort(key=lambda x: x.created_at)
+        for i, task in enumerate(pending_tasks):
+            task.position = i + 1
+    
+    def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """
+        获取任务状态
+        
+        Args:
+            task_id: 任务ID
+            
+        Returns:
+            任务状态信息
+        """
+        task = self._tasks.get(task_id)
+        if not task:
+            return None
+        
+        return {
+            "id": task.id,
+            "name": task.name,
+            "status": task.status.value,
+            "position": task.position,
+            "created_at": task.created_at.isoformat(),
+            "started_at": task.started_at.isoformat() if task.started_at else None,
+            "completed_at": task.completed_at.isoformat() if task.completed_at else None,
+            "result": task.result if task.status == TaskStatus.COMPLETED else None,
+            "error": task.error if task.status == TaskStatus.FAILED else None
+        }
+    
+    def get_queue_info(self) -> Dict[str, Any]:
+        """
+        获取队列信息
+        
+        Returns:
+            队列状态信息
+        """
+        pending = sum(1 for t in self._tasks.values() if t.status == TaskStatus.PENDING)
+        running = sum(1 for t in self._tasks.values() if t.status == TaskStatus.RUNNING)
+        completed = sum(1 for t in self._tasks.values() if t.status == TaskStatus.COMPLETED)
+        failed = sum(1 for t in self._tasks.values() if t.status == TaskStatus.FAILED)
+        
+        return {
+            "max_concurrent": self.max_concurrent,
+            "pending": pending,
+            "running": running,
+            "completed": completed,
+            "failed": failed,
+            "total": len(self._tasks)
+        }
+    
+    def clear_completed(self):
+        """清理已完成的任务记录"""
+        to_remove = [
+            tid for tid, task in self._tasks.items()
+            if task.status in (TaskStatus.COMPLETED, TaskStatus.FAILED)
+        ]
+        for tid in to_remove:
+            del self._tasks[tid]
+        logger.info(f"清理了 {len(to_remove)} 个已完成的任务")
+
+
+# 全局任务队列实例
+task_queue = TaskQueue(max_concurrent=3)
+
+
+def get_task_queue() -> TaskQueue:
+    """获取全局任务队列实例"""
+    return task_queue

+ 14 - 0
src/utils/vector_db/__init__.py

@@ -0,0 +1,14 @@
+"""
+向量数据库抽象层
+
+提供统一的向量数据库接口,支持 Infinity 和 Elasticsearch。
+"""
+
+from .base import VectorDBClient
+from .factory import get_vector_db_client, close_vector_db_client
+
+__all__ = [
+    'VectorDBClient',
+    'get_vector_db_client',
+    'close_vector_db_client',
+]

+ 134 - 0
src/utils/vector_db/base.py

@@ -0,0 +1,134 @@
+"""
+向量数据库客户端抽象基类
+
+定义向量数据库的统一接口,所有具体实现都需要继承此基类。
+"""
+
+from abc import ABC, abstractmethod
+from typing import Dict, Any, List, Optional
+
+
+class VectorDBClient(ABC):
+    """
+    向量数据库客户端抽象基类
+    
+    所有向量数据库实现(Infinity、Elasticsearch等)都需要实现此接口。
+    """
+    
+    @abstractmethod
+    def search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        全文搜索
+        
+        Args:
+            table_name: 表名/索引名
+            output_fields: 返回字段列表
+            query: 查询参数
+            database_name: 数据库名称(可选)
+            
+        Returns:
+            搜索结果
+        """
+        pass
+    
+    @abstractmethod
+    def vector_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        向量搜索
+        
+        Args:
+            table_name: 表名/索引名
+            output_fields: 返回字段列表
+            query: 查询参数,包含 query_vector 等
+            database_name: 数据库名称(可选)
+            
+        Returns:
+            搜索结果
+        """
+        pass
+    
+    @abstractmethod
+    def hybrid_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        混合搜索(向量 + 全文)
+        
+        Args:
+            table_name: 表名/索引名
+            output_fields: 返回字段列表
+            query: 查询参数
+            database_name: 数据库名称(可选)
+            
+        Returns:
+            搜索结果
+        """
+        pass
+    
+    @abstractmethod
+    def insert(
+        self,
+        table_name: str,
+        documents: List[Dict[str, Any]],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        插入文档
+        
+        Args:
+            table_name: 表名/索引名
+            documents: 文档列表
+            database_name: 数据库名称(可选)
+            
+        Returns:
+            插入结果
+        """
+        pass
+    
+    @abstractmethod
+    def update(
+        self,
+        table_name: str,
+        cond: str,
+        data: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        更新文档
+        
+        Args:
+            table_name: 表名/索引名
+            cond: 更新条件
+            data: 更新数据
+            database_name: 数据库名称(可选)
+            
+        Returns:
+            更新结果
+        """
+        pass
+    
+    @abstractmethod
+    def get_status(self) -> Dict[str, Any]:
+        """获取客户端状态"""
+        pass
+    
+    @abstractmethod
+    def close(self):
+        """关闭客户端连接"""
+        pass

+ 347 - 0
src/utils/vector_db/elasticsearch_adapter.py

@@ -0,0 +1,347 @@
+"""
+Elasticsearch 向量数据库适配器
+
+实现 VectorDBClient 接口,提供与 Infinity 兼容的 ES 操作。
+"""
+
+import threading
+from typing import Dict, Any, List, Optional
+from .base import VectorDBClient
+from src.conf.settings import es_settings, vector_db_settings
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class ElasticsearchAdapter(VectorDBClient):
+    """
+    Elasticsearch 向量数据库适配器
+    
+    实现统一的 VectorDBClient 接口,支持向量搜索和全文搜索。
+    """
+    
+    def __init__(
+        self,
+        database: Optional[str] = None,
+        index_name: Optional[str] = None
+    ):
+        """
+        初始化 Elasticsearch 适配器
+        
+        Args:
+            database: 数据库名称(在ES中对应index前缀)
+            index_name: 索引名称
+        """
+        self._database = database or "default"
+        self._index_name = index_name or es_settings.es_index_name
+        self._client = None
+        self._init_client()
+    
+    def _init_client(self):
+        """初始化 ES 客户端"""
+        try:
+            from elasticsearch import Elasticsearch
+            
+            # 构建连接参数
+            es_config = {
+                "hosts": es_settings.es_nodes,
+                "max_retries": es_settings.es_max_retries,
+                "retry_on_timeout": es_settings.es_retry_on_timeout,
+                "request_timeout": es_settings.es_timeout,
+                "verify_certs": es_settings.es_verify_certs,
+            }
+            
+            # 添加认证信息(如果配置了)
+            if es_settings.es_username and es_settings.es_password:
+                es_config["basic_auth"] = (es_settings.es_username, es_settings.es_password)
+            
+            self._client = Elasticsearch(**es_config)
+            logger.info(f"Elasticsearch 客户端初始化成功: {es_settings.es_nodes}")
+        except ImportError:
+            logger.error("elasticsearch 库未安装,请运行: pip install elasticsearch")
+            raise
+        except Exception as e:
+            logger.error(f"Elasticsearch 客户端初始化失败: {str(e)}")
+            raise
+    
+    @property
+    def client(self):
+        """获取底层 ES 客户端"""
+        return self._client
+    
+    def _get_index_name(self, table_name: str, database_name: Optional[str] = None) -> str:
+        """
+        获取完整的索引名称
+        
+        Args:
+            table_name: 表名(映射为索引名)
+            database_name: 数据库名(映射为索引前缀)
+        """
+        prefix = database_name or self._database
+        if prefix and prefix != "default":
+            return f"{prefix}_{table_name}"
+        return table_name
+    
+    def search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        全文搜索
+        
+        将 Infinity 风格的查询转换为 ES 查询。
+        """
+        index_name = self._get_index_name(table_name, database_name)
+        
+        # 构建 ES 查询
+        es_query = {
+            "query": {
+                "match": {
+                    query.get("match_field", "content"): query.get("matching_text", "")
+                }
+            },
+            "size": query.get("topn", 10),
+            "_source": output_fields
+        }
+        
+        result = self._client.search(index=index_name, body=es_query)
+        return self._convert_result(result, output_fields)
+    
+    def vector_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        向量搜索
+        
+        使用 ES 的 knn 查询进行向量检索。
+        """
+        index_name = self._get_index_name(table_name, database_name)
+        vector_field = query.get("vector_field", "dense_vector")
+        query_vector = query.get("query_vector", [])
+        topn = query.get("topn", 10)
+        
+        # 获取阈值
+        knn_params = query.get("knn_params", {})
+        threshold = float(knn_params.get("threshold", 0.0))
+        
+        # 构建 ES knn 查询
+        es_query = {
+            "knn": {
+                "field": vector_field,
+                "query_vector": query_vector,
+                "k": topn,
+                "num_candidates": topn * 10
+            },
+            "_source": output_fields
+        }
+        
+        # 如果有阈值,添加 min_score
+        if threshold > 0:
+            es_query["min_score"] = threshold
+        
+        result = self._client.search(index=index_name, body=es_query)
+        return self._convert_result(result, output_fields)
+    
+    def hybrid_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        混合搜索(向量 + 全文)
+        
+        使用 ES 的 bool 查询结合 knn 和 match。
+        """
+        index_name = self._get_index_name(table_name, database_name)
+        vector_field = query.get("vector_field", "dense_vector")
+        query_vector = query.get("query_vector", [])
+        match_field = query.get("match_field", "content")
+        matching_text = query.get("matching_text", "")
+        topn = query.get("topn", 10)
+        
+        # 构建混合查询
+        es_query = {
+            "query": {
+                "bool": {
+                    "should": [
+                        {
+                            "match": {
+                                match_field: {
+                                    "query": matching_text,
+                                    "boost": 1.0
+                                }
+                            }
+                        }
+                    ]
+                }
+            },
+            "knn": {
+                "field": vector_field,
+                "query_vector": query_vector,
+                "k": topn,
+                "num_candidates": topn * 10,
+                "boost": 1.0
+            },
+            "size": topn,
+            "_source": output_fields
+        }
+        
+        result = self._client.search(index=index_name, body=es_query)
+        return self._convert_result(result, output_fields)
+    
+    def insert(
+        self,
+        table_name: str,
+        documents: List[Dict[str, Any]],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """插入文档"""
+        index_name = self._get_index_name(table_name, database_name)
+        
+        # 批量插入
+        operations = []
+        for doc in documents:
+            operations.append({"index": {"_index": index_name}})
+            operations.append(doc)
+        
+        if operations:
+            result = self._client.bulk(body=operations, refresh=True)
+            return result
+        return None
+    
+    def update(
+        self,
+        table_name: str,
+        cond: str,
+        data: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """
+        更新文档
+        
+        注意:ES 的更新方式与 Infinity 不同,这里使用 update_by_query。
+        """
+        index_name = self._get_index_name(table_name, database_name)
+        
+        # 将条件字符串解析为 ES 查询
+        # 简单实现:假设 cond 是 "field = 'value'" 格式
+        es_query = {
+            "query": {
+                "query_string": {
+                    "query": cond
+                }
+            },
+            "script": {
+                "source": "; ".join([f"ctx._source.{k} = params.{k}" for k in data.keys()]),
+                "params": data
+            }
+        }
+        
+        result = self._client.update_by_query(index=index_name, body=es_query)
+        return result
+    
+    def get_status(self) -> Dict[str, Any]:
+        """获取客户端状态"""
+        try:
+            health = self._client.cluster.health()
+            return {
+                "type": "elasticsearch",
+                "status": health.get("status", "unknown"),
+                "cluster_name": health.get("cluster_name", ""),
+                "nodes": len(es_settings.es_nodes)
+            }
+        except Exception as e:
+            return {
+                "type": "elasticsearch",
+                "status": "error",
+                "error": str(e)
+            }
+    
+    def close(self):
+        """关闭客户端连接"""
+        if self._client:
+            self._client.close()
+            logger.info("Elasticsearch 客户端已关闭")
+    
+    def _convert_result(self, es_result: Dict, output_fields: List[str]) -> Any:
+        """
+        转换 ES 结果为统一格式
+        
+        返回类似 Infinity 的结果结构
+        """
+        hits = es_result.get("hits", {}).get("hits", [])
+        
+        class MockResult:
+            """模拟 Infinity 结果对象"""
+            def __init__(self, data):
+                self._data = data
+            
+            def to_result(self):
+                return self._data
+        
+        # 转换为统一格式
+        results = []
+        for hit in hits:
+            item = hit.get("_source", {})
+            item["_score"] = hit.get("_score", 0)
+            item["_id"] = hit.get("_id", "")
+            results.append(item)
+        
+        return MockResult(results)
+    
+    # ========== ES 特有方法(扩展) ==========
+    
+    def create_index(
+        self,
+        index_name: str,
+        mappings: Dict[str, Any],
+        settings: Optional[Dict[str, Any]] = None
+    ):
+        """创建索引"""
+        body = {"mappings": mappings}
+        if settings:
+            body["settings"] = settings
+        return self._client.indices.create(index=index_name, body=body, ignore=400)
+    
+    def delete_index(self, index_name: str):
+        """删除索引"""
+        return self._client.indices.delete(index=index_name, ignore=[400, 404])
+    
+    def index_exists(self, index_name: str) -> bool:
+        """检查索引是否存在"""
+        return self._client.indices.exists(index=index_name)
+
+
+# ========== 全局客户端管理 ==========
+
+_es_client_cache: Dict[str, ElasticsearchAdapter] = {}
+_es_client_lock = threading.Lock()
+
+
+def get_es_client(database: str = "default") -> ElasticsearchAdapter:
+    """获取 ES 客户端(单例缓存)"""
+    with _es_client_lock:
+        if database not in _es_client_cache:
+            _es_client_cache[database] = ElasticsearchAdapter(database=database)
+        return _es_client_cache[database]
+
+
+def close_es_client(database: Optional[str] = None):
+    """关闭 ES 客户端"""
+    with _es_client_lock:
+        if database is None:
+            for client in _es_client_cache.values():
+                client.close()
+            _es_client_cache.clear()
+        elif database in _es_client_cache:
+            _es_client_cache[database].close()
+            del _es_client_cache[database]

+ 118 - 0
src/utils/vector_db/factory.py

@@ -0,0 +1,118 @@
+"""
+向量数据库客户端工厂
+
+根据配置自动选择并返回对应的向量数据库客户端。
+"""
+
+import threading
+from typing import Dict, Optional
+from .base import VectorDBClient
+from src.conf.settings import vector_db_settings
+from src.common.logging_config import get_logger
+
+logger = get_logger(__name__)
+
+# 客户端缓存
+_client_cache: Dict[str, VectorDBClient] = {}
+_client_lock = threading.Lock()
+
+
+def get_vector_db_client(
+    database: Optional[str] = None,
+    db_type: Optional[str] = None
+) -> VectorDBClient:
+    """
+    获取向量数据库客户端
+    
+    根据配置 VECTOR_DB_TYPE 自动选择对应的数据库实现。
+    
+    Args:
+        database: 数据库名称
+        db_type: 数据库类型(可选,覆盖配置文件设置)
+        
+    Returns:
+        VectorDBClient 实例
+        
+    Example:
+        # 使用配置文件中的数据库类型
+        client = get_vector_db_client()
+        
+        # 指定数据库
+        client = get_vector_db_client(database="ragflow_db")
+        
+        # 强制使用特定类型
+        client = get_vector_db_client(db_type="infinity")
+    """
+    # 确定数据库类型
+    vector_db_type = db_type or vector_db_settings.vector_db_type
+    
+    # 默认数据库名
+    if database is None:
+        if vector_db_type == "infinity":
+            database = vector_db_settings.infinity_database
+        else:
+            database = "default"
+    
+    # 缓存键
+    cache_key = f"{vector_db_type}:{database}"
+    
+    with _client_lock:
+        if cache_key not in _client_cache:
+            _client_cache[cache_key] = _create_client(vector_db_type, database)
+            logger.info(f"创建向量数据库客户端: type={vector_db_type}, database={database}")
+        
+        return _client_cache[cache_key]
+
+
+def _create_client(db_type: str, database: str) -> VectorDBClient:
+    """
+    创建向量数据库客户端
+    
+    Args:
+        db_type: 数据库类型 (infinity/es)
+        database: 数据库名称
+    """
+    if db_type == "infinity":
+        from .infinity_adapter import InfinityAdapter
+        return InfinityAdapter(database=database)
+    
+    elif db_type == "es":
+        from .elasticsearch_adapter import ElasticsearchAdapter
+        return ElasticsearchAdapter(database=database)
+    
+    else:
+        raise ValueError(f"不支持的向量数据库类型: {db_type},可选值: infinity, es")
+
+
+def close_vector_db_client(database: Optional[str] = None):
+    """
+    关闭向量数据库客户端
+    
+    Args:
+        database: 要关闭的数据库客户端名称。
+                  如果为 None,则关闭所有缓存的客户端。
+    """
+    with _client_lock:
+        if database is None:
+            # 关闭所有客户端
+            for client in _client_cache.values():
+                try:
+                    client.close()
+                except Exception as e:
+                    logger.error(f"关闭客户端失败: {e}")
+            _client_cache.clear()
+            logger.info("所有向量数据库客户端已关闭")
+        else:
+            # 关闭指定数据库的客户端
+            keys_to_remove = [k for k in _client_cache if k.endswith(f":{database}")]
+            for key in keys_to_remove:
+                try:
+                    _client_cache[key].close()
+                except Exception as e:
+                    logger.error(f"关闭客户端 {key} 失败: {e}")
+                del _client_cache[key]
+
+
+def get_current_db_type() -> str:
+    """获取当前配置的数据库类型"""
+    return vector_db_settings.vector_db_type

+ 146 - 0
src/utils/vector_db/infinity_adapter.py

@@ -0,0 +1,146 @@
+"""
+Infinity 向量数据库适配器
+
+封装现有的 InfinityClient,实现 VectorDBClient 接口。
+"""
+
+from typing import Dict, Any, List, Optional
+from .base import VectorDBClient
+from src.utils.infinity import InfinityClient, get_client as get_infinity_client, close_client as close_infinity_client
+from src.conf.settings import vector_db_settings
+
+
+class InfinityAdapter(VectorDBClient):
+    """
+    Infinity 向量数据库适配器
+    
+    将现有的 InfinityClient 封装为统一的 VectorDBClient 接口。
+    """
+    
+    def __init__(
+        self,
+        database: Optional[str] = None,
+        host: Optional[str] = None,
+        port: Optional[int] = None,
+        min_connections: int = 5,
+        max_connections: int = 10
+    ):
+        """
+        初始化 Infinity 适配器
+        
+        Args:
+            database: 数据库名称
+            host: Infinity 服务地址
+            port: Infinity 服务端口
+            min_connections: 最小连接数
+            max_connections: 最大连接数
+        """
+        self._database = database or vector_db_settings.infinity_database
+        self._host = host or vector_db_settings.infinity_host
+        self._port = port or vector_db_settings.infinity_sdk_port
+        self._min_connections = min_connections
+        self._max_connections = max_connections
+        
+        # 获取底层 InfinityClient
+        self._client: InfinityClient = get_infinity_client(
+            host=self._host,
+            port=self._port,
+            database=self._database,
+            min_connections=self._min_connections,
+            max_connections=self._max_connections
+        )
+    
+    @property
+    def client(self) -> InfinityClient:
+        """获取底层 InfinityClient"""
+        return self._client
+    
+    def search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """全文搜索"""
+        return self._client.search(table_name, output_fields, query, database_name)
+    
+    def vector_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """向量搜索"""
+        return self._client.vector_search(table_name, output_fields, query, database_name)
+    
+    def hybrid_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """混合搜索"""
+        return self._client.hybrid_search(table_name, output_fields, query, database_name)
+    
+    def insert(
+        self,
+        table_name: str,
+        documents: List[Dict[str, Any]],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """插入文档"""
+        return self._client.insert(table_name, documents, database_name)
+    
+    def update(
+        self,
+        table_name: str,
+        cond: str,
+        data: Dict[str, Any],
+        database_name: Optional[str] = None
+    ) -> Any:
+        """更新文档"""
+        return self._client.update(table_name, cond, data, database_name)
+    
+    def get_status(self) -> Dict[str, Any]:
+        """获取客户端状态"""
+        return self._client.get_status()
+    
+    def close(self):
+        """关闭客户端连接"""
+        close_infinity_client(self._database)
+    
+    # ========== Infinity 特有方法(扩展) ==========
+    
+    def list_databases(self) -> List[str]:
+        """获取所有数据库"""
+        return self._client.list_databases()
+    
+    def list_tables(self, database_name: Optional[str] = None) -> List[str]:
+        """获取所有表"""
+        return self._client.list_tables(database_name)
+    
+    def create_table(
+        self,
+        table_name: str,
+        columns_definition: List[Dict[str, Any]],
+        database_name: Optional[str] = None
+    ):
+        """创建表"""
+        return self._client.create_table(table_name, columns_definition, database_name=database_name)
+    
+    def drop_table(self, table_name: str, database_name: Optional[str] = None):
+        """删除表"""
+        return self._client.drop_table(table_name, database_name)
+    
+    def create_index(
+        self,
+        table_name: str,
+        index_name: str,
+        index_info: Dict[str, Any],
+        database_name: Optional[str] = None
+    ):
+        """创建索引"""
+        return self._client.create_index(table_name, index_name, index_info, database_name)

+ 45 - 0
test_qa_workflow.py

@@ -0,0 +1,45 @@
+"""
+测试QA工作流
+"""
+
+import tempfile
+import os
+from src.datasets.parser.workflows import QAParsingWorkflowV2
+
+# 创建一个临时PDF文件用于测试
+def create_test_pdf():
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as f:
+        # 写入简单的PDF内容
+        f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000052 00000 n \n0000000101 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\n%%EOF")
+        return f.name
+
+def test_qa_workflow():
+    print("开始测试QA工作流...")
+    
+    # 创建测试PDF文件
+    pdf_path = create_test_pdf()
+    print(f"创建测试PDF文件: {pdf_path}")
+    
+    try:
+        # 初始化工作流
+        workflow = QAParsingWorkflowV2()
+        
+        # 运行工作流
+        result = workflow.run(
+            pdf_path=pdf_path,
+            dataset_id="test-dataset-123",
+            qa_count_per_chunk=5,
+            chunk_size=500,
+            chunk_overlap=100
+        )
+        
+        print(f"工作流执行结果: {result}")
+        
+    finally:
+        # 清理测试文件
+        if os.path.exists(pdf_path):
+            os.unlink(pdf_path)
+            print(f"清理测试PDF文件: {pdf_path}")
+
+if __name__ == "__main__":
+    test_qa_workflow()