ソースを参照

初始化项目,首次提交

yingge 3 ヶ月 前
コミット
5ef1502c14
94 ファイル変更9036 行追加0 行削除
  1. 175 0
      .gitignore
  2. 13 0
      doc/api_keys.sql
  3. 48 0
      doc/init.sql
  4. 37 0
      docker/Dockerfile
  5. 20 0
      docker/docker-compose.yml
  6. 25 0
      docker/start.bat
  7. 34 0
      docker/start.sh
  8. 122 0
      main.py
  9. 180 0
      requirements.txt
  10. 0 0
      src/__init__.py
  11. 0 0
      src/api/__init__.py
  12. 0 0
      src/api/dataset/__init__.py
  13. 0 0
      src/api/dataset/models/__init__.py
  14. 56 0
      src/api/dataset/models/dify_models.py
  15. 0 0
      src/api/dataset/services/__init__.py
  16. 56 0
      src/api/dataset/services/dataset_manage_service.py
  17. 79 0
      src/api/dataset/services/dify_knowledge_service.py
  18. 0 0
      src/api/db/__init__.py
  19. 3 0
      src/api/db/services/__init__.py
  20. 97 0
      src/api/db/services/infinity_search_service.py
  21. 214 0
      src/api/db/services/tag_service.py
  22. 0 0
      src/api/mcp/__init__.py
  23. 98 0
      src/api/mcp/hybrid_search_mcp.py
  24. 0 0
      src/api/sdk/__init__.py
  25. 139 0
      src/api/sdk/api_manage.py
  26. 54 0
      src/api/sdk/dataset_manage.py
  27. 28 0
      src/api/sdk/dify_dataset_manage.py
  28. 78 0
      src/api/sdk/search_infinity.py
  29. 93 0
      src/api/sdk/tag_manage.py
  30. 0 0
      src/common/__init__.py
  31. 0 0
      src/common/models/__init__.py
  32. 10 0
      src/common/models/pagination.py
  33. 62 0
      src/common/result.py
  34. 0 0
      src/conf/__init__.py
  35. 10 0
      src/conf/age_level.json
  36. 56 0
      src/conf/infinity_mapping.json
  37. 48 0
      src/conf/rag_parser_config.py
  38. 151 0
      src/conf/settings.py
  39. 0 0
      src/job/__init__.py
  40. 129 0
      src/job/chunk_update_job.py
  41. 0 0
      src/model/__init__.py
  42. 133 0
      src/model/jina_rerank.py
  43. 167 0
      src/model/multimodal_embedding.py
  44. 109 0
      src/model/openai_chat_model.py
  45. 162 0
      src/model/qwen_vl.py
  46. 21 0
      src/model/tracked_multi_embedding.py
  47. 12 0
      src/model/tracked_openai_embeddings.py
  48. 0 0
      src/parser/__init__.py
  49. 0 0
      src/parser/image_parser/__init__.py
  50. 304 0
      src/parser/image_parser/image_parser_workflow.py
  51. 0 0
      src/parser/pdf_parser/__init__.py
  52. 501 0
      src/parser/pdf_parser/pdf_parser_workflow.py
  53. 90 0
      src/parser/pdf_parser/pdf_splitter.py
  54. 58 0
      src/parser/pdf_parser/test_service.py
  55. 11 0
      src/utils/__init__.py
  56. 151 0
      src/utils/asymmetric_encryption.py
  57. 33 0
      src/utils/async_utils.py
  58. 57 0
      src/utils/auth.py
  59. 0 0
      src/utils/decorators/__init__.py
  60. 45 0
      src/utils/decorators/langfuse_trace_embedding.py
  61. 13 0
      src/utils/decorators/singleton.py
  62. 18 0
      src/utils/es/__init__.py
  63. 318 0
      src/utils/es/bulk_helper.py
  64. 80 0
      src/utils/es/client_manager.py
  65. 3 0
      src/utils/es/core/__init__.py
  66. 224 0
      src/utils/es/core/index_manager.py
  67. 3 0
      src/utils/es/services/__init__.py
  68. 326 0
      src/utils/es/services/search_service.py
  69. 221 0
      src/utils/excel_util.py
  70. 0 0
      src/utils/file/__init__.py
  71. 8 0
      src/utils/file/file_utils.py
  72. 319 0
      src/utils/file/image_util.py
  73. 0 0
      src/utils/file/minio/__init__.py
  74. 189 0
      src/utils/file/minio/minio_util.py
  75. 383 0
      src/utils/http_client.py
  76. 527 0
      src/utils/infinity/README.md
  77. 13 0
      src/utils/infinity/__init__.py
  78. 308 0
      src/utils/infinity/client.py
  79. 285 0
      src/utils/infinity/pool.py
  80. 73 0
      src/utils/infinity/result_util.py
  81. 89 0
      src/utils/infinity/test_infinity.py
  82. 52 0
      src/utils/mysql/__init__.py
  83. 244 0
      src/utils/mysql/mysql_conn.py
  84. 138 0
      src/utils/mysql/mysql_pool.py
  85. 0 0
      src/utils/ragflow/__init__.py
  86. 139 0
      src/utils/ragflow/agent_service.py
  87. 146 0
      src/utils/ragflow/chat_service.py
  88. 74 0
      src/utils/ragflow/chunk_record.py
  89. 78 0
      src/utils/ragflow/chunk_service.py
  90. 181 0
      src/utils/ragflow/dataset_service.py
  91. 127 0
      src/utils/ragflow/document_service.py
  92. 141 0
      src/utils/ragflow/file_service.py
  93. 45 0
      src/utils/ragflow/openai_service.py
  94. 302 0
      src/utils/ragflow/ragflow_service.py

+ 175 - 0
.gitignore

@@ -0,0 +1,175 @@
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.env.home
+.env.example
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+.idea
+
+.trae
+src/agent
+src/prompt
+src/workflow
+src/private_key.pem
+src/public_key.pem
+test/
+requirements.txt.bak

+ 13 - 0
doc/api_keys.sql

@@ -0,0 +1,13 @@
+-- 创建 API 密钥表
+CREATE TABLE IF NOT EXISTS api_keys (
+    id INT AUTO_INCREMENT PRIMARY KEY,
+    api_key VARCHAR(255) NOT NULL UNIQUE,
+    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+    expired_at TIMESTAMP NULL,
+    is_active BOOLEAN DEFAULT TRUE,
+    INDEX idx_api_key (api_key),
+    INDEX idx_is_active (is_active)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- 插入一个示例 API 密钥(仅供测试使用)
+INSERT IGNORE INTO api_keys (api_key, is_active) VALUES ('sk-test1234567890', TRUE);

+ 48 - 0
doc/init.sql

@@ -0,0 +1,48 @@
+-- 创建 RagflowChunk记录表
+CREATE TABLE IF NOT EXISTS ragflow_chunk_record (
+    id BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT '主键ID',
+	database_name VARCHAR(64) NOT NULL COMMENT '数据库ID',
+	table_name VARCHAR(255) NOT NULL COMMENT '数据表ID',
+	chunk_id VARCHAR(64) NOT NULL COMMENT '分块ID',
+    cond VARCHAR(100) COMMENT '条件参数',
+    update_data JSON COMMENT '数据参数',
+    scheduled_time DATETIME NOT NULL COMMENT '计划执行时间(当前时间+20秒)',
+    status VARCHAR(20) NOT NULL DEFAULT '0' COMMENT '执行状态:0:未执行/1:执行成功/2:执行失败',
+	error_message VARCHAR(255) COMMENT '失败信息',
+	executed_time DATETIME COMMENT '执行时间',
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
+    INDEX idx_dataset_name (database_name),
+    INDEX idx_table_name (table_name),
+    INDEX idx_chunk_id (chunk_id),
+    INDEX idx_scheduled_time (scheduled_time),
+    INDEX idx_status (status)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='RagflowChunk记录表';
+
+-- 创建 维度知识库关联表
+CREATE TABLE IF NOT EXISTS dimensional_knowledge (
+    id INT AUTO_INCREMENT PRIMARY KEY,
+    dimensional_id VARCHAR(64) NOT NULL UNIQUE COMMENT "维度id",
+    dimensional_name  VARCHAR(255) NOT NULL COMMENT "维度名称",
+    knowledge_id VARCHAR(64) NOT NULL UNIQUE COMMENT "知识库id",
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
+    is_active BOOLEAN DEFAULT TRUE COMMENT "是否有效",
+    INDEX idx_dimensional_id (dimensional_id),
+    INDEX idx_knowledge_id (knowledge_id),
+    INDEX idx_is_active (is_active)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- 创建 API 密钥表
+CREATE TABLE IF NOT EXISTS api_keys (
+    id INT AUTO_INCREMENT PRIMARY KEY COMMENT "主键ID",
+    api_key VARCHAR(255) NOT NULL UNIQUE COMMENT "API密钥",
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT "创建时间",
+    expired_at DATETIME NULL COMMENT "过期时间",
+    is_active BOOLEAN DEFAULT TRUE COMMENT "是否有效",
+    INDEX idx_api_key (api_key),
+    INDEX idx_is_active (is_active)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- 插入一个示例 API 密钥(仅供测试使用)
+INSERT IGNORE INTO api_keys (api_key, is_active) VALUES ('sk-test1234567890', TRUE);

+ 37 - 0
docker/Dockerfile

@@ -0,0 +1,37 @@
+# 使用官方 Python 3.12 slim 镜像作为基础镜像
+FROM python:3.12-slim
+
+# 设置工作目录
+WORKDIR /app
+
+# 设置环境变量
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    TZ=Asia/Shanghai
+
+# 安装系统依赖
+# build-essential: 编译依赖
+# curl: 网络工具
+# libgl1-mesa-glx: OpenCV 等库可能需要
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+
+# 复制依赖文件
+COPY requirements.txt .
+
+# 处理 Windows 特有依赖 (pywin32) 并安装依赖
+RUN sed -i '/pywin32/d' requirements.txt && \
+    pip install --no-cache-dir -r requirements.txt
+
+# 复制项目代码
+COPY . .
+
+# 暴露端口
+EXPOSE 18001
+
+# 启动命令
+CMD ["python", "main.py"]

+ 20 - 0
docker/docker-compose.yml

@@ -0,0 +1,20 @@
+version: '3.8'
+
+services:
+  rag-server:
+    container_name: book-rag-server
+    build:
+      context: ..
+      dockerfile: docker/Dockerfile
+    restart: always
+    ports:
+      - "18001:18001"
+    volumes:
+      - ./.env:/app/.env
+      # 开发环境下可取消注释以下行以挂载源码
+      # - ../src:/app/src
+    environment:
+      - TZ=Asia/Shanghai
+    # 如果依赖的服务在宿主机(localhost),请取消注释以下配置并使用 host.docker.internal 作为主机名
+    # extra_hosts:
+    #   - "host.docker.internal:host-gateway"

+ 25 - 0
docker/start.bat

@@ -0,0 +1,25 @@
+@echo off
+cd /d "%~dp0"
+
+echo Starting Book RAG Server...
+
+REM Check for docker-compose or docker compose
+where docker-compose >nul 2>nul
+if %errorlevel% equ 0 (
+    set CMD=docker-compose
+) else (
+    set CMD=docker compose
+)
+
+echo Using command: %CMD%
+%CMD% up -d --build
+
+if %errorlevel% equ 0 (
+    echo.
+    echo [SUCCESS] Service started successfully!
+    echo You can view logs with: %CMD% logs -f
+) else (
+    echo.
+    echo [ERROR] Failed to start service.
+    pause
+)

+ 34 - 0
docker/start.sh

@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# 切换到脚本所在目录
+cd "$(dirname "$0")"
+
+# 检查 docker 命令
+if ! command -v docker &> /dev/null; then
+    echo "Error: docker could not be found."
+    exit 1
+fi
+
+# 检查 docker-compose 命令 (支持 docker-compose 和 docker compose)
+if command -v docker-compose &> /dev/null; then
+    DOCKER_COMPOSE_CMD="docker-compose"
+elif docker compose version &> /dev/null; then
+    DOCKER_COMPOSE_CMD="docker compose"
+else
+    echo "Error: docker-compose could not be found."
+    exit 1
+fi
+
+echo "Starting Book RAG Server with $DOCKER_COMPOSE_CMD..."
+
+# 构建并启动服务
+$DOCKER_COMPOSE_CMD up -d --build
+
+# 检查启动状态
+if [ $? -eq 0 ]; then
+    echo "✅ Service started successfully!"
+    echo "You can verify the logs with: $DOCKER_COMPOSE_CMD logs -f"
+else
+    echo "❌ Failed to start service."
+    exit 1
+fi

+ 122 - 0
main.py

@@ -0,0 +1,122 @@
+# 主应用入口,整合多个 FastAPI 应用
+import uvicorn
+from fastapi import FastAPI
+from contextlib import asynccontextmanager
+
+# 导入所有子应用
+from src.api.sdk.search_infinity import app as search_app
+from src.api.sdk.tag_manage import app as tag_app
+from src.api.sdk.dataset_manage import app as dataset_app
+from src.api.sdk.api_manage import app as api_manage_app
+from src.api.sdk.dify_dataset_manage import app as dify_dataset_manage_app
+
+# 导入认证中间件
+from src.utils.auth import verify_api_key
+
+# 定义主应用的生命周期管理
+@asynccontextmanager
+async def main_lifespan(app: FastAPI):
+    """主应用生命周期管理"""
+    from src.utils.infinity import get_client, close_client
+    # 1. 初始化Infinity全局客户端(在服务启动时)
+    get_client(database="book_image_db", min_connections=5, max_connections=10)
+    print("✅ Infinity客户端已初始化")
+    
+    # 2. 初始化MySQL全局客户端
+    from src.utils.mysql import init_global_mysql_client, close_global_mysql_client
+    init_global_mysql_client()
+    print("✅ MySQL客户端已初始化")
+    
+    # 3. 初始化MinIO全局客户端并校验存储桶
+    from src.utils.file.minio.minio_util import init_minio_client, close_minio_client
+    init_minio_client(check_bucket=True)
+    print("✅ MinIO客户端已初始化并校验存储桶")
+
+    # 4. 启动Chunk更新定时任务
+    from src.job.chunk_update_job import start_scheduler, shutdown_scheduler
+    start_scheduler()
+    print("✅ Chunk update scheduler started")
+    
+    yield
+
+    # 1. 关闭Chunk更新定时任务
+    shutdown_scheduler()
+    print("✅ Chunk update scheduler shutdown")
+
+    # 2. 关闭MinIO全局客户端
+    close_minio_client()
+    print("✅ MinIO客户端已关闭")
+
+    # 3. 关闭MySQL全局客户端
+    close_global_mysql_client()
+    print("✅ MySQL客户端已关闭")
+
+    # 4. 关闭Infinity全局客户端(在服务关闭时)
+    close_client()
+    print("✅ Infinity客户端已关闭")
+    
+
+    
+
+
+# 创建主应用
+main_app = FastAPI(
+    title="Infinity API Gateway",
+    description="整合多个 FastAPI 应用的 API 网关",
+    version="1.0.0",
+    lifespan=main_lifespan
+)
+
+# 添加认证中间件
+main_app.middleware("http")(verify_api_key)
+# 挂载子应用
+# 1. 搜索 API - 访问路径: /search/*
+main_app.mount("/search", search_app, name="search_api")
+# 2. 标签管理 API - 访问路径: /tag/*
+main_app.mount("/tag", tag_app, name="tag_api")
+# 3. 数据集管理 API - 访问路径: /dataset/*
+main_app.mount("/dataset", dataset_app, name="dataset_api")
+# 4. API 管理 - 访问路径: /api/*
+main_app.mount("/api", api_manage_app, name="api_manage")
+# 5. Dify 数据集管理 API - 访问路径: /dify_dataset/*
+main_app.mount("/dify_dataset", dify_dataset_manage_app, name="dify_dataset_manage")
+
+from src.common.result import Result
+
+# 主应用根路径
+@main_app.get("/")
+async def root():
+    """API 网关根路径"""
+    data = {
+        "message": "Welcome to GRAPH_RAG API Gateway",
+        "available_apps": {
+            "search_api": "访问路径: /search, 文档: /search/docs",
+            "hybrid_http_api": "访问路径: /hybrid, 文档: /hybrid/docs",
+            "tag_api": "访问路径: /tag, 文档: /tag/docs",
+            "dataset_api": "访问路径: /dataset, 文档: /dataset/docs",
+            "api_manage": "访问路径: /api, 文档: /api/docs"
+        }
+    }
+    return Result.success(data=data, message="欢迎访问 GRAPH_RAG API Gateway")
+
+# 健康检查端点
+@main_app.get("/health")
+async def health_check():
+    """主应用健康检查"""
+    data = {"status": "healthy", "service": "Infinity API Gateway"}
+    return Result.success(data=data, message="服务健康")
+
+if __name__ == "__main__":
+    print("=== 启动 GRAPH_RAG API Gateway ===")
+    """启动主应用"""
+    uvicorn.run(
+        "main:main_app",  # 应用路径: 模块名:应用实例名
+        host="0.0.0.0",   # 允许所有IP访问
+        port=18001,         # 服务端口
+        reload=False,       # 开发模式下自动重载
+        workers=1,         # 生产环境可根据需要增加
+        log_level="info",   # 日志级别
+        limit_concurrency=100,  # 并发连接限制
+        timeout_keep_alive=30,  # 保持连接超时
+        timeout_graceful_shutdown=10  # 优雅关闭超时
+    )

+ 180 - 0
requirements.txt

@@ -0,0 +1,180 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.3
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.0
+APScheduler==3.11.2
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+attrs==25.4.0
+Authlib==1.6.6
+backoff==2.2.1
+beartype==0.22.9
+cachetools==6.2.4
+certifi==2026.1.4
+cffi==2.0.0
+charset-normalizer==3.4.4
+click==8.3.1
+cloudpickle==3.1.2
+colorama==0.4.6
+cryptography==46.0.3
+cyclopts==4.4.4
+dashscope==1.25.5
+dataclasses-json==0.6.7
+datrie==0.8.3
+DBUtils==3.1.2
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.8.0
+docstring_parser==0.17.0
+docutils==0.22.4
+elastic-transport==8.17.1
+elasticsearch==8.11.1
+email-validator==2.3.0
+et_xmlfile==2.0.0
+exceptiongroup==1.3.1
+fakeredis==2.33.0
+fastapi==0.128.0
+fastmcp==2.14.2
+filelock==3.20.2
+frozenlist==1.8.0
+fsspec==2025.12.0
+googleapis-common-protos==1.72.0
+greenlet==3.3.0
+h11==0.16.0
+hanziconv==0.3.2
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==1.2.3
+idna==3.11
+importlib_metadata==8.7.1
+infinity-sdk==0.6.15
+infinity_emb==0.0.77
+jaraco.classes==3.4.0
+jaraco.context==6.0.2
+jaraco.functools==4.4.0
+jiter==0.12.0
+joblib==1.5.3
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.26.0
+jsonschema-path==0.3.4
+jsonschema-specifications==2025.9.1
+keyring==25.7.0
+langchain==1.2.0
+langchain-classic==1.0.1
+langchain-community==0.4.1
+langchain-core==1.2.6
+langchain-mcp-adapters==0.2.1
+langchain-openai==1.1.6
+langchain-text-splitters==1.1.0
+langfuse==3.12.0
+langgraph==1.0.5
+langgraph-checkpoint==3.0.1
+langgraph-prebuilt==1.0.5
+langgraph-sdk==0.3.1
+langsmith==0.6.0
+lupa==2.6
+markdown-it-py==4.0.0
+marshmallow==3.26.2
+mcp==1.25.0
+mdurl==0.1.2
+minio==7.2.20
+more-itertools==10.8.0
+multidict==6.7.0
+mypy_extensions==1.1.0
+nltk==3.9.2
+numpy==1.26.4
+ollama==0.6.1
+openai==2.14.0
+openapi-pydantic==0.5.1
+openpyxl==3.1.5
+opentelemetry-api==1.39.1
+opentelemetry-exporter-otlp-proto-common==1.39.1
+opentelemetry-exporter-otlp-proto-http==1.39.1
+opentelemetry-exporter-prometheus==0.60b1
+opentelemetry-instrumentation==0.60b1
+opentelemetry-proto==1.39.1
+opentelemetry-sdk==1.39.1
+opentelemetry-semantic-conventions==0.60b1
+orjson==3.11.5
+ormsgpack==1.12.1
+packaging==25.0
+pandas==2.3.3
+pathable==0.4.4
+pathvalidate==3.3.1
+pdf2image==1.17.0
+pillow==12.1.0
+platformdirs==4.5.1
+polars-lts-cpu==1.33.1
+prometheus_client==0.23.1
+propcache==0.4.1
+protobuf==6.33.4
+py-key-value-aio==0.3.0
+py-key-value-shared==0.3.0
+pyarrow==22.0.0
+pycparser==2.23
+pycryptodome==3.23.0
+pydantic==2.12.5
+pydantic-settings==2.12.0
+pydantic_core==2.41.5
+pydocket==0.16.3
+Pygments==2.19.2
+PyJWT==2.10.1
+PyMuPDF==1.26.7
+PyMySQL==1.1.2
+pyperclip==1.11.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==4.0.0
+python-multipart==0.0.21
+pytz==2025.2
+pywin32==311
+pywin32-ctypes==0.2.3
+PyYAML==6.0.3
+ragflow-sdk==0.23.1
+readerwriterlock==1.0.9
+redis==7.1.0
+referencing==0.36.2
+regex==2025.11.3
+requests==2.32.5
+requests-toolbelt==1.0.0
+rich==14.2.0
+rich-rst==1.3.2
+rpds-py==0.30.0
+setuptools==80.9.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+SQLAlchemy==2.0.45
+sqlglot==28.5.0
+sqlglotrs==0.10.0
+sse-starlette==3.1.2
+starlette==0.50.0
+tenacity==9.1.2
+thrift==0.22.0
+tiktoken==0.12.0
+tqdm==4.67.1
+typer==0.21.1
+typer-slim==0.21.0
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.3
+tzlocal==5.3.1
+urllib3==2.6.2
+uuid_utils==0.12.0
+uvicorn==0.40.0
+websocket-client==1.9.0
+websockets==15.0.1
+wheel==0.45.1
+wrapt==1.17.3
+xlrd==2.0.2
+xxhash==3.6.0
+yarl==1.22.0
+zipp==3.23.0
+zstandard==0.25.0

+ 0 - 0
src/__init__.py


+ 0 - 0
src/api/__init__.py


+ 0 - 0
src/api/dataset/__init__.py


+ 0 - 0
src/api/dataset/models/__init__.py


+ 56 - 0
src/api/dataset/models/dify_models.py

@@ -0,0 +1,56 @@
+from pydantic import BaseModel, Field
+from typing import List, Optional, Any, Literal
+
+class Condition(BaseModel):
+    """
+    筛选条件对象
+    """
+    name: List[str] = Field(
+        ..., 
+        description="需要筛选的 metadata 名称路径", 
+        example=["category", "tag"]
+    )
+    comparison_operator: str = Field(
+        ..., 
+        description="比较操作符,如 contains, equal, empty 等", 
+        example="contains"
+    )
+    value: Optional[str] = Field(
+        None, 
+        description="对比值。当操作符为 empty, not empty, null, not null 时可省略", 
+        example="AI"
+    )
+
+class MetadataCondition(BaseModel):
+    """元数据筛选条件(可选)"""
+    logical_operator: Literal["and", "or"] = Field(
+        default="and", 
+        description="组合条件的逻辑运算符"
+    )
+    conditions: List[Condition] = Field(
+        default_factory=list, 
+        description="具体的筛选条件列表"
+    )
+
+class RetrievalSetting(BaseModel):
+    """检索设置"""
+    top_k: int = Field(default=5, description="返回最相关的结果数量")
+    score_threshold: float = Field(default=0.0, description="分数阈值筛选")
+
+class RetrievalRequest(BaseModel):
+    """Dify 发送的检索请求体"""
+    knowledge_id: str = Field(..., description="知识库ID")
+    query: str = Field(..., description="查询字符串")
+    retrieval_setting: RetrievalSetting
+    metadata_condition: Optional[MetadataCondition] = None
+
+class Record(BaseModel):
+    """单条检索结果记录"""
+    content: str = Field(..., description="文本内容")
+    score: float = Field(..., description="相关性分数 (0.0 - 1.0)")
+    title: str = Field(..., description="文档标题")
+    metadata: Optional[dict] = Field(default=None, description="其他元数据")
+
+class RetrievalResponse(BaseModel):
+    """返回给 Dify 的响应体"""
+    records: List[Record]

+ 0 - 0
src/api/dataset/services/__init__.py


+ 56 - 0
src/api/dataset/services/dataset_manage_service.py

@@ -0,0 +1,56 @@
+"""
+数据集管理服务
+
+该文件提供数据集管理功能,支持:
+- PDF文件解析
+- 数据集创建和管理
+- 调用PDF解析工作流
+"""
+
+import os
+import tempfile
+from typing import Dict, Any, Optional
+from src.parser.pdf_parser.pdf_parser_workflow import PDFParsingWorkflow
+from src.conf.settings import vector_db_settings
+
+
+class DatasetManageService:
+    """数据集管理服务类"""
+    
+    def __init__(self):
+        """初始化数据集管理服务"""
+        self.pdf_workflow = PDFParsingWorkflow()
+    
+    def parse_pdf(self, series_name: str, pdf_file: bytes, pdf_filename: str) -> Dict[str, Any]:
+        """
+        解析PDF文件
+        
+        Args:
+            series_name: 系列名
+            pdf_file: PDF文件字节数据
+            pdf_filename: PDF文件名
+            
+        Returns:
+            Dict[str, Any]: 解析结果
+        """
+        try:
+            # 创建临时文件,使用原始文件名称
+            temp_dir = tempfile.gettempdir()
+            temp_file_path = os.path.join(temp_dir, pdf_filename)
+            with open(temp_file_path, 'wb') as temp_file:
+                temp_file.write(pdf_file)
+            
+            try:
+                # 运行PDF解析工作流
+                result = self.pdf_workflow.run(
+                    pdf_path=temp_file_path,
+                    page_dataset_id=vector_db_settings.infinity_page_dataset_id,
+                    dataset_name=series_name
+                )
+                
+                return result
+            finally:
+                # 删除临时文件
+                os.unlink(temp_file_path)
+        except Exception as e:
+            raise Exception(f"解析PDF文件失败: {str(e)}")

+ 79 - 0
src/api/dataset/services/dify_knowledge_service.py

@@ -0,0 +1,79 @@
+import json
+from src.conf.settings import vector_db_settings
+from src.utils.infinity import InfinityClient
+from src.utils.file.image_util import image_util
+from src.model.multimodal_embedding import get_embedding_model
+from src.utils.infinity.result_util import convert_to_langchain_docs
+from src.api.dataset.models.dify_models import RetrievalRequest
+from src.conf.settings import vector_db_settings
+
+class DifyKnowledgeService:
+    def __init__(self, infinity_client: InfinityClient, vector_field: str = None, match_field: str = None, match_type: str = None, table_name: str = None):
+        self.infinity_client = infinity_client
+        # 输出字段
+        self.output_fields = [
+                    "file_name",
+                    "page_number",
+                    "content",
+                    "image_path",
+                    "dataset_id",
+                    "document_id",
+                    "_similarity"
+                ]
+        self.vector_field = vector_field or "dense_vector_1024"
+        self.match_field = match_field or "content"
+        self.match_type = match_type or "cosine"
+        self.table_name = table_name or vector_db_settings.infinity_table_name
+
+    def dify_database_search(self, request: RetrievalRequest):
+        """
+        执行Dify数据库搜索
+    
+        Args:
+            retrievalRequest: 搜索查询参数
+        
+        Returns:
+            搜索结果,转换为基本类型以便序列化
+        """
+        try:
+            if request.knowledge_id:
+                # table_name = f"{vector_db_settings.infinity_dataset_prefix}{request.knowledge_id}"
+                table_name = self.table_name
+            else:
+                # 抛出异常
+                raise Exception("knowledge_id不能为空")
+            # 获取检索参数,并解析为json
+            try:
+                query = json.loads(request.query)
+                # 检查query是否包含match_image或match_text
+                if "match_image" in query or "matching_text" in query:
+                    input_image = query.get("match_image")
+                    input_text = query.get("matching_text")
+            except json.JSONDecodeError:
+                input_text = request.query
+
+            retrieval_setting = request.retrieval_setting
+
+            # 1.处理image_url为image: Image.Image
+            image = image_util._url_to_image(input_image)
+            # 多模态向量
+            query_vector = get_embedding_model().get_multimodal_embedding(text=input_text, image=image)
+            # 构建搜索查询
+            search_query = {
+                "vector_field": self.vector_field,
+                "query_vector": query_vector,
+                "topn": retrieval_setting.top_k,
+                "knn_params": {
+                    "ef": str(retrieval_setting.top_k * 10),
+                    "threshold": str(retrieval_setting.score_threshold)
+                }
+            }
+            # 执行搜索
+            result = self.infinity_client.vector_search(table_name, self.output_fields, search_query)  
+            # 将结果转换为基本类型,处理可能的复杂类型
+            result_dict = result.to_result()
+            # 递归转换所有复杂类型为基本类型
+            return convert_to_langchain_docs(result_dict)
+        except Exception as e:
+            raise Exception(f"搜索失败: {str(e)}")
+

+ 0 - 0
src/api/db/__init__.py


+ 3 - 0
src/api/db/services/__init__.py

@@ -0,0 +1,3 @@
+# from api.db.services.infinity_search_service import InfinitySearchService
+
+# search_service = InfinitySearchService()

+ 97 - 0
src/api/db/services/infinity_search_service.py

@@ -0,0 +1,97 @@
+from typing import Dict, Any, List
+from src.conf.settings import vector_db_settings
+from src.utils.infinity import InfinityClient
+from src.utils.file.image_util import image_util
+from src.model.multimodal_embedding import get_embedding_model
+from src.utils.infinity.result_util import convert_to_basic_types
+
+class InfinitySearchService:
+    def __init__(self, infinity_client: InfinityClient, vector_field: str = None, match_field: str = None, match_type: str = None, table_name: str = None):
+        self.infinity_client = infinity_client
+        # 输出字段
+        self.output_fields = [
+                    "file_name",
+                    "page_number",
+                    "content",
+                    "image_path",
+                    "dataset_id",
+                    "document_id"
+                ]
+        self.vector_field = vector_field or "dense_vector_1024"
+        self.match_field = match_field or "content"
+        self.match_type = match_type or "cosine"
+        self.table_name = table_name or vector_db_settings.infinity_table_name
+
+    def search(self, search_query: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        执行Infinity数据库搜索
+    
+        Args:
+            search_query: 搜索查询参数
+        
+        Returns:
+            搜索结果,转换为基本类型以便序列化
+        """
+        try:
+            # 执行搜索
+            result = self.infinity_client.search(self.table_name, self.output_fields, search_query)
+            # 将结果转换为基本类型,处理可能的复杂类型
+            result_dict = result.to_result()
+            # 递归转换所有复杂类型为基本类型
+            return convert_to_basic_types(result_dict)
+        except Exception as e:
+            raise Exception(f"搜索失败: {str(e)}")
+
+    def vector_search(self, search_query: Dict[str, Any]):
+        """
+        执行Infinity数据库向量检索
+    
+        Args:
+            search_query: 向量检索查询参数
+        
+        Returns:
+            向量检索结果,转换为基本类型以便序列化
+        """
+        try:
+            # 1.处理image_url为image: Image.Image
+            image = image_util._url_to_image(search_query["image_url"])
+            # 2.将图片进行向量化
+            query_vector = get_embedding_model().get_multimodal_embedding(search_query["matching_text"], image)
+
+            search_query["vector_field"] = self.vector_field
+            search_query["query_vector"] = query_vector
+            # 执行向量检索
+            result = self.infinity_client.vector_search(self.table_name, self.output_fields, search_query)
+            # 将结果转换为基本类型,处理可能的复杂类型
+            result_dict = result.to_result()
+            # 递归转换所有复杂类型为基本类型
+            return convert_to_basic_types(result_dict)
+        except Exception as e:
+            raise Exception(f"向量检索失败: {str(e)}")
+
+    def hybrid_search(self, search_query: Dict[str, Any]):
+        """
+        执行Infinity数据库混合检索
+    
+        Args:
+            search_query: 混合检索查询参数
+        
+        Returns:
+            混合检索结果,转换为基本类型以便序列化
+        """
+        try:
+            # 1.处理image_url为image: Image.Image
+            image = image_util._url_to_image(search_query["image_url"])
+            # 2.将图片进行向量化
+            query_vector = get_embedding_model().get_multimodal_embedding(search_query["matching_text"], image)
+            search_query["vector_field"] = self.vector_field
+            search_query["query_vector"] = query_vector
+            search_query["match_field"] = self.match_field
+            # 执行混合检索
+            result = self.infinity_client.hybrid_search(self.table_name, self.output_fields, search_query)
+            # 将结果转换为基本类型,处理可能的复杂类型
+            result_dict = result.to_result()
+            # 递归转换所有复杂类型为基本类型
+            return convert_to_basic_types(result_dict)
+        except Exception as e:
+            raise Exception(f"混合检索失败: {str(e)}")

+ 214 - 0
src/api/db/services/tag_service.py

@@ -0,0 +1,214 @@
+from typing import List, Dict, Any, Optional
+from abc import ABC, abstractmethod
+from src.utils.ragflow.ragflow_service import RAGFlowService
+from src.utils.infinity import InfinityClient
+from src.conf.settings import tag_search_settings
+
+
+
+class TagService(ABC):
+    """标签管理服务接口"""
+    
+    @abstractmethod
+    def create_tag(self, tag_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        新增标签
+        
+        Args:
+            tag_data: 标签数据,包含标签名称、描述等信息
+            
+        Returns:
+            Dict[str, Any]: 创建成功的标签信息
+        """
+        pass
+    
+    @abstractmethod
+    def upload_tags(self, tags_data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        批量上传标签
+        
+        Args:
+            tags_data: 标签数据列表,每个元素包含标签名称、描述等信息
+            
+        Returns:
+            Dict[str, Any]: 上传结果,包含成功数量、失败数量等信息
+        """
+        pass
+    
+    @abstractmethod
+    def delete_tag(self, tag_id: str) -> Dict[str, Any]:
+        """
+        删除标签
+        
+        Args:
+            tag_id: 标签ID
+            
+        Returns:
+            Dict[str, Any]: 删除结果
+        """
+        pass
+
+
+class TagServiceImpl(TagService):
+    """标签管理服务实现"""
+    
+    def __init__(self, infinity_client: InfinityClient):
+        """
+        初始化标签服务
+        
+        Args:
+            db_client: 数据库客户端实例
+        """
+        self.tag_dataset_id=tag_search_settings.tag_dataset_id
+        self.tag_document_id=tag_search_settings.tag_document_id
+        self.ragflow_service = RAGFlowService()
+        self.infinity_client = infinity_client
+    
+    def create_tag(self, tag_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        新增标签
+        
+        Args:
+            tag_data: 标签数据,包含标签名称、描述等信息
+            
+        Returns:
+            Dict[str, Any]: 创建成功的标签信息
+        """
+        tag_name = tag_data["name"]
+        tag_desc = tag_data["description"]
+        age_range = tag_data["age_range"]
+        # 步骤1:将分块数据写入ragflow
+        chunk = self.ragflow_service.create_chunk(dataset_id=self.tag_dataset_id,
+                                                  document_id=self.tag_document_id,
+                                                  content=tag_desc,
+                                                  important_keywords=[age_range])
+        chunk_id = chunk["chunk"]["id"]
+        # 步骤2: 调用infinity的update方法,将标签更新到块数据中
+        res = self.infinity_client.update(f"id = {chunk_id}", {"tag_kwd": tag_name})
+        if res["code"] != 0:
+            raise Exception(f"更新标签到infinity失败: {res}")
+        
+        # 返回创建成功的标签信息
+        return {
+            "name": tag_name,
+            "description": tag_desc,
+            "age_range": age_range,
+            "chunk_id": chunk_id,
+            "ragflow_chunk": chunk,
+            "infinity_update_result": res
+        }
+    
+    def upload_tags(self, tags_data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        批量上传标签
+        
+        Args:
+            tags_data: 标签数据列表,每个元素包含一级标签、二级标签、三级标签、标签描述等信息
+            
+        Returns:
+            Dict[str, Any]: 上传结果,包含成功数量、失败数量等信息
+        """
+        success_count = 0
+        fail_count = 0
+        failed_tags = []
+
+        tag_data_list = self.get_tag_data(tags_data)
+        # 标签数据入库
+        for tag_data in tag_data_list:
+            tag_name = tag_data["name"]
+            tag_desc = tag_data["description"]
+            age_range = tag_data["age_range"]
+            # 步骤1:将分块数据写入ragflow
+            chunk = self.ragflow_service.create_chunk(dataset_id=self.tag_dataset_id,
+                                                  document_id=self.tag_document_id,
+                                                  content=tag_desc,
+                                                  important_keywords=[age_range])
+            chunk_id = chunk["chunk"]["id"]        
+            print(f"分块数据写入成功, chunk_id: {chunk_id}")
+            # 步骤2: 调用infinity的update方法,将标签更新到块数据中
+            res = self.infinity_client.update(table_name="ragflow_92162247e93e11f084830242ac1d0002_18caf531f04d11f095670242c0a85002", 
+                                              cond=f"id = '{chunk_id}'", 
+                                              data={"tag_kwd": tag_name},
+                                              database_name="default_db")
+            print(f"更新标签 {tag_name} 到 infinity 结果: {res}")
+            if res.error_code == 0:
+                print(f"标签 {tag_name} 更新到 infinity 成功")
+                success_count += 1
+            else:
+                fail_count += 1
+                failed_tags.append({"age_range": age_range, "tag_name": tag_name, "tag_desc": tag_desc, "error": res})
+        return {
+            "success": True,
+            "total": len(tags_data),
+            "success_count": success_count,
+            "fail_count": fail_count,
+            "failed_tags": failed_tags
+        }
+    
+    def delete_tag(self, tag_id: str) -> Dict[str, Any]:
+        """
+        删除标签
+        
+        Args:
+            tag_id: 标签ID
+            
+        Returns:
+            Dict[str, Any]: 删除结果
+        """
+        pass
+
+    def get_tag_data(self, tags_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        import json
+        import os
+        
+        # 读取年龄段配置文件
+        age_level_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))), "conf", "age_level.json")
+        with open(age_level_path, "r", encoding="utf-8") as f:
+            age_level_map = json.load(f)
+        
+        # 存储转换后的标签数据
+        formatted_tags = []
+        
+        # 遍历标签数据列表,验证每个标签数据是否符合要求,并按照要求格式输出
+        for tag_data in tags_data:
+            # 提取标签数据
+            age_level_code = tag_data.get("年龄分级", "")  # 年龄分级编码
+            one_tag = tag_data.get("一级标签", "")  # 一级标签
+            two_tag = tag_data.get("二级标签", "")  # 二级标签
+            three_tag = tag_data.get("三级标签", "")  # 三级标签
+            tag_desc = tag_data.get("标签描述", "")  # 标签描述
+                
+            # 生成标签名称:一级标签_二级标签_三级标签
+            tag_name = f"{one_tag}_{two_tag}_{three_tag}"
+                
+            # 获取年龄段
+            age_range = age_level_map.get(age_level_code, "")
+                
+            # 构建最终标签数据
+            formatted_tag_data = {
+                "name": tag_name,
+                "description": tag_desc,
+                "age_range": age_range
+            }
+            
+            # 将转换后的标签数据添加到列表中
+            formatted_tags.append(formatted_tag_data)
+        
+        # 返回转换后的标签数据集合
+        return formatted_tags
+
+class TagServiceFactory:
+    """标签服务工厂类"""
+    
+    @staticmethod
+    def create_tag_service(db_client) -> TagService:
+        """
+        创建标签服务实例
+        
+        Args:
+            db_client: 数据库客户端实例
+            
+        Returns:
+            TagService: 标签服务实例
+        """
+        return TagServiceImpl(db_client)

+ 0 - 0
src/api/mcp/__init__.py


+ 98 - 0
src/api/mcp/hybrid_search_mcp.py

@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""
+混合检索MCP服务
+使用fastmcp框架实现,提供图片解析后的向量化入库和混合检索功能
+"""
+import requests
+from io import BytesIO
+from typing import List, Dict, Any
+from fastmcp import FastMCP
+from PIL import Image
+from src.utils.infinity_util import InfinityVectorDB
+from src.model.multimodal_embedding import Embedding
+from src.conf.settings import model_settings, ragflow_settings, vector_db_settings
+
+
+
+# 初始化fastmcp应用
+mcp = FastMCP("Multi_Vector_Search")
+
+# 初始化向量数据库
+vector_db = InfinityVectorDB()
+
+# 初始化多模态嵌入模型
+embedding_model = Embedding(
+    model_name=model_settings.multimodal_embedding_model_name,
+    api_key=model_settings.dashscope_api_key
+)
+
+@mcp.tool(name="hybrid_search")
+def hybrid_search(request: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    混合检索API
+    使用文本查询和向量查询进行混合检索
+    """
+    try:
+        # 解析请求参数
+        text_query = request["text_query"]
+        image_url = request["image"]
+        topn = request.get("topn", 2)
+        
+        print(f"开始混合检索,数据库: {vector_db_settings.infinity_database}, 知识库id: {ragflow_settings.dataset_id}, 文本查询: {text_query}, 返回数量: {topn}")
+        
+        # 构建索引名称
+        index_name = f"pdf_documents_{ragflow_settings.dataset_id}"        
+        print(f"开始生成多模态嵌入,文本长度: {len(text_query)}")
+        
+        # 处理image_url为image: Image.Image
+        if isinstance(image_url, str):
+                # 下载图片
+                response = requests.get(image_url)
+                response.raise_for_status()  # 检查HTTP状态码
+    
+                # 将响应内容转换为字节流
+                image_bytes = BytesIO(response.content)
+    
+                # 创建Image对象
+                image = Image.open(image_bytes)
+        
+        # 生成多模态嵌入向量
+        embedding = embedding_model.get_multimodal_embedding(text_query, image)
+        
+        print(f"多模态嵌入生成完成,向量长度: {len(embedding)}")
+        
+        # 执行混合检索
+        result = vector_db.hybrid_search(
+            index_name=index_name,
+            match_method="dense",
+            vector_field="dense_vector_1024",
+            query_vector=embedding,
+            element_type="float",
+            metric_type="cosine",
+            topn=topn,
+            text_query=text_query,
+            text_field="content"
+        )
+        
+        print(f"混合检索完成,总命中数: {result.get('total', 0)}")
+        
+        # 返回成功响应
+        return {
+            "success": True,
+            "message": "混合检索成功",
+            "output": result.get("output", []),
+            "total": result.get("total", topn)
+        }
+    except Exception as e:
+        print(f"混合检索失败: {str(e)}")
+        return {
+            "success": False,
+            "message": str(e)
+        }
+
+
+if __name__ == "__main__":
+    mcp.run(transport="sse", host="0.0.0.0", port=18000)
+    # 启动HTTP服务器,使用uvicorn运行FastAPI应用
+    # import uvicorn
+    # uvicorn.run(mcp.http_app, host="0.0.0.0", port=18000, transport="stdio")

+ 0 - 0
src/api/sdk/__init__.py


+ 139 - 0
src/api/sdk/api_manage.py

@@ -0,0 +1,139 @@
+"""
+API key management endpoints
+"""
+from fastapi import FastAPI, HTTPException, Depends
+from typing import List, Dict, Any, Optional
+from src.utils.mysql import get_global_mysql_client
+from datetime import datetime, timedelta
+import secrets
+import string
+from src.common.result import Result
+
+app = FastAPI(
+    title="API Key Management",
+    description="API for managing API keys",
+    version="1.0.0"
+)
+
+
+def generate_api_key() -> str:
+    """
+    Generate a new API key in format sk-<random-string>
+    """
+    # Generate random string
+    alphabet = string.ascii_letters + string.digits
+    random_string = ''.join(secrets.choice(alphabet) for _ in range(32))
+    return f"sk-{random_string}"
+
+
+@app.post("/generate")
+async def generate_api_key_endpoint(expiry_days: Optional[int] = None):
+    """
+    Generate a new API key
+    """
+    try:
+        api_key = generate_api_key()
+        mysql_client = get_global_mysql_client()
+        
+        # Calculate expiration date if provided
+        expired_at = None
+        if expiry_days:
+            expired_at = datetime.now() + timedelta(days=expiry_days)
+        
+        # Insert API key into database
+        query = """
+        INSERT INTO api_keys (api_key, expired_at, is_active)
+        VALUES (%s, %s, TRUE)
+        """
+        mysql_client.execute(query, [api_key, expired_at])
+        
+        data = {
+            "api_key": api_key,
+            "created_at": datetime.now().isoformat(),
+            "expired_at": expired_at.isoformat() if expired_at else None
+        }
+        
+        return Result.success(data=data, message="API key 生成成功")
+        
+    except Exception as e:
+        return Result.error(code=500, message=f"生成 API key 失败: {str(e)}")
+
+
+@app.get("/list")
+async def list_api_keys():
+    """
+    List all API keys
+    """
+    try:
+        mysql_client = get_global_mysql_client()
+        query = """
+        SELECT id, api_key, created_at, expired_at, is_active
+        FROM api_keys
+        ORDER BY created_at DESC
+        """
+        result = mysql_client.fetch_all(query)
+        
+        return Result.success(data=result, message="获取 API keys 成功")
+        
+    except Exception as e:
+        return Result.error(code=500, message=f"获取 API keys 失败: {str(e)}")
+
+
+@app.put("/toggle/{api_key_id}")
+async def toggle_api_key(api_key_id: int):
+    """
+    Toggle API key activation status
+    """
+    try:
+        mysql_client = get_global_mysql_client()
+        
+        # Check if API key exists
+        query = "SELECT is_active FROM api_keys WHERE id = %s"
+        result = mysql_client.fetch_one(query, [api_key_id])
+        
+        if not result:
+            return Result.error(code=404, message="API key 不存在")
+        
+        # Toggle status
+        new_status = not result["is_active"]
+        update_query = "UPDATE api_keys SET is_active = %s WHERE id = %s"
+        mysql_client.execute(update_query, [new_status, api_key_id])
+        
+        data = {
+            "api_key_id": api_key_id,
+            "is_active": new_status
+        }
+        
+        return Result.success(data=data, message="API key 状态更新成功")
+        
+    except Exception as e:
+        return Result.error(code=500, message=f"更新 API key 状态失败: {str(e)}")
+
+
+@app.delete("/delete/{api_key_id}")
+async def delete_api_key(api_key_id: int):
+    """
+    Delete API key
+    """
+    try:
+        mysql_client = get_global_mysql_client()
+        
+        # Check if API key exists
+        query = "SELECT id FROM api_keys WHERE id = %s"
+        result = mysql_client.fetch_one(query, [api_key_id])
+        
+        if not result:
+            return Result.error(code=404, message="API key 不存在")
+        
+        # Delete API key
+        delete_query = "DELETE FROM api_keys WHERE id = %s"
+        mysql_client.execute(delete_query, [api_key_id])
+        
+        data = {
+            "api_key_id": api_key_id
+        }
+        
+        return Result.success(data=data, message="API key 删除成功")
+        
+    except Exception as e:
+        return Result.error(code=500, message=f"删除 API key 失败: {str(e)}")

+ 54 - 0
src/api/sdk/dataset_manage.py

@@ -0,0 +1,54 @@
+"""
+数据集管理 API
+
+该文件提供数据集管理的 API 接口,支持:
+- PDF 文件上传和解析
+- 数据集创建
+"""
+
+from fastapi import FastAPI, UploadFile, File, Form
+from src.api.dataset.services.dataset_manage_service import DatasetManageService
+from src.common.result import Result
+
+
+# 创建 FastAPI 应用
+app = FastAPI(
+    title="数据集管理 API",
+    description="数据集管理服务,提供 PDF 解析和数据集创建功能",
+    version="1.0.0"
+)
+
+# 创建数据集管理服务实例
+dataset_service = DatasetManageService()
+
+
+@app.post("/parse-pdf")
+async def parse_pdf(
+    file: UploadFile = File(...),
+    series_name: str = Form(...)
+    
+):
+    """
+    解析 PDF 文件接口
+    
+    - **file**: PDF 文件附件
+    - **series_name**: 系列名
+    """
+    try:
+        # 验证文件格式
+        if not file.filename.endswith((".pdf", ".PDF")):
+            return Result.error(code=400, message="只支持 PDF 格式的文件")
+        
+        # 读取文件内容
+        file_content = await file.read()
+        
+        # 调用解析 PDF 方法
+        result = dataset_service.parse_pdf(
+            series_name=series_name,
+            pdf_file=file_content,
+            pdf_filename=file.filename
+        )
+        
+        return Result.success(data=result, message="PDF 解析成功")
+    except Exception as e:
+        return Result.error(code=500, message=f"解析 PDF 文件失败: {str(e)}")

+ 28 - 0
src/api/sdk/dify_dataset_manage.py

@@ -0,0 +1,28 @@
+import uvicorn
+from fastapi import FastAPI
+from src.api.dataset.models.dify_models import RetrievalRequest, RetrievalResponse, Record
+from src.api.dataset.services.dify_knowledge_service import DifyKnowledgeService
+from src.utils.infinity import get_client
+
+app = FastAPI(    
+    title="Dify External Knowledge API",
+    description="基于Infinity向量数据库的搜索API服务",
+    version="1.0.0"
+    )
+
+
+@app.post("/retrieval", response_model=RetrievalResponse)
+async def retrieval(request: RetrievalRequest):
+    # 打印收到的筛选条件,方便调试
+    dify_knowledge_service = DifyKnowledgeService(infinity_client=get_client())
+    result = dify_knowledge_service.dify_database_search(request)
+    records = [Record(
+        content=item.page_content,
+        score=item.metadata["SIMILARITY"],
+        title=item.metadata["file_name"],
+        metadata=item.metadata
+    ) for item in result]
+    return RetrievalResponse(records=records)
+
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

+ 78 - 0
src/api/sdk/search_infinity.py

@@ -0,0 +1,78 @@
+# Infinity搜索API服务
+
+from fastapi import FastAPI, HTTPException
+from typing import List, Dict, Any, Optional
+from src.api.db.services.infinity_search_service import InfinitySearchService
+from src.utils.infinity import get_client
+from src.common.result import Result
+from src.utils.async_utils import run_in_threadpool
+
+
+# 创建FastAPI应用
+
+app = FastAPI(
+    title="Infinity Search API",
+    description="基于Infinity向量数据库的搜索API服务",
+    version="1.0.0"
+)
+
+# 请求模型
+from pydantic import BaseModel
+
+class SearchRequest(BaseModel):
+    """搜索请求模型"""
+    search_query: Dict[str, Any]
+
+# 1. 普通搜索接口
+@app.post("/text")
+async def search(request: SearchRequest):
+    """
+    普通搜索接口
+    
+    - **table_name**: 表名
+    - **output_fields**: 要返回的字段列表
+    - **query**: 查询条件,包含field、query和topn字段
+    - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
+    """
+    try:
+        search_service = InfinitySearchService(infinity_client=get_client())
+        result = await run_in_threadpool(search_service.search, request.search_query)
+        return Result.success(data=result, message="搜索成功")
+    except Exception as e:
+        return Result.error(code=500, message=f"搜索失败: {str(e)}")
+
+# 2. 向量搜索接口
+@app.post("/vector")
+async def vector_search(request: SearchRequest):
+    """
+    向量搜索接口
+    
+    - **table_name**: 表名
+    - **output_fields**: 要返回的字段列表
+    - **query**: 查询条件,包含vector_field、query_vector和topn字段
+    - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
+    """
+    try:
+        search_service = InfinitySearchService(infinity_client=get_client())
+        result = await run_in_threadpool(search_service.vector_search, request.search_query)
+        return Result.success(data=result, message="向量搜索成功")
+    except Exception as e:
+        return Result.error(code=500, message=f"向量搜索失败: {str(e)}")
+
+# 3. 混合搜索接口
+@app.post("/hybrid")
+async def hybrid_search(request: SearchRequest):
+    """
+    混合搜索接口
+    
+    - **table_name**: 表名
+    - **output_fields**: 要返回的字段列表
+    - **query**: 查询条件,包含vector_field、query_vector、field、query、topn和fusion_weight字段
+    - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
+    """
+    try:
+        search_service = InfinitySearchService(infinity_client=get_client())
+        result = await run_in_threadpool(search_service.hybrid_search, request.search_query)
+        return Result.success(data=result, message="混合搜索成功")
+    except Exception as e:
+        return Result.error(code=500, message=f"混合搜索失败: {str(e)}")

+ 93 - 0
src/api/sdk/tag_manage.py

@@ -0,0 +1,93 @@
+# 标签管理API服务
+import os
+import tempfile
+from pydantic import BaseModel
+from fastapi import FastAPI, HTTPException, UploadFile, File
+from typing import List, Dict, Any, Optional
+from src.api.db.services.tag_service import TagServiceFactory
+from src.utils.infinity import get_client
+from src.utils.excel_util import excel_util
+from src.common.result import Result
+
+
+# 创建FastAPI应用
+app = FastAPI(
+    title="标签管理 API",
+    description="标签管理服务,提供标签上传、查询、删除等功能",
+    version="1.0.0"
+)
+
+
+# 请求模型
+class TagCreateRequest(BaseModel):
+    """创建标签请求模型"""
+    name: str  # 标签名称,格式:一级标签_二级标签_三级标签
+    description: str  # 标签描述
+    age_range: str  # 年龄段,如:0-2、2-3等
+
+
+# 1. 标签上传接口
+@app.post("/upload")
+async def upload_tags(file: UploadFile = File(...)):
+    """
+    批量上传标签接口
+    
+    - **file**: Excel格式的标签文件,包含年龄分级、一级标签、二级标签、三级标签、标签描述字段
+    
+    Excel文件格式要求:
+    - 支持 .xlsx 和 .xls 格式
+    - 第一行为表头,包含:年龄分级、一级标签、二级标签、三级标签、标签描述
+    - 年龄分级字段值应为 L1-L8 之间的一个(如 L1、L2 等)
+    """
+    try:
+        # 验证文件格式
+        file_ext = os.path.splitext(file.filename)[1].lower()
+        if file_ext not in [".xlsx", ".xls"]:
+            return Result.error(code=400, message=f"不支持的文件格式 {file_ext},只支持 .xlsx 和 .xls 格式")
+        
+        # 创建临时文件
+        with tempfile.NamedTemporaryFile(suffix=file_ext, delete=False) as temp_file:
+            temp_file.write(await file.read())
+            temp_file_path = temp_file.name
+        
+        try:
+            # 解析Excel文件获取标签数据
+            tags_data = excel_util.parse_excel(file_path=temp_file_path)
+            
+            # 使用工厂类创建标签服务实例
+            tag_service = TagServiceFactory.create_tag_service(get_client())
+            
+            # 调用批量上传标签方法
+            result = tag_service.upload_tags(tags_data)
+            
+            return Result.success(data=result, message="标签上传成功")
+        finally:
+            # 删除临时文件
+            os.unlink(temp_file_path)
+    except Exception as e:
+        return Result.error(code=500, message=f"标签上传失败: {str(e)}")
+
+
+# 2. 新增标签接口
+@app.post("/create")
+async def create_tag(request: TagCreateRequest):
+    """
+    新增标签接口
+    
+    - **name**: 标签名称,格式为:一级标签_二级标签_三级标签
+    - **description**: 标签描述,详细说明标签的含义和用途
+    - **age_range**: 年龄段,如:0-2、2-3、3-4等
+    """
+    try:
+        # 使用工厂类创建标签服务实例
+        tag_service = TagServiceFactory.create_tag_service(get_client())
+        
+        # 调用创建标签方法
+        tag_data = request.model_dump()
+        result = tag_service.create_tag(tag_data)
+        
+        return Result.success(data=result, message="标签创建成功")
+    except KeyError as e:
+        return Result.error(code=400, message=f"缺少必填字段: {str(e)}")
+    except Exception as e:
+        return Result.error(code=500, message=f"创建标签失败: {str(e)}")

+ 0 - 0
src/common/__init__.py


+ 0 - 0
src/common/models/__init__.py


+ 10 - 0
src/common/models/pagination.py

@@ -0,0 +1,10 @@
+from pydantic import BaseModel, Field
+
+class Pagination(BaseModel):
+    """通用分页与过滤模型"""
+    page: int = Field(default=1, ge=1, description="当前页码")
+    page_size: int = Field(default=30, ge=1, le=100, description="每页条数")
+    
+    def to_dict(self):
+        """过滤掉 None 值,转化为 API 要求的字典"""
+        return {k: v for k, v in self.model_dump().items() if v is not None}

+ 62 - 0
src/common/result.py

@@ -0,0 +1,62 @@
+"""
+Unified API response template
+"""
+from typing import Optional, Any, Dict, List
+from fastapi.responses import JSONResponse
+
+
+class Result:
+    """
+    Unified API response class
+    """
+    
+    @staticmethod
+    def success(data: Any = None, message: str = "操作成功", total: int = 0, page: int = 1) -> JSONResponse:
+        """
+        Success response
+        """
+        return JSONResponse(
+            status_code=200,
+            content={
+                "code": 200,
+                "success": True,
+                "message": message,
+                "data": data,
+                "total": total,
+                "page": page
+            }
+        )
+    
+    @staticmethod
+    def error(code: int = 400, message: str = "操作失败", data: Any = None) -> JSONResponse:
+        """
+        Error response
+        """
+        return JSONResponse(
+            status_code=code,
+            content={
+                "code": code,
+                "success": False,
+                "message": message,
+                "data": data,
+                "total": 0,
+                "page": 1
+            }
+        )
+    
+    @staticmethod
+    def pagination(data: List[Any], total: int, page: int, message: str = "操作成功") -> JSONResponse:
+        """
+        Pagination response
+        """
+        return JSONResponse(
+            status_code=200,
+            content={
+                "code": 200,
+                "success": True,
+                "message": message,
+                "data": data,
+                "total": total,
+                "page": page
+            }
+        )

+ 0 - 0
src/conf/__init__.py


+ 10 - 0
src/conf/age_level.json

@@ -0,0 +1,10 @@
+{
+    "L1": [0, 1 ,2],
+    "L2": [2, 3],
+    "L3": [3, 4],
+    "L4": [4, 5],
+    "L5": [5, 6],
+    "L6": [6, 7, 8, 9, 10],
+    "L7": [10, 11, 12, 13, 14],
+    "L8": [14, 15, 16, 17, 18, 19, 20]
+}

+ 56 - 0
src/conf/infinity_mapping.json

@@ -0,0 +1,56 @@
+[
+                {
+                    "name": "id",
+                    "type": "varchar",
+                    "default": "",
+                    "comment": "文档ID"
+                },
+                {
+                    "name": "file_name",
+                    "type": "varchar",
+                    "default": "",
+                    "comment": "文件名"
+                },
+                {
+                    "name": "file_page_count",
+                    "type": "int",
+                    "default": 0,
+                    "comment": "文件总页数"
+                },
+                {
+                    "name": "page_number",
+                    "type": "int",
+                    "default": 0,
+                    "comment": "页码"
+                },
+                {
+                    "name": "content",
+                    "type": "varchar",
+                    "default": "",
+                    "comment": "文本内容"
+                },
+                {
+                    "name": "image_id",
+                    "type": "varchar",
+                    "default": "",
+                    "comment": "图片id"
+                },
+                {
+                    "name": "dense_vector_1024",
+                    "type": "vector,1024,float",
+                    "default": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+                    "comment": "1024维向量"
+                },
+                {
+                    "name": "dataset_id",
+                    "type": "varchar",
+                    "default": "",
+                    "comment": "数据集ID"
+                },
+                {
+                    "name": "document_id",
+                    "type": "varchar",
+                    "default": "",
+                    "comment": "RAGFlow文档ID"
+                }
+            ]

+ 48 - 0
src/conf/rag_parser_config.py

@@ -0,0 +1,48 @@
+class RagParserDefaults:
+
+    DATASET_PERMISSION="team"
+
+    DATASET_CHUNK_METHOD="naive"
+
+    DATASET_CONFIG_DICT = {
+            "chunk_token_num": 256,
+            "delimiter": "\n!?;。;!?",
+            "html4excel": False,
+            "layout_recognize": "DeepDOC",
+            "auto_keywords": 5,
+            "tag_kb_ids": [],
+            "topn_tags": 3,
+            "task_page_size": 4,
+            "raptor": {
+                "max_cluster": 64,
+                "max_token": 256,
+                "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
+                "random_seed": 0,
+                "threshold": 0.1,
+                "use_raptor": True
+            },
+            "graphrag": {
+                "resolution": True,
+                "use_graphrag": True,
+                "method": "general",
+                "entity_types": [
+                    "person",
+                    "geo",
+                    "event",
+                    "Book",
+                    "Author",
+                    "Illustrator",
+                    "Series",
+                    "Theme",
+                    "Genre",
+                    "Character",
+                    "Setting",
+                    "AgeGroup",
+                    "Competency",
+                    "ArtStyle",
+                    "Award",
+                    "Publisher",
+                    "Role"
+                ]
+            }
+        }

+ 151 - 0
src/conf/settings.py

@@ -0,0 +1,151 @@
+"""配置管理模块,使用 pydantic-settings 从.env文件加载环境变量"""
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic import Field
+from typing import List
+
+
+class AppSettings(BaseSettings):
+    """应用配置类"""
+    log_level: str = Field(default="INFO", alias="LOG_LEVEL")
+    
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+class ModelSettings(BaseSettings):
+    """模型配置类"""
+    model_provider: str = Field(default="openai", alias="MODEL_PROVIDER")
+    model_name: str = Field(default="Qwen/Qwen3-VL-8B-Instruct", alias="MODEL_NAME")
+    chat_model_name: str = Field(default="deepseek-ai/DeepSeek-V3.2", alias="CHAT_MODEL_NAME")
+    embedding_model_name: str = Field(default="Qwen/Qwen3-Embedding-0.6B", alias="EMBEDDING_MODEL_NAME")
+    base_url: str = Field(default="https://api.openai.com/v1", alias="BASE_URL")
+    api_key: str = Field(default="", alias="API_KEY")
+    rank_model_name: str = Field(default="Qwen/Qwen3-Reranker-0.6B", alias="RANK_MODEL_NAME")
+    multimodal_embedding_model_name: str = Field(default="qwen2.5-vl-embedding", alias="MULTIMODAL_EMBEDDING_MODEL_NAME")
+    dashscope_api_key: str = Field(default="", alias="DASHSCOPE_API_KEY")
+    
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+class RagflowSettings(BaseSettings):
+    """RAGFLOW配置类"""
+    ragflow_api_url: str = Field(default="http://192.168.16.134/", alias="RAGFLOW_API_URL")
+    ragflow_api_key: str = Field(default="", alias="RAGFLOW_API_KEY")
+    dataset_id: str = Field(default="", alias="DATASET_ID")
+    ragflow_user_name: str = Field(default="", alias="RAGFLOW_USER_NAME")
+    ragflow_passwd: str = Field(default="", alias="RAGFLOW_PASSWD")
+    
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+class MinioSettings(BaseSettings):
+    """MinIO配置类"""
+    minio_endpoint: str = Field(default="http://localhost:9000", alias="MINIO_ENDPOINT")
+    minio_access_key: str = Field(default="minioadmin", alias="MINIO_ACCESS_KEY")
+    minio_secret_key: str = Field(default="minioadmin", alias="MINIO_SECRET_KEY")
+    minio_bucket_name: str = Field(default="ragflow", alias="MINIO_BUCKET_NAME")
+    minio_secure: bool = Field(default=False, alias="MINIO_SECURE")
+    
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+class VectorDBSettings(BaseSettings):
+    """向量数据库配置类"""
+    vector_db_type: str = Field(default="es", alias="VECTOR_DB_TYPE")
+    infinity_host: str = Field(default="192.168.16.134", alias="INFINITY_HOST")
+    infinity_port: int = Field(default=23820, alias="INFINITY_PORT")
+    infinity_sdk_port: int = Field(default=23817, alias="INFINITY_SDK_PORT")
+    infinity_user: str = Field(default="admin", alias="INFINITY_USER")
+    infinity_password: str = Field(default="admin", alias="INFINITY_PASSWORD")
+    infinity_database: str = Field(default="test", alias="INFINITY_DATABASE")
+    infinity_table_name: str = Field(default="test", alias="INFINITY_TABLE_NAME")
+    infinity_page_dataset_id: str = Field(default="", alias="INFINITY_PAGE_DATASET_ID")
+    infinity_page_table_name: str = Field(default="", alias="INFINITY_PAGE_TABLE_NAME")
+    infinity_ragflow_database: str = Field(default="default_db", alias="INFINITY_RAGFLOW_DATABASE")
+    infinity_dataset_prefix: str = Field(default="ragbook_", alias="INFINITY_DATASET_PREFIX")
+    
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+class MysqlSettings(BaseSettings):
+    """MySQL配置类"""
+    mysql_host: str = Field(default="localhost", alias="MYSQL_HOST")
+    mysql_port: int = Field(default=3306, alias="MYSQL_PORT")
+    mysql_user: str = Field(default="root", alias="MYSQL_USER")
+    mysql_password: str = Field(default="", alias="MYSQL_PASSWORD")
+    mysql_database: str = Field(default="", alias="MYSQL_DATABASE")
+    mysql_charset: str = Field(default="utf8mb4", alias="MYSQL_CHARSET")
+    mysql_pool_size: int = Field(default=10, alias="MYSQL_POOL_SIZE")
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+
+class TagSearchSettings(BaseSettings):
+    """标签搜索配置类"""
+    tag_db_name: str = Field(default="tag_db", alias="TAG_DB_NAME")
+    tag_table_name: str = Field(default="tag_table", alias="TAG_TABLE_NAME")
+    tag_document_id: str = Field(default="", alias="TAG_DOCUMENT_ID")
+    tag_dataset_id: str = Field(default="", alias="TAG_DATASET_ID")
+    
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+
+class EsSettings(BaseSettings):
+    """Elasticsearch配置类"""
+    es_nodes: List[str] = Field(default=["http://localhost:9200"], alias="ES_NODES")
+    es_username: str = Field(default="", alias="ES_USERNAME")
+    es_password: str = Field(default="", alias="ES_PASSWORD")
+    es_index_name: str = Field(default="rag_documents", alias="ES_INDEX_NAME")
+    es_connections_per_node: int = Field(default=20, alias="ES_CONNECTIONS_PER_NODE")
+    es_max_retries: int = Field(default=3, alias="ES_MAX_RETRIES")
+    es_retry_on_timeout: bool = Field(default=True, alias="ES_RETRY_ON_TIMEOUT")
+    es_timeout: int = Field(default=30, alias="ES_TIMEOUT")
+    es_verify_certs: bool = Field(default=False, alias="ES_VERIFY_CERTS")
+    
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        case_sensitive=False,
+        extra="ignore"
+    )
+
+
+# 创建配置实例
+model_settings = ModelSettings()
+ragflow_settings = RagflowSettings()
+app_settings = AppSettings()
+minio_settings = MinioSettings()
+vector_db_settings = VectorDBSettings()
+mysql_settings = MysqlSettings()
+tag_search_settings = TagSearchSettings()
+es_settings = EsSettings()

+ 0 - 0
src/job/__init__.py


+ 129 - 0
src/job/chunk_update_job.py

@@ -0,0 +1,129 @@
+"""
+Chunk 更新定时任务
+
+该模块负责处理 ragflow_chunk_record 表中的定时任务,包括:
+- 定期查询到期的任务
+- 执行任务逻辑
+- 更新任务状态
+"""
+import time
+import json
+from datetime import datetime
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.interval import IntervalTrigger
+from src.utils.mysql import get_global_mysql_client
+from src.utils.infinity import get_client
+
+# 初始化调度器
+_scheduler = None
+
+class ChunkUpdateJob:
+    """Chunk 更新定时任务服务"""
+    
+    def __init__(self):
+        """初始化定时任务服务"""
+        self.mysql_client = get_global_mysql_client()
+        self.infinity_client = get_client()
+    
+    def process_due_tasks(self):
+        """处理到期的任务"""
+        try:
+            # 查询状态为"未执行"且计划时间小于等于当前时间的任务
+            current_time = datetime.now()
+            tasks = self.mysql_client.fetch_all(
+                "SELECT id, database_name, table_name, chunk_id, cond, update_data FROM ragflow_chunk_record "
+                "WHERE status = %s AND scheduled_time <= %s",
+                ["未执行", current_time]
+            )
+            
+            # 处理每个任务
+            for task in tasks:
+                task_id = task["id"]
+                database_name = task["database_name"]
+                table_name = task["table_name"]
+                chunk_id = task["chunk_id"]
+                cond = task["cond"]
+                data = task["update_data"]
+                
+                try:
+                    # 执行任务逻辑
+                    self._execute_task(database_name, table_name, chunk_id, cond, data)
+                    
+                    # 更新任务状态为"已执行"
+                    self.mysql_client.execute(
+                        "UPDATE ragflow_chunk_record SET status = %s, executed_time = %s WHERE id = %s",
+                        ["已执行", datetime.now(), task_id]
+                    )
+                    
+                    print(f"Task {task_id} executed successfully")
+                except Exception as e:
+                    # 更新任务状态为"执行失败"
+                    self.mysql_client.execute(
+                        "UPDATE ragflow_chunk_record SET status = %s, error_message = %s, executed_time = %s WHERE id = %s",
+                        ["执行失败", str(e), datetime.now(), task_id]
+                    )
+                    
+                    print(f"Task {task_id} execution failed: {e}")
+        except Exception as e:
+            print(f"Failed to process due tasks: {e}")
+    
+    def _execute_task(self, database_name: str, table_name: str, chunk_id: str, 
+                      cond: str, data: dict) -> None:
+        """
+        执行具体的任务逻辑
+        
+        Args:
+            database_name: 数据库名称
+            table_name: 表名称
+            chunk_id: Chunk ID
+            cond: 条件字符串
+            data: 数据字典
+        """
+        try:
+            # 使用 Infinity 客户端执行更新操作
+            # 这里需要根据实际的 Infinity API 进行调整
+            if cond and data:
+                self.infinity_client.update(
+                    table_name=table_name,
+                    cond=cond,
+                    data=json.loads(data),
+                    database_name=database_name
+                )
+            print(f"Updated chunk {chunk_id} in {database_name}.{table_name}")
+        except Exception as e:
+            raise Exception(f"Failed to update chunk {chunk_id}: {e}")
+
+
+def start_scheduler():
+    """启动定时任务调度器"""
+    global _scheduler
+    
+    if _scheduler is None:
+        # 创建调度器
+        _scheduler = BackgroundScheduler()
+        
+        # 创建任务实例
+        chunk_update_job = ChunkUpdateJob()
+        
+        # 添加定时任务,每5秒执行一次
+        _scheduler.add_job(
+            func=chunk_update_job.process_due_tasks,
+            trigger=IntervalTrigger(seconds=30),
+            id="chunk_update_job",
+            name="Process due chunk update tasks",
+            replace_existing=True
+        )
+        
+        # 启动调度器
+        _scheduler.start()
+        print("✅ Chunk update scheduler started")
+
+
+def shutdown_scheduler():
+    """关闭定时任务调度器"""
+    global _scheduler
+    
+    if _scheduler is not None:
+        _scheduler.shutdown()
+        _scheduler = None
+        print("✅ Chunk update scheduler shutdown")

+ 0 - 0
src/model/__init__.py


+ 133 - 0
src/model/jina_rerank.py

@@ -0,0 +1,133 @@
+from __future__ import annotations
+
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+import requests
+from langchain_core.callbacks import Callbacks
+from langchain_core.documents import BaseDocumentCompressor, Document
+from langchain_core.utils import get_from_dict_or_env
+from pydantic import ConfigDict, model_validator
+
+JINA_API_URL: str = "https://api.jina.ai/v1/rerank"
+
+
+class JinaRerank(BaseDocumentCompressor):
+    """Document compressor that uses `Jina Rerank API` with support for custom base_url."""
+
+    session: Any = None
+    """Requests session to communicate with API."""
+    top_n: Optional[int] = 3
+    """Number of documents to return."""
+    model: str = "jina-reranker-v1-base-en"
+    """Model to use for reranking."""
+    jina_api_key: Optional[str] = None
+    """Jina API key. Must be specified directly or via environment variable 
+        JINA_API_KEY."""
+    user_agent: str = "langchain"
+    """Identifier for the application making the request."""
+    base_url: str = JINA_API_URL
+    """Base URL for the Jina API. Defaults to JINA_API_URL."""
+
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True,
+        extra="forbid",
+    )
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_environment(cls, values: Dict) -> Any:
+        """Validate that api key exists in environment."""
+        jina_api_key = get_from_dict_or_env(values, "jina_api_key", "JINA_API_KEY")
+        user_agent = values.get("user_agent", "langchain")
+        session = requests.Session()
+        session.headers.update(
+            {
+                "Authorization": f"Bearer {jina_api_key}",
+                "Accept-Encoding": "identity",
+                "Content-type": "application/json",
+                "user-agent": user_agent,
+            }
+        )
+        values["session"] = session
+        return values
+
+    def rerank(
+        self,
+        documents: Sequence[Union[str, Document, dict]],
+        query: str,
+        *,
+        model: Optional[str] = None,
+        top_n: Optional[int] = -1,
+        max_chunks_per_doc: Optional[int] = None,
+    ) -> List[Dict[str, Any]]:
+        """Returns an ordered list of documents ordered by their relevance to the provided query.
+
+        Args:
+            query: The query to use for reranking.
+            documents: A sequence of documents to rerank.
+            model: The model to use for re-ranking. Default to self.model.
+            top_n : The number of results to return. If None returns all results.
+                Defaults to self.top_n.
+            max_chunks_per_doc : The maximum number of chunks derived from a document.
+        """  # noqa: E501
+        if len(documents) == 0:  # to avoid empty api call
+            return []
+        docs = [
+            doc.page_content if isinstance(doc, Document) else doc for doc in documents
+        ]
+        model = model or self.model
+        top_n = top_n if (top_n is None or top_n > 0) else self.top_n
+        data = {
+            "query": query,
+            "documents": docs,
+            "model": model,
+            "top_n": top_n,
+        }
+
+        # 构建完整的API路径
+        api_url = self.base_url.rstrip('/') + '/rerank'
+        resp = self.session.post(
+            api_url,
+            json=data,
+        ).json()
+
+        if "results" not in resp:
+            raise RuntimeError(resp["detail"])
+
+        results = resp["results"]
+        result_dicts = []
+        for res in results:
+            result_dicts.append(
+                {
+                    "index": res["index"],
+                    "relevance_score": res["relevance_score"],
+                }
+            )
+        return result_dicts
+
+    def compress_documents(
+        self,
+        documents: Sequence[Document],
+        query: str,
+        top_n: Optional[int] = None,
+        callbacks: Optional[Callbacks] = None,
+    ) -> Sequence[Document]:
+        """
+        Compress documents using Jina's Rerank API.
+
+        Args:
+            documents: A sequence of documents to compress.
+            query: The query to use for compressing the documents.
+            callbacks: Callbacks to run during the compression process.
+
+        Returns:
+            A sequence of compressed documents.
+        """
+        compressed = []
+        for res in self.rerank(documents=documents, query=query, top_n=top_n):
+            doc = documents[res["index"]]
+            doc_copy = Document(doc.page_content, metadata=deepcopy(doc.metadata))
+            doc_copy.metadata["relevance_score"] = res["relevance_score"]
+            compressed.append(doc_copy)
+        return compressed

+ 167 - 0
src/model/multimodal_embedding.py

@@ -0,0 +1,167 @@
+from typing import List
+from PIL import Image
+import base64
+import io
+from langchain_openai import OpenAIEmbeddings
+from dashscope import MultiModalEmbedding
+from src.conf.settings import model_settings
+from langfuse import observe
+from src.utils.file.image_util import image_util
+
+class Embedding:
+    """Embedding模型工具"""
+    
+    def __init__(self, model_name: str = None, multi_model_name: str = None, api_key: str = None, dashscope_api_key: str = None):
+        """
+        初始化Embedding模型
+        
+        Args:
+            model_name: 模型名称,若为None则使用配置文件中的值
+            api_key: API密钥,若为None则使用配置文件中的值
+        """
+        # 获取模型配置
+        self.model_provider = model_settings.model_provider
+        self.model_name = model_name or model_settings.model_name
+        self.multi_model_name = multi_model_name or model_settings.multimodal_embedding_model_name
+        self.base_url = model_settings.base_url
+        self.api_key = api_key or model_settings.api_key
+        self.dashscope_api_key = dashscope_api_key or model_settings.dashscope_api_key
+    
+    @observe(name="text_embedding", as_type="embedding")
+    def get_text_embedding(self, text: str) -> List[float]:
+        """
+        获取文本的embedding
+        
+        Args:
+            text: 要获取embedding的文本
+            
+        Returns:
+            List[float]: 文本的embedding向量
+        """
+        try:
+            # 使用langchain_openai初始化OpenAI Embeddings模型
+            embeddings = OpenAIEmbeddings(
+                model=self.model_name,
+                base_url=self.base_url,
+                api_key=self.api_key
+            )
+            embedding = embeddings.embed_query(text)
+            return embedding
+        except Exception as e:
+            raise Exception(f"文本embedding生成失败: {str(e)}")
+    
+    @observe(name="texts_embedding", as_type="embedding")
+    def get_texts_embedding(self, texts: List[str]) -> List[List[float]]:
+        """
+        获取多个文本的embedding
+        
+        Args:
+            texts: 要获取embedding的文本列表
+            
+        Returns:
+            List[List[float]]: 文本列表的embedding向量列表
+        """
+        try:
+            # 使用langchain_openai初始化OpenAI Embeddings模型
+            embeddings = OpenAIEmbeddings(
+                model=self.model_name,
+                base_url=self.base_url,
+                api_key=self.api_key
+            )
+            embeddings = embeddings.embed_documents(texts)
+            return embeddings
+        except Exception as e:
+            raise Exception(f"多个文本embedding生成失败: {str(e)}")
+    
+    @observe(name="image_embedding", as_type="embedding")
+    def get_image_embedding(self, image: Image.Image) -> List[float]:
+        """
+        获取图像的embedding
+        
+        Args:
+            image: PIL图像对象
+            
+        Returns:
+            List[float]: 图像的embedding向量
+        """
+        try:
+            # 将图像转换为base64
+            buffer = io.BytesIO()
+            image.save(buffer, format="PNG")
+            buffer.seek(0)
+            
+            # 压缩图片字节流
+            compressed_bytes = image_util._compress_image_to_bytes(buffer)
+            
+            image_base64 = base64.b64encode(compressed_bytes).decode("utf-8")
+            # 构建输入项
+            item = [
+                {
+                    "image": f"data:image/png;base64,{image_base64}"
+                }
+            ]
+            response = MultiModalEmbedding.call(
+                model=self.multi_model_name,
+                api_key=self.dashscope_api_key,
+                input=item
+            )
+            if response.status_code == 200:
+                return response.output["embeddings"][0]["embedding"]
+        except Exception as e:
+            raise Exception(f"图像embedding生成失败: {str(e)}")
+
+    @observe(name="multimodal_embedding", as_type="embedding")
+    def get_multimodal_embedding(self, text: str, image: Image.Image) -> List[float]:
+        """
+        获取多模态(文本+图像)的embedding
+        
+        Args:
+            text: 文本内容
+            image: PIL图像对象
+            
+        Returns:
+            List[float]: 多模态的embedding向量
+        """
+        try:
+            item = []
+            if image is not None:
+                buffer = io.BytesIO()
+                image.save(buffer, format="PNG")
+                buffer.seek(0)
+            
+                # 压缩图片字节流
+                compressed_bytes = image_util._compress_image_to_bytes(buffer)
+            
+                image_base64 = base64.b64encode(compressed_bytes).decode("utf-8")
+                item.append({'image': f"data:image/png;base64,{image_base64}"})
+            # 判断text部位None或者不为空字符串
+            if text is not None and text.strip() != "":
+                item.append({'text': text})
+        
+            response = MultiModalEmbedding.call(
+                model=self.multi_model_name,
+                api_key=self.dashscope_api_key,
+                input=item
+            )
+            
+            if response.status_code == 200:
+                return response.output["embeddings"][0]["embedding"]
+            else:
+                raise Exception(f"Error: {response.message}")   
+        except Exception as e:
+            raise Exception(f"多模态embedding生成失败: {str(e)}")
+
+# 全局单例
+def get_embedding_model() -> Embedding:
+    """
+    获取全局单例的Embedding模型
+    
+    Returns:
+        Embedding: 全局单例的Embedding模型
+    """
+    return Embedding(
+        model_name=model_settings.embedding_model_name,
+        multi_model_name=model_settings.multimodal_embedding_model_name,
+        api_key=model_settings.api_key,
+        dashscope_api_key=model_settings.dashscope_api_key
+    )

+ 109 - 0
src/model/openai_chat_model.py

@@ -0,0 +1,109 @@
+from typing import Optional, Dict, Any
+from langchain.chat_models import init_chat_model
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
+from src.conf.settings import model_settings
+
+class OpenAIChatModel:
+    """
+    OpenAI 风格的聊天模型封装
+    
+    基于 langchain 的 init_chat_model 实现,
+    默认使用 deepseek-r1 模型,支持用户指定其他模型。
+    """
+    
+    def __init__(
+        self,
+        model_provider: str = model_settings.model_provider,
+        model_name: str = model_settings.chat_model_name,
+        api_key: Optional[str] = model_settings.api_key,
+        base_url: Optional[str] = model_settings.base_url,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs
+    ):
+        """
+        初始化 OpenAI Chat 模型
+        
+        Args:
+            model_name: 模型名称,默认为 "deepseek-r1"
+            api_key: API 密钥
+            base_url: API 基础 URL
+            temperature: 生成文本的随机性,范围 0-2,默认为 0.7
+            max_tokens: 最大生成 token 数
+            **kwargs: 其他参数
+        """
+        # 使用 langchain 的 init_chat_model 初始化模型
+        self.chat_model = init_chat_model(
+            model_provider=model_provider,
+            model=model_name,
+            api_key=api_key,
+            base_url=base_url,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            **kwargs
+        )
+    
+    def get_chat_model(self):
+        """
+        获取聊天模型实例
+        
+        Returns:
+            聊天模型实例(由 langchain.init_chat_model 返回的类型)
+        """
+        return self.chat_model
+    
+    def generate_response(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        **kwargs
+    ) -> str:
+        """
+        生成响应
+        
+        Args:
+            prompt: 用户提示
+            system_prompt: 系统提示
+            **kwargs: 其他参数
+            
+        Returns:
+            str: 生成的响应
+        """
+        # 构建消息列表
+        messages = []
+        
+        # 添加系统提示(如果有)
+        if system_prompt:
+            messages.append(SystemMessage(content=system_prompt))
+        
+        # 添加用户提示
+        messages.append(HumanMessage(content=prompt))
+        
+        # 生成响应
+        response = self.chat_model.invoke(messages, **kwargs)
+        
+        # 解析响应
+        if isinstance(response, AIMessage):
+            return response.content
+        else:
+            # 对于其他类型的响应,尝试获取内容
+            return str(response)
+    
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        **kwargs
+    ) -> str:
+        """
+        聊天接口(别名,向后兼容)
+        
+        Args:
+            prompt: 用户提示
+            system_prompt: 系统提示
+            **kwargs: 其他参数
+            
+        Returns:
+            str: 生成的响应
+        """
+        return self.generate_response(prompt, system_prompt, **kwargs)

+ 162 - 0
src/model/qwen_vl.py

@@ -0,0 +1,162 @@
+from typing import Dict, Any
+from PIL import Image
+import base64
+import io
+from langchain.chat_models import init_chat_model
+from src.conf.settings import model_settings
+from langfuse.langchain import CallbackHandler
+
+class QWenVLParser:
+    """QWEN VL模型图像解析工具"""
+    
+    def __init__(self, model_name: str = None):
+        """
+        初始化QWEN VL模型解析器
+        
+        Args:
+            model_name: 模型名称,若为None则使用配置文件中的值
+        """
+        # 获取模型配置
+        self.model_provider = model_settings.model_provider
+        self.model_name = model_name or model_settings.model_name
+        self.base_url = model_settings.base_url
+        self.api_key = model_settings.api_key
+        self.langfuse_handler = CallbackHandler()
+        # 使用langchain的init_chat_model初始化模型
+        self.model = init_chat_model(
+            model_provider=self.model_provider,
+            model=self.model_name,
+            base_url=self.base_url,
+            api_key=self.api_key
+        )
+    
+    def image_to_base64(self, image: Image.Image) -> str:
+        """
+        将PIL图像转换为base64编码字符串
+        
+        Args:
+            image: PIL图像对象
+            
+        Returns:
+            str: base64编码的图像字符串
+        """
+        buffer = io.BytesIO()
+        image.save(buffer, format="PNG")
+        return base64.b64encode(buffer.getvalue()).decode("utf-8")
+    
+    def parse_image(self, image: Image.Image, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
+        """
+        使用OpenAI模型解析图像内容
+        
+        Args:
+            image: PIL图像对象
+            page_number: 页码
+            prompt: 提示词
+            
+        Returns:
+            Dict: 包含解析结果的字典,包含:
+                - page_number: 页码
+                - content: 解析内容
+                - model: 使用的模型名称
+        """
+        try:
+            # 将图像转换为base64
+            image_base64 = self.image_to_base64(image)
+            
+            # 构建消息,符合OpenAI API格式
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            # 使用langchain模型调用OpenAI API
+            response = self.model.invoke(input=messages, config={"callbacks": [self.langfuse_handler]})
+            
+            # 提取解析结果
+            content = response.content
+            
+            return {
+                "page_number": page_number,
+                "content": content,
+                "model": self.model_name
+            }
+        except Exception as e:
+            raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
+    
+    def parse_image_path(self, image_path: str, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
+        """
+        使用OpenAI模型解析图像内容
+        
+        Args:
+            image_path: 图像路径
+            page_number: 页码
+            prompt: 提示词
+            
+        Returns:
+            Dict: 包含解析结果的字典,包含:
+                - page_number: 页码
+                - content: 解析内容
+                - model: 使用的模型名称
+        """
+        try: 
+            # 构建消息,符合OpenAI API格式
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"{image_path}"
+                            }
+                        }
+                    ]
+                }
+            ]
+            
+            # 使用langchain模型调用OpenAI API
+            response = self.model.invoke(messages)
+            
+            # 提取解析结果
+            content = response.content
+            
+            return {
+                "page_number": page_number,
+                "content": content,
+                "model": self.model_name
+            }
+        except Exception as e:
+            raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
+    
+    def parse_image_bytes(self, image_bytes: io.BytesIO, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
+        """
+        使用OpenAI模型解析图像字节流
+        
+        Args:
+            image_bytes: 图像字节流
+            page_number: 页码
+            prompt: 提示词
+            
+        Returns:
+            Dict: 包含解析结果的字典
+        """
+        # 将字节流转换为PIL图像
+        image = Image.open(image_bytes)
+        return self.parse_image(image, page_number, prompt)

+ 21 - 0
src/model/tracked_multi_embedding.py

@@ -0,0 +1,21 @@
+from model.multimodal_embedding import Embedding
+from PIL import Image
+from src.utils.decorators.langfuse_trace_embedding import langfuse_trace_embedding
+
+class TrackedMultiEmbeddings(Embedding):
+    
+    @langfuse_trace_embedding(name="multi-embed-query")
+    def embed_query(self, text: str):
+        return super().get_text_embedding(text)
+
+    @langfuse_trace_embedding(name="multi-embed-documents")
+    def embed_documents(self, texts: list[str]):
+        return super().get_texts_embedding(texts)
+
+    @langfuse_trace_embedding(name="multi-embed-image")
+    def embed_image(self, image: Image):
+        return super().get_image_embedding(image)
+
+    @langfuse_trace_embedding(name="multi-embed-multimodal")
+    def embed_multimodal(self, text: str, image: Image):
+        return super().get_multimodal_embedding(text, image)

+ 12 - 0
src/model/tracked_openai_embeddings.py

@@ -0,0 +1,12 @@
+from langchain_openai import OpenAIEmbeddings
+from src.utils.decorators.langfuse_trace_embedding import langfuse_trace_embedding
+
+class TrackedOpenAIEmbeddings(OpenAIEmbeddings):
+    
+    @langfuse_trace_embedding(name="openai-embed-query")
+    def embed_query(self, text: str):
+        return super().embed_query(text)
+
+    @langfuse_trace_embedding(name="openai-embed-documents")
+    def embed_documents(self, texts: list[str]):
+        return super().embed_documents(texts)

+ 0 - 0
src/parser/__init__.py


+ 0 - 0
src/parser/image_parser/__init__.py


+ 304 - 0
src/parser/image_parser/image_parser_workflow.py

@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+"""
+图片解析工作流
+"""
+import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor
+from PIL import Image
+import requests
+from langgraph.graph import StateGraph, START, END
+from typing import List, Dict, Any, Annotated
+from pydantic import BaseModel, Field, ConfigDict
+from src.model.qwen_vl import QWenVLParser
+from src.utils.ragflow.ragflow_service import RAGFlowService
+from src.model.multimodal_embedding import Embedding
+from src.utils.file.image_util import image_util
+from src.conf.settings import model_settings
+from src.utils.infinity import get_client
+
+# 定义工作流状态类
+class ImageParsingState(BaseModel):
+    """图片解析工作流状态"""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    zip_file_path: str = Field(..., description="图片压缩包路径")
+    book_name: str = Field(..., description="书名")
+    dataset_id: str = Field(..., description="数据集ID")
+    ragflow_service: RAGFlowService = Field(default_factory=RAGFlowService, description="RAGFLOW服务")
+    embedding_model: Embedding = Field(default_factory=Embedding, description="多模态嵌入模型实例")
+    document_id: str = Field(default="", description="文档ID")
+    split_images: List[Dict[str, Any]] = Field(default_factory=list, description="拆分后的图片列表,包含图片URL和页码信息")
+    parsed_results: List[Dict[str, Any]] = Field(default_factory=list, description="解析结果列表")
+    vectorized_results: List[Dict[str, Any]] = Field(default_factory=list, description="向量化结果列表")
+    processed_images: int = Field(default=0, description="已处理的图片数量")
+    vectorized_images: int = Field(default=0, description="已向量化的图片数量")
+    is_complete: bool = Field(default=False, description="是否处理完成")
+
+# 创建工作流构建器
+class ImageParsingWorkflow:
+    """图片解析工作流"""
+    
+    def __init__(self, model_name: str = "Qwen/Qwen3-VL-8B-Instruct"):
+        """
+        初始化图片解析工作流
+        
+        Args:
+            model_name: QWEN VL模型名称
+        """
+        self.model_name = model_name
+        self.workflow = self._build_workflow()
+    
+    def _build_workflow(self):
+        """构建langgraph工作流,实现基于条件路由的并行处理"""
+        # 创建状态图
+        graph = StateGraph(ImageParsingState)
+        
+        # 添加节点
+        graph.add_node("upload_images", self._upload_images_node)
+        graph.add_node("parse_image", self._parse_image_node)
+        graph.add_node("vectorize_store", self._vectorize_store_node)
+        graph.add_node("complete", self._complete_node)
+        
+        # 定义边
+        graph.add_edge(START, "upload_images")
+        graph.add_edge("upload_images", "parse_image")
+        
+        # 添加条件边:判断是否继续解析
+        graph.add_conditional_edges(
+            "parse_image",
+            self._should_continue_parsing,
+            {
+                "continue": "parse_image",
+                "complete": "vectorize_store"
+            }
+        )
+        
+        graph.add_edge("vectorize_store", "complete")
+        graph.add_edge("complete", END)
+        
+        # 编译工作流
+        return graph.compile()
+    
+    def _upload_images_node(self, state: ImageParsingState) -> Dict[str, Any]:
+        """上传图片节点,调用image_util处理图片压缩包"""
+        print(f"开始处理图片压缩包: {state.zip_file_path}")
+        
+        try:
+            # 调用image_util处理图片压缩包,获取图片URL列表
+            image_urls = image_util.process_image_zip(
+                state.zip_file_path,
+                state.book_name
+            )
+            
+            print(f"图片压缩包处理完成,共 {len(image_urls)} 张图片")
+            
+            # 构建split_images列表,格式与PDF解析工作流保持一致
+            split_images = []
+            for i, url in enumerate(image_urls):
+                split_images.append({
+                    "page_number": i + 1,
+                    "image_url": url,
+                    "image": None  # 稍后在解析时加载
+                })
+            
+            return {
+                "split_images": split_images,
+                "processed_images": 0,
+                "is_complete": False
+            }
+        except Exception as e:
+            print(f"处理图片压缩包时出错: {str(e)}")
+            raise
+    
+    def _parse_single_page(self, image_info: Dict[str, Any], model_name: str) -> Dict[str, Any]:
+        """解析单个图片(用于并行处理)"""
+        prompt = """
+            你是一个画本类童书的创作者,创作的内容适合0-12岁的儿童
+            任务:你需要根据现有童书插画与内容,提取出插画中的各种要素、行为、情感,并针对每个要素进行独立描述
+            注意:描述内容要积极正向,符合社会主义核心价值观
+            输出要求:
+            1.以json的格式输出,结构为:
+            {
+                "page_number": 页码,
+                "content": 页面原文本内容,
+                "elements": [
+                    {
+                        "element": "元素描述",
+                        "description": "详细描述"
+                    },
+                    ...
+                ]
+            }
+            2.每个要素的描述要简洁明了,不超过50个中文字符
+            3.每个元素的描述要与插画中的元素相关,不能脱离插画而独立存在
+            4.每个元素的描述要符合社会价值观,不能包含任何负面或不道德的内容
+            """
+        
+        page_number = image_info["page_number"]
+        image_url = image_info["image_url"]
+        
+        print(f"开始解析第 {page_number} 页,图片URL: {image_url}")
+        
+        try:
+            # 从URL加载图片
+            response = requests.get(image_url, timeout=30)
+            response.raise_for_status()
+            image = Image.open(requests.get(image_url, stream=True).raw)
+            
+            # 使用QWEN VL模型解析图像
+            parser = QWenVLParser(model_name)
+            result = parser.parse_image(image, page_number, prompt)
+            
+            print(f"第 {page_number} 页解析完成")
+            return result
+        except Exception as e:
+            print(f"解析第 {page_number} 页时出错: {str(e)}")
+            raise
+    
+    def _parse_image_node(self, state: ImageParsingState) -> Dict[str, Any]:
+        """解析图像节点,使用并行处理"""
+        if not state.split_images:
+            return state.dict()
+        
+        print(f"开始并行解析 {len(state.split_images)} 张图片")
+        
+        parsed_results = []
+        
+        # 使用ThreadPoolExecutor实现并行处理
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            # 提交所有图片解析任务
+            future_to_image = {
+                executor.submit(self._parse_single_page, image_info, self.model_name): image_info
+                for image_info in state.split_images
+            }
+            
+            # 收集解析结果
+            for future in concurrent.futures.as_completed(future_to_image):
+                try:
+                    result = future.result()
+                    parsed_results.append(result)
+                except Exception as e:
+                    image_info = future_to_image[future]
+                    print(f"解析第 {image_info['page_number']} 页时出错: {str(e)}")
+        
+        # 按页码排序结果
+        parsed_results.sort(key=lambda x: x["page_number"])
+        
+        print(f"所有图片解析完成,共解析 {len(parsed_results)} 张图片")
+        
+        return {
+            "split_images": state.split_images,  # 保留split_images,以便后续访问图片
+            "parsed_results": parsed_results,
+            "processed_images": len(parsed_results),
+            "is_complete": True
+        }
+    
+    def _should_continue_parsing(self, state: ImageParsingState) -> str:
+        """判断是否继续解析"""
+        # 由于我们使用了并行处理,parse_image_node会一次性处理所有图片
+        # 所以这里总是返回"complete"
+        return "complete"
+    
+    def _vectorize_store_node(self, state: ImageParsingState) -> Dict[str, Any]:
+        """向量化入库节点"""
+        print(f"开始向量化入库,共 {len(state.parsed_results)} 张图片")
+        
+        # 创建索引(如果不存在)
+        index_name = f"image_documents_{state.dataset_id}"
+        state.vector_db.create_index(index_name)
+        
+        # 准备要入库的文档列表
+        documents_to_store = []
+        
+        # 获取文件名和总页数
+        file_name = f"{state.book_name}.zip"
+        file_page_count = len(state.split_images)
+        
+        # 遍历所有解析结果,生成向量化文档
+        for i, parsed_result in enumerate(state.parsed_results):
+            try:
+                page_number = parsed_result.get("page_number")
+                text = parsed_result.get("content", "")
+                image_url = state.split_images[i].get("image_url")
+                
+                # 从URL加载图片
+                image = None
+                try:
+                    response = requests.get(image_url, timeout=30)
+                    response.raise_for_status()
+                    image = Image.open(requests.get(image_url, stream=True).raw)
+                except Exception as e:
+                    print(f"加载图片 {image_url} 失败: {str(e)}")
+                
+                # 获取多模态嵌入向量
+                print(f"正在生成第 {page_number} 页的多模态嵌入...")
+                embedding = state.embedding_model.get_multimodal_embedding(text, image)
+                
+                # 生成1024维稠密向量
+                dense_vector_1024 = embedding[:1024]  # 取前1024维
+                
+                # 创建文档
+                document = {
+                    "id": f"{state.document_id}_{page_number}" if state.document_id else f"image_{state.dataset_id}_{page_number}",
+                    "file_name": file_name,
+                    "file_page_count": file_page_count,
+                    "page_number": page_number,
+                    "content": text,
+                    "image_path": image_url,  # 这里可以根据实际情况生成图片ID
+                    "dense_vector_1024": dense_vector_1024,
+                    "dataset_id": state.dataset_id,
+                    "document_id": state.document_id
+                }
+                
+                documents_to_store.append(document)
+                print(f"第 {page_number} 页向量化完成")
+            except Exception as e:
+                print(f"第 {i+1} 页向量化失败: {str(e)}")
+        
+        # 批量入库
+        if documents_to_store:
+            print(f"开始入库,共 {len(documents_to_store)} 个文档")
+            infinity_client = get_client()
+            result = infinity_client.insert(index_name, documents_to_store)
+            print(f"入库结果: {result}")
+        
+        return {
+            "vectorized_results": documents_to_store,
+            "vectorized_images": len(documents_to_store),
+            "is_complete": True
+        }
+    
+    def _complete_node(self, state: ImageParsingState) -> Dict[str, Any]:
+        """完成节点"""
+        print(f"图片解析工作流完成,共解析 {len(state.parsed_results)} 张图片,向量化 {state.vectorized_images} 张图片")
+        return {
+            "is_complete": True
+        }
+    
+    def run(self, zip_file_path: str, book_name: str, dataset_id: str, ragflow_api_url: str, rag_flow_api_key: str) -> Dict[str, Any]:
+        """
+        运行图片解析工作流
+        
+        Args:
+            zip_file_path: 图片压缩包路径
+            book_name: 书名
+            dataset_id: 数据集ID
+            ragflow_api_url: RAGFLOW API URL
+            rag_flow_api_key: RAGFLOW API密钥
+            
+        Returns:
+            Dict: 包含最终状态的字典
+        """
+        initial_state = ImageParsingState(
+            zip_file_path=zip_file_path,
+            book_name=book_name,
+            dataset_id=dataset_id,
+            embedding_model=Embedding(model_name=model_settings.multimodal_embedding_model_name, api_key=model_settings.dashscope_api_key),
+            ragflow_service=RAGFlowService(base_url=ragflow_api_url, api_key=rag_flow_api_key)
+        )
+        result = self.workflow.invoke(initial_state)
+        
+        # 检查结果类型,如果是字典直接返回,否则调用dict()方法
+        if isinstance(result, dict):
+            return result
+        else:
+            return result.dict()

+ 0 - 0
src/parser/pdf_parser/__init__.py


+ 501 - 0
src/parser/pdf_parser/pdf_parser_workflow.py

@@ -0,0 +1,501 @@
+import os
+import concurrent.futures
+import time
+from concurrent.futures import ThreadPoolExecutor
+from langgraph.graph import StateGraph, START, END
+from typing import List, Dict, Any
+from pydantic import BaseModel, Field, ConfigDict
+from src.parser.pdf_parser.pdf_splitter import PDFSplitter
+from src.model.qwen_vl import QWenVLParser
+from src.utils.ragflow.ragflow_service import RAGFlowService
+from src.utils.ragflow.chunk_record import get_chunk_record_service
+from src.model.multimodal_embedding import Embedding
+from src.conf.settings import model_settings, vector_db_settings, minio_settings
+from src.utils.infinity import get_client
+from langfuse.langchain import CallbackHandler
+from src.conf.rag_parser_config import RagParserDefaults
+
+# 定义工作流状态类
+class PDFParsingState(BaseModel):
+    """PDF解析工作流状态"""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    pdf_path: str = Field(..., description="PDF文件路径")
+    dataset_name: str = Field(..., description="数据集名称")
+    dataset_id: str = Field(default="", description="RAGFLOW数据集ID")
+    document_id: str = Field(default="", description="上传后的文档ID")
+    page_dataset_id: str = Field(..., description="页面数据集ID")
+    page_document_id: str = Field(default="", description="上传后的页面文档ID")
+    split_pages: List[Dict[str, Any]] = Field(default_factory=list, description="拆分后的页面列表")
+    current_page: Dict[str, Any] = Field(default_factory=dict, description="当前处理的页面")
+    parsed_results: List[Dict[str, Any]] = Field(default_factory=list, description="解析结果列表")
+    vectorized_results: List[Dict[str, Any]] = Field(default_factory=list, description="向量化结果列表")
+    processed_pages: int = Field(default=0, description="已处理的页面数量")
+    vectorized_pages: int = Field(default=0, description="已向量化的页面数量")
+    is_complete: bool = Field(default=False, description="是否处理完成")
+
+# 创建工作流构建器
+class PDFParsingWorkflow:
+    """PDF扫描件拆分解析工作流"""
+    
+    def __init__(self, model_name: str = "Qwen/Qwen3-VL-8B-Instruct"):
+        """
+        初始化PDF解析工作流
+        
+        Args:
+            model_name: QWEN VL模型名称
+        """
+        self.model_name = model_name
+        self.workflow = self._build_workflow()
+        self.ragflow_service = RAGFlowService()
+        self.langfuse_handler = CallbackHandler()
+        self.embedding_model = Embedding(model_name=model_settings.multimodal_embedding_model_name, api_key=model_settings.dashscope_api_key)
+        
+    
+    def _build_workflow(self):
+        """构建langgraph工作流,实现基于条件路由的并行处理"""
+        # 创建状态图
+        graph = StateGraph(PDFParsingState)
+        
+        # 添加查询知识库是否存在节点
+        graph.add_node("get_ragflow_dataset", self.get_ragflow_dataset)
+
+        # 添加创建知识库节点
+        graph.add_node("create_ragflow_dataset", self.create_ragflow_dataset)
+
+        # 添加上传文档节点
+        graph.add_node("upload_document", self._upload_document_node)
+
+        # 添加上传图书页面文档节点
+        graph.add_node("upload_page_document", self._upload_page_document_node)
+        
+        # 添加解析文档节点
+        graph.add_node("parse_document", self._parse_document_node)
+        
+        # 添加拆分PDF节点
+        graph.add_node("split_pdf", self._split_pdf_node)
+        
+        # 添加解析图像节点
+        graph.add_node("parse_image", self._parse_image_node)
+
+        # 添加解析图书页面图像节点
+        graph.add_node("create_ragflow_chunk", self.create_ragflow_chunk)
+        
+        # 添加向量化入库节点
+        graph.add_node("vectorize_store", self._vectorize_store_node)
+        
+        # 添加完成节点
+        graph.add_node("complete", self._complete_node)
+        
+        # 定义边
+        # 查询知识库是否存在
+        graph.add_edge(START, "get_ragflow_dataset")
+        # 添加条件边,判断知识库是否存在
+        graph.add_conditional_edges(
+            "get_ragflow_dataset",
+            self._check_dataset_exists,
+            {
+                "exists": "upload_document",
+                "not_exists": "create_ragflow_dataset"
+            }
+        )
+        # 添加解析文档边
+        graph.add_edge("create_ragflow_dataset", "upload_document")
+        graph.add_edge("upload_document", "parse_document")
+        graph.add_edge("upload_document", "upload_page_document")
+        graph.add_edge("parse_document", "split_pdf")
+        # 定义图片解析边
+        graph.add_edge("split_pdf", "parse_image")
+        
+        # 添加条件边:判断是否继续解析
+        graph.add_conditional_edges(
+            "parse_image",
+            self._should_continue_parsing,
+            {
+                "continue": "parse_image",
+                "complete": "vectorize_store",
+            }
+        )
+        
+        # 添加从vectorize_store到create_ragflow_chunk的边
+        graph.add_edge("vectorize_store", "create_ragflow_chunk")
+        
+        graph.add_edge("create_ragflow_chunk", "complete")
+        
+        graph.add_edge("complete", END)
+        
+        # 编译工作流
+        return graph.compile()
+    
+    def get_ragflow_dataset(self, state: PDFParsingState) -> str:
+        """获取RAGFLOW数据集ID"""
+        try:
+            dataset = self.ragflow_service.get_dataset(name=state.dataset_name)
+            dataset_id = dataset["id"] if dataset else ""
+            print(f"数据集 {state.dataset_name} 的ID为: {dataset_id}")
+            return {
+                "dataset_id": dataset_id
+            }
+        except Exception as e:
+            raise Exception(f"获取数据集ID时出错: {str(e)}")
+
+    def _check_dataset_exists(self, state: PDFParsingState) -> str:
+        """检查RAGFLOW数据集是否存在"""
+        # 判断state.dataset_id是否为空,为空则返回"not_exists",否则返回"exists"
+        if state.dataset_id == "":
+            return "not_exists"
+        else:
+            return "exists"
+        
+    def create_ragflow_dataset(self, state: PDFParsingState) -> str:
+        """创建RAGFLOW数据集"""      
+        print(f"开始创建数据集: {state.dataset_name}")
+        
+        try: 
+            # 创建数据集
+            dataset = self.ragflow_service.create_dataset(name=state.dataset_name, description="",
+                                             permission=RagParserDefaults.DATASET_PERMISSION,
+                                             chunk_method=RagParserDefaults.DATASET_CHUNK_METHOD,
+                                             parser_config=RagParserDefaults.DATASET_CONFIG_DICT)
+            dataset_id = dataset["id"]                     
+            print(f"数据集创建成功,数据集ID: {dataset_id}")
+            return {
+                "dataset_id": dataset_id
+            }
+        except Exception as e:
+            print(f"创建数据集时出错: {str(e)}")
+            raise
+
+    def _upload_document_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """RAGFLOW上传文档节点"""
+        print(f"开始上传文档到数据集 {state.dataset_id}: {state.pdf_path}")
+        
+        try:
+            # 上传文档
+            document_info_list = self.ragflow_service.upload_document(
+                dataset_id=state.dataset_id,
+                file_path=state.pdf_path
+            )
+            
+            # 检查响应
+            if document_info_list and len(document_info_list) > 0:
+                document_id = document_info_list[0]["id"]
+                print(f"文档上传成功,文档ID: {document_id}")
+                return {
+                    "document_id": document_id,
+                }
+            else:
+                print("文档上传失败: 未返回有效的文档信息")
+                raise Exception("文档上传失败: 未返回有效的文档信息")
+        except Exception as e:
+            print(f"上传文档时出错: {str(e)}")
+            raise
+
+    def _upload_page_document_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """RAGFLOW上传页面文档节点"""
+        print(f"开始上传页面文档到数据集 {state.dataset_id}: {state.pdf_path}")
+        
+        try:
+            # 上传文档
+            document_info_list = self.ragflow_service.upload_document(
+                dataset_id=state.page_dataset_id,
+                file_path=state.pdf_path
+            ) 
+            
+            # 检查响应
+            if document_info_list and len(document_info_list) > 0:
+                page_document_id = document_info_list[0]["id"]
+                print(f"文档上传成功,文档ID: {page_document_id}")
+                return {
+                    "page_document_id": page_document_id,
+                }
+            else:
+                print("文档上传失败: 未返回有效的文档信息")
+                raise Exception("文档上传失败: 未返回有效的文档信息")
+        except Exception as e:
+            print(f"上传文档时出错: {str(e)}")
+            raise
+
+    def _parse_document_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """RAGFLOW文档解析节点"""
+        print(f"开始解析文档 {state.dataset_id}: {state.document_id}")
+        
+        try:        
+            # 解析文档
+            parse_success = self.ragflow_service.parse_document(
+                dataset_id=state.dataset_id,
+                document_ids=[state.document_id]
+            )
+            
+            # 检查响应parse_success为bool
+            if parse_success:
+                print(f"文档解析成功,文档ID: {state.document_id}")
+                # 返回空列表,因为parsed_results字段期望是列表类型
+                return {
+                    "parsed_results": []
+                }
+            else:
+                print("文档解析失败: 未返回有效的解析结果")
+                raise Exception("文档解析失败: 未返回有效的解析结果")
+        except Exception as e:
+            print(f"解析文档时出错: {str(e)}")
+            raise
+    
+    def _split_pdf_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """拆分PDF节点"""
+        print(f"开始拆分PDF: {state.pdf_path}")
+        
+        # 拆分PDF
+        splitter = PDFSplitter()
+        split_pages = splitter.split_pdf(state.pdf_path)
+        
+        print(f"PDF拆分完成,共 {len(split_pages)} 页")
+        
+        return {
+            "split_pages": split_pages,
+            "parsed_results": [],
+            "processed_pages": 0,
+            "is_complete": False
+        }
+    
+    def _parse_single_page(self, page: Dict[str, Any], model_name: str) -> Dict[str, Any]:
+        """解析单个页面(用于并行处理)"""
+        page_number = page["page_number"]
+        image = page["image"]
+        prompt = f"""
+            角色定位:你是一位顶尖的儿童绘本分析师与视觉工程专家,擅长将插画视觉信息转化为高精度的结构化元数据。
+            任务描述:请深度解析提供的绘本页面,不仅提取基本要素,还要进行“像素级”的特征拆解。重点关注角色的微表情、服饰纹理、环境光效、构图视角及整体艺术风格。
+            当前提取页码为:{page_number}
+            提取维度:
+            艺术风格 (Style):包括笔触(如水彩、蜡笔)、线条粗细、整体色调偏好。
+            角色特征 (Character):五官细节、肢体动作的动态感、衣物材质、标志性配饰。
+            空间构图 (Composition):透视关系(仰拍/俯拍)、视觉焦点、前景/中景/背景的层次。
+            物品与环境 (Object & Environment):物体的精确形状、材质光泽、环境中的自然元素(风吹草动的方向等)。
+            内容标签 (content_tags):请从以下三个维度进行打标:
+            主题维度(如:自然探索、家庭学校、科学科普、传统文化)
+            具体对象(如:昆虫、交通工具、五官、家具)
+            情感氛围(如:惊喜、友爱、好奇、安静)
+            能力标签 (ability_tags):请严格参照以下教育能力模型,根据图中元素体现的教育价值进行选择:
+            [语言表达、逻辑思维、数理逻辑、空间想象、艺术创造、身体协调、自我认知、社会交往、自然观察、情绪管理]。
+            输出约束:
+            保持描述积极向上,符合0-12岁儿童阅读的安全标准。
+            描述精度:单条描述需包含具体视觉属性(颜色、形状、质感),字数控制在50字以内。
+            格式要求:严谨按照指定的JSON结构输出。  
+            json格式:
+            {{
+                "page_meta": {{
+                "page_number": {page_number},
+                "content_text": "页面原文本内容",
+                "overall_style": {{
+                    "art_medium": "艺术媒介(如:手绘水彩、矢量平涂、3D渲染)",
+                    "color_palette": ["主色调1", "主色调2"],
+                    "lighting": "光影描述(如:柔和侧光、清晨自然光)",
+                    "composition": "构图(如:三分法、对角线构图、大远景)"
+                    }}
+                }},
+                "elements": [
+                    {{
+                        "element_name": "元素名称(如:小兔子)",
+                        "character_name": "角色名称(如果有,没有的话,角色名称为空字符串)",
+                        "category": "分类(角色/场景/道具)",
+                        "spatial_layer": "所在层级(前景/中景/背景)",
+                        "visual_attributes": {{
+                            "appearance": "外貌细节描述(发型、五官、材质感)",
+                            "action_emotion": "行为动作与情感流露",
+                            "color_detail": "像素级颜色描述(如:淡茱萸粉、薄荷绿)",
+                            "ability_tag": "如果为角色,其表现出的正面能力/特质"
+                        }},
+                        "content_tags": {{
+                            "theme": ["自然", "社交", "生活常识"], 
+                            "object": ["动物", "服装", "植物"],
+                            "emotion": ["快乐", "勇敢"]
+                        }},
+                        "ability_tags": ["语言表达", "逻辑思维", "自我认知"],
+                        "description": "综合性简洁描述(50字内)"
+                    }}
+                ]
+            }}
+            """
+        
+        print(f"开始解析第 {page_number} 页")
+        
+        # 使用QWEN VL模型解析图像
+        parser = QWenVLParser(model_name)
+        result = parser.parse_image(image, page_number, prompt)
+        
+        print(f"第 {page_number} 页解析完成")
+        return result
+
+    def _parse_image_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """解析图像节点,使用并行处理"""
+        if not state.split_pages:
+            return state.dict()
+        
+        print(f"开始并行解析 {len(state.split_pages)} 页")
+        
+        parsed_results = []
+        
+        # 使用ThreadPoolExecutor实现并行处理
+        with ThreadPoolExecutor(max_workers=5, thread_name_prefix="parse_page_") as executor:
+            # 提交所有页面解析任务
+            future_to_page = {
+                executor.submit(self._parse_single_page, page, self.model_name): page
+                for page in state.split_pages
+            }
+            
+            # 收集解析结果
+            for future in concurrent.futures.as_completed(future_to_page):
+                try:
+                    result = future.result()
+                    parsed_results.append(result)
+                except Exception as e:
+                    page = future_to_page[future]
+                    print(f"解析第 {page['page_number']} 页时出错: {str(e)}")
+        
+        # 按页码排序结果
+        parsed_results.sort(key=lambda x: x["page_number"])
+        
+        print(f"所有页面解析完成,共解析 {len(parsed_results)} 页")
+        
+        return {
+            "split_pages": state.split_pages,  # 保留split_pages,以便后续访问图片
+            "parsed_results": parsed_results,
+            "processed_pages": len(parsed_results),
+            "is_complete": True
+        }
+    
+
+    
+    def _should_continue_parsing(self, state: PDFParsingState) -> str:
+        """判断是否继续解析"""
+        # 由于我们使用了并行处理,parse_image_node会一次性处理所有页面
+        # 所以这里总是返回"complete"
+        return "complete"
+    
+    def create_ragflow_chunk(self, state: PDFParsingState):
+        """单页上传节点"""
+        print(f"开始单页上传,共 {len(state.parsed_results)} 页")
+        
+        # 遍历所有解析结果,上传单页
+         # 遍历所有解析结果,生成向量化文档
+        for i, parsed_result in enumerate(state.parsed_results):
+            page_number = parsed_result.get("page_number")
+            text = parsed_result.get("content", "")
+            image_path = state.split_pages[i].get("image_path")
+            # 截取url中的图片名
+            img_id = f"{vector_db_settings.infinity_page_dataset_id}-{os.path.basename(image_path).split(".")[0]}.png"
+            
+            # 上传单页到RagFlow Chunk
+            chunk = self.ragflow_service.create_chunk(dataset_id=state.page_dataset_id, 
+                                              document_id=state.page_document_id, 
+                                              content=text)
+            chunk_id = chunk["chunk"]["id"]
+            print(f"上传第 {page_number} 页,Chunk ID: {chunk_id}")
+
+            # 记录到定时任务表
+            get_chunk_record_service().record_chunk_add(
+                database_name=vector_db_settings.infinity_ragflow_database,
+                table_name=vector_db_settings.infinity_page_table_name,
+                chunk_id=chunk_id,
+                cond=f"id = '{chunk_id}'",
+                data={"img_id": img_id}
+            )
+
+    def _vectorize_store_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """向量化入库节点"""
+        print(f"开始向量化入库,共 {len(state.parsed_results)} 页")
+        
+        # 创建索引(如果不存在)
+        index_name = f"{vector_db_settings.infinity_table_name}"
+        # get_client().create_index()
+        
+        # 准备要入库的文档列表
+        documents_to_store = []
+        
+        # 获取文件名和总页数
+        file_name = os.path.basename(state.pdf_path)
+        file_page_count = len(state.split_pages)
+        
+        # 遍历所有解析结果,生成向量化文档
+        for i, parsed_result in enumerate(state.parsed_results):
+            try:
+                page_number = parsed_result.get("page_number")
+                text = parsed_result.get("content", "")
+                image = state.split_pages[i].get("image")
+                
+                image_path = state.split_pages[i].get("image_path")
+
+                # 获取多模态嵌入向量
+                print(f"正在生成第 {page_number} 页的多模态嵌入...")
+                embedding = self.embedding_model.get_multimodal_embedding(text, image)
+                
+                # 生成1024维稠密向量(如果嵌入向量维度不是1024,这里需要处理)
+                dense_vector_1024 = embedding[:1024]  # 取前1024维
+                
+                # 创建文档
+                document = {
+                    "id": f"{state.document_id}_{page_number}",
+                    "file_name": file_name,
+                    "file_page_count": file_page_count,
+                    "page_number": page_number,
+                    "content": text,
+                    "image_path": image_path,
+                    "dense_vector_1024": dense_vector_1024,
+                    "dataset_id": state.dataset_id,
+                    "document_id": state.document_id
+                }
+                
+                documents_to_store.append(document)
+                print(f"第 {page_number} 页向量化完成")
+            except Exception as e:
+                print(f"第 {i+1} 页向量化失败: {str(e)}")
+        
+        # 批量入库
+        if documents_to_store:
+            print(f"开始入库,共 {len(documents_to_store)} 个文档")
+            result = get_client().insert(
+                table_name=vector_db_settings.infinity_table_name,
+                documents=documents_to_store,
+                database_name=vector_db_settings.infinity_database
+            )
+            print(f"入库结果: {result}")
+        
+        return {
+            "vectorized_results": documents_to_store,
+            "vectorized_pages": len(documents_to_store),
+            "is_complete": True
+        }
+    
+    def _complete_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """完成节点"""
+        print(f"PDF解析工作流完成,共解析 {len(state.parsed_results)} 页,向量化 {state.vectorized_pages} 页")
+        # 判断ragflow是否解析成功
+
+        return {
+            "is_complete": True
+        }
+    
+    def run(self, pdf_path: str, page_dataset_id: str, dataset_name: str) -> Dict[str, Any]:
+        """
+        运行PDF解析工作流
+        
+        Args:
+            pdf_path: PDF文件路径
+            page_dataset_id: 数据集ID
+            ragflow_api_url: RAGFLOW API URL
+            rag_flow_api_key: RAGFLOW API密钥
+            
+        Returns:
+            Dict: 包含最终状态的字典
+        """
+        initial_state = PDFParsingState(
+            pdf_path=pdf_path,
+            page_dataset_id=page_dataset_id,
+            dataset_name=dataset_name
+        )
+        result = self.workflow.invoke(initial_state, config={"callbacks": [self.langfuse_handler]})
+        
+        # 检查结果类型,如果是字典直接返回,否则调用dict()方法
+        if isinstance(result, dict):
+            return result
+        else:
+            return result.dict()

+ 90 - 0
src/parser/pdf_parser/pdf_splitter.py

@@ -0,0 +1,90 @@
+import fitz
+from PIL import Image
+import io
+from typing import List, Dict, Tuple
+from src.conf.settings import vector_db_settings
+
+class PDFSplitter:
+    """PDF扫描件按页拆分工具"""
+    
+    @staticmethod
+    def split_pdf(pdf_path: str) -> List[Dict[str, any]]:
+        """
+        将PDF按页拆分,转换为图像并记录页码,同时保存图片到MinIO
+        
+        Args:
+            pdf_path: PDF文件路径
+            
+        Returns:
+            List[Dict]: 包含每一页信息的列表,每个字典包含:
+                - page_number: 页码
+                - image: PIL图像对象
+                - image_bytes: 图像字节流
+                - image_path: MinIO中保存的图片URL
+        """
+        import os
+        from utils.file.minio.minio_util import MinIOUtil
+        
+        try:
+            # 初始化MinioUtil
+            minio_util = MinIOUtil()
+            
+            # 打开PDF文件
+            pdf_document = fitz.open(pdf_path)
+            
+            # 获取PDF文件名(不含扩展名)
+            pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+            
+            result = []
+            for page_num in range(len(pdf_document)):
+                # 获取页面
+                page = pdf_document[page_num]
+                # 页码从1开始
+                page_number = page_num + 1
+                
+                # 将页面转换为图像
+                # 使用较高分辨率,DPI=300
+                pix = page.get_pixmap(dpi=300)
+                
+                # 将fitz pixmap转换为PIL图像
+                image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                
+                # 将图像转换为字节流,便于后续处理
+                image_bytes = io.BytesIO()
+                image.save(image_bytes, format='PNG')
+                image_bytes.seek(0)
+                
+                # 生成图片文件名
+                image_filename = f"{pdf_filename}_{page_number}.png"
+                
+                # 重置字节流指针
+                image_bytes.seek(0)
+                
+                # 上传图片到MinIO,获取URL
+                image_url = minio_util.custom_upload_file(file=image_bytes, original_filename=image_filename, bucket_name=vector_db_settings.infinity_page_dataset_id)
+                
+                result.append({
+                    "page_number": page_number,
+                    "image": image,
+                    "image_bytes": image_bytes,
+                    "image_path": image_url
+                })
+            
+            # 关闭PDF文件
+            pdf_document.close()
+            # 将result根据page_number排序
+            result.sort(key=lambda x: x["page_number"])
+            return result
+        except Exception as e:
+            raise Exception(f"PDF拆分失败: {str(e)}")
+    
+    @staticmethod
+    def save_page_image(image: Image.Image, output_path: str) -> None:
+        """
+        保存单页图像
+        
+        Args:
+            image: PIL图像对象
+            output_path: 输出文件路径
+        """
+        image.save(output_path, format='PNG')

+ 58 - 0
src/parser/pdf_parser/test_service.py

@@ -0,0 +1,58 @@
+from .main import PDFParsingService
+
+def test_service_initialization():
+    """测试服务初始化"""
+    print("测试服务初始化...")
+    try:
+        service = PDFParsingService()
+        print("✓ 服务初始化成功")
+        return True
+    except Exception as e:
+        print(f"✗ 服务初始化失败: {str(e)}")
+        return False
+
+def test_module_imports():
+    """测试模块导入"""
+    print("测试模块导入...")
+    
+    modules_to_test = [
+        ("pdf_splitter", ".pdf_splitter"),
+        ("qwen_vl", ".qwen_vl"),
+        ("workflow", ".workflow"),
+        ("main", ".main")
+    ]
+    
+    all_imported = True
+    for module_name, module_path in modules_to_test:
+        try:
+            __import__(module_path, fromlist=["*"], level=1)
+            print(f"✓ {module_name} 模块导入成功")
+        except Exception as e:
+            print(f"✗ {module_name} 模块导入失败: {str(e)}")
+            all_imported = False
+    
+    return all_imported
+
+def main():
+    """测试主函数"""
+    print("开始测试PDF解析服务...")
+    print("=" * 50)
+    
+    # 测试模块导入
+    module_test = test_module_imports()
+    print()
+    
+    # 测试服务初始化
+    init_test = test_service_initialization()
+    print()
+    
+    print("=" * 50)
+    if module_test and init_test:
+        print("✓ 所有测试通过!服务可以正常使用")
+        print("\n使用示例:")
+        print("python -m services.pdf_parser.main --pdf_path <pdf文件路径> --output <输出json路径>")
+    else:
+        print("✗ 测试失败!请检查错误信息")
+
+if __name__ == "__main__":
+    main()

+ 11 - 0
src/utils/__init__.py

@@ -0,0 +1,11 @@
+# 从分离的模块中导入所有组件,确保向后兼容性
+from .infinity.pool import InfinityConnectionPool
+from .infinity.client import InfinityClient, get_client, close_client
+
+# 重新导出所有组件,使它们可以从包级别访问
+__all__ = [
+    'InfinityConnectionPool',
+    'InfinityClient',
+    'get_client',
+    'close_client'
+]

+ 151 - 0
src/utils/asymmetric_encryption.py

@@ -0,0 +1,151 @@
+from cryptography.hazmat.primitives import serialization, hashes
+from cryptography.hazmat.primitives.asymmetric import rsa, padding
+from cryptography.hazmat.backends import default_backend
+from typing import Tuple, Optional
+import base64
+
+
+class AsymmetricEncryption:
+    """非对称加密工具类,使用RSA算法"""
+
+    @staticmethod
+    def generate_key_pair(key_size: int = 2048) -> Tuple[bytes, bytes]:
+        """
+        生成RSA密钥对
+        
+        Args:
+            key_size: 密钥大小,默认为2048位
+        
+        Returns:
+            Tuple[bytes, bytes]: (私钥PEM格式,公钥PEM格式)
+        """
+        # 生成私钥
+        private_key = rsa.generate_private_key(
+            public_exponent=65537,
+            key_size=key_size,
+            backend=default_backend()
+        )
+        
+        # 生成公钥
+        public_key = private_key.public_key()
+        
+        # 将私钥序列化为PEM格式
+        private_pem = private_key.private_bytes(
+            encoding=serialization.Encoding.PEM,
+            format=serialization.PrivateFormat.TraditionalOpenSSL,
+            encryption_algorithm=serialization.NoEncryption()
+        )
+        
+        # 将公钥序列化为PEM格式
+        public_pem = public_key.public_bytes(
+            encoding=serialization.Encoding.PEM,
+            format=serialization.PublicFormat.SubjectPublicKeyInfo
+        )
+        
+        return private_pem, public_pem
+    
+    @staticmethod
+    def encrypt(message: str, public_key_pem: bytes) -> str:
+        """
+        使用公钥加密数据
+        
+        Args:
+            message: 要加密的明文
+            public_key_pem: 公钥PEM格式
+        
+        Returns:
+            str: 加密后的base64编码字符串
+        """
+        # 加载公钥
+        public_key = serialization.load_pem_public_key(
+            public_key_pem,
+            backend=default_backend()
+        )
+        
+        # 加密数据
+        encrypted = public_key.encrypt(
+            message.encode('utf-8'),
+            padding.OAEP(
+                mgf=padding.MGF1(algorithm=hashes.SHA256()),
+                algorithm=hashes.SHA256(),
+                label=None
+            )
+        )
+        
+        # 返回base64编码的加密数据
+        return base64.b64encode(encrypted).decode('utf-8')
+    
+    @staticmethod
+    def decrypt(encrypted_message: str, private_key_pem: bytes) -> str:
+        """
+        使用私钥解密数据
+        
+        Args:
+            encrypted_message: 加密后的base64编码字符串
+            private_key_pem: 私钥PEM格式
+        
+        Returns:
+            str: 解密后的明文
+        """
+        # 加载私钥
+        private_key = serialization.load_pem_private_key(
+            private_key_pem,
+            password=None,
+            backend=default_backend()
+        )
+        
+        # 解码base64加密数据
+        encrypted = base64.b64decode(encrypted_message)
+        
+        # 解密数据
+        decrypted = private_key.decrypt(
+            encrypted,
+            padding.OAEP(
+                mgf=padding.MGF1(algorithm=hashes.SHA256()),
+                algorithm=hashes.SHA256(),
+                label=None
+            )
+        )
+        
+        # 返回解密后的明文
+        return decrypted.decode('utf-8')
+    
+    @staticmethod
+    def save_key_to_file(key_pem: bytes, file_path: str) -> None:
+        """
+        将密钥保存到文件
+        
+        Args:
+            key_pem: 密钥PEM格式
+            file_path: 文件路径
+        """
+        with open(file_path, 'wb') as f:
+            f.write(key_pem)
+    
+    @staticmethod
+    def load_key_from_file(file_path: str) -> bytes:
+        """
+        从文件加载密钥
+        
+        Args:
+            file_path: 文件路径
+        
+        Returns:
+            bytes: 密钥PEM格式
+        """
+        with open(file_path, 'rb') as f:
+            return f.read()
+    
+    @staticmethod
+    def generate_key_pair_and_save(private_key_path: str, public_key_path: str, key_size: int = 2048) -> None:
+        """
+        生成密钥对并保存到文件
+        
+        Args:
+            private_key_path: 私钥文件路径
+            public_key_path: 公钥文件路径
+            key_size: 密钥大小,默认为2048位
+        """
+        private_pem, public_pem = AsymmetricEncryption.generate_key_pair(key_size)
+        AsymmetricEncryption.save_key_to_file(private_pem, private_key_path)
+        AsymmetricEncryption.save_key_to_file(public_pem, public_key_path)

+ 33 - 0
src/utils/async_utils.py

@@ -0,0 +1,33 @@
+"""
+Async utility functions for running blocking operations
+"""
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Callable
+
+
+# Create a thread pool executor
+executor = ThreadPoolExecutor(max_workers=10)
+
+
+async def run_in_threadpool(func: Callable[..., Any], *args, **kwargs) -> Any:
+    """
+    Run a blocking function in a thread pool
+    
+    Args:
+        func: Blocking function to run
+        *args: Arguments to pass to the function
+        **kwargs: Keyword arguments to pass to the function
+        
+    Returns:
+        Result of the function
+    """
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(executor, func, *args, **kwargs)
+
+
+def close_executor():
+    """
+    Close the thread pool executor
+    """
+    executor.shutdown(wait=True)

+ 57 - 0
src/utils/auth.py

@@ -0,0 +1,57 @@
+"""
+Authentication middleware for API key verification
+"""
+from fastapi import HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.gzip import GZipMiddleware
+from src.common.result import Result
+from src.utils.mysql import get_global_mysql_client
+from datetime import datetime
+
+
+async def verify_api_key(request: Request, call_next):
+    """
+    Verify API key from Authorization header
+    """
+    # Skip authentication for health check and root endpoints
+    if request.url.path in ["/", "/health"]:
+        response = await call_next(request)
+        return response
+    
+    # Get Authorization header
+    auth_header = request.headers.get("Authorization")
+    if not auth_header:
+        return Result.error(code=401, message="参数缺少api_key")
+    
+    # Check if header has Bearer prefix
+    if not auth_header.startswith("Bearer "):
+        return Result.error(code=401, message="Authorization header must be in format: Bearer {API_KEY}")
+    
+    # Extract API key
+    api_key = auth_header.split(" ")[1]
+    
+    # Validate API key
+    try:
+        mysql_client = get_global_mysql_client()
+        query = """
+        SELECT id, is_active, expired_at 
+        FROM api_keys 
+        WHERE api_key = %s
+        """
+        result = mysql_client.fetch_one(query, [api_key])
+        
+        if not result:
+            return Result.error(code=401, message="api_key错误,请仔细检查您的api_key")
+        
+        if not result["is_active"]:
+            return Result.error(code=401, message="api_key已被禁用")
+        
+        if result["expired_at"] and result["expired_at"] < datetime.now():
+            return Result.error(code=401, message="api_key已过期")
+        
+    except Exception as e:
+        return Result.error(code=500, message=f"认证错误: {str(e)}")
+    
+    # Continue processing request
+    response = await call_next(request)
+    return response

+ 0 - 0
src/utils/decorators/__init__.py


+ 45 - 0
src/utils/decorators/langfuse_trace_embedding.py

@@ -0,0 +1,45 @@
+import functools
+import time
+from langfuse import Langfuse
+
+# 初始化 Langfuse 客户端
+langfuse = Langfuse()
+
+def langfuse_trace_embedding(name="embedding-operation"):
+    """
+    用于追踪 Embedding 操作的装饰器
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            # 1. 获取输入参数(假设第一个参数是文本或文本列表)
+            # 对于 embed_query(text) 或 embed_documents(texts)
+            input_data = args[1] if len(args) > 1 else kwargs.get("text") or kwargs.get("texts")
+            
+            # 2. 在 Langfuse 中开启一个 Span
+            # 如果当前上下文已有 Trace,它会自动关联;如果没有,它会创建新的 Trace
+            span = langfuse.span(
+                name=name,
+                input=input_data,
+                metadata={"function": func.__name__}
+            )
+            
+            start_time = time.time()
+            try:
+                # 3. 执行真正的 Embedding 操作
+                result = func(*args, **kwargs)
+                
+                # 4. 记录执行结果和耗时
+                span.end(
+                    output={
+                        "vector_count": len(result) if isinstance(result, list) else 1,
+                        "dimension": len(result[0]) if isinstance(result, list) and len(result) > 0 else len(result) if isinstance(result, list) else 0
+                    }
+                )
+                return result
+            except Exception as e:
+                # 捕捉异常并记录到 Langfuse
+                span.end(level="ERROR", status_message=str(e))
+                raise e
+        return wrapper
+    return decorator

+ 13 - 0
src/utils/decorators/singleton.py

@@ -0,0 +1,13 @@
+# 单例装饰器
+class singleton:
+    """
+    单例装饰器,确保类只有一个实例
+    """
+    def __init__(self, cls):
+        self.cls = cls
+        self._instance = None
+    
+    def __call__(self, *args, **kwargs):
+        if self._instance is None:
+            self._instance = self.cls(*args, **kwargs)
+        return self._instance

+ 18 - 0
src/utils/es/__init__.py

@@ -0,0 +1,18 @@
+"""
+Elasticsearch utility module
+"""
+from src.utils.es.client_manager import es_client_manager, ESClientManager
+from src.utils.es.core.index_manager import index_manager, IndexManager
+from src.utils.es.services.search_service import search_service, SearchService
+from src.utils.es.bulk_helper import bulk_helper, BulkHelper
+
+__all__ = [
+    "es_client_manager",
+    "ESClientManager",
+    "index_manager",
+    "IndexManager",
+    "search_service",
+    "SearchService",
+    "bulk_helper",
+    "BulkHelper"
+]

+ 318 - 0
src/utils/es/bulk_helper.py

@@ -0,0 +1,318 @@
+"""
+Elasticsearch bulk helper
+Encapsulates efficient bulk import logic using helpers.bulk
+"""
+import logging
+from typing import Dict, Any, Iterator, Optional
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk, parallel_bulk
+from src.conf.settings import es_settings
+from src.utils.es.client_manager import es_client_manager
+from src.utils.es.core.index_manager import index_manager
+
+logger = logging.getLogger(__name__)
+
+
+class BulkHelper:
+    """
+    Elasticsearch bulk helper
+    """
+    
+    def __init__(self):
+        """
+        Initialize bulk helper
+        """
+        self.client = es_client_manager.get_client()
+        self.index_name = es_settings.es_index_name
+        # Ensure index is initialized
+        index_manager.initialize_index()
+    
+    def generate_actions(self, documents: Iterator[Dict[str, Any]], doc_id_field: str = "id") -> Iterator[Dict[str, Any]]:
+        """
+        Generate bulk actions from documents
+        
+        Args:
+            documents: Iterator of documents
+            doc_id_field: Field to use as document ID
+        
+        Returns:
+            Iterator[Dict[str, Any]]: Bulk actions
+        """
+        for doc in documents:
+            action = {
+                "_op_type": "index",
+                "_index": self.index_name,
+                "_source": doc
+            }
+            if doc_id_field in doc:
+                action["_id"] = doc[doc_id_field]
+            yield action
+    
+    def bulk_index(self, documents: Iterator[Dict[str, Any]], chunk_size: int = 1000, 
+                   doc_id_field: str = "id") -> Dict[str, Any]:
+        """
+        Bulk index documents
+        
+        Args:
+            documents: Iterator of documents
+            chunk_size: Bulk chunk size
+            doc_id_field: Field to use as document ID
+        
+        Returns:
+            Dict[str, Any]: Bulk operation result
+        """
+        try:
+            actions = self.generate_actions(documents, doc_id_field)
+            
+            success, failed = bulk(
+                self.client,
+                actions,
+                chunk_size=chunk_size,
+                raise_on_error=False,
+                raise_on_exception=False
+            )
+            
+            result = {
+                "success": success,
+                "failed": len(failed),
+                "errors": failed
+            }
+            
+            if failed:
+                logger.warning(f"Bulk index completed with {len(failed)} failures")
+            else:
+                logger.info(f"Bulk index completed successfully: {success} documents")
+            
+            return result
+        except Exception as e:
+            logger.error(f"Error in bulk index: {e}")
+            return {
+                "success": 0,
+                "failed": 0,
+                "errors": [str(e)]
+            }
+    
+    def parallel_bulk_index(self, documents: Iterator[Dict[str, Any]], chunk_size: int = 1000,
+                           thread_count: int = 4, queue_size: int = 4, 
+                           doc_id_field: str = "id") -> Dict[str, Any]:
+        """
+        Parallel bulk index documents
+        
+        Args:
+            documents: Iterator of documents
+            chunk_size: Bulk chunk size
+            thread_count: Number of threads
+            queue_size: Queue size
+            doc_id_field: Field to use as document ID
+        
+        Returns:
+            Dict[str, Any]: Bulk operation result
+        """
+        try:
+            actions = self.generate_actions(documents, doc_id_field)
+            
+            success = 0
+            failed = []
+            
+            for ok, item in parallel_bulk(
+                self.client,
+                actions,
+                chunk_size=chunk_size,
+                thread_count=thread_count,
+                queue_size=queue_size,
+                raise_on_error=False,
+                raise_on_exception=False
+            ):
+                if ok:
+                    success += 1
+                else:
+                    failed.append(item)
+            
+            result = {
+                "success": success,
+                "failed": len(failed),
+                "errors": failed
+            }
+            
+            if failed:
+                logger.warning(f"Parallel bulk index completed with {len(failed)} failures")
+            else:
+                logger.info(f"Parallel bulk index completed successfully: {success} documents")
+            
+            return result
+        except Exception as e:
+            logger.error(f"Error in parallel bulk index: {e}")
+            return {
+                "success": 0,
+                "failed": 0,
+                "errors": [str(e)]
+            }
+    
+    def bulk_update(self, documents: Iterator[Dict[str, Any]], doc_id_field: str = "id") -> Dict[str, Any]:
+        """
+        Bulk update documents
+        
+        Args:
+            documents: Iterator of documents
+            doc_id_field: Field to use as document ID
+        
+        Returns:
+            Dict[str, Any]: Bulk operation result
+        """
+        try:
+            actions = []
+            for doc in documents:
+                if doc_id_field in doc:
+                    action = {
+                        "_op_type": "update",
+                        "_index": self.index_name,
+                        "_id": doc[doc_id_field],
+                        "doc": doc
+                    }
+                    actions.append(action)
+            
+            success, failed = bulk(
+                self.client,
+                actions,
+                chunk_size=1000,
+                raise_on_error=False,
+                raise_on_exception=False
+            )
+            
+            result = {
+                "success": success,
+                "failed": len(failed),
+                "errors": failed
+            }
+            
+            if failed:
+                logger.warning(f"Bulk update completed with {len(failed)} failures")
+            else:
+                logger.info(f"Bulk update completed successfully: {success} documents")
+            
+            return result
+        except Exception as e:
+            logger.error(f"Error in bulk update: {e}")
+            return {
+                "success": 0,
+                "failed": 0,
+                "errors": [str(e)]
+            }
+    
+    def bulk_delete(self, document_ids: Iterator[str]) -> Dict[str, Any]:
+        """
+        Bulk delete documents
+        
+        Args:
+            document_ids: Iterator of document IDs
+        
+        Returns:
+            Dict[str, Any]: Bulk operation result
+        """
+        try:
+            actions = [
+                {
+                    "_op_type": "delete",
+                    "_index": self.index_name,
+                    "_id": doc_id
+                }
+                for doc_id in document_ids
+            ]
+            
+            success, failed = bulk(
+                self.client,
+                actions,
+                chunk_size=1000,
+                raise_on_error=False,
+                raise_on_exception=False
+            )
+            
+            result = {
+                "success": success,
+                "failed": len(failed),
+                "errors": failed
+            }
+            
+            if failed:
+                logger.warning(f"Bulk delete completed with {len(failed)} failures")
+            else:
+                logger.info(f"Bulk delete completed successfully: {success} documents")
+            
+            return result
+        except Exception as e:
+            logger.error(f"Error in bulk delete: {e}")
+            return {
+                "success": 0,
+                "failed": 0,
+                "errors": [str(e)]
+            }
+    
+    def bulk_upsert(self, documents: Iterator[Dict[str, Any]], doc_id_field: str = "id") -> Dict[str, Any]:
+        """
+        Bulk upsert documents (update if exists, index if not)
+        
+        Args:
+            documents: Iterator of documents
+            doc_id_field: Field to use as document ID
+        
+        Returns:
+            Dict[str, Any]: Bulk operation result
+        """
+        try:
+            actions = []
+            for doc in documents:
+                if doc_id_field in doc:
+                    action = {
+                        "_op_type": "update",
+                        "_index": self.index_name,
+                        "_id": doc[doc_id_field],
+                        "doc": doc,
+                        "doc_as_upsert": True
+                    }
+                    actions.append(action)
+            
+            success, failed = bulk(
+                self.client,
+                actions,
+                chunk_size=1000,
+                raise_on_error=False,
+                raise_on_exception=False
+            )
+            
+            result = {
+                "success": success,
+                "failed": len(failed),
+                "errors": failed
+            }
+            
+            if failed:
+                logger.warning(f"Bulk upsert completed with {len(failed)} failures")
+            else:
+                logger.info(f"Bulk upsert completed successfully: {success} documents")
+            
+            return result
+        except Exception as e:
+            logger.error(f"Error in bulk upsert: {e}")
+            return {
+                "success": 0,
+                "failed": 0,
+                "errors": [str(e)]
+            }
+    
+    def refresh_index(self) -> bool:
+        """
+        Refresh index to make changes visible
+        
+        Returns:
+            bool: Refresh status
+        """
+        try:
+            self.client.indices.refresh(index=self.index_name)
+            return True
+        except Exception as e:
+            logger.error(f"Error refreshing index: {e}")
+            return False
+
+
+# Create singleton instance
+bulk_helper = BulkHelper()

+ 80 - 0
src/utils/es/client_manager.py

@@ -0,0 +1,80 @@
+"""
+Elasticsearch client manager
+"""
+from elasticsearch import Elasticsearch
+from elasticsearch.exceptions import ConnectionError as ESConnectionError
+from src.utils.decorators.singleton import singleton
+from src.conf.settings import es_settings
+
+
+@singleton
+class ESClientManager:
+    """
+    Elasticsearch客户端管理器(单例模式)
+    """
+    
+    def __init__(self):
+        """
+        初始化ES客户端管理器
+        """
+        self.client = None
+    
+    def get_client(self) -> Elasticsearch:
+        """
+        获取Elasticsearch客户端实例
+        
+        Returns:
+            Elasticsearch客户端实例
+        """
+        if self.client is None:
+            self._initialize_client()
+        return self.client
+    
+    def _initialize_client(self):
+        """
+        初始化Elasticsearch客户端
+        """
+        try:
+            self.client = Elasticsearch(
+                es_settings.es_nodes,
+                basic_auth=(
+                    es_settings.es_username,
+                    es_settings.es_password
+                ) if es_settings.es_username and es_settings.es_password else None,
+                connections_per_node=es_settings.es_connections_per_node,
+                max_retries=es_settings.es_max_retries,
+                retry_on_timeout=es_settings.es_retry_on_timeout,
+                request_timeout=es_settings.es_timeout,
+                verify_certs=es_settings.es_verify_certs
+            )
+            print("✅ Elasticsearch客户端初始化成功")
+        except ESConnectionError as e:
+            print(f"❌ Elasticsearch连接失败: {e}")
+            raise
+        except Exception as e:
+            print(f"❌ Elasticsearch初始化失败: {e}")
+            raise
+    
+    def ping(self) -> bool:
+        """
+        健康检查
+        
+        Returns:
+            bool: 连接状态
+        """
+        try:
+            return self.get_client().ping()
+        except Exception:
+            return False
+    
+    def close(self):
+        """
+        关闭Elasticsearch客户端
+        """
+        if self.client:
+            self.client.close()
+            self.client = None
+
+
+# 创建单例实例
+es_client_manager = ESClientManager()

+ 3 - 0
src/utils/es/core/__init__.py

@@ -0,0 +1,3 @@
+"""
+Elasticsearch core module
+"""

+ 224 - 0
src/utils/es/core/index_manager.py

@@ -0,0 +1,224 @@
+"""
+Elasticsearch index manager
+Responsible for index initialization, mapping definition, and maintenance
+"""
+import logging
+from typing import Dict, Any
+from elasticsearch import Elasticsearch
+from elasticsearch.exceptions import RequestError
+from src.conf.settings import es_settings
+from src.utils.es.client_manager import es_client_manager
+
+logger = logging.getLogger(__name__)
+
+
+class IndexManager:
+    """
+    Elasticsearch index manager
+    """
+    
+    def __init__(self):
+        """
+        Initialize index manager
+        """
+        self.client = es_client_manager.get_client()
+        self.index_name = es_settings.es_index_name
+    
+    def get_mapping(self) -> Dict[str, Any]:
+        """
+        Get index mapping definition
+        
+        Returns:
+            Dict[str, Any]: Mapping definition
+        """
+        return {
+            "settings": {
+                "number_of_shards": 3,
+                "number_of_replicas": 0,
+                "analysis": {
+                    "analyzer": {
+                        "ik_smart": {
+                            "type": "custom",
+                            "tokenizer": "ik_smart"
+                        }
+                    }
+                }
+            },
+            "mappings": {
+                "properties": {
+                    "id": {
+                        "type": "keyword"
+                    },
+                    "file_name": {
+                        "type": "keyword"
+                    },
+                    "file_page_count": {
+                        "type": "integer"
+                    },
+                    "page_number": {
+                        "type": "integer"
+                    },
+                    "content": {
+                        "type": "text",
+                        "analyzer": "ik_smart"
+                    },
+                    "image_path": {
+                        "type": "keyword"
+                    },
+                    "dense_vector_1024": {
+                        "type": "dense_vector",
+                        "dims": 1024,
+                        "index": True,
+                        "similarity": "cosine",
+                        "index_options": {
+                            "type": "hnsw",
+                            "m": 24,
+                            "ef_construction": 128
+                        }
+                    },
+                    "dataset_id": {
+                        "type": "keyword"
+                    },
+                    "document_id": {
+                        "type": "keyword"
+                    }
+                }
+            }
+        }
+    
+    def index_exists(self) -> bool:
+        """
+        Check if index exists
+        
+        Returns:
+            bool: Index existence status
+        """
+        try:
+            return self.client.indices.exists(index=self.index_name)
+        except Exception as e:
+            logger.error(f"Error checking index existence: {e}")
+            return False
+    
+    def create_index(self) -> bool:
+        """
+        Create index with mapping
+        
+        Returns:
+            bool: Creation status
+        """
+        try:
+            if not self.index_exists():
+                mapping = self.get_mapping()
+                self.client.indices.create(
+                    index=self.index_name,
+                    body=mapping
+                )
+                logger.info(f"Created index {self.index_name} with mapping")
+                return True
+            else:
+                logger.info(f"Index {self.index_name} already exists")
+                return False
+        except RequestError as e:
+            logger.error(f"Error creating index: {e}")
+            return False
+        except Exception as e:
+            logger.error(f"Unexpected error creating index: {e}")
+            return False
+    
+    def update_mapping(self, mapping: Dict[str, Any]) -> bool:
+        """
+        Update index mapping
+        
+        Args:
+            mapping: New mapping definition
+        
+        Returns:
+            bool: Update status
+        """
+        try:
+            if self.index_exists():
+                self.client.indices.put_mapping(
+                    index=self.index_name,
+                    body=mapping
+                )
+                logger.info(f"Updated mapping for index {self.index_name}")
+                return True
+            else:
+                logger.warning(f"Cannot update mapping: index {self.index_name} does not exist")
+                return False
+        except Exception as e:
+            logger.error(f"Error updating mapping: {e}")
+            return False
+    
+    def delete_index(self) -> bool:
+        """
+        Delete index
+        
+        Returns:
+            bool: Deletion status
+        """
+        try:
+            if self.index_exists():
+                self.client.indices.delete(index=self.index_name)
+                logger.info(f"Deleted index {self.index_name}")
+                return True
+            else:
+                logger.info(f"Index {self.index_name} does not exist, skipping deletion")
+                return False
+        except Exception as e:
+            logger.error(f"Error deleting index: {e}")
+            return False
+    
+    def initialize_index(self) -> bool:
+        """
+        Initialize index (create if not exists)
+        
+        Returns:
+            bool: Initialization status
+        """
+        try:
+            if not self.index_exists():
+                return self.create_index()
+            else:
+                logger.info(f"Index {self.index_name} already initialized")
+                return True
+        except Exception as e:
+            logger.error(f"Error initializing index: {e}")
+            return False
+    
+    def get_index_stats(self) -> Dict[str, Any]:
+        """
+        Get index statistics
+        
+        Returns:
+            Dict[str, Any]: Index statistics
+        """
+        try:
+            if self.index_exists():
+                return self.client.indices.stats(index=self.index_name)
+            else:
+                return {}
+        except Exception as e:
+            logger.error(f"Error getting index stats: {e}")
+            return {}
+    
+    def refresh_index(self) -> bool:
+        """
+        Refresh index to make recent changes visible
+        
+        Returns:
+            bool: Refresh status
+        """
+        try:
+            if self.index_exists():
+                self.client.indices.refresh(index=self.index_name)
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(f"Error refreshing index: {e}")
+            return False
+
+
+# Create singleton instance
+index_manager = IndexManager()

+ 3 - 0
src/utils/es/services/__init__.py

@@ -0,0 +1,3 @@
+"""
+Elasticsearch services module
+"""

+ 326 - 0
src/utils/es/services/search_service.py

@@ -0,0 +1,326 @@
+"""
+Elasticsearch search service
+Encapsulates business search logic including full-text, vector, and hybrid search
+"""
+import logging
+from typing import Dict, Any, List, Optional
+from elasticsearch import Elasticsearch
+from src.conf.settings import es_settings
+from src.utils.es.client_manager import es_client_manager
+from src.utils.es.core.index_manager import index_manager
+
+logger = logging.getLogger(__name__)
+
+
+class SearchService:
+    """
+    Elasticsearch search service
+    """
+    
+    def __init__(self):
+        """
+        Initialize search service
+        """
+        self.client = es_client_manager.get_client()
+        self.index_name = es_settings.es_index_name
+        # Ensure index is initialized
+        index_manager.initialize_index()
+    
+    def full_text_search(self, query: str, fields: List[str] = None, size: int = 10) -> Dict[str, Any]:
+        """
+        Full-text search
+        
+        Args:
+            query: Search query
+            fields: Fields to search in
+            size: Number of results to return
+        
+        Returns:
+            Dict[str, Any]: Search results
+        """
+        try:
+            if fields is None:
+                fields = ["content"]
+            
+            search_body = {
+                "size": size,
+                "query": {
+                    "multi_match": {
+                        "query": query,
+                        "fields": fields,
+                        "type": "best_fields"
+                    }
+                }
+            }
+            
+            response = self.client.search(
+                index=self.index_name,
+                body=search_body
+            )
+            
+            return {
+                "total": response["hits"]["total"]["value"],
+                "hits": [
+                    {
+                        "id": hit["_id"],
+                        "score": hit["_score"],
+                        "source": hit["_source"]
+                    }
+                    for hit in response["hits"]["hits"]
+                ]
+            }
+        except Exception as e:
+            logger.error(f"Error in full-text search: {e}")
+            return {
+                "total": 0,
+                "hits": []
+            }
+    
+    def vector_search(self, vector: List[float], size: int = 10) -> Dict[str, Any]:
+        """
+        Vector search
+        
+        Args:
+            vector: 1024-dimensional vector
+            size: Number of results to return
+        
+        Returns:
+            Dict[str, Any]: Search results
+        """
+        try:
+            # Validate vector dimensions
+            if len(vector) != 1024:
+                raise ValueError("Vector must be 1024-dimensional")
+            
+            search_body = {
+                "size": size,
+                "query": {
+                    "knn": {
+                        "dense_vector_1024": {
+                            "vector": vector,
+                            "k": size,
+                            "num_candidates": 100
+                        }
+                    }
+                }
+            }
+            
+            response = self.client.search(
+                index=self.index_name,
+                body=search_body
+            )
+            
+            return {
+                "total": len(response["hits"]["hits"]),
+                "hits": [
+                    {
+                        "id": hit["_id"],
+                        "score": hit["_score"],
+                        "source": hit["_source"]
+                    }
+                    for hit in response["hits"]["hits"]
+                ]
+            }
+        except Exception as e:
+            logger.error(f"Error in vector search: {e}")
+            return {
+                "total": 0,
+                "hits": []
+            }
+    
+    def hybrid_search(self, query: str, vector: List[float], size: int = 10, fields: List[str] = None) -> Dict[str, Any]:
+        """
+        Hybrid search combining full-text and vector search
+        
+        Args:
+            query: Search query
+            vector: 1024-dimensional vector
+            size: Number of results to return
+            fields: Fields to search in
+        
+        Returns:
+            Dict[str, Any]: Search results
+        """
+        try:
+            # Validate vector dimensions
+            if len(vector) != 1024:
+                raise ValueError("Vector must be 1024-dimensional")
+            
+            if fields is None:
+                fields = ["content"]
+            
+            search_body = {
+                "size": size,
+                "query": {
+                    "bool": {
+                        "should": [
+                            {
+                                "multi_match": {
+                                    "query": query,
+                                    "fields": fields,
+                                    "type": "best_fields",
+                                    "boost": 1.0
+                                }
+                            },
+                            {
+                                "knn": {
+                                    "dense_vector_1024": {
+                                        "vector": vector,
+                                        "k": size,
+                                        "num_candidates": 100,
+                                        "boost": 1.0
+                                    }
+                                }
+                            }
+                        ]
+                    }
+                }
+            }
+            
+            response = self.client.search(
+                index=self.index_name,
+                body=search_body
+            )
+            
+            return {
+                "total": response["hits"]["total"]["value"],
+                "hits": [
+                    {
+                        "id": hit["_id"],
+                        "score": hit["_score"],
+                        "source": hit["_source"]
+                    }
+                    for hit in response["hits"]["hits"]
+                ]
+            }
+        except Exception as e:
+            logger.error(f"Error in hybrid search: {e}")
+            return {
+                "total": 0,
+                "hits": []
+            }
+    
+    def search_with_filters(self, query: Optional[str] = None, vector: Optional[List[float]] = None, 
+                          filters: Dict[str, Any] = None, size: int = 10) -> Dict[str, Any]:
+        """
+        Search with filters
+        
+        Args:
+            query: Search query
+            vector: 1024-dimensional vector
+            filters: Additional filters
+            size: Number of results to return
+        
+        Returns:
+            Dict[str, Any]: Search results
+        """
+        try:
+            search_body = {
+                "size": size,
+                "query": {
+                    "bool": {
+                        "must": []
+                    }
+                }
+            }
+            
+            # Add filters
+            if filters:
+                for field, value in filters.items():
+                    search_body["query"]["bool"]["must"].append({
+                        "term": {
+                            field: value
+                        }
+                    })
+            
+            # Add search query
+            if query:
+                search_body["query"]["bool"]["must"].append({
+                    "multi_match": {
+                        "query": query,
+                        "fields": ["content"],
+                        "type": "best_fields"
+                    }
+                })
+            
+            # Add vector search
+            if vector:
+                if len(vector) != 1024:
+                    raise ValueError("Vector must be 1024-dimensional")
+                
+                search_body["query"]["bool"]["must"].append({
+                    "knn": {
+                        "dense_vector_1024": {
+                            "vector": vector,
+                            "k": size,
+                            "num_candidates": 100
+                        }
+                    }
+                })
+            
+            response = self.client.search(
+                index=self.index_name,
+                body=search_body
+            )
+            
+            return {
+                "total": response["hits"]["total"]["value"],
+                "hits": [
+                    {
+                        "id": hit["_id"],
+                        "score": hit["_score"],
+                        "source": hit["_source"]
+                    }
+                    for hit in response["hits"]["hits"]
+                ]
+            }
+        except Exception as e:
+            logger.error(f"Error in search with filters: {e}")
+            return {
+                "total": 0,
+                "hits": []
+            }
+    
+    def get_document(self, document_id: str) -> Optional[Dict[str, Any]]:
+        """
+        Get document by ID
+        
+        Args:
+            document_id: Document ID
+        
+        Returns:
+            Optional[Dict[str, Any]]: Document source
+        """
+        try:
+            response = self.client.get(
+                index=self.index_name,
+                id=document_id
+            )
+            return response["_source"]
+        except Exception as e:
+            logger.error(f"Error getting document: {e}")
+            return None
+    
+    def count_documents(self, query: Optional[Dict[str, Any]] = None) -> int:
+        """
+        Count documents matching query
+        
+        Args:
+            query: Count query
+        
+        Returns:
+            int: Document count
+        """
+        try:
+            response = self.client.count(
+                index=self.index_name,
+                body={"query": query} if query else None
+            )
+            return response["count"]
+        except Exception as e:
+            logger.error(f"Error counting documents: {e}")
+            return 0
+
+
+# Create singleton instance
+search_service = SearchService()

+ 221 - 0
src/utils/excel_util.py

@@ -0,0 +1,221 @@
+"""
+Excel 工具类
+
+该文件提供 Excel 解析功能,支持:
+- 解析 .xlsx 和 .xls 格式的 Excel 文件
+- 将 Excel 数据转换为 List[Dict[str, Any]] 格式
+- 支持指定工作表
+- 支持自定义表头行
+"""
+
+from typing import List, Dict, Any, Optional
+import os
+
+
+try:
+    from openpyxl import load_workbook
+    from openpyxl.worksheet.worksheet import Worksheet
+except ImportError:
+    raise ImportError("openpyxl 库未安装,请使用 pip install openpyxl 安装")
+
+try:
+    import xlrd
+    from xlrd.sheet import Sheet
+except ImportError:
+    raise ImportError("xlrd 库未安装,请使用 pip install xlrd 安装")
+
+
+class ExcelUtil:
+    """
+    Excel 工具类,用于解析 Excel 文件
+    """
+    
+    @staticmethod
+    def _parse_xlsx(file_path: str, sheet_name: Optional[str] = None, header_row: int = 0) -> List[Dict[str, Any]]:
+        """
+        解析 .xlsx 格式的 Excel 文件
+        
+        Args:
+            file_path: Excel 文件路径
+            sheet_name: 工作表名称,None 表示使用第一个工作表
+            header_row: 表头所在行索引,默认第 0 行
+            
+        Returns:
+            List[Dict[str, Any]]: 解析后的 Excel 数据
+        """
+        # 加载工作簿
+        workbook = load_workbook(filename=file_path, read_only=True)
+        
+        # 获取工作表
+        if sheet_name:
+            sheet: Worksheet = workbook[sheet_name]
+        else:
+            sheet: Worksheet = workbook.active
+        
+        # 获取所有行数据
+        rows = list(sheet.iter_rows(values_only=True))
+        
+        # 检查数据是否为空
+        if not rows:
+            return []
+        
+        # 检查表头行索引是否合法
+        if header_row >= len(rows):
+            raise ValueError(f"表头行索引 {header_row} 超出数据总行数 {len(rows)}")
+        
+        # 获取表头
+        headers = rows[header_row]
+        
+        # 检查表头是否为空
+        if not headers:
+            return []
+        
+        # 解析数据行
+        data: List[Dict[str, Any]] = []
+        for row in rows[header_row + 1:]:
+            if not row or all(cell is None for cell in row):
+                continue
+            
+            # 创建数据字典
+            row_data: Dict[str, Any] = {}
+            for i, header in enumerate(headers):
+                if i < len(row):
+                    row_data[header] = row[i]
+                else:
+                    row_data[header] = None
+            
+            data.append(row_data)
+        
+        workbook.close()
+        return data
+    
+    @staticmethod
+    def _parse_xls(file_path: str, sheet_name: Optional[str] = None, header_row: int = 0) -> List[Dict[str, Any]]:
+        """
+        解析 .xls 格式的 Excel 文件
+        
+        Args:
+            file_path: Excel 文件路径
+            sheet_name: 工作表名称,None 表示使用第一个工作表
+            header_row: 表头所在行索引,默认第 0 行
+            
+        Returns:
+            List[Dict[str, Any]]: 解析后的 Excel 数据
+        """
+        # 打开工作簿
+        workbook = xlrd.open_workbook(file_path)
+        
+        # 获取工作表
+        if sheet_name:
+            sheet: Sheet = workbook.sheet_by_name(sheet_name)
+        else:
+            sheet: Sheet = workbook.sheet_by_index(0)
+        
+        # 获取所有行数据
+        rows = []
+        for i in range(sheet.nrows):
+            rows.append(sheet.row_values(i))
+        
+        # 检查数据是否为空
+        if not rows:
+            return []
+        
+        # 检查表头行索引是否合法
+        if header_row >= len(rows):
+            raise ValueError(f"表头行索引 {header_row} 超出数据总行数 {len(rows)}")
+        
+        # 获取表头
+        headers = rows[header_row]
+        
+        # 检查表头是否为空
+        if not headers:
+            return []
+        
+        # 解析数据行
+        data: List[Dict[str, Any]] = []
+        for row in rows[header_row + 1:]:
+            if not row or all(cell == '' or cell is None for cell in row):
+                continue
+            
+            # 创建数据字典
+            row_data: Dict[str, Any] = {}
+            for i, header in enumerate(headers):
+                if i < len(row):
+                    row_data[header] = row[i]
+                else:
+                    row_data[header] = None
+            
+            data.append(row_data)
+        
+        return data
+    
+    @staticmethod
+    def parse_excel(file_path: str, sheet_name: Optional[str] = None, header_row: int = 0) -> List[Dict[str, Any]]:
+        """
+        解析 Excel 文件,支持 .xlsx 和 .xls 格式
+        
+        Args:
+            file_path: Excel 文件路径
+            sheet_name: 工作表名称,None 表示使用第一个工作表
+            header_row: 表头所在行索引,默认第 0 行
+            
+        Returns:
+            List[Dict[str, Any]]: 解析后的 Excel 数据
+        
+        Raises:
+            ValueError: 不支持的文件格式
+            FileNotFoundError: 文件不存在
+        """
+        # 检查文件是否存在
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"文件 {file_path} 不存在")
+        
+        # 获取文件扩展名
+        file_ext = os.path.splitext(file_path)[1].lower()
+        
+        # 根据文件格式选择解析方法
+        if file_ext == '.xlsx':
+            return ExcelUtil._parse_xlsx(file_path, sheet_name, header_row)
+        elif file_ext == '.xls':
+            return ExcelUtil._parse_xls(file_path, sheet_name, header_row)
+        else:
+            raise ValueError(f"不支持的文件格式 {file_ext},只支持 .xlsx 和 .xls 格式")
+    
+    @staticmethod
+    def get_sheet_names(file_path: str) -> List[str]:
+        """
+        获取 Excel 文件中的所有工作表名称
+        
+        Args:
+            file_path: Excel 文件路径
+            
+        Returns:
+            List[str]: 工作表名称列表
+        
+        Raises:
+            ValueError: 不支持的文件格式
+            FileNotFoundError: 文件不存在
+        """
+        # 检查文件是否存在
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"文件 {file_path} 不存在")
+        
+        # 获取文件扩展名
+        file_ext = os.path.splitext(file_path)[1].lower()
+        
+        # 根据文件格式选择方法
+        if file_ext == '.xlsx':
+            workbook = load_workbook(filename=file_path, read_only=True)
+            sheet_names = workbook.sheetnames
+            workbook.close()
+            return sheet_names
+        elif file_ext == '.xls':
+            workbook = xlrd.open_workbook(file_path)
+            sheet_names = workbook.sheet_names()
+            return sheet_names
+        else:
+            raise ValueError(f"不支持的文件格式 {file_ext},只支持 .xlsx 和 .xls 格式")
+
+
+# 单例模式实例
+excel_util = ExcelUtil()

+ 0 - 0
src/utils/file/__init__.py


+ 8 - 0
src/utils/file/file_utils.py

@@ -0,0 +1,8 @@
+import uuid
+import os
+
+def generate_unique_filename(filename: str) -> str:
+    """生成唯一文件名,避免重复"""
+    ext = os.path.splitext(filename)[1]
+    unique_id = str(uuid.uuid4()).replace("-", "")
+    return f"{unique_id}{ext}"

+ 319 - 0
src/utils/file/image_util.py

@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+"""
+图片处理工具类
+"""
+import os
+import zipfile
+import re
+from typing import List
+from io import BytesIO
+from PIL import Image
+from src.utils.file.minio.minio_util import MinIOUtil
+from src.utils.file.file_utils import generate_unique_filename
+
+
+class ImageUtil:
+    """图片处理工具类"""
+    
+    def __init__(self):
+        """初始化图片处理工具类"""
+        self.minio_util = MinIOUtil()
+
+    # 将图片url转换为Image对象
+    def _url_to_image(self, image_url: str) -> Image.Image:
+        """
+        将图片url转换为Image对象
+        
+        Args:
+            image_url: 图片url
+            
+        Returns:
+            Image.Image: 图片对象
+        """
+        import requests
+
+        # 处理image_url为image: Image.Image
+        if isinstance(image_url, str):
+            # 下载图片
+            response = requests.get(image_url)
+            response.raise_for_status()  # 检查HTTP状态码
+    
+            # 将响应内容转换为字节流
+            image_bytes = BytesIO(response.content)
+    
+            # 创建Image对象
+            image = Image.open(image_bytes)
+            return image
+    
+    def process_image_zip(self, zip_file_path: str, book_name: str) -> List[str]:
+        """
+        处理图片压缩包,解压并将图片存入minio,返回按页码顺序排序的url集合
+        
+        Args:
+            zip_file_path: 图片压缩包路径
+            book_name: 书名,用于生成图片文件名
+            
+        Returns:
+            List[str]: 按页码顺序排序的minio url集合
+        """
+        print(f"开始处理图片压缩包: {zip_file_path}")
+        
+        # 用于存储图片信息的列表,格式: (页码, url)
+        image_info_list = []
+        
+        try:
+            # 打开并解压压缩包
+            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+                # 获取压缩包中的所有文件名
+                all_files = zip_ref.namelist()
+                
+                # 过滤出图片文件
+                image_files = [
+                    f for f in all_files 
+                    if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')) 
+                    and not f.startswith('__MACOSX')  # 排除macOS生成的隐藏文件
+                ]
+                
+                print(f"找到 {len(image_files)} 张图片")
+                
+                # 遍历所有图片文件
+                for image_file in image_files:
+                    try:
+                        # 从文件名中提取页码
+                        page_num = self._extract_page_number(image_file)
+                        
+                        # 生成符合要求的文件名:书名_页码,例如:莉莉兰的小虫虫_P1.png
+                        _, ext = os.path.splitext(image_file)
+                        new_filename = f"{book_name}_P{page_num}{ext}"
+                        
+                        print(f"处理图片: {image_file} -> {new_filename}, 页码: {page_num}")
+                        
+                        # 读取图片文件内容
+                        with zip_ref.open(image_file) as f:
+                            image_content = f.read()
+                            
+                        # 将图片内容转换为BytesIO流
+                        image_stream = BytesIO(image_content)
+                        
+                        # 压缩图片
+                        compressed_stream = self._compress_image(image_stream, new_filename)
+                        
+                        # 上传到minio
+                        image_url = self.minio_util.upload_file(compressed_stream, new_filename)
+                        
+                        # 添加到图片信息列表
+                        image_info_list.append((page_num, image_url))
+                        
+                    except Exception as e:
+                        print(f"处理图片 {image_file} 失败: {str(e)}")
+                        continue
+            
+            # 按页码顺序排序
+            image_info_list.sort(key=lambda x: x[0])
+            
+            # 提取url列表
+            image_urls = [url for _, url in image_info_list]
+            
+            print(f"图片压缩包处理完成,共处理 {len(image_urls)} 张图片")
+            return image_urls
+            
+        except Exception as e:
+            print(f"处理图片压缩包失败: {str(e)}")
+            raise
+    
+    def _compress_image(self, image_stream: BytesIO, original_filename: str, max_size_kb: int = 5000) -> BytesIO:
+        """
+        压缩图片,确保最终压缩大小不超过max_size_kb
+        使用尺寸调整和质量调整结合的方式,确保压缩效果
+        
+        Args:
+            image_stream: 图片流
+            original_filename: 原始文件名
+            max_size_kb: 最大文件大小,单位KB
+            
+        Returns:
+            BytesIO: 压缩后的图片流
+        """
+        # 检查图片大小
+        image_stream.seek(0, 2)  # 移动到文件末尾
+        current_size = image_stream.tell() / 1024  # 当前大小,单位KB
+        original_stream_data = image_stream.getvalue()  # 保存原始流数据
+        image_stream.seek(0)  # 回到文件开头
+        
+        # 严格目标大小,使用max_size_kb作为目标
+        target_size = max_size_kb
+        
+        # 如果当前大小小于等于目标大小,直接返回
+        if current_size <= target_size:
+            return image_stream
+        
+        # 打开图片
+        img = Image.open(image_stream)
+        original_width, original_height = img.size
+        
+        # 获取原始图片格式
+        original_format = img.format or 'JPEG'  # 默认使用JPEG格式
+        
+        # 使用LANCZOS高质量重采样算法
+        resample_method = Image.Resampling.LANCZOS
+        
+        # 保存最佳结果
+        best_result = None
+        best_size = float('inf')
+        
+        # 辅助函数:获取指定尺寸和质量的压缩大小和字节流
+        def get_compressed_data(width, height, quality_val):
+            """
+            获取指定尺寸和质量的压缩大小和字节流
+            """
+            # 调整图片尺寸
+            resized_img = img.resize((width, height), resample_method)
+            
+            # 保存调整后的图片
+            compressed_stream = BytesIO()
+            resized_img.save(compressed_stream, format=original_format, quality=quality_val)
+            compressed_stream.seek(0, 2)
+            compressed_size = compressed_stream.tell() / 1024
+            compressed_stream.seek(0)
+            
+            return compressed_size, compressed_stream.getvalue()
+        
+        # 主要压缩逻辑:逐步缩小尺寸和降低质量,直到符合要求
+        # 尺寸调整为主,质量调整为辅
+        sizes_to_try = []
+        
+        # 生成要尝试的尺寸列表(从原始尺寸开始,逐步缩小)
+        current_try_width, current_try_height = original_width, original_height
+        for i in range(15):  # 最多尝试15种尺寸
+            sizes_to_try.append((current_try_width, current_try_height))
+            # 每次缩小10%
+            current_try_width = int(current_try_width * 0.9)
+            current_try_height = int(current_try_height * 0.9)
+            # 确保尺寸不小于原始尺寸的30%
+            if current_try_width < original_width * 0.3 or current_try_height < original_height * 0.3:
+                break
+        
+        # 质量级别列表(从高质量开始,逐步降低)
+        quality_levels = [90, 85, 80, 75, 70, 65, 60]
+        
+        # 遍历所有尺寸和质量组合,寻找最佳结果
+        for width, height in sizes_to_try:
+            for quality in quality_levels:
+                # 获取当前参数的压缩数据
+                compressed_size, compressed_data = get_compressed_data(width, height, quality)
+                
+                # 更新最佳结果
+                if compressed_size < best_size:
+                    best_result = compressed_data
+                    best_size = compressed_size
+                    
+                # 如果已经达到目标大小,直接返回
+                if compressed_size <= target_size:
+                    final_stream = BytesIO(compressed_data)
+                    final_stream.seek(0)
+                    return final_stream
+        
+        # 如果没有找到符合要求的结果,使用最佳结果
+        if best_result is None:
+            # 返回原始图片
+            return BytesIO(original_stream_data)
+        
+        # 最终检查:如果最佳结果仍超过目标,使用最激进的压缩
+        final_stream = BytesIO(best_result)
+        final_stream.seek(0, 2)
+        final_size = final_stream.tell() / 1024
+        final_stream.seek(0)
+        
+        if final_size > target_size:
+            # 使用最激进的压缩参数
+            aggressive_width = int(original_width * 0.5)
+            aggressive_height = int(original_height * 0.5)
+            aggressive_quality = 50
+            
+            aggressive_size, aggressive_data = get_compressed_data(aggressive_width, aggressive_height, aggressive_quality)
+            
+            if aggressive_size < final_size:
+                final_stream = BytesIO(aggressive_data)
+                final_stream.seek(0)
+        
+        return final_stream
+    
+    def _compress_image_to_bytes(self, image_stream: BytesIO, max_size_kb: int = 5000) -> bytes:
+        """
+        压缩图片,将大于max_size_kb的图片压缩到max_size_kb以内,返回图片的字节流
+        
+        Args:
+            image_stream: 图片流
+            max_size_kb: 最大文件大小,单位KB
+            
+        Returns:
+            bytes: 压缩后的图片字节流
+        """
+        # 创建一个临时文件名用于日志
+        temp_filename = "temp_image"
+        
+        # 调用现有的压缩方法获取压缩后的BytesIO对象
+        compressed_stream = self._compress_image(image_stream, temp_filename, max_size_kb)
+        
+        # 读取字节流并返回
+        compressed_bytes = compressed_stream.getvalue()
+        print(f"图片转换为字节流完成,字节大小为 {len(compressed_bytes)} 字节")
+        
+        return compressed_bytes
+    
+    def compress_image_bytes(self, image_bytes: bytes, max_size_kb: int = 5000) -> bytes:
+        """
+        压缩图片,将大于max_size_kb的图片压缩到max_size_kb以内,返回图片的字节流
+        
+        Args:
+            image_bytes: 图片字节流
+            max_size_kb: 最大文件大小,单位KB
+            
+        Returns:
+            bytes: 压缩后的图片字节流
+        """
+        print(f"开始压缩图片,原大小为 {len(image_bytes) / 1024:.2f}KB")
+        
+        # 将字节流转换为BytesIO对象
+        image_stream = BytesIO(image_bytes)
+        
+        # 调用现有的压缩方法
+        compressed_bytes = self._compress_image_to_bytes(image_stream, max_size_kb)
+        
+        print(f"图片压缩完成,压缩后大小为 {len(compressed_bytes) / 1024:.2f}KB")
+        
+        return compressed_bytes
+    
+    def _extract_page_number(self, filename: str) -> int:
+        """
+        从文件名中提取页码
+        
+        Args:
+            filename: 文件名
+            
+        Returns:
+            int: 页码
+        """
+        # 提取文件名(不含路径)
+        basename = os.path.basename(filename)
+        
+        # 使用正则表达式匹配页码
+        # 匹配类似 P1, Page1, 001, 1 等格式的页码
+        patterns = [
+            r'P(\d+)',  # P1, P123
+            r'Page(\d+)',  # Page1, Page123
+            r'(\d+)\.(?:png|jpg|jpeg|gif)$',  # 1.png, 123.jpg
+            r'(\d+)_',  # 1_, 123_
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, basename, re.IGNORECASE)
+            if match:
+                return int(match.group(1))
+        
+        # 如果没有匹配到页码,返回0
+        return 0
+
+
+# 单例模式
+image_util = ImageUtil()

+ 0 - 0
src/utils/file/minio/__init__.py


+ 189 - 0
src/utils/file/minio/minio_util.py

@@ -0,0 +1,189 @@
+from minio import Minio
+from typing import BinaryIO
+from datetime import timedelta
+from src.conf.settings import minio_settings
+from src.utils.file.file_utils import generate_unique_filename
+
+# 全局MinIO客户端实例
+_global_minio_client = None
+
+class MinIOUtil:
+    def __init__(self, bucket_name: str = minio_settings.minio_bucket_name, check_bucket=False):
+        self.client = Minio(
+            endpoint=minio_settings.minio_endpoint,
+            access_key=minio_settings.minio_access_key,
+            secret_key=minio_settings.minio_secret_key,
+            secure=False
+        )
+        self.bucket_name = bucket_name
+        
+        # 仅在明确要求时才校验存储桶
+        if check_bucket:
+            self._ensure_bucket_exists()
+
+    def _ensure_bucket_exists(self):
+        """确保存储桶存在,若不存在则创建"""
+        try:
+            if not self.client.bucket_exists(self.bucket_name):
+                self.client.make_bucket(self.bucket_name)
+                print(f"Bucket '{self.bucket_name}' created successfully.")
+            else:
+                print(f"Bucket '{self.bucket_name}' already exists.")
+        except Exception as e:
+            raise RuntimeError(f"Failed to create bucket: {e}")
+
+    def close(self):
+        """关闭MinIO客户端连接"""
+        # MinIO客户端不需要显式关闭连接,此方法用于统一接口
+        pass
+
+    def upload_file(self, file: BinaryIO, original_filename: str) -> str:
+        """上传文件并返回URL"""
+        try:
+            # 生成唯一文件名,防止冲突
+            unique_filename = generate_unique_filename(original_filename)
+            content_type = self._get_content_type(original_filename)
+            
+            # 获取文件长度
+            if hasattr(file, 'getbuffer'):
+                # 对于BytesIO对象,获取其缓冲区大小
+                length = file.getbuffer().nbytes
+            elif hasattr(file, 'tell') and hasattr(file, 'seek'):
+                # 对于支持seek/tell的文件对象,获取其大小
+                current_pos = file.tell()
+                file.seek(0, 2)  # 移动到文件末尾
+                length = file.tell()
+                file.seek(current_pos)  # 恢复到原始位置
+            else:
+                # 对于其他类型,使用-1让MinIO自动处理
+                length = -1
+            
+            # 上传文件(支持大文件分块上传)
+            self.client.put_object(
+                bucket_name=self.bucket_name,
+                object_name=unique_filename,
+                data=file,
+                length=length,
+                content_type=content_type
+            )
+            
+            # 生成公开可访问的URL(可选:设置过期时间或私有访问)
+            url = self.client.get_presigned_url(
+                method="GET",
+                bucket_name=self.bucket_name,
+                object_name=unique_filename,
+                expires=timedelta(seconds=3600)  # 1小时有效期(可设为永久或更短)
+            )
+            
+            return url
+        except Exception as e:
+            raise RuntimeError(f"File upload failed: {e}")
+
+    def custom_upload_file(self, file: BinaryIO, original_filename: str, bucket_name: str = None) -> str:
+        """上传文件并返回URL"""
+        try:
+            if bucket_name is None:
+                bucket_name = self.bucket_name
+             # 生成唯一文件名,防止冲突
+            unique_filename = generate_unique_filename(original_filename)
+            content_type = self._get_content_type(original_filename)
+            
+            # 获取文件长度
+            if hasattr(file, 'getbuffer'):
+                # 对于BytesIO对象,获取其缓冲区大小
+                length = file.getbuffer().nbytes
+            elif hasattr(file, 'tell') and hasattr(file, 'seek'):
+                # 对于支持seek/tell的文件对象,获取其大小
+                current_pos = file.tell()
+                file.seek(0, 2)  # 移动到文件末尾
+                length = file.tell()
+                file.seek(current_pos)  # 恢复到原始位置
+            else:
+                # 对于其他类型,使用-1让MinIO自动处理
+                length = -1
+            
+            # 上传文件(支持大文件分块上传)
+            self.client.put_object(
+                bucket_name=bucket_name,
+                object_name=unique_filename,
+                data=file,
+                length=length,
+                content_type=content_type
+            )
+            
+            url = f"{minio_settings.minio_endpoint}/{bucket_name}/{unique_filename}"
+
+            # # 生成公开可访问的URL(可选:设置过期时间或私有访问)
+            # url = self.client.get_presigned_url(
+            #     method="GET",
+            #     bucket_name=self.bucket_name,
+            #     object_name=custom_filename,
+            #     expires=timedelta(seconds=3600)  # 1小时有效期(可设为永久或更短)
+            # )
+            
+            return url
+        except Exception as e:
+            raise RuntimeError(f"File upload failed: {e}")
+
+    def download_file(self, object_name: str) -> BinaryIO:
+        """下载文件并返回文件流"""
+        try:
+            response = self.client.get_object(
+                bucket_name=self.bucket_name,
+                object_name=object_name
+            )
+            return response
+        except Exception as e:
+            raise RuntimeError(f"File download failed: {e}")
+
+    def delete_file(self, object_name: str) -> bool:
+        """删除文件"""
+        try:
+            self.client.remove_object(
+                bucket_name=self.bucket_name,
+                object_name=object_name
+            )
+            return True
+        except Exception as e:
+            print(f"Delete failed: {e}")
+            return False
+
+    def _get_content_type(self, filename: str) -> str:
+        """根据文件后缀推断MIME类型"""
+        ext = filename.split('.')[-1].lower()
+        mime_map = {
+            'jpg': 'image/jpeg', 'jpeg': 'image/jpeg',
+            'png': 'image/png', 'gif': 'image/gif',
+            'pdf': 'application/pdf',
+            'txt': 'text/plain',
+            'mp4': 'video/mp4',
+            'mp3': 'audio/mpeg'
+        }
+        return mime_map.get(ext, 'application/octet-stream')
+
+
+def get_minio_client() -> MinIOUtil:
+    """获取全局MinIO客户端实例"""
+    global _global_minio_client
+    if _global_minio_client is None:
+        raise RuntimeError("MinIO client has not been initialized. Call init_minio_client() first.")
+    return _global_minio_client
+
+
+def init_minio_client(check_bucket=False) -> None:
+    """初始化全局MinIO客户端
+    
+    Args:
+        check_bucket: 是否在初始化时校验存储桶
+    """
+    global _global_minio_client
+    if _global_minio_client is None:
+        _global_minio_client = MinIOUtil(check_bucket=check_bucket)
+
+
+def close_minio_client() -> None:
+    """关闭全局MinIO客户端"""
+    global _global_minio_client
+    if _global_minio_client is not None:
+        _global_minio_client.close()
+        _global_minio_client = None

+ 383 - 0
src/utils/http_client.py

@@ -0,0 +1,383 @@
+import requests
+import logging
+import os
+import json
+from typing import Dict, Any, Optional
+from urllib3.util.retry import Retry
+from requests.adapters import HTTPAdapter
+
+# 配置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class HTTPClient:
+    """HTTP请求工具类,用于发送各种HTTP请求"""
+    
+    def __init__(self, base_url: str, api_key: str = None, auth_type: str = 'bearer'):
+        """
+        初始化HTTP客户端
+        
+        Args:
+            base_url: API基础URL
+            api_key: API密钥
+            auth_type: 认证类型,支持'bearer'和'basic'
+        """
+        self.base_url = base_url.rstrip('/')
+        self.api_key = api_key
+        self.session = requests.Session()
+        
+        # 设置请求超时(秒)
+        self.timeout = 30
+        
+        # 配置重试机制
+        retry_strategy = Retry(
+            total=3,  # 最大重试次数
+            backoff_factor=1,  # 重试间隔:1秒、2秒、4秒
+            status_forcelist=[502, 503, 504],  # 重试的状态码(移除500,避免过多无效重试)
+            allowed_methods=["GET", "POST", "PUT", "DELETE"]  # 允许重试的方法
+        )
+        
+        # 创建HTTP适配器并设置重试策略
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        
+        # 将适配器应用到所有请求
+        self.session.mount("http://", adapter)
+        self.session.mount("https://", adapter)
+        
+        # 设置默认请求头
+        if self.api_key:
+            if auth_type == 'bearer':
+                self.session.headers.update({
+                    'Authorization': f'Bearer {self.api_key}'
+                })
+            elif auth_type == 'basic':
+                # 处理Basic Auth,格式为"username:password"
+                self.session.headers.update({
+                    'Authorization': f'Basic {self.api_key}'
+                })
+        
+        self.session.headers.update({
+            'Content-Type': 'application/json'
+        })
+    
+    def post(self, endpoint: str, data: Optional[Dict] = None, 
+             json_data: Optional[Dict] = None, files: Optional[Dict] = None,
+             headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        发送POST请求
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            data: 表单数据
+            json_data: JSON数据
+            files: 文件数据
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        url = f"{self.base_url}{endpoint}"
+        
+        # 记录请求日志
+        request_info = {
+            "method": "POST",
+            "url": url,
+            "data": data,
+            "json": json_data,
+            "headers": headers,
+            "files": files is not None  # 不记录文件内容,只记录是否有文件
+        }
+        # 将请求报文写入D:\project\work\ragflow_plugs\book\output\temp下的request.txt文件
+        with open(r"D:\project\work\ragflow_plugs\book\output\temp\request.txt", "w", encoding="utf-8") as f:
+            f.write(str(request_info))
+        
+        logger.info(f"Sending request: {request_info}")
+        
+        try:
+            # 当上传文件时,不使用默认的Content-Type: application/json头
+            # 让requests库自动生成正确的multipart/form-data头
+            if files:
+                # 创建一个临时会话,不包含默认的Content-Type头
+                temp_session = requests.Session()
+                
+                # 复制认证头
+                if self.api_key:
+                    auth_header = self.session.headers.get('Authorization')
+                    if auth_header:
+                        temp_session.headers.update({'Authorization': auth_header})
+                
+                # 使用临时会话发送请求
+                response = temp_session.post(
+                    url=url,
+                    data=data,
+                    json=json_data,
+                    files=files,
+                    headers=headers,
+                    timeout=self.timeout  # 添加超时参数
+                )
+            else:
+                # 正常请求,使用默认会话
+                response = self.session.post(
+                    url=url,
+                    data=data,
+                    json=json_data,
+                    headers=headers,
+                    timeout=self.timeout  # 添加超时参数
+                )
+            
+            # 记录响应日志
+            response_info = {
+                "status_code": response.status_code,
+                "url": url,
+                "headers": dict(response.headers),
+                "content_length": len(response.content)
+            }
+            logger.info(f"Received response: {response_info}")
+            
+            response.raise_for_status()  # 抛出HTTP错误
+            return response.json()
+        except Exception as e:
+            # 记录错误日志
+            logger.error(f"Request failed: {str(e)}")
+            raise
+    
+    def get(self, endpoint: str, params: Optional[Dict] = None,
+            headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        发送GET请求
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            params: 查询参数
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        url = f"{self.base_url}{endpoint}"
+        
+        # 记录请求日志
+        request_info = {
+            "method": "GET",
+            "url": url,
+            "params": params,
+            "headers": headers
+        }
+        logger.info(f"Sending request: {request_info}")
+        
+        try:
+            response = self.session.get(
+                url=url,
+                params=params,
+                headers=headers,
+                timeout=self.timeout  # 添加超时参数
+            )
+            
+            # 记录响应日志
+            response_info = {
+                "status_code": response.status_code,
+                "url": url,
+                "headers": dict(response.headers),
+                "content_length": len(response.content)
+            }
+            logger.info(f"Received response: {response_info}")
+            
+            response.raise_for_status()  # 抛出HTTP错误
+            return response.json()
+        except Exception as e:
+            # 记录错误日志
+            logger.error(f"Request failed: {str(e)}")
+            raise
+    
+    def get_json(self, endpoint: str, json_data: Optional[Dict] = None,
+                headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        发送带有JSON数据的GET请求
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            json_data: JSON数据
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        url = f"{self.base_url}{endpoint}"
+        
+        # 记录请求日志
+        request_info = {
+            "method": "GET",
+            "url": url,
+            "json": json_data,
+            "headers": headers
+        }
+        logger.info(f"Sending request: {request_info}")
+        
+        try:
+            response = self.session.get(
+                url=url,
+                json=json_data,
+                headers=headers,
+                timeout=self.timeout  # 添加超时参数
+            )
+            
+            # 记录响应日志
+            response_info = {
+                "status_code": response.status_code,
+                "url": url,
+                "headers": dict(response.headers),
+                "content_length": len(response.content)
+            }
+            logger.info(f"Received response: {response_info}")
+            
+            response.raise_for_status()  # 抛出HTTP错误
+            # 将response.content转换为JSON
+            return json.loads(response.content.decode('utf-8'))
+        except Exception as e:
+            # 记录错误日志
+            logger.error(f"Request failed: {str(e)}")
+            raise
+    
+    def put(self, endpoint: str, data: Optional[Dict] = None, 
+            json_data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        发送PUT请求
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            data: 表单数据
+            json_data: JSON数据
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        url = f"{self.base_url}{endpoint}"
+        
+        # 记录请求日志
+        request_info = {
+            "method": "PUT",
+            "url": url,
+            "data": data,
+            "json": json_data,
+            "headers": headers
+        }
+        logger.info(f"Sending request: {request_info}")
+        
+        try:
+            response = self.session.put(
+                url=url,
+                data=data,
+                json=json_data,
+                headers=headers,
+                timeout=self.timeout  # 添加超时参数
+            )
+            
+            # 记录响应日志
+            response_info = {
+                "status_code": response.status_code,
+                "url": url,
+                "headers": dict(response.headers),
+                "content_length": len(response.content)
+            }
+            logger.info(f"Received response: {response_info}")
+            
+            response.raise_for_status()  # 抛出HTTP错误
+            return response.json()
+        except Exception as e:
+            # 记录错误日志
+            logger.error(f"Request failed: {str(e)}")
+            raise
+    
+    def delete(self, endpoint: str, data: Optional[Dict] = None, 
+               json_data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        发送DELETE请求
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            data: 表单数据
+            json_data: JSON数据
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        url = f"{self.base_url}{endpoint}"
+        
+        # 记录请求日志
+        request_info = {
+            "method": "DELETE",
+            "url": url,
+            "data": data,
+            "json": json_data,
+            "headers": headers
+        }
+        logger.info(f"Sending request: {request_info}")
+        
+        try:
+            response = self.session.delete(
+                url=url,
+                data=data,
+                json=json_data,
+                headers=headers,
+                timeout=self.timeout  # 添加超时参数
+            )
+            
+            # 记录响应日志
+            response_info = {
+                "status_code": response.status_code,
+                "url": url,
+                "headers": dict(response.headers),
+                "content_length": len(response.content)
+            }
+            logger.info(f"Received response: {response_info}")
+            
+            response.raise_for_status()  # 抛出HTTP错误
+            return response.json()
+        except Exception as e:
+            # 记录错误日志
+            logger.error(f"Request failed: {str(e)}")
+            raise
+    
+    def upload_file(self, endpoint: str, file_path: str, file_field_name: str = 'file',
+                   data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        上传文件
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            file_path: 本地文件路径
+            file_field_name: 表单字段名称
+            data: 额外的表单数据
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        # 打开文件并构建files字典
+        with open(file_path, 'rb') as f:
+            files = {
+                file_field_name: (os.path.basename(file_path), f)
+            }
+            
+            # 发送POST请求
+            return self.post(endpoint, data=data, files=files, headers=headers)

+ 527 - 0
src/utils/infinity/README.md

@@ -0,0 +1,527 @@
+# Infinity Python API客户端(带连接池)
+
+这是一个基于Infinity数据库的Python API客户端,带有连接池机制,旨在保障高并发下的高可用性。
+
+## 设计特点
+
+1. **并发连接管理**:支持多线程并发访问
+2. **连接自动回收和复用**:避免频繁创建和关闭连接
+3. **连接超时和心跳检测**:自动检测和清理无效连接
+4. **动态调整连接数量**:根据负载自动调整连接数
+5. **线程安全**:所有操作都是线程安全的
+6. **易用的API**:提供简洁易用的数据库操作接口
+
+## 目录结构
+
+```
+└── utils/infinity/
+    ├── __init__.py    # 主入口,导入并重新导出所有组件
+    ├── client.py      # 客户端实现,包含数据库操作方法
+    ├── pool.py        # 连接池实现,包含连接管理和心跳检测
+    ├── README.md      # 说明文档
+    └── test_infinity.py  # 测试脚本
+```
+
+## 安装依赖
+
+```bash
+# 安装Infinity Python SDK
+pip install infinity-sdk
+```
+
+## 快速开始
+
+### 1. 基本使用
+
+```python
+from utils.infinity import InfinityClient
+
+# 创建客户端实例
+client = InfinityClient(
+    host="192.168.16.134",
+    port="23817",
+    database="default_db",
+    min_connections=2,
+    max_connections=10
+)
+
+# 获取所有数据库
+databases = client.get_databases()
+print(f"Databases: {databases}")
+
+# 获取指定数据库的所有表
+tables = client.get_tables(database_name="image_db")
+print(f"Tables: {tables}")
+
+# 关闭客户端
+client.close()
+```
+
+### 2. 使用全局客户端(单例模式)
+
+```python
+from utils.infinity import get_client, close_client
+
+# 获取全局客户端实例
+client = get_client()
+
+# 使用客户端
+databases = client.get_databases()
+print(f"Databases: {databases}")
+
+# 关闭全局客户端
+close_client()
+```
+
+### 3. 使用连接上下文
+
+连接上下文管理器是一种更安全、更高效的使用连接方式,它会确保连接在使用完毕后自动释放回连接池,避免连接泄漏。
+
+#### 完整示例
+
+```python
+from utils.infinity import InfinityClient
+
+def main():
+    # 创建客户端实例
+    client = InfinityClient(database="test_db")
+    
+    try:
+        # 使用连接上下文获取连接
+        with client.get_connection() as conn:
+            print("=== 连接上下文示例 ===")
+            
+            # 1. 获取所有数据库
+            databases = conn.get_databases()
+            print(f"1. 所有数据库: {databases}")
+            
+            # 2. 获取当前数据库的所有表
+            tables = conn.get_tables()
+            print(f"2. 当前数据库表: {tables}")
+            
+            # 3. 创建新表(如果不存在)
+            table_name = "example_table"
+            print(f"3. 创建表: {table_name}")
+            
+            # 定义表结构
+            fields = [
+                {"name": "id", "type": "INT", "is_primary_key": True},
+                {"name": "name", "type": "VARCHAR(100)"},
+                {"name": "value", "type": "FLOAT"}
+            ]
+            
+            # 先检查表是否存在,如果存在则删除
+            if table_name in tables:
+                conn.drop_table(table_name)
+                print(f"   - 表 {table_name} 已存在,已删除")
+            
+            # 创建新表
+            conn.create_table(table_name, fields)
+            print(f"   - 表 {table_name} 创建成功")
+            
+            # 4. 插入数据
+            print(f"4. 向表 {table_name} 插入数据")
+            
+            # 准备插入的数据
+            documents = [
+                {"id": 1, "name": "示例1", "value": 10.5},
+                {"id": 2, "name": "示例2", "value": 20.3},
+                {"id": 3, "name": "示例3", "value": 15.7}
+            ]
+            
+            # 执行插入操作
+            conn.insert(table_name, documents)
+            print(f"   - 成功插入 {len(documents)} 条数据")
+            
+            # 5. 执行搜索查询
+            print(f"5. 查询表 {table_name} 中的数据")
+            
+            # 构建查询
+            search_query = {
+                "field": "name",
+                "query": "示例",
+                "topn": 2
+            }
+            
+            # 执行查询
+            result = conn.search(table_name, ["id", "name", "value"], search_query)
+            print(f"   - 查询结果: {result}")
+            
+            # 6. 执行向量检索(示例,实际需要向量字段)
+            print(f"6. 执行向量检索(示例)")
+            
+            try:
+                # 注意:此示例仅用于演示API用法,实际使用需要表中存在向量字段
+                vector_query = {
+                    "vector_field": "vector",  # 假设存在向量字段
+                    "query_vector": [0.1, 0.2, 0.3],
+                    "topn": 2
+                }
+                vector_result = conn.vector_search(table_name, ["id", "name", "value"], vector_query)
+                print(f"   - 向量检索结果: {vector_result}")
+            except Exception as e:
+                print(f"   - 向量检索示例失败(预期行为,因为表中没有向量字段): {e}")
+            
+            # 7. 再次查看所有表
+            updated_tables = conn.get_tables()
+            print(f"7. 更新后的表列表: {updated_tables}")
+            
+            print("\n✅ 所有操作执行完成")
+        
+        # 8. 连接已自动释放回连接池
+        print("\n✅ 连接已自动释放回连接池")
+        
+    except Exception as e:
+        print(f"\n❌ 操作失败: {e}")
+    finally:
+        # 关闭客户端
+        client.close()
+        print("✅ 客户端已关闭")
+
+if __name__ == "__main__":
+    main()
+```
+
+#### 多操作上下文示例
+
+```python
+from utils.infinity import InfinityClient
+
+# 创建客户端实例
+client = InfinityClient(database="test_db")
+
+# 示例:在同一个连接上下文中执行多个相关操作
+try:
+    with client.get_connection() as conn:
+        # 操作1:创建表
+        conn.create_table("temp_table", [
+            {"name": "id", "type": "INT", "is_primary_key": True},
+            {"name": "data", "type": "VARCHAR(255)"}
+        ])
+        
+        # 操作2:插入数据
+        conn.insert("temp_table", [{"id": 1, "data": "test1"}, {"id": 2, "data": "test2"}])
+        
+        # 操作3:查询数据
+        search_query = {
+            "field": "data",
+            "query": "test",
+            "topn": 2
+        }
+        result = conn.search("temp_table", ["id", "data"], search_query)
+        print(f"查询结果: {result}")
+        
+        # 操作4:删除表
+        conn.drop_table("temp_table")
+        
+        print("✅ 所有相关操作在同一个连接上下文中完成")
+except Exception as e:
+    print(f"❌ 操作失败: {e}")
+finally:
+    client.close()
+```
+
+#### 异常处理上下文示例
+
+```python
+from utils.infinity import InfinityClient
+
+# 创建客户端实例
+client = InfinityClient(database="test_db")
+
+try:
+    with client.get_connection() as conn:
+        # 执行可能失败的操作
+        conn.create_table("error_table", [
+            {"name": "id", "type": "INT", "is_primary_key": True},
+            {"name": "invalid_field", "type": "INVALID_TYPE"}  # 无效字段类型
+        ])
+        
+    # 注意:如果上面的操作失败,代码不会执行到这里
+    print("✅ 操作成功")
+except Exception as e:
+    # 捕获并处理异常
+    print(f"❌ 操作失败,已捕获异常: {e}")
+finally:
+    # 无论操作是否成功,客户端都会被关闭
+    client.close()
+    print("✅ 客户端已关闭")
+```
+
+### 4. 搜索示例
+
+```python
+from utils.infinity import InfinityClient
+
+client = InfinityClient(database="image_db")
+
+# 执行搜索查询
+result = client.search(
+    table_name="pdf_documents_table",
+    output_fields=["id", "title", "content"],
+    query={
+        "field": "content",
+        "query": "儿童绘本",
+        "topn": 5
+    }
+)
+
+print(f"Search result: {result}")
+
+client.close()
+```
+
+### 5. 混合检索示例
+
+```python
+from utils.infinity import InfinityClient
+
+client = InfinityClient(database="image_db")
+
+# 执行混合检索
+result = client.hybrid_search(
+    table_name="pdf_documents_table",
+    output_fields=["id", "title", "content", "score"],
+    query={
+        "vector_field": "dense_vector_1024",
+        "query_vector": [0.1, 0.2, 0.3, ...],  # 实际向量
+        "field": "content",
+        "query": "儿童绘本",
+        "topn": 5,
+        "fusion_weight": 0.5
+    }
+)
+
+print(f"Hybrid search result: {result}")
+
+client.close()
+```
+
+### 6. 向量检索示例
+
+```python
+from utils.infinity import InfinityClient
+
+client = InfinityClient(database="image_db")
+
+# 执行向量检索
+result = client.vector_search(
+    table_name="pdf_documents_table",
+    output_fields=["id", "title", "content", "score"],
+    query={
+        "vector_field": "dense_vector_1024",
+        "query_vector": [0.1, 0.2, 0.3, ...],  # 实际向量
+        "topn": 5
+    }
+)
+
+print(f"Vector search result: {result}")
+
+client.close()
+```
+
+## 核心功能
+
+### 数据库操作
+
+- `get_databases()`: 获取所有数据库
+- `create_database(database_name)`: 创建数据库
+- `drop_database(database_name)`: 删除数据库
+- `use_database(database_name)`: 切换数据库
+
+### 表操作
+
+- `get_tables(database_name=None)`: 获取所有表
+- `create_table(table_name, fields, database_name=None)`: 创建表
+- `drop_table(table_name, database_name=None)`: 删除表
+
+### 文档操作
+
+- `insert(table_name, documents, database_name=None)`: 插入文档
+
+### 检索操作
+
+- `search(table_name, output_fields, query, database_name=None)`: 搜索文档
+  - `table_name`: 表名
+  - `output_fields`: 要返回的字段列表
+  - `query`: 查询条件,包含`field`、`query`和`topn`字段
+  - `database_name`: 数据库名称(可选,默认使用客户端配置的数据库)
+
+- `hybrid_search(table_name, output_fields, query, database_name=None)`: 混合检索
+  - `table_name`: 表名
+  - `output_fields`: 要返回的字段列表
+  - `query`: 查询条件,包含`vector_field`、`query_vector`、`field`、`query`、`topn`和`fusion_weight`字段
+  - `database_name`: 数据库名称(可选,默认使用客户端配置的数据库)
+
+- `vector_search(table_name, output_fields, query, database_name=None)`: 向量检索
+  - `table_name`: 表名
+  - `output_fields`: 要返回的字段列表
+  - `query`: 查询条件,包含`vector_field`、`query_vector`和`topn`字段
+  - `database_name`: 数据库名称(可选,默认使用客户端配置的数据库)
+
+### 连接池管理
+
+- `get_status()`: 获取连接池状态
+- `close()`: 关闭所有连接
+
+## 配置参数
+
+### InfinityClient 配置
+
+| 参数 | 类型 | 默认值 | 说明 |
+|------|------|--------|------|
+| host | str | "192.168.16.134" | Infinity服务地址 |
+| port | str | "23817" | Infinity服务端口 |
+| database | str | "default_db" | 数据库名称 |
+| min_connections | int | 2 | 最小连接数 |
+| max_connections | int | 10 | 最大连接数 |
+
+### InfinityConnectionPool 配置
+
+| 参数 | 类型 | 默认值 | 说明 |
+|------|------|--------|------|
+| host | str | "192.168.16.134" | Infinity服务地址 |
+| port | str | "23817" | Infinity服务端口 |
+| database | str | "default_db" | 数据库名称 |
+| min_connections | int | 2 | 最小连接数 |
+| max_connections | int | 10 | 最大连接数 |
+| connection_timeout | int | 30 | 连接超时时间(秒) |
+| idle_timeout | int | 300 | 空闲连接超时时间(秒) |
+| heartbeat_interval | int | 60 | 心跳检测间隔(秒) |
+
+## 连接池状态
+
+可以通过 `get_status()` 方法获取连接池的状态信息:
+
+```python
+from utils.infinity import InfinityClient
+
+client = InfinityClient()
+status = client.get_status()
+print(f"Connection pool status: {status}")
+```
+
+输出示例:
+
+```
+Connection pool status: {
+    "total_connections": 5,
+    "available_connections": 3,
+    "in_use_connections": 2,
+    "min_connections": 2,
+    "max_connections": 10
+}
+```
+
+## 高并发使用建议
+
+1. **调整连接池大小**:根据实际并发量调整 `min_connections` 和 `max_connections`
+2. **使用全局客户端**:在多线程环境中使用全局客户端实例
+3. **合理设置超时时间**:根据网络环境和数据库性能调整超时参数
+4. **使用连接上下文**:使用 `with client.get_connection()` 确保连接正确释放
+5. **定期检查连接池状态**:监控连接池状态,及时调整配置
+
+## 异常处理
+
+```python
+from utils.infinity import InfinityClient
+
+try:
+    client = InfinityClient()
+    databases = client.get_databases()
+    print(f"Databases: {databases}")
+except Exception as e:
+    print(f"Error: {e}")
+finally:
+    client.close()
+```
+
+## 最佳实践
+
+1. **在应用启动时初始化客户端**:避免频繁创建和销毁客户端
+2. **在应用关闭时关闭客户端**:释放所有连接资源
+3. **使用连接上下文管理连接**:确保连接正确释放
+4. **监控连接池状态**:及时发现和解决连接问题
+5. **根据负载调整连接池大小**:避免连接过多或不足
+
+## 与现有代码集成
+
+可以将此客户端与现有代码无缝集成,替换直接使用Infinity SDK的地方:
+
+### 原有代码
+
+```python
+import infinity
+from infinity.common import NetworkAddress
+
+# 直接创建连接
+conn = infinity.connect(NetworkAddress("192.168.16.134", "23817"))
+conn.use_database("image_db")
+# 使用连接
+result = conn.search("my_table", query)
+```
+
+### 集成后
+
+```python
+from utils.infinity import InfinityClient
+
+# 使用连接池客户端
+client = InfinityClient(database="image_db")
+result = client.search("my_table", query)
+```
+
+## 性能优化
+
+1. **减少连接创建开销**:连接池自动管理连接,避免频繁创建和关闭连接
+2. **连接复用**:连接可以被多次复用,提高性能
+3. **异步连接管理**:连接池异步管理连接,不会阻塞主线程
+4. **心跳检测**:自动清理无效连接,保持连接池健康
+
+## 测试
+
+可以使用以下方法测试连接池的性能:
+
+```python
+from utils.infinity import InfinityClient
+import threading
+import time
+
+def test_query(client, thread_id):
+    """测试查询性能"""
+    start_time = time.time()
+    databases = client.get_databases()
+    end_time = time.time()
+    print(f"Thread {thread_id}: Query took {end_time - start_time:.4f} seconds")
+
+# 创建客户端
+client = InfinityClient(max_connections=20)
+
+# 测试并发查询
+threads = []
+start_time = time.time()
+
+for i in range(50):
+    thread = threading.Thread(target=test_query, args=(client, i))
+    threads.append(thread)
+    thread.start()
+
+# 等待所有线程完成
+for thread in threads:
+    thread.join()
+
+end_time = time.time()
+print(f"Total time for 50 concurrent queries: {end_time - start_time:.4f} seconds")
+print(f"Connection pool status: {client.get_status()}")
+
+# 关闭客户端
+client.close()
+```
+
+## 版本历史
+
+- v1.0.0: 初始版本,实现了基本的连接池功能和数据库操作接口
+
+## 许可证
+
+MIT License

+ 13 - 0
src/utils/infinity/__init__.py

@@ -0,0 +1,13 @@
+# Infinity Python API客户端,带有连接池机制
+
+# 从分离的模块中导入所有组件,确保向后兼容性
+from .pool import InfinityConnectionPool
+from .client import InfinityClient, get_client, close_client
+
+# 重新导出所有组件,使它们可以从包级别访问
+__all__ = [
+    'InfinityConnectionPool',
+    'InfinityClient',
+    'get_client',
+    'close_client'
+]

+ 308 - 0
src/utils/infinity/client.py

@@ -0,0 +1,308 @@
+# Infinity数据库客户端实现
+from infinity.common import ConflictType
+from typing import Dict, Any, List, Optional
+import threading
+from contextlib import contextmanager
+from src.conf.settings import vector_db_settings
+from .pool import InfinityConnectionPool
+
+class InfinityClient:
+    """
+    Infinity数据库客户端,基于连接池
+    
+    提供了常用的数据库操作方法,包括:
+    - 数据库操作
+    - 表操作
+    - 文档操作
+    - 搜索操作
+    """
+    
+    def __init__(
+        self,
+        host: str = vector_db_settings.infinity_host,
+        port: str = vector_db_settings.infinity_sdk_port,
+        database: str = vector_db_settings.infinity_database,
+        min_connections: int = 2,
+        max_connections: int = 10
+    ):
+        """
+        初始化Infinity客户端
+        
+        Args:
+            host: Infinity服务地址
+            port: Infinity服务端口
+            database: 数据库名称
+            min_connections: 最小连接数
+            max_connections: 最大连接数
+        """
+        self.pool = InfinityConnectionPool(
+            host=host,
+            port=port,
+            database=database,
+            min_connections=min_connections,
+            max_connections=max_connections
+        )
+    
+    def _get_database(self, conn: Any, database_name: Optional[str] = None):
+        """
+        获取数据库对象
+        
+        Args:
+            conn: Infinity连接对象
+            database_name: 数据库名称(可选,默认使用客户端配置的数据库)
+            
+        Returns:
+            Database对象
+        """
+        db_name = database_name or self.pool.database
+        return conn.get_database(db_name)
+    
+    def _get_table(self, conn: Any, table_name: str, database_name: Optional[str] = None):
+        """
+        获取表对象
+        
+        Args:
+            conn: Infinity连接对象
+            table_name: 表名
+            database_name: 数据库名称(可选,默认使用客户端配置的数据库)
+            
+        Returns:
+            Table对象
+        """
+        db = self._get_database(conn, database_name)
+        return db.get_table(table_name)
+    
+    @contextmanager
+    def get_connection(self, timeout: Optional[int] = None):
+        """
+        获取一个连接上下文
+        
+        Args:
+            timeout: 获取连接的超时时间(秒)
+        """
+        with self.pool.get_connection(timeout) as conn:
+            yield conn
+    
+    def list_databases(self) -> List[str]:
+        """获取所有数据库"""
+        with self.pool.list_connection() as conn:
+            return conn.list_databases()
+    
+    def get_databases(self) -> List[str]:
+        """获取所有数据库(别名,向后兼容)"""
+        return self.list_databases()
+    
+    def create_database(self, database_name: str, conflict_type: str = ConflictType.Error, comment: Optional[str] = None):
+        """创建数据库"""
+        with self.pool.get_connection() as conn:
+            return conn.create_database(database_name, conflict_type, comment)
+    
+    def drop_database(self, database_name: str, conflict_type: str = "Error"):
+        """删除数据库"""
+        with self.pool.get_connection() as conn:
+            return conn.drop_database(database_name, conflict_type)
+    
+    def use_database(self, database_name: str):
+        """切换客户端默认数据库
+        
+        注意:此操作只会影响当前客户端实例创建的新连接,不会影响已存在的连接
+        """
+        # 更新客户端的默认数据库
+        self.pool.database = database_name
+    
+    def list_tables(self, database_name: Optional[str] = None) -> List[str]:
+        """获取所有表"""
+        with self.pool.get_connection() as conn:
+            # 根据官方API,应该先获取Database对象,然后在Database对象上调用get_tables方法
+            db = self._get_database(conn, database_name)
+            return db.list_tables()
+    
+    def get_tables(self, database_name: Optional[str] = None) -> List[str]:
+        """获取所有表(别名,向后兼容)"""
+        return self.list_tables(database_name)
+    
+    def get_table(self, table_name: str, database_name: Optional[str] = None):
+        """获取表"""
+        with self.pool.get_connection() as conn:
+            # 根据官方API,应该先获取Database对象,然后在Database对象上调用get_table方法
+            return self._get_table(conn, table_name, database_name)
+
+    def create_table(
+        self,
+        table_name: str,
+        columns_definition: List[Dict[str, Any]],
+        conflict_type: str = ConflictType.Error,
+        database_name: Optional[str] = None
+    ):
+        """创建表"""
+        with self.pool.get_connection() as conn:
+            # 根据官方API,应该先获取Database对象,然后在Database对象上调用create_table方法
+            db = self._get_database(conn, database_name)
+            return db.create_table(table_name, columns_definition, conflict_type)
+
+    def drop_table(self, table_name: str, database_name: Optional[str] = None, conflict_type = ConflictType.Error):
+        """删除表"""
+        with self.pool.get_connection() as conn:
+            # 根据官方API,应该先获取Database对象,然后在Database对象上调用drop_table方法
+            db = self._get_database(conn, database_name)
+            return db.drop_table(table_name, conflict_type)
+
+    def create_index(self, table_name: str, index_name: str, index_info: Dict[str, Any], database_name: Optional[str] = None, conflict_type = ConflictType.Error, comment = None):
+        """创建索引"""
+        with self.pool.get_connection() as conn:
+            # 使用辅助方法获取Table对象
+            table = self._get_table(conn, table_name, database_name)
+            return table.create_index(index_name, index_info, conflict_type, comment)
+    
+    def optimize(self, table_name: str, database_name: Optional[str] = None):
+        """优化表"""
+        with self.pool.get_connection() as conn:
+           # 使用辅助方法获取Table对象
+            table = self._get_table(conn, table_name, database_name)
+            return table.optimize()
+
+    def drop_index(self, table_name: str, index_name: str, database_name: Optional[str] = None, conflict_type = ConflictType.Error):
+        """删除索引"""
+        with self.pool.get_connection() as conn:
+           # 使用辅助方法获取Table对象
+            table = self._get_table(conn, table_name, database_name)
+            return table.drop_index(index_name, conflict_type)
+
+    def insert(
+        self,
+        table_name: str,
+        documents: List[Dict[str, Any]],
+        database_name: Optional[str] = None
+    ):
+        """插入文档"""
+        with self.pool.get_connection() as conn:
+            table = self._get_table(conn, table_name, database_name)
+            return table.insert(documents)
+    
+    def update(
+        self, 
+        table_name: str, 
+        cond: str, 
+        data: Dict[str, Any], 
+        database_name: Optional[str] = None
+    ):
+        """更新文档"""
+        with self.pool.get_connection() as conn:
+            # 获取Table对象
+            table = self._get_table(conn, table_name, database_name)
+            # 更新文档
+            return table.update(cond, data)
+
+    def search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ):
+        """搜索文档"""
+        with self.pool.get_connection() as conn:
+            # 获取Table对象
+            table = self._get_table(conn, table_name, database_name)
+            # 获取结果集
+            return table.output(output_fields).match_text(query["match_field"], query["matching_text"], query["topn"])
+    
+    def hybrid_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ):
+        """混合检索"""
+        with self.pool.get_connection() as conn:
+            # 获取Table对象
+            table = self._get_table(conn, table_name, database_name)
+            # 获取结果集
+            return table.output(output_fields) \
+                .match_dense(
+                    vector_column_name=query["vector_field"], 
+                    embedding_data=query["query_vector"], 
+                    embedding_data_type="float", 
+                    distance_type="cosine", 
+                    topn=query["topn"],
+                    knn_params=query["knn_params"]
+                ) \
+                .match_text(query["match_field"], query["matching_text"], 2) \
+                .fusion("rrf", query["topn"])
+    
+    def vector_search(
+        self,
+        table_name: str,
+        output_fields: List[str],
+        query: Dict[str, Any],
+        database_name: Optional[str] = None
+    ):
+        """向量检索"""
+        with self.pool.get_connection() as conn:
+            # 获取Table对象
+            table = self._get_table(conn, table_name, database_name)
+            # 获取结果集
+            return table.output(output_fields) \
+                .match_dense(
+                    vector_column_name=query["vector_field"], 
+                    embedding_data=query["query_vector"], 
+                    embedding_data_type="float", 
+                    distance_type="cosine", 
+                    topn=query["topn"], 
+                    knn_params=query["knn_params"])
+    
+    def get_status(self) -> Dict[str, Any]:
+        """获取客户端状态"""
+        return self.pool.get_status()
+    
+    def close(self):
+        """关闭客户端,释放所有连接"""
+        self.pool.close()
+
+# 全局客户端实例
+_global_client: Optional[InfinityClient] = None
+_client_lock = threading.Lock()
+
+def get_client(
+    host: str = vector_db_settings.infinity_host,
+    port: str = vector_db_settings.infinity_sdk_port,
+    database: str = vector_db_settings.infinity_database,
+    min_connections: int = 5,
+    max_connections: int = 10
+) -> InfinityClient:
+    """
+    获取全局客户端实例(单例模式)
+    
+    Args:
+        host: Infinity服务地址
+        port: Infinity服务端口
+        database: 数据库名称
+        min_connections: 最小连接数
+        max_connections: 最大连接数
+        
+    Returns:
+        全局Infinity客户端实例
+    """
+    global _global_client
+    
+    with _client_lock:
+        if _global_client is None:
+            _global_client = InfinityClient(
+                host=host,
+                port=port,
+                database=database,
+                min_connections=min_connections,
+                max_connections=max_connections
+            )
+    
+    return _global_client
+
+def close_client():
+    """关闭全局客户端"""
+    global _global_client
+    
+    with _client_lock:
+        if _global_client:
+            _global_client.close()
+            _global_client = None

+ 285 - 0
src/utils/infinity/pool.py

@@ -0,0 +1,285 @@
+# Infinity数据库连接池实现
+
+from typing import Dict, Any, List, Optional
+import threading
+import time
+from contextlib import contextmanager
+from src.conf.settings import vector_db_settings
+
+class InfinityConnectionPool:
+    """
+    Infinity数据库连接池
+    
+    设计特点:
+    1. 支持并发连接管理
+    2. 连接自动回收和复用
+    3. 连接超时和心跳检测
+    4. 动态调整连接数量
+    5. 线程安全
+    """
+    
+    def __init__(
+        self,
+        host: str = vector_db_settings.infinity_host,
+        port: str = vector_db_settings.infinity_sdk_port,
+        database: str = vector_db_settings.infinity_database,
+        min_connections: int = 2,
+        max_connections: int = 10,
+        connection_timeout: int = 30,
+        idle_timeout: int = 300,
+        heartbeat_interval: int = 60
+    ):
+        """
+        初始化连接池
+        
+        Args:
+            host: Infinity服务地址
+            port: Infinity服务端口
+            database: 数据库名称
+            min_connections: 最小连接数
+            max_connections: 最大连接数
+            connection_timeout: 连接超时时间(秒)
+            idle_timeout: 空闲连接超时时间(秒)
+            heartbeat_interval: 心跳检测间隔(秒)
+        """
+        self.host = host
+        self.port = port
+        self.database = database
+        self.min_connections = min_connections
+        self.max_connections = max_connections
+        self.connection_timeout = connection_timeout
+        self.idle_timeout = idle_timeout
+        self.heartbeat_interval = heartbeat_interval
+        
+        # 连接池状态
+        self.connections = []  # 可用连接列表
+        self.in_use = {}  # 正在使用的连接 {connection: thread_id}
+        self.connection_count = 0  # 当前连接总数
+        
+        # 线程安全锁
+        self.lock = threading.Lock()
+        self.condition = threading.Condition(self.lock)
+        
+        # 初始化最小连接数
+        self._init_connections()
+        
+        # 启动心跳检测线程
+        self.heartbeat_thread = threading.Thread(target=self._heartbeat_check, daemon=True)
+        self.heartbeat_thread.start()
+    
+    def _init_connections(self):
+        """初始化最小连接数"""
+        for _ in range(self.min_connections):
+            # 初始化时需要获取锁,因为_create_connection现在没有内部锁
+            with self.lock:
+                self._create_connection()
+    
+    def _create_connection(self) -> Any:
+        """创建新连接"""
+        try:
+            import infinity
+            
+            # 连接到Infinity服务
+            connection = infinity.connect(
+                infinity.NetworkAddress(self.host, self.port)
+            )
+                        
+            # 注意:根据官方API,RemoteThriftInfinityConnection对象没有use_database方法
+            # 数据库操作应该通过create_database、drop_database等方法直接指定数据库名称
+            # 或者通过获取Database对象后再进行操作
+            
+            # 保存数据库名称,供后续操作使用
+            connection.__dict__['_database'] = self.database
+            
+            # 记录连接创建时间
+            connection.__dict__['_created_at'] = time.time()
+            connection.__dict__['_last_used'] = time.time()
+            connection.__dict__['_is_valid'] = True
+            
+            # 注意:这里不需要再获取锁,因为调用此方法时已经在acquire方法中持有了锁
+            self.connections.append(connection)
+            self.connection_count += 1
+            
+            return connection
+        except Exception as e:
+            raise Exception(f"Failed to create Infinity connection: {e}")
+    
+    def _is_valid_connection(self, connection: Any) -> bool:
+        """检查连接是否有效"""
+        try:
+            # 通过执行简单查询检查连接是否有效
+            # 注意:这里不应该在持有锁的情况下执行网络操作
+            # 但由于此方法是在锁内被调用的,我们需要尽量减少操作时间
+            connection.get_database(self.database)
+            return True
+        except Exception:
+            return False
+    
+    def _heartbeat_check(self):
+        """心跳检测,定期检查连接有效性并清理过期连接"""
+        while True:
+            time.sleep(self.heartbeat_interval)
+            self._cleanup_connections()
+    
+    def _cleanup_connections(self):
+        """清理无效或过期连接"""
+        with self.lock:
+            current_time = time.time()
+            valid_connections = []
+            
+            for connection in self.connections:
+                # 检查连接是否过期
+                if current_time - connection.__dict__['_last_used'] > self.idle_timeout:
+                    # 关闭过期连接
+                    try:
+                        connection.disconnect()
+                        self.connection_count -= 1
+                    except Exception:
+                        pass
+                elif not self._is_valid_connection(connection):
+                    # 关闭无效连接
+                    try:
+                        connection.disconnect()
+                        self.connection_count -= 1
+                    except Exception:
+                        pass
+                else:
+                    valid_connections.append(connection)
+            
+            self.connections = valid_connections
+            
+            # 确保连接数不低于最小值
+            while self.connection_count < self.min_connections:
+                self._create_connection()
+    
+    @contextmanager
+    def get_connection(self, timeout: Optional[int] = None) -> Any:
+        """
+        获取一个连接,使用上下文管理器
+        
+        Args:
+            timeout: 获取连接的超时时间(秒)
+            
+        Yields:
+            Infinity连接对象
+        """
+        connection = None
+        try:
+            connection = self.acquire(timeout)
+            yield connection
+        finally:
+            if connection:
+                self.release(connection)
+    
+    def acquire(self, timeout: Optional[int] = None) -> Any:
+        """
+        获取一个连接
+        
+        Args:
+            timeout: 获取连接的超时时间(秒)
+            
+        Returns:
+            Infinity连接对象
+            
+        Raises:
+            TimeoutError: 获取连接超时
+        """
+        end_time = time.time() + (timeout or self.connection_timeout)
+        
+        with self.lock:
+            while True:
+                # 检查是否有可用连接
+                if self.connections:
+                    # 获取一个连接
+                    connection = self.connections.pop()
+                    
+                    # 检查连接是否有效
+                    if self._is_valid_connection(connection):
+                        connection.__dict__['_last_used'] = time.time()
+                        self.in_use[connection] = threading.get_ident()
+                        return connection
+                    else:
+                        # 连接无效,关闭并计数减一
+                        try:
+                            connection.disconnect()
+                            self.connection_count -= 1
+                        except Exception:
+                            pass
+                
+                # 检查是否可以创建新连接
+                elif self.connection_count < self.max_connections:
+                    # 创建新连接
+                    connection = self._create_connection()
+                    connection.__dict__['_last_used'] = time.time()
+                    self.in_use[connection] = threading.get_ident()
+                    return connection
+                
+                # 等待连接释放
+                remaining = end_time - time.time()
+                if remaining <= 0:
+                    raise TimeoutError("Timeout waiting for Infinity connection")
+                
+                # 等待连接释放或超时
+                self.condition.wait(remaining)
+    
+    def release(self, connection: Any):
+        """
+        释放连接
+        
+        Args:
+            connection: 要释放的连接
+        """
+        with self.lock:
+            if connection in self.in_use:
+                del self.in_use[connection]
+                
+                # 检查连接是否有效
+                if self._is_valid_connection(connection):
+                    connection.__dict__['_last_used'] = time.time()
+                    self.connections.append(connection)
+                    # 通知等待的线程
+                    self.condition.notify()
+                else:
+                    # 连接无效,关闭并计数减一
+                    try:
+                        connection.disconnect()
+                        self.connection_count -= 1
+                    except Exception:
+                        pass
+    
+    def close(self):
+        """关闭所有连接"""
+        with self.lock:
+            # 关闭可用连接
+            for connection in self.connections:
+                try:
+                    connection.disconnect()
+                except Exception:
+                    pass
+            
+            # 关闭正在使用的连接
+            for connection in list(self.in_use.keys()):
+                try:
+                    connection.disconnect()
+                except Exception:
+                    pass
+            
+            self.connections = []
+            self.in_use = {}
+            self.connection_count = 0
+    
+    def get_status(self) -> Dict[str, Any]:
+        """
+        获取连接池状态
+        
+        Returns:
+            连接池状态信息
+        """
+        with self.lock:
+            return {
+                "total_connections": self.connection_count,
+                "available_connections": len(self.connections),
+                "in_use_connections": len(self.in_use),
+                "min_connections": self.min_connections,
+                "max_connections": self.max_connections
+            }

+ 73 - 0
src/utils/infinity/result_util.py

@@ -0,0 +1,73 @@
+from typing import Dict, Any, List
+from langchain_core.documents import Document
+
+
+def convert_to_basic_types(obj: Any) -> Any:
+    """
+    递归将对象转换为基本类型,以便Pydantic能够序列化
+    
+    特殊处理:当字典中的子项包含相同长度的数组时,将其转换为数组对象结构
+    例如:{"a": [1,2], "b": [3,4]} -> [{"a":1, "b":3}, {"a":2, "b":4}]
+    
+    Args:
+        obj: 要转换的对象
+    
+    Returns:
+        转换后的基本类型对象
+    """
+    if obj is None:
+        return None
+    elif isinstance(obj, (str, int, float, bool)):
+        return obj
+    elif isinstance(obj, dict):
+        # 先递归转换所有值
+        converted = {k: convert_to_basic_types(v) for k, v in obj.items()}
+        
+        # 检查是否需要转换为数组对象结构
+        # 条件:所有值都是列表,且长度一致,且长度大于0
+        values = list(converted.values())
+        if all(isinstance(v, list) for v in values):
+            lengths = [len(v) for v in values]
+            if len(set(lengths)) == 1 and lengths[0] > 0:
+                # 转换为数组对象结构
+                result = []
+                keys = list(converted.keys())
+                for i in range(lengths[0]):
+                    item = {}
+                    for key in keys:
+                        # 处理数组中可能存在的None值
+                        if i < len(converted[key]):
+                            item[key] = converted[key][i]
+                        else:
+                            item[key] = None
+                    result.append(item)
+                return result
+        
+        return converted
+    elif isinstance(obj, (list, tuple)):
+        return [convert_to_basic_types(item) for item in obj]
+    else:
+        # 对于其他类型,尝试将其转换为字符串或字典
+        try:
+            return dict(obj)
+        except:
+            return str(obj)
+
+def convert_to_langchain_docs(obj: Any) -> List[Document]:
+    """
+    将Infinity搜索结果转换为LangChain的Document格式
+    
+    Args:
+        obj: 要转换的对象
+    
+    Returns:
+        转换后的Document列表
+    """
+    res = convert_to_basic_types(obj=obj)
+    # 将数据转换为 LangChain 的 Document 格式
+    candidate_docs = [
+        Document(page_content=item["content"], 
+            metadata={k: v for k, v in item.items() if k != "content"}) 
+        for item in res[0]
+    ]
+    return candidate_docs

+ 89 - 0
src/utils/infinity/test_infinity.py

@@ -0,0 +1,89 @@
+# 测试Infinity客户端拆分后的代码结构
+
+import sys
+import os
+
+# 添加项目根目录到Python路径
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+
+# 测试1:导入验证
+print("=== 测试1:导入验证 ===")
+try:
+    from utils.infinity import (
+        InfinityConnectionPool,
+        InfinityClient,
+        get_client,
+        close_client
+    )
+    print("✅ 所有组件导入成功")
+except Exception as e:
+    print(f"❌ 导入失败: {e}")
+    sys.exit(1)
+
+# 测试2:类继承和结构验证
+print("\n=== 测试2:类结构验证 ===")
+try:
+    # 验证类的基本属性
+    print(f"✅ InfinityConnectionPool类存在")
+    print(f"✅ InfinityClient类存在")
+    print(f"✅ get_client函数存在")
+    print(f"✅ close_client函数存在")
+except Exception as e:
+    print(f"❌ 类结构验证失败: {e}")
+    sys.exit(1)
+
+# 测试3:全局客户端函数验证
+print("\n=== 测试3:全局客户端函数验证 ===")
+try:
+    # 只验证函数存在和基本调用,不实际连接
+    import inspect
+    
+    # 检查函数签名
+    get_client_sig = inspect.signature(get_client)
+    close_client_sig = inspect.signature(close_client)
+    
+    print(f"✅ get_client函数签名正确: {get_client_sig}")
+    print(f"✅ close_client函数签名正确: {close_client_sig}")
+    
+    # 验证全局客户端函数可以被调用(但不实际连接)
+    print("✅ 全局客户端函数可以被调用")
+except Exception as e:
+    print(f"❌ 全局客户端函数验证失败: {e}")
+    sys.exit(1)
+
+# 测试4:客户端类验证
+print("\n=== 测试4:客户端类验证 ===")
+try:
+    # 验证类的方法存在,不实际实例化连接
+    import inspect
+    
+    # 验证客户端类的方法
+    required_methods = [
+        'get_databases',
+        'create_database',
+        'drop_database',
+        'get_tables',
+        'create_table',
+        'drop_table',
+        'insert',
+        'search',
+        'hybrid_search',
+        'vector_search',
+        'get_status',
+        'close'
+    ]
+    
+    for method in required_methods:
+        if hasattr(InfinityClient, method):
+            print(f"✅ 客户端方法 '{method}' 存在")
+        else:
+            print(f"❌ 客户端方法 '{method}' 不存在")
+            raise Exception(f"Missing method: {method}")
+    
+    # 验证客户端类可以被实例化(但不实际连接)
+    print("✅ InfinityClient类可以被实例化")
+except Exception as e:
+    print(f"❌ 客户端类验证失败: {e}")
+    sys.exit(1)
+
+print("\n🎉 所有测试通过!Infinity客户端拆分成功!")

+ 52 - 0
src/utils/mysql/__init__.py

@@ -0,0 +1,52 @@
+from .mysql_conn import MySQLConnection
+
+
+# 全局 MySQL 客户端实例
+_global_mysql_client = None
+
+def init_global_mysql_client(host: str = None, port: int = None, 
+                           user: str = None, password: str = None, 
+                           database: str = None, charset: str = None,
+                           pool_size: int = None, **kwargs) -> None:
+    """
+    初始化全局 MySQL 客户端
+    
+    Args:
+        host: MySQL 主机地址
+        port: MySQL 端口号
+        user: MySQL 用户名
+        password: MySQL 密码
+        database: 数据库名称
+        charset: 字符集
+        pool_size: 连接池大小
+        **kwargs: 其他 MySQL 连接参数
+    """
+    global _global_mysql_client
+    if _global_mysql_client is None:
+        _global_mysql_client = MySQLConnection(
+            host=host, port=port, user=user, password=password, 
+            database=database, charset=charset, pool_size=pool_size, **kwargs
+        )
+
+
+def get_global_mysql_client() -> MySQLConnection:
+    """
+    获取全局 MySQL 客户端实例
+    
+    Returns:
+        MySQL SQL 执行器实例
+    """
+    global _global_mysql_client
+    if _global_mysql_client is None:
+        raise RuntimeError("Global MySQL client has not been initialized. Call init_global_mysql_client() first.")
+    return _global_mysql_client
+
+
+def close_global_mysql_client() -> None:
+    """
+    关闭全局 MySQL 客户端
+    """
+    global _global_mysql_client
+    if _global_mysql_client is not None:
+        _global_mysql_client.close()
+        _global_mysql_client = None

+ 244 - 0
src/utils/mysql/mysql_conn.py

@@ -0,0 +1,244 @@
+"""
+MySQL SQL 执行器
+
+该文件提供 MySQL 数据库 SQL 执行功能,支持:
+- 单例模式
+- 基本 CRUD 操作
+- 事务支持
+- 连接错误处理
+- 全局客户端管理
+"""
+import pymysql
+from pymysql.cursors import DictCursor
+from typing import Any, List, Dict, Optional, Union
+from contextlib import contextmanager
+from .mysql_pool import get_mysql_pool, MySQLPool
+from src.utils.decorators.singleton import singleton
+
+@singleton
+class MySQLConnection:
+    """
+    MySQL SQL 执行器
+    支持:
+    - 单例模式
+    - 基本 CRUD 操作
+    - 事务支持
+    - 连接错误处理
+    """
+    
+    def __init__(self, mysql_pool: Optional[MySQLPool] = None,
+                 host: str = None, port: int = None, 
+                 user: str = None, password: str = None, 
+                 database: str = None, charset: str = None,
+                 pool_size: int = None, **kwargs):
+        """
+        初始化 MySQL SQL 执行器
+        
+        Args:
+            mysql_pool: 可选的 MySQL 连接池实例,如果提供则使用该实例,否则创建新实例
+            host: MySQL 主机地址
+            port: MySQL 端口号
+            user: MySQL 用户名
+            password: MySQL 密码
+            database: 数据库名称
+            charset: 字符集
+            pool_size: 连接池大小
+            **kwargs: 其他 MySQL 连接参数
+        """
+        # 如果提供了连接池实例,则使用该实例,否则创建新实例
+        if mysql_pool:
+            self._pool = mysql_pool
+        else:
+            self._pool = get_mysql_pool(host, port, user, password, database, charset, pool_size, **kwargs)
+    
+    def _get_connection(self) -> pymysql.connections.Connection:
+        """
+        从连接池获取连接
+        
+        Returns:
+            MySQL 连接对象
+        """
+        return self._pool.get_connection()
+    
+    @contextmanager
+    def get_cursor(self, cursorclass=DictCursor):
+        """
+        获取游标上下文管理器
+        
+        Args:
+            cursorclass: 游标类型,默认为DictCursor
+        
+        Yields:
+            MySQL 游标对象
+        """
+        conn = self._get_connection()
+        cursor = conn.cursor(cursorclass)
+        
+        try:
+            yield cursor
+            conn.commit()
+        except Exception as e:
+            conn.rollback()
+            raise e
+        finally:
+            cursor.close()
+            conn.close()
+    
+    def execute(self, sql: str, params: Union[List, Dict] = None) -> int:
+        """
+        执行 SQL 语句(用于 INSERT、UPDATE、DELETE)
+        
+        Args:
+            sql: SQL 语句
+            params: SQL 参数
+        
+        Returns:
+            受影响的行数
+        """
+        with self.get_cursor() as cursor:
+            cursor.execute(sql, params)
+            return cursor.rowcount
+    
+    def fetch_one(self, sql: str, params: Union[List, Dict] = None) -> Optional[Dict[str, Any]]:
+        """
+        执行 SQL 查询,返回单行结果
+        
+        Args:
+            sql: SQL 查询语句
+            params: SQL 参数
+        
+        Returns:
+            查询结果字典,无结果返回 None
+        """
+        with self.get_cursor() as cursor:
+            cursor.execute(sql, params)
+            return cursor.fetchone()
+    
+    def fetch_all(self, sql: str, params: Union[List, Dict] = None) -> List[Dict[str, Any]]:
+        """
+        执行 SQL 查询,返回所有结果
+        
+        Args:
+            sql: SQL 查询语句
+            params: SQL 参数
+        
+        Returns:
+            查询结果列表
+        """
+        with self.get_cursor() as cursor:
+            cursor.execute(sql, params)
+            return cursor.fetchall()
+    
+    def fetch_many(self, sql: str, size: int, params: Union[List, Dict] = None) -> List[Dict[str, Any]]:
+        """
+        执行 SQL 查询,返回指定数量的结果
+        
+        Args:
+            sql: SQL 查询语句
+            size: 返回结果数量
+            params: SQL 参数
+        
+        Returns:
+            查询结果列表
+        """
+        with self.get_cursor() as cursor:
+            cursor.execute(sql, params)
+            return cursor.fetchmany(size)
+    
+    def bulk_insert(self, sql: str, params_list: List[Union[List, Dict]]) -> int:
+        """
+        批量插入数据
+        
+        Args:
+            sql: SQL 插入语句
+            params_list: 参数列表
+        
+        Returns:
+            受影响的行数
+        """
+        with self.get_cursor() as cursor:
+            cursor.executemany(sql, params_list)
+            return cursor.rowcount
+    
+    def begin_transaction(self):
+        """
+        开始事务
+        
+        Returns:
+            连接对象和游标对象
+        """
+        conn = self._get_connection()
+        conn.begin()
+        cursor = conn.cursor()
+        return conn, cursor
+    
+    def commit_transaction(self, conn: pymysql.connections.Connection, cursor: pymysql.cursors.Cursor):
+        """
+        提交事务
+        
+        Args:
+            conn: 连接对象
+            cursor: 游标对象
+        """
+        try:
+            conn.commit()
+        finally:
+            cursor.close()
+            conn.close()
+    
+    def rollback_transaction(self, conn: pymysql.connections.Connection, cursor: pymysql.cursors.Cursor):
+        """
+        回滚事务
+        
+        Args:
+            conn: 连接对象
+            cursor: 游标对象
+        """
+        try:
+            conn.rollback()
+        finally:
+            cursor.close()
+            conn.close()
+    
+    def close(self):
+        """
+        关闭 SQL 执行器
+        """
+        # 关闭连接池
+        self._pool.close()
+
+# 简化的接口函数,便于快速使用
+def get_mysql_conn(host: str = None, port: int = None, 
+                  user: str = None, password: str = None, 
+                  database: str = None, charset: str = None,
+                  pool_size: int = None, **kwargs) -> MySQLConnection:
+    """
+    获取 MySQL SQL 执行器实例
+    
+    Args:
+        host: MySQL 主机地址
+        port: MySQL 端口号
+        user: MySQL 用户名
+        password: MySQL 密码
+        database: 数据库名称
+        charset: 字符集
+        pool_size: 连接池大小
+        **kwargs: 其他 MySQL 连接参数
+    
+    Returns:
+        MySQL SQL 执行器实例
+    """
+    return MySQLConnection(host=host, port=port, user=user, password=password, 
+                          database=database, charset=charset, pool_size=pool_size, **kwargs)
+
+def get_mysql_conn_with_pool(mysql_pool: MySQLPool) -> MySQLConnection:
+    """
+    使用指定的连接池获取 MySQL SQL 执行器实例
+    
+    Args:
+        mysql_pool: MySQL 连接池实例
+    
+    Returns:
+        MySQL SQL 执行器实例
+    """
+    return MySQLConnection(mysql_pool=mysql_pool)

+ 138 - 0
src/utils/mysql/mysql_pool.py

@@ -0,0 +1,138 @@
+"""
+MySQL 连接池配置
+
+该文件提供 MySQL 数据库连接池配置功能,支持:
+- 单例模式
+- DBUtils 连接池管理
+- 连接错误处理
+- 从.env文件读取配置
+"""
+import pymysql
+from pymysql.cursors import DictCursor
+from dbutils.pooled_db import PooledDB
+from src.conf.settings import mysql_settings
+
+# 单例装饰器
+class singleton:
+    def __init__(self, cls):
+        self.cls = cls
+        self._instance = None
+    
+    def __call__(self, *args, **kwargs):
+        if self._instance is None:
+            self._instance = self.cls(*args, **kwargs)
+        return self._instance
+
+@singleton
+class MySQLPool:
+    """
+    MySQL 连接池管理器
+    支持:
+    - 单例模式
+    - DBUtils 连接池管理
+    - 连接错误处理
+    """
+    
+    def __init__(self, host: str = None, port: int = None, 
+                 user: str = None, password: str = None, 
+                 database: str = None, charset: str = None,
+                 pool_size: int = None, **kwargs):
+        """
+        初始化 MySQL 连接池
+        
+        Args:
+            host: MySQL 主机地址
+            port: MySQL 端口号
+            user: MySQL 用户名
+            password: MySQL 密码
+            database: 数据库名称
+            charset: 字符集
+            pool_size: 连接池大小
+            **kwargs: 其他 MySQL 连接参数
+        """
+        # 从环境变量读取配置,优先级:传入参数 > 环境变量 > 默认值
+        self.host = host or mysql_settings.mysql_host
+        self.port = int(port or mysql_settings.mysql_port)
+        self.user = user or mysql_settings.mysql_user
+        self.password = password or mysql_settings.mysql_password
+        self.database = database or mysql_settings.mysql_database
+        self.charset = charset or mysql_settings.mysql_charset
+        self.pool_size = int(pool_size or mysql_settings.mysql_pool_size)
+        self.kwargs = kwargs
+        
+        # 初始化 DBUtils 连接池
+        self._pool = PooledDB(
+            creator=pymysql,
+            maxconnections=self.pool_size,
+            mincached=3,
+            maxcached=8,
+            maxshared=5,
+            blocking=False,
+            maxusage=None,
+            setsession=[],
+            ping=0,
+            host=self.host,
+            port=self.port,
+            user=self.user,
+            password=self.password,
+            database=self.database,
+            charset=self.charset,
+            cursorclass=DictCursor,
+            **kwargs
+        )
+    
+    def get_connection(self) -> pymysql.connections.Connection:
+        """
+        从连接池获取连接
+        
+        Returns:
+            MySQL 连接对象
+        """
+        return self._pool.connection()
+    
+    def close(self):
+        """
+        关闭连接池
+        """
+        # DBUtils 连接池会自动管理连接,无需手动关闭
+        pass
+
+# 简化的接口函数,便于快速使用
+def get_mysql_pool(host: str = None, port: int = None, 
+                  user: str = None, password: str = None, 
+                  database: str = None, charset: str = None,
+                  pool_size: int = None, **kwargs) -> MySQLPool:
+    """
+    获取 MySQL 连接池实例
+    
+    Args:
+        host: MySQL 主机地址
+        port: MySQL 端口号
+        user: MySQL 用户名
+        password: MySQL 密码
+        database: 数据库名称
+        charset: 字符集
+        pool_size: 连接池大小
+        **kwargs: 其他 MySQL 连接参数
+    
+    Returns:
+        MySQL 连接池实例
+    """
+    # 从环境变量读取默认值,与MySQLPool.__init__保持一致
+    default_host = mysql_settings.mysql_host
+    default_port = int(mysql_settings.mysql_port)
+    default_user = mysql_settings.mysql_user
+    default_database = mysql_settings.mysql_database
+    default_charset = mysql_settings.mysql_charset
+    default_pool_size = int(mysql_settings.mysql_pool_size)
+    
+    return MySQLPool(
+        host=host or default_host,
+        port=port or default_port,
+        user=user or default_user,
+        password=password,
+        database=database or default_database,
+        charset=charset or default_charset,
+        pool_size=pool_size or default_pool_size,
+        **kwargs
+    )

+ 0 - 0
src/utils/ragflow/__init__.py


+ 139 - 0
src/utils/ragflow/agent_service.py

@@ -0,0 +1,139 @@
+from typing import Dict, Any, List, Optional
+
+class AgentService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_agent(self, name: str, llm: Dict[str, Any], description: str = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/agents"
+        
+        data = {"name": name, "llm": llm}
+        if description is not None:
+            data["description"] = description
+        
+        response = self.http_client.post(endpoint, json_data=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建代理失败: {response.get('message', '未知错误')}")
+    
+    def update_agent(self, agent_id: str, name: str = None, llm: Dict[str, Any] = None,
+                    description: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents/{agent_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if llm is not None:
+            data["llm"] = llm
+        if description is not None:
+            data["description"] = description
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新代理失败: {response.get('message', '未知错误')}")
+    
+    def delete_agent(self, agent_id: str) -> bool:
+        endpoint = f"/api/v1/agents/{agent_id}"
+        
+        response = self.http_client.post(endpoint, json_data={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除代理失败: {response.get('message', '未知错误')}")
+    
+    def list_agents(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                   desc: bool = True, name: str = None, agent_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/agents"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if name is not None:
+            params["name"] = name
+        if agent_id is not None:
+            params["id"] = agent_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出代理失败: {response.get('message', '未知错误')}")
+    
+    def create_agent_session(self, agent_id: str, name: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents/{agent_id}/sessions"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建代理会话失败: {response.get('message', '未知错误')}")
+    
+    def list_agent_sessions(self, agent_id: str, page: int = 1, size: int = 20,
+                           orderby: str = "create_time", desc: bool = True,
+                           session_id: str = None, user_id: str = None,
+                           dsl: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/agents/{agent_id}/sessions"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if session_id is not None:
+            params["id"] = session_id
+        if user_id is not None:
+            params["user_id"] = user_id
+        if dsl is not None:
+            params["dsl"] = dsl
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出代理会话失败: {response.get('message', '未知错误')}")
+    
+    def delete_agent_session(self, agent_id: str, session_id: str) -> bool:
+        endpoint = f"/api/v1/agents/{agent_id}/sessions"
+        
+        response = self.http_client.post(endpoint, json_data={"session_ids": [session_id]})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除代理会话失败: {response.get('message', '未知错误')}")
+    
+    def agent_completion(self, agent_id: str, query: str, stream: bool = False,
+                        session_id: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents/{agent_id}/completions"
+        
+        data = {"query": query, "stream": stream}
+        if session_id is not None:
+            data["session_id"] = session_id
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"代理完成失败: {response.get('message', '未知错误')}")
+    
+    def get_related_questions(self, dataset_id: str, question: str, top: int = 10) -> List[str]:
+        endpoint = "/api/v1/sessions/related_questions"
+        
+        response = self.http_client.post(endpoint, json_data={
+            "dataset_id": dataset_id,
+            "question": question,
+            "top": top
+        })
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取相关问题失败: {response.get('message', '未知错误')}")

+ 146 - 0
src/utils/ragflow/chat_service.py

@@ -0,0 +1,146 @@
+from typing import Dict, Any, List, Optional
+
+class ChatService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_chat(self, name: str, dataset_ids: List[str], llm: Dict[str, Any],
+                   prompt: str = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/chats"
+        
+        data = {
+            "name": name,
+            "dataset_ids": dataset_ids,
+            "llm": llm
+        }
+        if prompt is not None:
+            data["prompt"] = prompt
+        
+        response = self.http_client.post(endpoint, json_data=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建聊天失败: {response.get('message', '未知错误')}")
+    
+    def update_chat(self, chat_id: str, name: str = None, dataset_ids: List[str] = None,
+                   llm: Dict[str, Any] = None, prompt: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if dataset_ids is not None:
+            data["dataset_ids"] = dataset_ids
+        if llm is not None:
+            data["llm"] = llm
+        if prompt is not None:
+            data["prompt"] = prompt
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新聊天失败: {response.get('message', '未知错误')}")
+    
+    def delete_chats(self, chat_ids: List[str]) -> bool:
+        endpoint = "/api/v1/chats"
+        
+        response = self.http_client.post(endpoint, json_data={"chat_ids": chat_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除聊天失败: {response.get('message', '未知错误')}")
+    
+    def list_chats(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True, name: str = None, chat_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/chats"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if name is not None:
+            params["name"] = name
+        if chat_id is not None:
+            params["id"] = chat_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出聊天失败: {response.get('message', '未知错误')}")
+    
+    def create_chat_session(self, chat_id: str, name: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建会话失败: {response.get('message', '未知错误')}")
+    
+    def update_chat_session(self, chat_id: str, session_id: str, 
+                           name: str = None, message: List[Dict] = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions/{session_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if message is not None:
+            data["message"] = message
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新会话失败: {response.get('message', '未知错误')}")
+    
+    def list_chat_sessions(self, chat_id: str, page: int = 1, size: int = 20,
+                          orderby: str = "create_time", desc: bool = True,
+                          session_id: str = None, session_name: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if session_id is not None:
+            params["id"] = session_id
+        if session_name is not None:
+            params["name"] = session_name
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出会话失败: {response.get('message', '未知错误')}")
+    
+    def delete_chat_session(self, chat_id: str, session_id: str) -> bool:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions"
+        
+        response = self.http_client.post(endpoint, json_data={"session_ids": [session_id]})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除会话失败: {response.get('message', '未知错误')}")
+    
+    def chat_completion(self, chat_id: str, query: str, stream: bool = False,
+                       session_id: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}/completions"
+        
+        data = {"query": query, "stream": stream}
+        if session_id is not None:
+            data["session_id"] = session_id
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"聊天完成失败: {response.get('message', '未知错误')}")

+ 74 - 0
src/utils/ragflow/chunk_record.py

@@ -0,0 +1,74 @@
+"""
+RagFlow Chunk 上传记录管理模块
+
+该模块负责处理 RagFlow Chunk 上传记录的数据库操作,包括:
+- 记录 Chunk 上传任务到定时任务表
+- 提供统一的接口供外部调用
+"""
+import json
+from typing import Dict, Any, Optional
+from src.utils.mysql import get_global_mysql_client
+from datetime import datetime, timedelta
+
+class ChunkRecordService:
+    """Chunk 上传记录服务"""
+    
+    def __init__(self):
+        """初始化 Chunk 记录服务"""
+        self.mysql_client = get_global_mysql_client()
+    
+    def record_chunk_add(self, database_name: str, table_name: str, chunk_id: str,
+                         cond: Optional[str] = None, data: Dict[str, Any] = {}) -> None:
+        """
+        记录 Chunk 上传信息到 MySQL 定时任务表
+        
+        Args:
+            database_name: 数据库 ID
+            table_name: 数据表 ID
+            chunk_id: 上传成功返回的 Chunk ID
+            image_path: 图片路径
+            cond: 条件字符串,由调用方传递
+            data: 数据字符串,由调用方传递或自动生成
+        """
+        # 计算20秒后的时间
+        scheduled_time = datetime.now() + timedelta(seconds=60)
+
+        # 判断cond与data不能为空
+        if not cond and not data:
+            raise ValueError("cond and data must be provided")
+        
+        # 准备插入数据(所有记录status固定为"未执行")
+        insert_data = {
+            "database_name": database_name,
+            "table_name": table_name,
+            "chunk_id": chunk_id,
+            "cond": cond,
+            "update_data": json.dumps(data),
+            "scheduled_time": scheduled_time,
+            "status": "未执行"
+        }
+        
+        try:
+            # 插入记录到 MySQL 定时任务表
+            self.mysql_client.execute(
+                "INSERT INTO ragflow_chunk_record (database_name, table_name, chunk_id, cond, update_data, scheduled_time, status) "
+                "VALUES (%(database_name)s, %(table_name)s, %(chunk_id)s, %(cond)s, %(update_data)s, %(scheduled_time)s, %(status)s)",
+                insert_data
+            )
+        except Exception as e:
+            raise Exception(f"Failed to record chunk upload: {e}")
+
+# 创建全局实例
+_chunk_record_service = None
+
+def get_chunk_record_service() -> ChunkRecordService:
+    """
+    获取 Chunk 记录服务实例(单例模式)
+    
+    Returns:
+        ChunkRecordService 实例
+    """
+    global _chunk_record_service
+    if _chunk_record_service is None:
+        _chunk_record_service = ChunkRecordService()
+    return _chunk_record_service

+ 78 - 0
src/utils/ragflow/chunk_service.py

@@ -0,0 +1,78 @@
+from typing import Dict, Any, List, Optional
+
+class ChunkService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_chunk(self, dataset_id: str, document_id: str, content: str, 
+                    important_keywords: List[str]) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks"
+        
+        data = {"content": content}
+        if important_keywords is not None:
+            data["important_keywords"] = important_keywords
+        
+        response = self.http_client.post(endpoint, json_data=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建切片失败: {response.get('message', '未知错误')}")
+    
+    def update_chunk(self, dataset_id: str, chunk_id: str, content: str = None,
+                    important_keywords: List[str] = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/chunks/{chunk_id}"
+        
+        data = {}
+        if content is not None:
+            data["content"] = content
+        if important_keywords is not None:
+            data["important_keywords"] = important_keywords
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新切片失败: {response.get('message', '未知错误')}")
+    
+    def delete_chunk(self, dataset_id: str, chunk_id: str) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/chunks/{chunk_id}"
+        
+        response = self.http_client.post(endpoint, json_data={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除切片失败: {response.get('message', '未知错误')}")
+    
+    def delete_chunks(self, dataset_id: str, document_id: str, chunk_ids: List[str]) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks"
+        
+        response = self.http_client.post(endpoint, json_data={"chunk_ids": chunk_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"批量删除切片失败: {response.get('message', '未知错误')}")
+    
+    def retrieval(self, dataset_ids: List[str], query: str, top_k: int = 5,
+                 similarity_threshold: float = 0.1, vector_similarity_weight: float = 0.3,
+                 refine: bool = False) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/retrieval"
+        
+        data = {
+            "dataset_ids": dataset_ids,
+            "query": query,
+            "top_k": top_k,
+            "similarity_threshold": similarity_threshold,
+            "vector_similarity_weight": vector_similarity_weight,
+            "refine": refine
+        }
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"检索失败: {response.get('message', '未知错误')}")

+ 181 - 0
src/utils/ragflow/dataset_service.py

@@ -0,0 +1,181 @@
+from typing import Dict, Any, List, Optional
+
+class DatasetService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_dataset(self, name: str, description: str = None, 
+                      embedding_model: str = None, permission: str = None,
+                      chunk_method: str = None, parser_config: dict = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/datasets"
+        
+        data = {"name": name}
+        if description is not None:
+            data["description"] = description
+        if embedding_model is not None:
+            data["embedding_model"] = embedding_model
+        if permission is not None:
+            data["permission"] = permission
+        if chunk_method is not None:
+            data["chunk_method"] = chunk_method
+        if parser_config is not None:
+            data["parser_config"] = parser_config
+        
+        response = self.http_client.post(endpoint, json_data=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建数据集失败: {response.get('message', '未知错误')}")
+    
+    def delete_datasets(self, dataset_ids: List[str]) -> bool:
+        endpoint = "/api/v1/datasets"
+        
+        response = self.http_client.delete(endpoint, json_data={"ids": dataset_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除数据集失败: {response.get('message', '未知错误')}")
+    
+    def update_dataset(self, dataset_id: str, name: str = None, 
+                      description: str = None, embedding_model: str = None,
+                      permission: str = None, chunk_method: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if description is not None:
+            data["description"] = description
+        if embedding_model is not None:
+            data["embedding_model"] = embedding_model
+        if permission is not None:
+            data["permission"] = permission
+        if chunk_method is not None:
+            data["chunk_method"] = chunk_method
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新数据集失败: {response.get('message', '未知错误')}")
+    
+    def list_datasets(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                     desc: bool = True, name: str = None, dataset_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/datasets"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if name is not None:
+            params["name"] = name
+        if dataset_id is not None:
+            params["id"] = dataset_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        elif response.get("code") == 108:
+            return None
+        else:
+            raise Exception(f"列出数据集失败: {response.get('message', '未知错误')}")
+    
+    def get_dataset(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取数据集失败: {response.get('message', '未知错误')}")
+    
+    def get_knowledge_graph(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/knowledge_graph"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取知识图谱失败: {response.get('message', '未知错误')}")
+    
+    def delete_knowledge_graph(self, dataset_id: str) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/knowledge_graph"
+        
+        response = self.http_client.post(endpoint, json_data={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除知识图谱失败: {response.get('message', '未知错误')}")
+    
+    def trace_graphrag(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/trace_graphrag"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取GraphRAG追踪失败: {response.get('message', '未知错误')}")
+    
+    def trace_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/trace_raptor"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取RAPTOR追踪失败: {response.get('message', '未知错误')}")
+    
+    def get_metadata_summary(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/metadata/summary"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取元数据摘要失败: {response.get('message', '未知错误')}")
+    
+    def update_metadata(self, dataset_id: str, metadata: Dict = None,
+                       document_ids: List[str] = None, metadata_condition: Dict = None) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/metadata/update"
+        
+        data = {}
+        if metadata is not None:
+            data["metadata"] = metadata
+        if document_ids is not None:
+            data["document_ids"] = document_ids
+        if metadata_condition is not None:
+            data["metadata_condition"] = metadata_condition
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"更新元数据失败: {response.get('message', '未知错误')}")
+    
+    def run_graphrag(self, dataset_id: str, mode: str = "light") -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/run_graphrag"
+        
+        response = self.http_client.post(endpoint, json_data={"mode": mode})
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"运行GraphRAG失败: {response.get('message', '未知错误')}")
+    
+    def run_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/run_raptor"
+        
+        response = self.http_client.post(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"运行RAPTOR失败: {response.get('message', '未知错误')}")

+ 127 - 0
src/utils/ragflow/document_service.py

@@ -0,0 +1,127 @@
+from typing import Dict, Any, List, Optional
+
+class DocumentService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def upload_document(self, dataset_id: str, file_path: str) -> List[Dict[str, Any]]:
+        import os
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents"
+        
+        with open(file_path, 'rb') as f:
+            # 使用os.path.basename获取文件名,兼容Windows和Linux
+            files = {'file': (os.path.basename(file_path), f)}
+            # 不设置Content-Type头,让requests库自动生成正确的multipart/form-data头
+            response = self.http_client.post(endpoint, files=files)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"上传文档失败: {response.get('message', '未知错误')}")
+    
+    def update_document(self, dataset_id: str, document_id: str, 
+                       name: str = None, meta_fields: Dict = None, 
+                       chunk_method: str = None, parser_config: Dict = None,
+                       enabled: int = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if meta_fields is not None:
+            data["meta_fields"] = meta_fields
+        if chunk_method is not None:
+            data["chunk_method"] = chunk_method
+        if parser_config is not None:
+            data["parser_config"] = parser_config
+        if enabled is not None:
+            data["enabled"] = enabled
+        
+        response = self.http_client.post(endpoint, json_data=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新文档失败: {response.get('message', '未知错误')}")
+    
+    def delete_document(self, dataset_id: str, document_id: str) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}"
+        
+        response = self.http_client.post(endpoint, json_data={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除文档失败: {response.get('message', '未知错误')}")
+    
+    def delete_documents(self, dataset_id: str, document_ids: List[str]) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents"
+        
+        response = self.http_client.post(endpoint, json_data={"document_ids": document_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"批量删除文档失败: {response.get('message', '未知错误')}")
+    
+    def get_document(self, dataset_id: str, document_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取文档失败: {response.get('message', '未知错误')}")
+    
+    def list_documents(self, dataset_id: str, page: int = 1, size: int = 20,
+                      keywords: str = None, document_id: str = None, document_name: str = None,
+                      suffix: str = None, run: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents"
+        
+        params = {"page": page, "page_size": size}
+        if keywords is not None:
+            params["keywords"] = keywords
+        if document_id is not None:
+            params["id"] = document_id
+        if document_name is not None:
+            params["name"] = document_name
+        if suffix is not None:
+            params["suffix"] = suffix
+        if run is not None:
+            params["run"] = run
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出文档失败: {response.get('message', '未知错误')}")
+    
+    def get_document_chunks(self, dataset_id: str, document_id: str,
+                           keywords: str = None, page: int = 1, size: int = 20,
+                           chunk_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks"
+        
+        params = {"page": page, "page_size": size}
+        if keywords is not None:
+            params["keywords"] = keywords
+        if chunk_id is not None:
+            params["id"] = chunk_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取文档切片失败: {response.get('message', '未知错误')}")
+    
+    def parse_document(self, dataset_id: str, document_ids: List[str]) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/chunks"
+        
+        response = self.http_client.post(endpoint, json_data={"document_ids": document_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"解析文档失败: {response.get('message', '未知错误')}")

+ 141 - 0
src/utils/ragflow/file_service.py

@@ -0,0 +1,141 @@
+from typing import Dict, Any, List, Optional
+
+class FileService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def list_files(self, parent_id: str = None, keywords: str = None,
+                  page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/file/list"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if parent_id is not None:
+            params["parent_id"] = parent_id
+        if keywords is not None:
+            params["keywords"] = keywords
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出文件失败: {response.get('message', '未知错误')}")
+    
+    def get_root_folder(self) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/root_folder"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取根目录失败: {response.get('message', '未知错误')}")
+    
+    def get_parent_folder(self, file_id: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/parent_folder"
+        
+        response = self.http_client.get(endpoint, params={"file_id": file_id})
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取父目录失败: {response.get('message', '未知错误')}")
+    
+    def get_all_parent_folders(self, file_id: str) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/file/all_parent_folder"
+        
+        response = self.http_client.get(endpoint, params={"file_id": file_id})
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取所有父目录失败: {response.get('message', '未知错误')}")
+    
+    def get_file(self, file_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/file/get/{file_id}"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取文件失败: {response.get('message', '未知错误')}")
+    
+    def upload_file(self, file_path: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/upload"
+        
+        with open(file_path, 'rb') as f:
+            files = {'file': (file_path.split('/')[-1], f)}
+            headers = {'Content-Type': 'multipart/form-data'}
+            
+            response = self.http_client.post(endpoint, files=files, headers=headers)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"上传文件失败: {response.get('message', '未知错误')}")
+    
+    def create_file(self, file_id: str, tenant_id: str = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/create"
+        
+        data = {"file_id": file_id}
+        if tenant_id is not None:
+            data["tenant_id"] = tenant_id
+        
+        response = self.http_client.post(endpoint, json_data=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"创建文件失败: {response.get('message', '未知错误')}")
+    
+    def delete_file(self, file_id: str) -> bool:
+        endpoint = "/api/v1/file/rm"
+        
+        response = self.http_client.post(endpoint, json_data={"file_id": file_id})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除文件失败: {response.get('message', '未知错误')}")
+    
+    def rename_file(self, file_id: str, new_name: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/rename"
+        
+        data = {
+            "file_id": file_id,
+            "new_name": new_name
+        }
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"重命名文件失败: {response.get('message', '未知错误')}")
+    
+    def move_file(self, file_id: str, parent_id: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/mv"
+        
+        data = {
+            "file_id": file_id,
+            "parent_id": parent_id
+        }
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"移动文件失败: {response.get('message', '未知错误')}")
+    
+    def convert_file(self, file_id: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/convert"
+        
+        response = self.http_client.post(endpoint, json={"file_id": file_id})
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"转换文件失败: {response.get('message', '未知错误')}")

+ 45 - 0
src/utils/ragflow/openai_service.py

@@ -0,0 +1,45 @@
+from typing import Dict, Any, List, Optional
+
+class OpenAICompatibleService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def chat_completion(self, chat_id: str, messages: List[Dict[str, Any]], 
+                       stream: bool = False, model: str = "model",
+                       extra_body: Dict = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats_openai/{chat_id}/chat/completions"
+        
+        data = {
+            "model": model,
+            "messages": messages,
+            "stream": stream
+        }
+        if extra_body is not None:
+            data["extra_body"] = extra_body
+        
+        response = self.http_client.post(endpoint, json_data=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", response)
+        else:
+            raise Exception(f"聊天完成失败: {response.get('message', '未知错误')}")
+    
+    def agent_completion(self, agent_id: str, messages: List[Dict[str, Any]], 
+                        stream: bool = False, model: str = "model",
+                        session_id: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents_openai/{agent_id}/chat/completions"
+        
+        data = {
+            "model": model,
+            "messages": messages,
+            "stream": stream
+        }
+        if session_id is not None:
+            data["session_id"] = session_id
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", response)
+        else:
+            raise Exception(f"代理完成失败: {response.get('message', '未知错误')}")

+ 302 - 0
src/utils/ragflow/ragflow_service.py

@@ -0,0 +1,302 @@
+import sys
+import os
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from src.utils.http_client import HTTPClient
+from src.conf.settings import ragflow_settings
+from src.utils.ragflow.dataset_service import DatasetService
+from src.utils.ragflow.document_service import DocumentService
+from src.utils.ragflow.chunk_service import ChunkService
+from src.utils.ragflow.chat_service import ChatService
+from src.utils.ragflow.agent_service import AgentService
+from src.utils.ragflow.file_service import FileService
+from src.utils.ragflow.openai_service import OpenAICompatibleService
+
+@dataclass
+class DocumentInfo:
+    id: str
+    name: str
+    type: str
+    size: int
+    location: str
+    dataset_id: str
+    chunk_method: str
+    chunk_count: Optional[int] = None
+    token_count: Optional[int] = None
+    run: str = "UNSTART"
+    status: str = "1"
+
+@dataclass
+class ChunkInfo:
+    id: str
+    document_id: str
+    content: str
+    document_name: str
+    dataset_id: str
+    similarity: float = 0.0
+    vector_similarity: float = 0.0
+    term_similarity: float = 0.0
+
+@dataclass
+class DatasetInfo:
+    id: str
+    name: str
+    description: Optional[str] = None
+    embedding_model: Optional[str] = None
+    permission: Optional[str] = None
+    chunk_method: Optional[str] = None
+    chunk_count: int = 0
+    document_count: int = 0
+    token_count: int = 0
+    status: str = "1"
+
+@dataclass
+class ChatInfo:
+    id: str
+    name: str
+    dataset_ids: List[str]
+    llm: Dict[str, Any]
+    prompt: str
+
+@dataclass
+class AgentInfo:
+    id: str
+    name: str
+    llm: Dict[str, Any]
+    description: Optional[str] = None
+
+@dataclass
+class FileInfo:
+    id: str
+    parent_id: str
+    name: str
+    type: str
+    size: int
+
+class RAGFlowService:
+    def __init__(self, base_url: str = None, api_key: str = None):
+        base_url = base_url or ragflow_settings.ragflow_api_url
+        api_key = api_key or ragflow_settings.ragflow_api_key
+        self.http_client = HTTPClient(base_url=base_url, api_key=api_key)
+        
+        self.dataset_service = DatasetService(self.http_client)
+        self.document_service = DocumentService(self.http_client)
+        self.chunk_service = ChunkService(self.http_client)
+        self.chat_service = ChatService(self.http_client)
+        self.agent_service = AgentService(self.http_client)
+        self.file_service = FileService(self.http_client)
+        self.openai_service = OpenAICompatibleService(self.http_client)
+    
+    def create_dataset(self, name: str, description: str = None, 
+                      embedding_model: str = None, permission: str = None,
+                      chunk_method: str = None, parser_config: dict = None) -> DatasetInfo:
+        return self.dataset_service.create_dataset(name, description, embedding_model, permission, chunk_method, parser_config)
+    
+    def delete_datasets(self, dataset_ids: List[str]) -> bool:
+        return self.dataset_service.delete_datasets(dataset_ids)
+    
+    def update_dataset(self, dataset_id: str, name: str = None, 
+                      description: str = None, embedding_model: str = None,
+                      permission: str = None, chunk_method: str = None) -> DatasetInfo:
+        return self.dataset_service.update_dataset(dataset_id, name, description, embedding_model, permission, chunk_method)
+    
+    def list_datasets(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                     desc: bool = True, name: str = None, dataset_id: str = None) -> List[DatasetInfo]:
+        return self.dataset_service.list_datasets(page, size, orderby, desc, name, dataset_id)
+    
+    def get_dataset(self, name: Optional[str] = None, dataset_id: Optional[str] = None) -> DatasetInfo:
+        _list = self.list_datasets(name=name, dataset_id=dataset_id)
+        if _list is None:
+            return None
+        elif len(_list) > 0:
+            return _list[0]
+    
+    def get_knowledge_graph(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.get_knowledge_graph(dataset_id)
+    
+    def delete_knowledge_graph(self, dataset_id: str) -> bool:
+        return self.dataset_service.delete_knowledge_graph(dataset_id)
+    
+    def trace_graphrag(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.trace_graphrag(dataset_id)
+    
+    def trace_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.trace_raptor(dataset_id)
+    
+    def get_metadata_summary(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.get_metadata_summary(dataset_id)
+    
+    def update_metadata(self, dataset_id: str, metadata: Dict = None,
+                       document_ids: List[str] = None, metadata_condition: Dict = None) -> bool:
+        return self.dataset_service.update_metadata(dataset_id, metadata, document_ids, metadata_condition)
+    
+    def run_graphrag(self, dataset_id: str, mode: str = "light") -> Dict[str, Any]:
+        return self.dataset_service.run_graphrag(dataset_id, mode)
+    
+    def run_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.run_raptor(dataset_id)
+    
+    def upload_document(self, dataset_id: str, file_path: str) -> List[DocumentInfo]:
+        return self.document_service.upload_document(dataset_id, file_path)
+    
+    def update_document(self, dataset_id: str, document_id: str, 
+                       name: str = None, meta_fields: Dict = None, 
+                       chunk_method: str = None, parser_config: Dict = None,
+                       enabled: int = None) -> DocumentInfo:
+        return self.document_service.update_document(dataset_id, document_id, name, meta_fields, chunk_method, parser_config, enabled)
+    
+    def delete_document(self, dataset_id: str, document_id: str) -> bool:
+        return self.document_service.delete_document(dataset_id, document_id)
+    
+    def delete_documents(self, dataset_id: str, document_ids: List[str]) -> bool:
+        return self.document_service.delete_documents(dataset_id, document_ids)
+    
+    def get_document(self, dataset_id: str, document_id: str) -> DocumentInfo:
+        return self.document_service.get_document(dataset_id, document_id)
+    
+    def list_documents(self, dataset_id: str, page: int = 1, size: int = 20,
+                      keywords: str = None, document_id: str = None, document_name: str = None,
+                      suffix: str = None, run: str = None) -> List[DocumentInfo]:
+        return self.document_service.list_documents(dataset_id, page, size, keywords, document_id, document_name, suffix, run)
+    
+    def get_document_chunks(self, dataset_id: str, document_id: str,
+                           keywords: str = None, page: int = 1, size: int = 20,
+                           chunk_id: str = None) -> List[ChunkInfo]:
+        return self.document_service.get_document_chunks(dataset_id, document_id, keywords, page, size, chunk_id)
+    
+    def parse_document(self, dataset_id: str, document_ids: List[str]) -> bool:
+        return self.document_service.parse_document(dataset_id, document_ids)
+    
+    def create_chunk(self, dataset_id: str, document_id: str, content: str, 
+                    important_keywords: List[str] = None) -> ChunkInfo:
+        return self.chunk_service.create_chunk(dataset_id, document_id, content, important_keywords)
+    
+    def update_chunk(self, dataset_id: str, chunk_id: str, content: str = None,
+                    important_keywords: List[str] = None) -> ChunkInfo:
+        return self.chunk_service.update_chunk(dataset_id, chunk_id, content, important_keywords)
+    
+    def delete_chunk(self, dataset_id: str, chunk_id: str) -> bool:
+        return self.chunk_service.delete_chunk(dataset_id, chunk_id)
+    
+    def delete_chunks(self, dataset_id: str, document_id: str, chunk_ids: List[str]) -> bool:
+        return self.chunk_service.delete_chunks(dataset_id, document_id, chunk_ids)
+    
+    def retrieval(self, dataset_ids: List[str], query: str, top_k: int = 5,
+                 similarity_threshold: float = 0.1, vector_similarity_weight: float = 0.3,
+                 refine: bool = False) -> List[ChunkInfo]:
+        return self.chunk_service.retrieval(dataset_ids, query, top_k, similarity_threshold, vector_similarity_weight, refine)
+    
+    def create_chat(self, name: str, dataset_ids: List[str], llm: Dict[str, Any],
+                   prompt: str = None) -> ChatInfo:
+        return self.chat_service.create_chat(name, dataset_ids, llm, prompt)
+    
+    def update_chat(self, chat_id: str, name: str = None, dataset_ids: List[str] = None,
+                   llm: Dict[str, Any] = None, prompt: str = None) -> ChatInfo:
+        return self.chat_service.update_chat(chat_id, name, dataset_ids, llm, prompt)
+    
+    def delete_chats(self, chat_ids: List[str]) -> bool:
+        return self.chat_service.delete_chats(chat_ids)
+    
+    def list_chats(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True, name: str = None, chat_id: str = None) -> List[ChatInfo]:
+        return self.chat_service.list_chats(page, size, orderby, desc, name, chat_id)
+    
+    def create_chat_session(self, chat_id: str, name: str = None) -> Dict[str, Any]:
+        return self.chat_service.create_chat_session(chat_id, name)
+    
+    def update_chat_session(self, chat_id: str, session_id: str, 
+                           name: str = None, message: List[Dict] = None) -> Dict[str, Any]:
+        return self.chat_service.update_chat_session(chat_id, session_id, name, message)
+    
+    def list_chat_sessions(self, chat_id: str, page: int = 1, size: int = 20,
+                          orderby: str = "create_time", desc: bool = True,
+                          session_id: str = None, session_name: str = None) -> List[Dict[str, Any]]:
+        return self.chat_service.list_chat_sessions(chat_id, page, size, orderby, desc, session_id, session_name)
+    
+    def delete_chat_session(self, chat_id: str, session_id: str) -> bool:
+        return self.chat_service.delete_chat_session(chat_id, session_id)
+    
+    def chat_completion(self, chat_id: str, query: str, stream: bool = False,
+                       session_id: str = None) -> Dict[str, Any]:
+        return self.chat_service.chat_completion(chat_id, query, stream, session_id)
+    
+    def create_agent(self, name: str, llm: Dict[str, Any], description: str = None) -> AgentInfo:
+        return self.agent_service.create_agent(name, llm, description)
+    
+    def update_agent(self, agent_id: str, name: str = None, llm: Dict[str, Any] = None,
+                    description: str = None) -> AgentInfo:
+        return self.agent_service.update_agent(agent_id, name, llm, description)
+    
+    def delete_agent(self, agent_id: str) -> bool:
+        return self.agent_service.delete_agent(agent_id)
+    
+    def list_agents(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                   desc: bool = True, name: str = None, agent_id: str = None) -> List[AgentInfo]:
+        return self.agent_service.list_agents(page, size, orderby, desc, name, agent_id)
+    
+    def create_agent_session(self, agent_id: str, name: str = None) -> Dict[str, Any]:
+        return self.agent_service.create_agent_session(agent_id, name)
+    
+    def list_agent_sessions(self, agent_id: str, page: int = 1, size: int = 20,
+                           orderby: str = "create_time", desc: bool = True,
+                           session_id: str = None, user_id: str = None,
+                           dsl: str = None) -> List[Dict[str, Any]]:
+        return self.agent_service.list_agent_sessions(agent_id, page, size, orderby, desc, session_id, user_id, dsl)
+    
+    def delete_agent_session(self, agent_id: str, session_id: str) -> bool:
+        return self.agent_service.delete_agent_session(agent_id, session_id)
+    
+    def agent_completion(self, agent_id: str, query: str, stream: bool = False,
+                        session_id: str = None) -> Dict[str, Any]:
+        return self.agent_service.agent_completion(agent_id, query, stream, session_id)
+    
+    def get_related_questions(self, dataset_id: str, question: str, top: int = 10) -> List[str]:
+        return self.agent_service.get_related_questions(dataset_id, question, top)
+    
+    def list_files(self, parent_id: str = None, keywords: str = None,
+                  page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True) -> List[FileInfo]:
+        return self.file_service.list_files(parent_id, keywords, page, size, orderby, desc)
+    
+    def get_root_folder(self) -> Dict[str, Any]:
+        return self.file_service.get_root_folder()
+    
+    def get_parent_folder(self, file_id: str) -> Dict[str, Any]:
+        return self.file_service.get_parent_folder(file_id)
+    
+    def get_all_parent_folders(self, file_id: str) -> List[Dict[str, Any]]:
+        return self.file_service.get_all_parent_folders(file_id)
+    
+    def get_file(self, file_id: str) -> Dict[str, Any]:
+        return self.file_service.get_file(file_id)
+    
+    def upload_file(self, file_path: str) -> Dict[str, Any]:
+        return self.file_service.upload_file(file_path)
+    
+    def create_file(self, file_id: str, tenant_id: str = None) -> Dict[str, Any]:
+        return self.file_service.create_file(file_id, tenant_id)
+    
+    def delete_file(self, file_id: str) -> bool:
+        return self.file_service.delete_file(file_id)
+    
+    def rename_file(self, file_id: str, new_name: str) -> Dict[str, Any]:
+        return self.file_service.rename_file(file_id, new_name)
+    
+    def move_file(self, file_id: str, parent_id: str) -> Dict[str, Any]:
+        return self.file_service.move_file(file_id, parent_id)
+    
+    def convert_file(self, file_id: str) -> Dict[str, Any]:
+        return self.file_service.convert_file(file_id)
+    
+    def openai_chat_completion(self, chat_id: str, messages: List[Dict[str, Any]], 
+                              stream: bool = False, model: str = "model",
+                              extra_body: Dict = None) -> Dict[str, Any]:
+        return self.openai_service.chat_completion(chat_id, messages, stream, model, extra_body)
+    
+    def openai_agent_completion(self, agent_id: str, messages: List[Dict[str, Any]], 
+                               stream: bool = False, model: str = "model",
+                               session_id: str = None) -> Dict[str, Any]:
+        return self.openai_service.agent_completion(agent_id, messages, stream, model, session_id)