yingge 3 kuukautta sitten
vanhempi
commit
3a375c66d2
85 muutettua tiedostoa jossa 385 lisäystä ja 5092 poistoa
  1. 34 3
      .env.example
  2. 4 0
      .gitignore
  3. BIN
      __pycache__/main.cpython-312.pyc
  4. BIN
      api/__pycache__/__init__.cpython-312.pyc
  5. BIN
      api/__pycache__/hybrid_search_http.cpython-312.pyc
  6. BIN
      api/__pycache__/search_infinity.cpython-312.pyc
  7. 3 0
      api/db/services/__init__.py
  8. BIN
      api/db/services/__pycache__/__init__.cpython-312.pyc
  9. BIN
      api/db/services/__pycache__/infinity_search_service.cpython-312.pyc
  10. 150 0
      api/db/services/infinity_search_service.py
  11. 0 136
      api/hybrid_search_http.py
  12. 0 55
      api/hybrid_search_http_example.py
  13. 0 35
      api/hybrid_search_http_example.sh
  14. 76 0
      api/search_infinity.py
  15. 0 252
      doc/README.md
  16. 0 446
      doc/design.md
  17. 64 0
      main.py
  18. BIN
      model/__pycache__/multimodal_embedding.cpython-312.pyc
  19. 15 2
      model/multimodal_embedding.py
  20. 0 43
      test/check_infinity_sdk.py
  21. 0 172
      test/main.py
  22. 0 58
      test/test_compress_image_bytes.py
  23. 0 121
      test/test_es_conn.py
  24. 0 74
      test/test_fastapi_hybrid_search.py
  25. 0 82
      test/test_file_upload.py
  26. 0 85
      test/test_full_service.py
  27. 0 61
      test/test_http_hybrid_search.py
  28. 0 58
      test/test_image_compression.py
  29. 0 60
      test/test_image_compression_bytes.py
  30. 0 100
      test/test_image_compression_fix.py
  31. 0 132
      test/test_image_compression_real.py
  32. 0 180
      test/test_infinity_encapsulation.py
  33. 0 56
      test/test_infinity_http.py
  34. 0 60
      test/test_infinity_sdk.py
  35. 0 6
      test/test_mcp.py
  36. 0 72
      test/test_mcp_hybrid_search.py
  37. 0 47
      test/test_mcp_simple.py
  38. 0 59
      test/test_multimodal_embedding.py
  39. 0 126
      test/test_mysql_conn.py
  40. 0 34
      test/test_simple.py
  41. 0 39
      test/test_upload_document.py
  42. 0 104
      test/test_vector_db.py
  43. 0 71
      test/test_workflow.py
  44. 0 99
      test/verify_pdf_splitter.py
  45. 0 38
      test/vl_embedding_test.py
  46. BIN
      utils/__pycache__/__init__.cpython-312.pyc
  47. 0 17
      utils/es/__init__.py
  48. BIN
      utils/es/__pycache__/__init__.cpython-312.pyc
  49. BIN
      utils/es/__pycache__/base.cpython-312.pyc
  50. BIN
      utils/es/__pycache__/constants.cpython-312.pyc
  51. BIN
      utils/es/__pycache__/document.cpython-312.pyc
  52. BIN
      utils/es/__pycache__/index.cpython-312.pyc
  53. BIN
      utils/es/__pycache__/search.cpython-312.pyc
  54. BIN
      utils/es/__pycache__/templates.cpython-312.pyc
  55. 0 68
      utils/es/base.py
  56. 0 25
      utils/es/constants.py
  57. 0 192
      utils/es/document.py
  58. 0 131
      utils/es/index.py
  59. 0 202
      utils/es/search.py
  60. 0 203
      utils/es/templates.py
  61. 0 138
      utils/es_conn.py
  62. BIN
      utils/file/__pycache__/file_utils.cpython-312.pyc
  63. BIN
      utils/file/__pycache__/image_util.cpython-312.pyc
  64. 0 0
      utils/file/file_utils.py
  65. 29 4
      utils/file/image_util.py
  66. 0 0
      utils/file/minio/__init__.py
  67. 0 0
      utils/file/minio/__pycache__/__init__.cpython-312.pyc
  68. 0 0
      utils/file/minio/__pycache__/file_utils.cpython-312.pyc
  69. 0 0
      utils/file/minio/__pycache__/image_util.cpython-312.pyc
  70. BIN
      utils/file/minio/__pycache__/minio_util.cpython-312.pyc
  71. 1 1
      utils/file/minio/minio_util.py
  72. 0 30
      utils/infinity/__init__.py
  73. BIN
      utils/infinity/__pycache__/__init__.cpython-312.pyc
  74. BIN
      utils/infinity/__pycache__/client.cpython-312.pyc
  75. 3 3
      utils/infinity/client.py
  76. 0 382
      utils/infinity_util/__init__.py
  77. BIN
      utils/infinity_util/__pycache__/__init__.cpython-312.pyc
  78. BIN
      utils/infinity_util/__pycache__/base.cpython-312.pyc
  79. BIN
      utils/infinity_util/__pycache__/document.cpython-312.pyc
  80. BIN
      utils/infinity_util/__pycache__/index.cpython-312.pyc
  81. BIN
      utils/infinity_util/__pycache__/search.cpython-312.pyc
  82. 0 168
      utils/vector_db.py
  83. 0 555
      vector_search_result.md
  84. 3 3
      workflow/image_parsing_workflow.py
  85. 3 4
      workflow/workflow.py

+ 34 - 3
.env.example

@@ -1,8 +1,39 @@
 # 模型配置
 MODEL_PROVIDER=openai
-MODEL_NAME=qwen3-vl
-BASE_URL=https://api.openai.com/v1
-API_KEY=your-api-key-here
+MODEL_NAME=Qwen/Qwen3-VL-8B-Instruct
+BASE_URL=https://api.siliconflow.cn/v1
+API_KEY=sk-xvrfniafyxprllrgedsgosdwcmfmrbnrvhhztssqsmnzacfj
+DASHSCOPE=sk-bc0f1026a41c4c92beb014be8973e4e2
+# embedding模型配置
+EMBEDDING_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
+MULTIMODAL_EMBEDDING_MODEL_NAME=qwen2.5-vl-embedding
+# RAGFLOW配置
+RAGFLOW_API_URL=http://192.168.16.134/
+RAGFLOW_API_KEY=ragflow-sPJ06xiUdRrcfDRlOD-GN2gl-U2DLB-PbgNGckUu0KM
+DATASET_ID=a0f1aa03ed2c11f08b8f0242c0a85002
 
 # 应用配置
 LOG_LEVEL=INFO
+
+# 向量数据库配置
+# 可选值: es, infinity
+VECTOR_DB_TYPE=infinity
+
+# Infinity向量数据库配置
+INFINITY_HOST=192.168.16.134
+INFINITY_PORT=23820
+INFINITY_SDK_PORT=23817
+INFINITY_DATABASE=book_image_db
+INFINITY_USER=admin
+INFINITY_PASSWORD=admin
+INFINITY_TABLE_NAME=book_page_image
+
+# MinIO配置
+MINIO_ENDPOINT=192.168.16.134:9000
+MINIO_ACCESS_KEY=ck7I8Esssx6rzZrXQ5uP
+MINIO_SECRET_KEY=8Hz5o2WXNuQJPDMLyBiUQpbefhTWYzYnm5ToBLSb
+MINIO_BUCKET_NAME=bookpage
+# 本地测试设为false;生产环境设为true
+MINIO_SECURE=False
+
+

+ 4 - 0
.gitignore

@@ -0,0 +1,4 @@
+_pycache_/
+.infinity_client/
+.trae/
+.model_output/

BIN
__pycache__/main.cpython-312.pyc


BIN
api/__pycache__/__init__.cpython-312.pyc


BIN
api/__pycache__/hybrid_search_http.cpython-312.pyc


BIN
api/__pycache__/search_infinity.cpython-312.pyc


+ 3 - 0
api/db/services/__init__.py

@@ -0,0 +1,3 @@
+# from api.db.services.infinity_search_service import InfinitySearchService
+
+# search_service = InfinitySearchService()

BIN
api/db/services/__pycache__/__init__.cpython-312.pyc


BIN
api/db/services/__pycache__/infinity_search_service.cpython-312.pyc


+ 150 - 0
api/db/services/infinity_search_service.py

@@ -0,0 +1,150 @@
+from typing import Dict, Any, List
+from conf.config import VectorDBConfig
+from utils.infinity import InfinityClient
+from utils.file.image_util import image_util
+from model.multimodal_embedding import get_embedding_model
+
+
+def convert_to_basic_types(obj: Any) -> Any:
+    """
+    递归将对象转换为基本类型,以便Pydantic能够序列化
+    
+    特殊处理:当字典中的子项包含相同长度的数组时,将其转换为数组对象结构
+    例如:{"a": [1,2], "b": [3,4]} -> [{"a":1, "b":3}, {"a":2, "b":4}]
+    
+    Args:
+        obj: 要转换的对象
+    
+    Returns:
+        转换后的基本类型对象
+    """
+    if obj is None:
+        return None
+    elif isinstance(obj, (str, int, float, bool)):
+        return obj
+    elif isinstance(obj, dict):
+        # 先递归转换所有值
+        converted = {k: convert_to_basic_types(v) for k, v in obj.items()}
+        
+        # 检查是否需要转换为数组对象结构
+        # 条件:所有值都是列表,且长度一致,且长度大于0
+        values = list(converted.values())
+        if all(isinstance(v, list) for v in values):
+            lengths = [len(v) for v in values]
+            if len(set(lengths)) == 1 and lengths[0] > 0:
+                # 转换为数组对象结构
+                result = []
+                keys = list(converted.keys())
+                for i in range(lengths[0]):
+                    item = {}
+                    for key in keys:
+                        # 处理数组中可能存在的None值
+                        if i < len(converted[key]):
+                            item[key] = converted[key][i]
+                        else:
+                            item[key] = None
+                    result.append(item)
+                return result
+        
+        return converted
+    elif isinstance(obj, (list, tuple)):
+        return [convert_to_basic_types(item) for item in obj]
+    else:
+        # 对于其他类型,尝试将其转换为字符串或字典
+        try:
+            return dict(obj)
+        except:
+            return str(obj)
+
+
+class InfinitySearchService:
+    def __init__(self, infinity_client: InfinityClient, vector_field: str = None, match_field: str = None, match_type: str = None, table_name: str = None):
+        self.infinity_client = infinity_client
+        # 输出字段
+        self.output_fields = [
+                    "file_name",
+                    "page_number",
+                    "content",
+                    "image_path",
+                    "dataset_id",
+                    "document_id"
+                ]
+        self.vector_field = vector_field or "dense_vector_1024"
+        self.match_field = match_field or "content"
+        self.match_type = match_type or "cosine"
+        self.table_name = table_name or VectorDBConfig.get_infinity_table_name()
+
+    def search(self, search_query: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        执行Infinity数据库搜索
+    
+        Args:
+            search_query: 搜索查询参数
+        
+        Returns:
+            搜索结果,转换为基本类型以便序列化
+        """
+        try:
+            # 执行搜索
+            result = self.infinity_client.search(self.table_name, self.output_fields, search_query)
+            # 将结果转换为基本类型,处理可能的复杂类型
+            result_dict = result.to_result()
+            # 递归转换所有复杂类型为基本类型
+            return convert_to_basic_types(result_dict)
+        except Exception as e:
+            raise Exception(f"搜索失败: {str(e)}")
+
+    def vector_search(self, search_query: Dict[str, Any]):
+        """
+        执行Infinity数据库向量检索
+    
+        Args:
+            search_query: 向量检索查询参数
+        
+        Returns:
+            向量检索结果,转换为基本类型以便序列化
+        """
+        try:
+            # 1.处理image_url为image: Image.Image
+            image = image_util._url_to_image(search_query["image_url"])
+            # 2.将图片进行向量化
+            query_vector = get_embedding_model().get_multimodal_embedding(search_query["matching_text"], image)
+
+            search_query["vector_field"] = self.vector_field
+            search_query["query_vector"] = query_vector
+            # 执行向量检索
+            result = self.infinity_client.vector_search(self.table_name, self.output_fields, search_query)
+            # 将结果转换为基本类型,处理可能的复杂类型
+            result_dict = result.to_result()
+            # 递归转换所有复杂类型为基本类型
+            return convert_to_basic_types(result_dict)
+        except Exception as e:
+            raise Exception(f"向量检索失败: {str(e)}")
+
+    def hybrid_search(self, search_query: Dict[str, Any]):
+        """
+        执行Infinity数据库混合检索
+    
+        Args:
+            search_query: 混合检索查询参数
+        
+        Returns:
+            混合检索结果,转换为基本类型以便序列化
+        """
+        try:
+            # 1.处理image_url为image: Image.Image
+            image = image_util._url_to_image(search_query["image_url"])
+            # 2.将图片进行向量化
+            query_vector = get_embedding_model().get_multimodal_embedding(search_query["matching_text"], image)
+            search_query["vector_field"] = self.vector_field
+            search_query["query_vector"] = query_vector
+            search_query["match_field"] = self.match_field
+            # 执行混合检索
+            result = self.infinity_client.hybrid_search(self.table_name, self.output_fields, search_query)
+            # 将结果转换为基本类型,处理可能的复杂类型
+            result_dict = result.to_result()
+            # 递归转换所有复杂类型为基本类型
+            return convert_to_basic_types(result_dict)
+        except Exception as e:
+            raise Exception(f"混合检索失败: {str(e)}")
+

+ 0 - 136
api/hybrid_search_http.py

@@ -1,136 +0,0 @@
-#!/usr/bin/env python3
-"""
-混合检索HTTP服务
-使用FastAPI框架实现,提供混合检索的HTTP POST接口
-"""
-
-import sys
-import os
-import requests
-from io import BytesIO
-from typing import List, Dict, Any
-from fastapi import FastAPI, HTTPException, Body
-from pydantic import BaseModel
-from PIL import Image
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.infinity_util import InfinityVectorDB
-from model.multimodal_embedding import Embedding
-from conf.config import ModelConfig, VectorDBConfig
-
-# 初始化FastAPI应用
-app = FastAPI(
-    title="混合检索HTTP服务",
-    description="提供混合检索的HTTP POST接口",
-    version="1.0.0"
-)
-
-# 初始化向量数据库
-vector_db = InfinityVectorDB()
-
-# 初始化多模态嵌入模型
-embedding_model = Embedding(
-    model_name=ModelConfig.get_multimodal_embedding_model_name(),
-    api_key=ModelConfig.get_dashscope_api_key()
-)
-
-# 定义请求模型
-class HybridSearchRequest(BaseModel):
-    """混合检索请求模型"""
-    text_query: str
-    image: str
-    topn: int = 2
-
-# 定义响应模型
-class HybridSearchResponse(BaseModel):
-    """混合检索响应模型"""
-    success: bool
-    message: str
-    output: List[Dict[str, Any]] = []
-    total: int = 0
-
-@app.post("/hybrid_search", response_model=HybridSearchResponse)
-def hybrid_search(request: HybridSearchRequest = Body(...)):
-    """
-    混合检索API
-    使用文本查询和向量查询进行混合检索
-    
-    请求参数:
-    - text_query: 文本查询
-    - image: 图片URL
-    - topn: 返回结果数量,默认2
-    
-    返回结果:
-    - success: 是否成功
-    - message: 结果消息
-    - output: 检索结果列表
-    - total: 总命中数
-    """
-    try:
-        # 解析请求参数
-        text_query = request.text_query
-        image_url = request.image
-        topn = request.topn
-        
-        print(f"开始混合检索,数据库: {VectorDBConfig.get_infinity_database}, 知识库id: {ModelConfig.get_dataset_id()}, 文本查询: {text_query}, 返回数量: {topn}")
-        
-        # 构建索引名称
-        index_name = f"{VectorDBConfig.get_infinity_table_name()}" 
-        print(f"开始生成多模态嵌入,文本长度: {len(text_query)}")
-        
-        # 处理image_url为image: Image.Image
-        if isinstance(image_url, str):
-                # 下载图片
-                response = requests.get(image_url)
-                response.raise_for_status()  # 检查HTTP状态码
-    
-                # 将响应内容转换为字节流
-                image_bytes = BytesIO(response.content)
-    
-                # 创建Image对象
-                image = Image.open(image_bytes)
-        
-        # 生成多模态嵌入向量
-        embedding = embedding_model.get_multimodal_embedding(text_query, image)
-        
-        print(f"多模态嵌入生成完成,向量长度: {len(embedding)}")
-        
-        # 执行混合检索
-        result = vector_db.hybrid_search(
-            index_name=index_name,
-            match_method="dense",
-            vector_field="dense_vector_1024",
-            query_vector=embedding,
-            element_type="float",
-            metric_type="cosine",
-            topn=topn,
-            text_query=text_query,
-            text_field="content"
-        )
-        
-        print(f"混合检索完成,总命中数: {result.get('total', 0)}")
-        
-        # 返回成功响应
-        return HybridSearchResponse(
-            success=True,
-            message="混合检索成功",
-            output=result.get("output", []),
-            total=result.get("total", topn)
-        )
-    except Exception as e:
-        print(f"混合检索失败: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-@app.get("/health")
-def health_check():
-    """健康检查接口"""
-    return {
-        "status": "ok",
-        "message": "混合检索HTTP服务正常运行"
-    }
-
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=18001)

+ 0 - 55
api/hybrid_search_http_example.py

@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-混合检索HTTP服务请求示例
-使用Python requests库调用混合检索接口
-"""
-
-import requests
-import json
-
-def hybrid_search_example():
-    """
-    混合检索接口调用示例
-    """
-    # 服务地址
-    base_url = "http://localhost:18001"
-    endpoint = "/hybrid_search"
-    url = f"{base_url}{endpoint}"
-    
-    # 示例1:基本请求(仅文本查询)
-    print("示例1:基本请求(仅文本查询)")
-    payload1 = {
-        "text_query": "这是一个测试文本查询",
-        "topn": 2
-    }
-    
-    response1 = requests.post(url, json=payload1)
-    print(f"状态码: {response1.status_code}")
-    print(f"响应内容: {json.dumps(response1.json(), indent=2, ensure_ascii=False)}")
-    
-    # 示例2:完整请求(文本+图片)
-    print("\n示例2:完整请求(文本+图片)")
-    payload2 = {
-        "text_query": "这是一个带图片的测试查询",
-        "image": "https://example.com/test.jpg",
-        "topn": 5
-    }
-    
-    response2 = requests.post(url, json=payload2)
-    print(f"状态码: {response2.status_code}")
-    print(f"响应内容: {json.dumps(response2.json(), indent=2, ensure_ascii=False)}")
-    
-    # 示例3:使用默认topn值
-    print("\n示例3:使用默认topn值")
-    payload3 = {
-        "text_query": "这是一个使用默认值的测试",
-        "image": "https://example.com/another.jpg"
-    }
-    
-    response3 = requests.post(url, json=payload3)
-    print(f"状态码: {response3.status_code}")
-    print(f"响应内容: {json.dumps(response3.json(), indent=2, ensure_ascii=False)}")
-
-if __name__ == "__main__":
-    hybrid_search_example()

+ 0 - 35
api/hybrid_search_http_example.sh

@@ -1,35 +0,0 @@
-#!/bin/bash
-
-# 混合检索HTTP服务请求示例
-# 服务地址:http://localhost:18001
-# 接口路径:/hybrid_search
-# 请求方法:POST
-# 请求格式:application/json
-
-# 示例1:基本请求(仅文本查询)
-echo "示例1:基本请求(仅文本查询)"
-curl -X POST "http://localhost:18001/hybrid_search" \
-     -H "Content-Type: application/json" \
-     -d '{
-           "text_query": "这是一个测试文本查询",
-           "topn": 2
-         }'
-
-echo -e "\n\n示例2:完整请求(文本+图片)"
-# 示例2:完整请求(文本+图片)
-curl -X POST "http://localhost:18001/hybrid_search" \
-     -H "Content-Type: application/json" \
-     -d '{
-           "text_query": "这是一个带图片的测试查询",
-           "image": "https://example.com/test.jpg",
-           "topn": 5
-         }'
-
-echo -e "\n\n示例3:使用默认topn值"
-# 示例3:使用默认topn值(默认值为2)
-curl -X POST "http://localhost:18001/hybrid_search" \
-     -H "Content-Type: application/json" \
-     -d '{
-           "text_query": "这是一个使用默认值的测试",
-           "image": "https://example.com/another.jpg"
-         }'

+ 76 - 0
api/search_infinity.py

@@ -0,0 +1,76 @@
+# Infinity搜索API服务
+
+from fastapi import FastAPI, HTTPException
+from typing import List, Dict, Any, Optional
+from api.db.services.infinity_search_service import InfinitySearchService
+from utils.infinity import get_client
+
+
+# 创建FastAPI应用
+
+app = FastAPI(
+    title="Infinity Search API",
+    description="基于Infinity向量数据库的搜索API服务",
+    version="1.0.0"
+)
+
+# 请求模型
+from pydantic import BaseModel
+
+class SearchRequest(BaseModel):
+    """搜索请求模型"""
+    search_query: Dict[str, Any]
+
+# 1. 普通搜索接口
+@app.post("/text", response_model=Dict[str, Any])
+def search(request: SearchRequest):
+    """
+    普通搜索接口
+    
+    - **table_name**: 表名
+    - **output_fields**: 要返回的字段列表
+    - **query**: 查询条件,包含field、query和topn字段
+    - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
+    """
+    try:
+        search_service = InfinitySearchService(infinity_client=get_client())
+        result = search_service.search(request.search_query)
+        return {"success": True, "result": result}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"搜索失败: {str(e)}")
+
+# 2. 向量搜索接口
+@app.post("/vector", response_model=Dict[str, Any])
+def vector_search(request: SearchRequest):
+    """
+    向量搜索接口
+    
+    - **table_name**: 表名
+    - **output_fields**: 要返回的字段列表
+    - **query**: 查询条件,包含vector_field、query_vector和topn字段
+    - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
+    """
+    try:
+        search_service = InfinitySearchService(infinity_client=get_client())
+        result = search_service.vector_search(request.search_query)
+        return {"success": True, "result": result}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"向量搜索失败: {str(e)}")
+
+# 3. 混合搜索接口
+@app.post("/hybrid", response_model=Dict[str, Any])
+def hybrid_search(request: SearchRequest):
+    """
+    混合搜索接口
+    
+    - **table_name**: 表名
+    - **output_fields**: 要返回的字段列表
+    - **query**: 查询条件,包含vector_field、query_vector、field、query、topn和fusion_weight字段
+    - **database_name**: 数据库名称(可选,默认使用客户端配置的数据库)
+    """
+    try:
+        search_service = InfinitySearchService(infinity_client=get_client())
+        result = search_service.hybrid_search(request.search_query)
+        return {"success": True, "result": result}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"混合搜索失败: {str(e)}")

+ 0 - 252
doc/README.md

@@ -1,252 +0,0 @@
-# Ragflow_plugs 项目文档
-
-## 1. 项目概述
-
-Ragflow_plugs是一个基于RAG(检索增强生成)技术的多模态混合检索系统,支持文本和图像的联合检索,并提供灵活的HTTP API服务。该系统旨在为智能应用提供高效、准确的多模态信息检索能力。
-
-## 2. 目录结构
-
-```
-├── agent/             # 智能代理模块
-├── api/               # HTTP API服务模块
-├── book/              # 示例文档和输出目录
-├── conf/              # 配置文件
-├── doc/               # 项目文档
-├── model/             # 模型相关模块
-├── parser/            # 文档解析模块
-├── test/              # 测试文件
-├── utils/             # 工具模块
-├── workflow/          # 工作流管理模块
-├── .env               # 环境变量配置
-├── .env.example       # 环境变量示例
-├── requirements.txt   # 项目依赖
-└── __init__.py        # 项目初始化
-```
-
-## 3. 核心功能
-
-### 3.1 多模态嵌入生成
-
-- 支持文本和图像的联合嵌入生成
-- 兼容多种AI模型API
-- 提供统一的嵌入接口
-
-### 3.2 混合检索
-
-- 结合文本检索和向量检索
-- 支持灵活的检索参数配置
-- 提供准确的检索结果
-
-### 3.3 HTTP API服务
-
-- 提供RESTful API
-- 支持请求参数验证
-- 提供统一的响应格式
-
-### 3.4 文档处理
-
-- 支持PDF文档解析
-- 支持图像提取和处理
-
-### 3.5 向量数据库集成
-
-- 与Infinity向量数据库无缝集成
-- 支持向量存储和检索
-
-## 4. 快速开始
-
-### 4.1 环境准备
-
-1. 安装Python 3.12
-2. 安装依赖:
-
-```bash
-pip install -r requirements.txt
-```
-
-3. 配置环境变量:
-
-```bash
-cp .env.example .env
-# 编辑.env文件,配置API密钥和其他参数
-```
-
-### 4.2 运行HTTP服务
-
-```bash
-python -m api.hybrid_search_http
-```
-
-服务将在 `http://0.0.0.0:18001` 上运行。
-
-### 4.3 使用API
-
-#### 混合检索API
-
-**请求URL**:`/hybrid_search`
-
-**请求方法**:POST
-
-**请求体**:
-
-```json
-{
-    "text_query": "文本查询",
-    "image": "图片URL或base64编码",
-    "topn": 2
-}
-```
-
-**响应示例**:
-
-```json
-{
-    "success": true,
-    "message": "混合检索成功",
-    "output": [
-        {
-            "file_name": "文件名",
-            "page_number": 1,
-            "content": "内容",
-            "image_path": "图片路径",
-            "dataset_id": "数据集ID",
-            "document_id": "文档ID",
-            "_similarity": 0.95
-        }
-    ],
-    "total": 1
-}
-```
-
-#### 健康检查API
-
-**请求URL**:`/health`
-
-**请求方法**:GET
-
-**响应示例**:
-
-```json
-{
-    "status": "ok",
-    "message": "混合检索HTTP服务正常运行"
-}
-```
-
-## 5. 配置说明
-
-### 5.1 模型配置
-
-| 配置项 | 说明 | 默认值 |
-|--------|------|--------|
-| MULTIMODAL_EMBEDDING_MODEL_NAME | 多模态嵌入模型名称 | qwen-vl-plus |
-| DASHSCOPE_API_KEY | DashScope API密钥 | - |
-| SILICONFLOW_API_KEY | SiliconFlow API密钥 | - |
-
-### 5.2 向量数据库配置
-
-| 配置项 | 说明 | 默认值 |
-|--------|------|--------|
-| INFINITY_HOST | Infinity数据库地址 | http://localhost:23820 |
-| INFINITY_DATABASE | Infinity数据库名称 | image_db |
-| INFINITY_TABLE_NAME | Infinity表名称 | - |
-
-## 6. 开发指南
-
-### 6.1 代码风格
-
-- 遵循PEP 8代码风格
-- 使用类型注解
-- 编写清晰的文档字符串
-
-### 6.2 测试
-
-- 运行单元测试:
-
-```bash
-python -m pytest test/ -v
-```
-
-- 运行特定测试:
-
-```bash
-python -m pytest test/test_http_hybrid_search.py -v
-```
-
-### 6.3 添加新功能
-
-1. 在相应的模块目录下创建新文件
-2. 实现新功能
-3. 编写测试用例
-4. 更新文档
-
-## 7. 示例代码
-
-### 7.1 使用混合检索
-
-```python
-from utils.infinity_util import InfinityVectorDB
-from model.multimodal_embedding import Embedding
-from conf.config import ModelConfig
-
-# 初始化向量数据库
-vector_db = InfinityVectorDB()
-
-# 初始化嵌入模型
-embedding_model = Embedding(
-    model_name=ModelConfig.get_multimodal_embedding_model_name(),
-    api_key=ModelConfig.get_dashscope_api_key()
-)
-
-# 生成多模态嵌入
-text_query = "文本查询"
-image = Image.open("image.jpg")
-embedding = embedding_model.get_multimodal_embedding(text_query, image)
-
-# 执行混合检索
-result = vector_db.hybrid_search(
-    index_name="index_name",
-    match_method="dense",
-    vector_field="vector_field",
-    query_vector=embedding,
-    element_type="float",
-    metric_type="cosine",
-    topn=2,
-    text_query=text_query,
-    text_field="content"
-)
-
-print(result)
-```
-
-## 8. 常见问题
-
-### 8.1 嵌入生成失败
-
-- 检查API密钥是否正确
-- 检查网络连接
-- 检查模型名称是否正确
-
-### 8.2 检索结果不准确
-
-- 检查向量数据库配置
-- 检查嵌入模型配置
-- 调整检索参数
-
-### 8.3 HTTP服务无法启动
-
-- 检查端口是否被占用
-- 检查配置文件是否正确
-- 检查依赖是否安装完整
-
-## 9. 联系方式
-
-如有问题或建议,请联系项目负责人。
-
-## 10. 版本历史
-
-- v1.0.0:初始版本,支持多模态混合检索和HTTP API服务
-
-## 11. 许可证
-
-本项目采用MIT许可证。

+ 0 - 446
doc/design.md

@@ -1,446 +0,0 @@
-# Ragflow_plugs 项目设计文档
-
-## 1. 项目概述
-
-Ragflow_plugs是一个基于RAG(检索增强生成)技术的多模态混合检索系统,支持文本和图像的联合检索,并提供灵活的HTTP API服务。该系统旨在为智能应用提供高效、准确的多模态信息检索能力。
-
-### 1.1 核心功能
-
-- **多模态嵌入生成**:支持文本和图像的联合嵌入生成
-- **混合检索**:结合文本检索和向量检索,提供更准确的检索结果
-- **HTTP API服务**:提供RESTful API,方便外部系统集成
-- **文档处理**:支持PDF文档解析、图像提取和处理
-- **向量数据库集成**:与Infinity向量数据库无缝集成
-- **灵活配置**:支持多环境配置,便于部署和管理
-
-### 1.2 应用场景
-
-- 智能问答系统
-- 图像搜索和内容推荐
-- 多模态内容管理系统
-- 教育资源检索
-- 儿童绘本智能分析
-
-## 2. 设计思路
-
-### 2.1 架构设计原则
-
-- **模块化设计**:将系统拆分为多个独立模块,便于维护和扩展
-- **松耦合**:模块之间通过明确的接口进行通信,降低依赖关系
-- **可扩展性**:支持多种模型、多种向量数据库的扩展
-- **高可用性**:设计合理的错误处理和重试机制
-- **性能优化**:针对检索和嵌入生成进行性能优化
-
-### 2.2 核心设计理念
-
-- **多模态融合**:将文本和图像信息融合为统一的向量表示
-- **检索增强生成**:先检索相关信息,再结合大模型生成高质量回答
-- **分层设计**:分为数据层、服务层、API层,各层职责明确
-- **配置驱动**:通过配置文件灵活调整系统行为
-
-## 3. 系统架构
-
-### 3.1 整体架构
-
-```
-┌───────────────────────────────────────────────────────────────────┐
-│                      Client Applications                          │
-└───────────────────────────────────────────────────────────────────┘
-                               │
-                               ▼
-┌───────────────────────────────────────────────────────────────────┐
-│                          API Layer                               │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                     hybrid_search_http.py                   │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                    hybrid_search_mcp.py                    │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-└───────────────────────────────────────────────────────────────────┘
-                               │
-                               ▼
-┌───────────────────────────────────────────────────────────────────┐
-│                         Service Layer                            │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                      Agent Module                           │  │
-│  │  ┌───────────────────────────────────────────────────────┐  │  │
-│  │  │                  test_image_agent.py                  │  │  │
-│  │  └───────────────────────────────────────────────────────┘  │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                     Workflow Module                        │  │
-│  │  ┌───────────────────────────────────────────────────────┐  │  │
-│  │  │               image_parsing_workflow.py               │  │  │
-│  │  └───────────────────────────────────────────────────────┘  │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-└───────────────────────────────────────────────────────────────────┘
-                               │
-                               ▼
-┌───────────────────────────────────────────────────────────────────┐
-│                         Model Layer                              │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                 multimodal_embedding.py                    │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                         qwen_vl.py                         │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-└───────────────────────────────────────────────────────────────────┘
-                               │
-                               ▼
-┌───────────────────────────────────────────────────────────────────┐
-│                         Data Layer                               │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                      Parser Module                         │  │
-│  │  ┌───────────────────────────────────────────────────────┐  │  │
-│  │  │                      pdf_parser                       │  │  │
-│  │  └───────────────────────────────────────────────────────┘  │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                      Utils Module                          │  │
-│  │  ┌───────────────────────────────────────────────────────┐  │  │
-│  │  │                      http_client.py                   │  │  │
-│  │  └───────────────────────────────────────────────────────┘  │  │
-│  │  ┌───────────────────────────────────────────────────────┐  │  │
-│  │  │                    infinity_util                      │  │  │
-│  │  └───────────────────────────────────────────────────────┘  │  │
-│  │  └───────────────────────────────────────────────────────┘  │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-└───────────────────────────────────────────────────────────────────┘
-                               │
-                               ▼
-┌───────────────────────────────────────────────────────────────────┐
-│                       External Services                          │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                    Infinity Vector DB                      │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-│  ┌─────────────────────────────────────────────────────────────┐  │
-│  │                 AI Model APIs (DashScope, etc.)            │  │
-│  └─────────────────────────────────────────────────────────────┘  │
-└───────────────────────────────────────────────────────────────────┘
-```
-
-### 3.2 核心模块关系
-
-#### 3.2.1 多模态嵌入模块
-
-- **功能**:生成文本和图像的联合嵌入向量
-- **依赖**:AI模型API(如DashScope、SiliconFlow等)
-- **调用关系**:被hybrid_search_http.py和agent模块调用
-
-#### 3.2.2 混合检索模块
-
-- **功能**:结合文本检索和向量检索,返回综合结果
-- **依赖**:infinity_util、vector_db等工具模块
-- **调用关系**:被HTTP API模块和agent模块调用
-
-#### 3.2.3 HTTP API模块
-
-- **功能**:提供RESTful API服务
-- **依赖**:FastAPI框架、混合检索模块
-- **调用关系**:被外部客户端调用
-
-#### 3.2.4 Agent模块
-
-- **功能**:提供智能代理功能,结合检索结果生成回答
-- **依赖**:混合检索模块、LLM模型
-- **调用关系**:可被外部系统直接调用
-
-#### 3.2.5 Workflow模块
-
-- **功能**:管理系统工作流,如文档处理、图像分析等
-- **依赖**:Parser模块、Model模块
-- **调用关系**:被外部系统或定时任务调用
-
-## 4. 核心模块设计
-
-### 4.1 多模态嵌入模块
-
-#### 4.1.1 设计目标
-
-- 支持文本和图像的联合嵌入生成
-- 兼容多种AI模型API
-- 提供统一的嵌入接口
-- 支持配置不同的模型参数
-
-#### 4.1.2 核心类与方法
-
-```python
-class Embedding:
-    def __init__(self, model_name: str, api_key: str):
-        # 初始化嵌入模型
-        pass
-    
-    def get_multimodal_embedding(self, text: str, image: Image.Image) -> List[float]:
-        # 生成多模态嵌入向量
-        pass
-```
-
-#### 4.1.3 支持的模型
-
-- Qwen VL(通过DashScope API)
-- 其他多模态模型(可扩展)
-
-### 4.2 混合检索模块
-
-#### 4.2.1 设计目标
-
-- 结合文本检索和向量检索
-- 支持灵活的检索参数配置
-- 提供准确的检索结果
-- 支持分页和排序
-
-#### 4.2.2 核心类与方法
-
-```python
-class InfinityVectorDB:
-    def hybrid_search(self, index_name: str, match_method: str, vector_field: str, 
-                     query_vector: List[float], element_type: str, metric_type: str,
-                     topn: int, text_query: str, text_field: str) -> Dict[str, Any]:
-        # 执行混合检索
-        pass
-    
-    def vector_search(self, index_name: str, vector_field: str, vector: List[float], 
-                     size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
-        # 执行向量检索
-        pass
-```
-
-#### 4.2.3 检索流程
-
-1. 接收检索请求,包括文本查询、图像和检索参数
-2. 生成多模态嵌入向量
-3. 调用Infinity向量数据库进行混合检索
-4. 处理检索结果,转换为统一格式
-5. 返回检索结果
-
-### 4.3 HTTP API模块
-
-#### 4.3.1 设计目标
-
-- 提供RESTful API接口
-- 支持请求参数验证
-- 提供统一的响应格式
-- 支持错误处理和日志记录
-
-#### 4.3.2 核心API
-
-| API路径 | 方法 | 功能 |
-|--------|------|------|
-| /hybrid_search | POST | 执行混合检索 |
-| /health | GET | 健康检查 |
-
-#### 4.3.3 请求和响应模型
-
-```python
-class HybridSearchRequest(BaseModel):
-    text_query: str
-    image: str
-    topn: int = 2
-    
-class HybridSearchResponse(BaseModel):
-    success: bool
-    message: str
-    output: List[Dict[str, Any]] = []
-    total: int = 0
-```
-
-### 4.4 HTTP客户端模块
-
-#### 4.4.1 设计目标
-
-- 提供统一的HTTP请求接口
-- 支持重试机制
-- 支持不同的HTTP方法
-- 支持文件上传
-
-#### 4.4.2 核心类与方法
-
-```python
-class HTTPClient:
-    def post(self, endpoint: str, data: Optional[Dict] = None, 
-             json_data: Optional[Dict] = None, files: Optional[Dict] = None,
-             headers: Optional[Dict] = None) -> Dict[str, Any]:
-        # 发送POST请求
-        pass
-    
-    def get(self, endpoint: str, params: Optional[Dict] = None,
-            headers: Optional[Dict] = None) -> Dict[str, Any]:
-        # 发送GET请求
-        pass
-    
-    def get_json(self, endpoint: str, json_data: Optional[Dict] = None,
-                headers: Optional[Dict] = None) -> Dict[str, Any]:
-        # 发送带有JSON数据的GET请求
-        pass
-    
-    def put(self, endpoint: str, data: Optional[Dict] = None, 
-            json_data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Dict[str, Any]:
-        # 发送PUT请求
-        pass
-    
-    def delete(self, endpoint: str, data: Optional[Dict] = None, 
-               json_data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Dict[str, Any]:
-        # 发送DELETE请求
-        pass
-```
-
-## 5. 数据流程
-
-### 5.1 多模态检索流程
-
-```
-1. 客户端发送检索请求,包括文本查询和图像URL
-2. API层接收请求,解析参数
-3. 下载图像,转换为Image对象
-4. 调用多模态嵌入模块生成嵌入向量
-5. 调用混合检索模块执行检索
-6. 处理检索结果,转换为统一格式
-7. 返回JSON响应给客户端
-```
-
-### 5.2 文档处理流程
-
-```
-1. 上传PDF文档
-2. 解析PDF,提取文本和图像
-3. 生成文档的多模态嵌入
-4. 将嵌入向量和元数据存储到向量数据库
-5. 建立索引,便于后续检索
-```
-
-## 6. 配置管理
-
-### 6.1 配置文件结构
-
-```
-├── conf/
-│   ├── config.py          # 配置管理类
-│   ├── infinity_mapping.json  # Infinity数据库映射配置
-│   └── __init__.py
-└── .env                   # 环境变量配置
-```
-
-### 6.2 配置管理类
-
-```python
-class ModelConfig:
-    @staticmethod
-    def get_multimodal_embedding_model_name() -> str:
-        # 获取多模态嵌入模型名称
-        pass
-    
-    @staticmethod
-    def get_dashscope_api_key() -> str:
-        # 获取DashScope API密钥
-        pass
-    
-    # 其他模型配置方法
-    
-class VectorDBConfig:
-    @staticmethod
-    def get_infinity_database() -> str:
-        # 获取Infinity数据库名称
-        pass
-    
-    @staticmethod
-    def get_infinity_table_name() -> str:
-        # 获取Infinity表名称
-        pass
-    
-    # 其他向量数据库配置方法
-```
-
-### 6.3 环境变量配置
-
-```
-# 模型API配置
-DASHSCOPE_API_KEY=your_api_key
-SILICONFLOW_API_KEY=your_api_key
-
-# 向量数据库配置
-INFINITY_HOST=http://localhost:23820
-INFINITY_DATABASE=image_db
-
-# 应用配置
-LOG_LEVEL=INFO
-```
-
-## 7. 技术栈
-
-| 类别 | 技术/框架 | 用途 |
-|------|----------|------|
-| 编程语言 | Python 3.12 | 主要开发语言 |
-| Web框架 | FastAPI | HTTP API服务 |
-| HTTP客户端 | Requests | HTTP请求处理 |
-| 图像处理 | PIL/Pillow | 图像加载和处理 |
-| 向量数据库 | Infinity | 向量存储和检索 |
-| LLM集成 | LangChain | 大语言模型集成 |
-| AI模型API | DashScope, SiliconFlow | 多模态嵌入和生成 |
-| 配置管理 | python-dotenv | 环境变量管理 |
-| 测试框架 | pytest | 单元测试和集成测试 |
-
-## 8. 部署和运行
-
-### 8.1 依赖安装
-
-```bash
-pip install -r requirements.txt
-```
-
-### 8.2 运行HTTP服务
-
-```bash
-python -m api.hybrid_search_http
-```
-
-### 8.3 测试运行
-
-```bash
-python -m pytest test/
-```
-
-## 9. 测试策略
-
-### 9.1 单元测试
-
-- 针对核心模块的单元测试
-- 测试覆盖主要功能点
-- 使用模拟对象减少外部依赖
-
-### 9.2 集成测试
-
-- 测试模块之间的集成
-- 测试与外部服务的集成
-- 测试完整的业务流程
-
-### 9.3 性能测试
-
-- 测试多模态嵌入生成的性能
-- 测试混合检索的响应时间
-- 测试系统的并发处理能力
-
-## 10. 扩展和维护
-
-### 10.1 扩展方向
-
-- 支持更多的多模态模型
-- 支持更多的向量数据库
-- 增强文档处理能力,支持更多文档格式
-- 添加更多的检索算法和优化策略
-- 增强API功能,支持更复杂的检索请求
-
-### 10.2 维护建议
-
-- 定期更新依赖库
-- 监控系统性能和错误日志
-- 定期备份数据
-- 进行安全审计和漏洞修复
-- 保持文档更新
-
-## 11. 结论
-
-Ragflow_plugs是一个功能强大的多模态混合检索系统,具有良好的架构设计和灵活的扩展能力。该系统支持文本和图像的联合检索,提供高效、准确的检索结果,并通过HTTP API方便外部系统集成。
-
-通过模块化设计和清晰的接口定义,系统具有良好的可维护性和可扩展性。配置驱动的设计使得系统可以轻松适应不同的环境和需求。
-
-该系统可以广泛应用于智能问答、图像搜索、内容推荐等场景,为智能应用提供强大的多模态信息检索能力。

+ 64 - 0
main.py

@@ -0,0 +1,64 @@
+# 主应用入口,整合多个 FastAPI 应用
+import uvicorn
+from fastapi import FastAPI
+from contextlib import asynccontextmanager
+
+# 导入所有子应用
+from api.search_infinity import app as search_app
+
+# 定义主应用的生命周期管理
+@asynccontextmanager
+async def main_lifespan(app: FastAPI):
+    """主应用生命周期管理"""
+    from utils.infinity import get_client, close_client
+    print("=== Infinity API Gateway 启动 ===")
+    # 1. 初始化全局客户端(在服务启动时)
+    get_client(database="book_image_db")
+    print("✅ Infinity客户端已初始化")
+    yield
+
+    print("=== Infinity API Gateway 关闭 ===")
+     # 2. 关闭全局客户端(在服务关闭时)
+    close_client()
+    print("✅ Infinity客户端已关闭")
+
+# 创建主应用
+main_app = FastAPI(
+    title="Infinity API Gateway",
+    description="整合多个 FastAPI 应用的 API 网关",
+    version="1.0.0",
+    lifespan=main_lifespan
+)
+
+# 挂载子应用
+# 1. 搜索 API - 访问路径: /search/*
+main_app.mount("/search", search_app, name="search_api")
+
+# 主应用根路径
+@main_app.get("/")
+async def root():
+    """API 网关根路径"""
+    return {
+        "message": "Welcome to GRAPH_RAG API Gateway",
+        "available_apps": {
+            "search_api": "访问路径: /search, 文档: /search/docs",
+            "hybrid_http_api": "访问路径: /hybrid, 文档: /hybrid/docs"
+        }
+    }
+
+# 健康检查端点
+@main_app.get("/health")
+async def health_check():
+    """主应用健康检查"""
+    return {"status": "healthy", "service": "Infinity API Gateway"}
+
+if __name__ == "__main__":
+    """启动主应用"""
+    uvicorn.run(
+        "main:main_app",  # 应用路径: 模块名:应用实例名
+        host="0.0.0.0",   # 允许所有IP访问
+        port=18001,         # 服务端口
+        reload=False,       # 开发模式下自动重载
+        workers=1,         # 生产环境可根据需要增加
+        log_level="info"   # 日志级别
+    )

BIN
model/__pycache__/multimodal_embedding.cpython-312.pyc


+ 15 - 2
model/multimodal_embedding.py

@@ -5,7 +5,7 @@ import io
 from langchain_openai import OpenAIEmbeddings
 from dashscope import MultiModalEmbedding
 from conf.config import ModelConfig
-from utils.minio.image_util import ImageUtil
+from utils.file.image_util import image_util as ImageUtil
 
 class Embedding:
     """Embedding模型工具"""
@@ -137,4 +137,17 @@ class Embedding:
             else:
                 raise Exception(f"Error: {response.message}")   
         except Exception as e:
-            raise Exception(f"多模态embedding生成失败: {str(e)}")
+            raise Exception(f"多模态embedding生成失败: {str(e)}")
+
+# 全局单例
+def get_embedding_model() -> Embedding:
+    """
+    获取全局单例的Embedding模型
+    
+    Returns:
+        Embedding: 全局单例的Embedding模型
+    """
+    return Embedding(
+        model_name=ModelConfig.get_multimodal_embedding_model_name(),
+        api_key=ModelConfig.get_dashscope_api_key()
+    )

+ 0 - 43
test/check_infinity_sdk.py

@@ -1,43 +0,0 @@
-import infinity
-
-print("=== 检查Infinity SDK结构 ===")
-
-# 检查infinity模块的内容
-print("\n1. Infinity模块内容:")
-print(dir(infinity))
-
-# 检查infinity是否有common子模块
-print("\n2. 检查infinity.common是否存在:")
-try:
-    import infinity.common
-    print("✓ infinity.common存在")
-    print("infinity.common内容:")
-    print(dir(infinity.common))
-except Exception as e:
-    print(f"✗ infinity.common不存在: {e}")
-
-# 检查infinity是否有ConflictType
-print("\n3. 检查infinity.ConflictType是否存在:")
-print(f"hasattr(infinity, 'ConflictType'): {hasattr(infinity, 'ConflictType')}")
-
-# 检查infinity是否有其他相关属性
-print("\n4. 检查infinity的其他属性:")
-for attr in ['DataType', 'IndexType', 'MetricType', 'NetworkAddress', 'create_database']:
-    print(f"hasattr(infinity, '{attr}'): {hasattr(infinity, attr)}")
-
-# 尝试查找ConflictType的正确位置
-print("\n5. 尝试查找ConflictType:")
-import pkgutil
-import sys
-
-for _, module_name, _ in pkgutil.iter_modules(sys.modules['infinity'].__path__):
-    full_module_name = f"infinity.{module_name}"
-    try:
-        module = __import__(full_module_name, fromlist=[''])
-        print(f"\n检查模块: {full_module_name}")
-        module_attrs = dir(module)
-        print(f"属性: {module_attrs}")
-        if 'ConflictType' in module_attrs:
-            print(f"✓ 找到ConflictType在 {full_module_name}")
-    except Exception as e:
-        print(f"无法导入 {full_module_name}: {e}")

+ 0 - 172
test/main.py

@@ -1,172 +0,0 @@
-import sys
-import os
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from workflow.workflow import PDFParsingWorkflow
-import json
-from typing import Dict, Any
-from conf.config import ModelConfig
-
-os.environ["LANGSMITH_TRACING"] = "true"
-os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGSMITH_API_KEY"] = "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"
-os.environ["LANGSMITH_PROJECT"] = "ragflow_plugs"
-
-
-
-class PDFParsingService:
-    """PDF扫描件拆分解析服务"""
-    
-    def __init__(self, model_name: str = None):
-        """
-        初始化PDF解析服务
-        
-        Args:
-            model_name: QWEN VL模型名称,若为None则使用配置文件中的值
-        """
-        # 从配置文件获取默认模型名称
-        default_model = ModelConfig.get_model_name()
-        self.model_name = model_name or default_model
-        self.workflow = PDFParsingWorkflow(model_name=self.model_name)
-    
-    def parse_pdf(self, pdf_path: str) -> Dict[str, Any]:
-        """
-        解析PDF扫描件
-        
-        Args:
-            pdf_path: PDF文件路径
-            
-        Returns:
-            Dict: 解析结果,包含:
-                - pdf_path: PDF文件路径
-                - total_pages: 总页数
-                - parsed_results: 每一页的解析结果
-                - is_complete: 是否完成
-        """
-        # 运行工作流
-        result = self.workflow.run(pdf_path, ModelConfig.get_dataset_id(), ModelConfig.get_ragflow_api_url(), ModelConfig.get_ragflow_api_key())
-        
-        # 整理输出结果
-        output = {
-            "pdf_path": result["pdf_path"],
-            "total_pages": len(result["split_pages"]),
-            "parsed_results": result["parsed_results"],
-            "is_complete": result["is_complete"]
-        }
-        
-        return output
-    
-    def parse_pdf_to_json(self, pdf_path: str, output_json_path: str = None) -> str:
-        """
-        解析PDF并输出为JSON格式
-        
-        Args:
-            pdf_path: PDF文件路径
-            output_json_path: 输出JSON文件路径,若为None则返回JSON字符串
-            
-        Returns:
-            str: JSON字符串或输出文件路径
-        """
-        result = self.parse_pdf(pdf_path)
-        json_str = json.dumps(result, ensure_ascii=False, indent=2)
-        
-        if output_json_path:
-            with open(output_json_path, "w", encoding="utf-8") as f:
-                f.write(json_str)
-            return output_json_path
-        
-        return json_str
-    
-    def parse_pdf_to_markdown(self, pdf_path: str) -> str:
-        """
-        解析PDF并输出为Markdown格式,包含页码、描述和图片
-        
-        Args:
-            pdf_path: PDF文件路径
-            
-        Returns:
-            str: 输出Markdown文件路径
-        """
-        # 解析PDF
-        result = self.parse_pdf(pdf_path)
-        
-        # 获取PDF文件名(不含扩展名)
-        pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
-        
-        # 输出目录
-        output_dir = r"d:\project\work\ragflow_plugs\book\output"
-        
-        # 确保输出目录存在
-        os.makedirs(output_dir, exist_ok=True)
-        
-        # Markdown文件名
-        md_filename = f"{pdf_filename}.md"
-        md_file_path = os.path.join(output_dir, md_filename)
-        
-        # 图片存储目录
-        images_dir = os.path.join(output_dir, f"{pdf_filename}_images")
-        os.makedirs(images_dir, exist_ok=True)
-        
-        # 构建Markdown内容
-        md_content = f"# {pdf_filename} 解析结果\n\n"
-        md_content += f"**总页数**: {result['total_pages']}\n"
-        md_content += f"**模型**: {self.model_name}\n\n"
-        md_content += "---\n\n"
-        
-        # 遍历所有解析结果
-        for page_result in result['parsed_results']:
-            page_number = page_result.get('page_number', 0)
-            content = page_result.get('content', '')
-            
-            # 写入页码和描述
-            md_content += f"## 第 {page_number} 页\n\n"
-            md_content += f"### 描述\n{content}\n\n"
-            
-            # 从temp目录获取已保存的图片
-            pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
-            temp_image_dir = r".\temp"
-            temp_image_filename = f"{pdf_filename}_{page_number}.png"
-            temp_image_path = os.path.join(temp_image_dir, temp_image_filename)
-            
-            # 检查图片是否存在
-            if os.path.exists(temp_image_path):
-                # 在Markdown中引用temp目录中的图片,使用正斜杠确保语法正确
-                temp_image_url = temp_image_path.replace("\\", "/")
-                md_content += f"### 图片\n"
-                md_content += f"![第 {page_number} 页图片]({temp_image_url})\n\n"
-                md_content += "---\n\n"
-        
-        # 写入Markdown文件
-        with open(md_file_path, "w", encoding="utf-8") as f:
-            f.write(md_content)
-        
-        return md_file_path
-
-def main():
-    """主函数,示例用法"""
-    # 示例:使用服务解析PDF
-    # 1. 创建服务实例(使用配置文件中的默认模型)
-    service = PDFParsingService()
-    
-    # 2. 或指定模型名称
-    # service = PDFParsingService(model_name="qwen3-vl")
-    
-    # 3. 解析PDF文件
-    pdf_path = r"D:\project\work\ragflow_plugs\book\出发!超级播种机.pdf"
-    
-    # 4. 保存为Markdown文件 
-    md_output_path = service.parse_pdf_to_markdown(pdf_path)
-    print(f"解析结果已保存到: {md_output_path}")
-    
-    # 5. 或直接获取结果
-    # result = service.parse_pdf(pdf_path)
-    # print(json.dumps(result, ensure_ascii=False, indent=2))
-    
-    # 6. 或保存为JSON文件
-    # output_path = service.parse_pdf_to_json(pdf_path, "output.json")
-    # print(f"解析结果已保存到: {output_path}")
-
-
-if __name__ == "__main__":
-    main()

+ 0 - 58
test/test_compress_image_bytes.py

@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试图片压缩到字节流功能
-"""
-
-import os
-import sys
-from io import BytesIO
-from PIL import Image
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.minio.image_util import image_util
-
-# 生成一个大的测试图片
-def generate_test_image(width=2000, height=2000, color=(255, 0, 0)):
-    """
-    生成一个大的测试图片
-    """
-    print(f"生成测试图片,大小: {width}x{height}")
-    img = Image.new('RGB', (width, height), color=color)
-    img_stream = BytesIO()
-    img.save(img_stream, format='PNG')
-    img_stream.seek(0)
-    return img_stream
-
-# 测试图片压缩到字节流功能
-def test_compress_image_bytes():
-    """
-    测试图片压缩到字节流功能
-    """
-    print("开始测试图片压缩到字节流功能...")
-    
-    # 生成测试图片
-    img_stream = generate_test_image()
-    
-    # 将图片流转换为字节流
-    img_bytes = img_stream.getvalue()
-    print(f"原始图片字节大小: {len(img_bytes)} 字节")
-    
-    # 调用压缩到字节流方法
-    compressed_bytes = image_util.compress_image_bytes(img_bytes, max_size_kb=5000)
-    
-    # 检查压缩后大小
-    compressed_size = len(compressed_bytes) / 1024
-    print(f"压缩后大小: {compressed_size:.2f}KB")
-    
-    # 验证压缩后大小
-    assert compressed_size <= 5000, f"压缩后大小 {compressed_size:.2f}KB 超过了最大限制 5000KB"
-    
-    # 验证返回类型
-    assert isinstance(compressed_bytes, bytes), f"返回类型应为bytes,实际为 {type(compressed_bytes)}"
-    
-    print("图片压缩到字节流测试成功!")
-
-if __name__ == "__main__":
-    test_compress_image_bytes()

+ 0 - 121
test/test_es_conn.py

@@ -1,121 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import json
-from services.utils.es_conn import ESConnection
-
-def test_es_connection():
-    """
-    测试 Elasticsearch 连接和基本功能
-    """
-    try:
-        # 初始化连接
-        print("正在初始化 Elasticsearch 连接...")
-        es = ESConnection(hosts=["http://localhost:9200"])
-        print("连接成功!")
-        
-        # 测试索引创建
-        index_name = "test_ragflow_index"
-        print(f"\n正在创建索引: {index_name}")
-        success = es.create_index(index_name)
-        if success:
-            print(f"索引 {index_name} 创建成功!")
-        else:
-            print(f"索引 {index_name} 创建失败!")
-            return False
-        
-        # 测试文档插入
-        test_doc = {
-            "title": "测试文档",
-            "content": "这是一个用于测试 Elasticsearch 连接的文档",
-            "content_tks": "这 是 一个 用于 测试 Elasticsearch 连接 的 文档",
-            "vector_768_vec": [0.1] * 768,
-            "created_at": "2024-01-01 00:00:00",
-            "count_int": 10,
-            "importance_flt": 0.8,
-            "tags_kwd": ["测试", "elasticsearch"],
-            "kb_id": "test_kb_123"
-        }
-        
-        print("\n正在插入测试文档...")
-        insert_success = es.insert(index_name, test_doc)
-        if insert_success:
-            print("文档插入成功!")
-        else:
-            print("文档插入失败!")
-            return False
-        
-        # 测试批量插入
-        test_docs = []
-        for i in range(3):
-            doc = {
-                "title": f"批量测试文档 {i}",
-                "content": f"这是第 {i} 个批量测试文档",
-                "content_tks": f"这是 第 {i} 个 批量 测试 文档",
-                "vector_768_vec": [0.1] * 768,
-                "created_at": "2024-01-01 00:00:00",
-                "count_int": i,
-                "importance_flt": 0.5 + i * 0.1,
-                "tags_kwd": ["批量", "测试"],
-                "kb_id": "test_kb_123"
-            }
-            test_docs.append(doc)
-        
-        print("\n正在批量插入测试文档...")
-        bulk_result = es.bulk_insert(index_name, test_docs)
-        print(f"批量插入结果: {bulk_result}")
-        
-        # 测试全文检索
-        print("\n正在测试全文检索...")
-        text_query = {
-            "match": {
-                "content": "测试"
-            }
-        }
-        text_result = es.search(index_name, text_query, size=5)
-        print(f"全文检索结果: {text_result['hits']['total']} 个命中")
-        
-        # 测试向量检索
-        print("\n正在测试向量检索...")
-        vector = [0.1] * 768
-        vector_result = es.knn_search(
-            index_name=index_name,
-            vector_field="vector_768_vec",
-            vector=vector,
-            k=3
-        )
-        print(f"向量检索结果: {vector_result['hits']['total']} 个命中")
-        
-        # 测试混合检索
-        print("\n正在测试混合检索...")
-        hybrid_result = es.hybrid_search(
-            index_name=index_name,
-            text_query="测试",
-            vector_field="vector_768_vec",
-            vector=vector,
-            size=5
-        )
-        print(f"混合检索结果: {hybrid_result['hits']['total']} 个命中")
-        
-        # 打印命中的文档
-        print("\n混合检索命中的文档:")
-        for hit in hybrid_result['hits']['hits']:
-            doc = hit['_source']
-            print(f"  - 标题: {doc['title']}, 相似度分数: {hit['_score']:.4f}")
-        
-        # 测试文档删除
-        print(f"\n正在删除索引: {index_name}")
-        es.es.indices.delete(index=index_name, ignore=[400, 404])
-        print(f"索引 {index_name} 删除成功!")
-        
-        # 关闭连接
-        es.close()
-        print("\n所有测试完成!")
-        return True
-        
-    except Exception as e:
-        print(f"测试失败: {e}")
-        return False
-
-if __name__ == "__main__":
-    test_es_connection()

+ 0 - 74
test/test_fastapi_hybrid_search.py

@@ -1,74 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试混合检索FastAPI服务
-"""
-
-import requests
-import json
-
-# 测试数据
-test_data = {
-    "text_query": "测试",
-    "image": "https://example.com/image.jpg",
-    "topn": 2
-}
-
-# 发送POST请求
-def test_hybrid_search():
-    url = "http://localhost:18001/hybrid_search"
-    headers = {
-        "Content-Type": "application/json"
-    }
-    
-    print("开始测试混合检索FastAPI服务...")
-    print(f"请求URL: {url}")
-    print(f"请求数据: {json.dumps(test_data, indent=2, ensure_ascii=False)}")
-    
-    try:
-        # 发送POST请求
-        response = requests.post(url, headers=headers, json=test_data, timeout=10)
-        
-        # 打印响应结果
-        print(f"\n响应状态码: {response.status_code}")
-        print(f"响应头: {dict(response.headers)}")
-        print(f"响应内容: {response.text}")
-        
-        if response.status_code == 200:
-            # 解析JSON响应
-            response_data = response.json()
-            print(f"\n解析后的响应数据: {json.dumps(response_data, indent=2, ensure_ascii=False)}")
-            print("测试成功!")
-        else:
-            print(f"\n测试失败,状态码: {response.status_code}")
-    except Exception as e:
-        print(f"\n测试失败,请求异常: {str(e)}")
-
-# 测试健康检查接口
-def test_health_check():
-    url = "http://localhost:18001/health"
-    
-    print("\n开始测试健康检查接口...")
-    print(f"请求URL: {url}")
-    
-    try:
-        # 发送GET请求
-        response = requests.get(url, timeout=5)
-        
-        # 打印响应结果
-        print(f"\n响应状态码: {response.status_code}")
-        print(f"响应头: {dict(response.headers)}")
-        print(f"响应内容: {response.text}")
-        
-        if response.status_code == 200:
-            # 解析JSON响应
-            response_data = response.json()
-            print(f"\n解析后的响应数据: {json.dumps(response_data, indent=2, ensure_ascii=False)}")
-            print("健康检查测试成功!")
-        else:
-            print(f"\n健康检查测试失败,状态码: {response.status_code}")
-    except Exception as e:
-        print(f"\n健康检查测试失败,请求异常: {str(e)}")
-
-if __name__ == "__main__":
-    test_hybrid_search()
-    test_health_check()

+ 0 - 82
test/test_file_upload.py

@@ -1,82 +0,0 @@
-import sys
-import os
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-def test_file_upload():
-    """测试文件上传功能"""
-    try:
-        from services.utils.http_client import HTTPClient
-        
-        # 创建HTTP客户端实例
-        http_client = HTTPClient(
-            base_url="http://localhost:8000",  # 替换为实际的API URL
-            api_key="your_api_key"  # 替换为实际的API密钥
-        )
-        
-        # 测试文件路径
-        test_file_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"
-        
-        # 打开文件并构建files字典
-        with open(test_file_path, 'rb') as f:
-            files = {'file': (os.path.basename(test_file_path), f)}
-            
-            print(f"测试文件上传: {test_file_path}")
-            print(f"文件字典: {files}")
-            
-            # 发送POST请求,测试文件上传
-            response = http_client.post(
-                "/api/v1/test/upload",  # 替换为实际的上传端点
-                files=files
-            )
-            
-            print(f"上传响应: {response}")
-            print("✓ 文件上传测试通过")
-            return True
-    except Exception as e:
-        print(f"✗ 文件上传测试失败: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-def test_post_without_files():
-    """测试不带文件的POST请求"""
-    try:
-        from services.utils.http_client import HTTPClient
-        
-        # 创建HTTP客户端实例
-        http_client = HTTPClient(
-            base_url="http://localhost:8000",  # 替换为实际的API URL
-            api_key="your_api_key"  # 替换为实际的API密钥
-        )
-        
-        # 发送普通POST请求
-        response = http_client.post(
-            "/api/v1/test/post",  # 替换为实际的POST端点
-            json={"key": "value"}
-        )
-        
-        print(f"普通POST响应: {response}")
-        print("✓ 普通POST请求测试通过")
-        return True
-    except Exception as e:
-        print(f"✗ 普通POST请求测试失败: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-def main():
-    """主测试函数"""
-    print("=== 测试文件上传修复 ===")
-    
-    # 测试文件上传
-    test_file_upload()
-    
-    # 测试普通POST请求
-    test_post_without_files()
-    
-    print("\n=== 测试完成 ===")
-
-if __name__ == "__main__":
-    main()

+ 0 - 85
test/test_full_service.py

@@ -1,85 +0,0 @@
-"""完整PDF解析服务测试脚本"""
-
-import sys
-import os
-import json
-from pathlib import Path
-
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from services.pdf_parser.main import PDFParsingService
-
-def test_full_service():
-    """测试完整的PDF解析服务"""
-    print("=" * 50)
-    print("完整PDF解析服务测试")
-    print("=" * 50)
-    
-    # 检查PDF文件
-    pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"
-    pdf = Path(pdf_path)
-    if not pdf.exists():
-        print(f"✗ PDF文件不存在: {pdf_path}")
-        print("请确保PDF文件存在,或修改脚本中的pdf_path变量")
-        return False
-    
-    print(f"✓ PDF文件存在: {pdf_path}")
-    print(f"  大小: {pdf.stat().st_size} 字节")
-    print()
-    
-    try:
-        # 创建服务实例
-        print("创建PDF解析服务实例...")
-        service = PDFParsingService(model_name="gpt-4o")
-        print("✓ 服务实例创建成功")
-        print(f"  使用模型: {service.model_name}")
-        print()
-        
-        # 测试PDF拆分功能
-        print("测试PDF拆分功能...")
-        from services.pdf_parser.pdf_splitter import PDFSplitter
-        splitter = PDFSplitter()
-        pages = splitter.split_pdf(pdf_path)
-        print(f"✓ PDF拆分成功,共 {len(pages)} 页")
-        print(f"  第1页图像大小: {pages[0]['image'].size}")
-        print()
-        
-        # 注意:完整解析需要模型API密钥,这里只测试到拆分阶段
-        print("注意:完整解析需要配置模型API密钥")
-        print("当前只测试了PDF拆分功能,模型解析需要配置API_KEY")
-        print()
-        
-        print("=" * 50)
-        print("测试完成!")
-        print("✓ PDF拆分功能正常工作")
-        print("✓ 服务实例创建成功")
-        print("✓ 依赖配置正确")
-        print("=" * 50)
-        
-        # 输出使用说明
-        print("\n使用说明:")
-        print("1. 配置.env文件:")
-        print("   - API_KEY=your-api-key")
-        print("   - BASE_URL=https://api.openai.com/v1")
-        print("   - MODEL_NAME=qwen3-vl")
-        print("   - MODEL_PROVIDER=openai")
-        print("2. 运行解析:")
-        print("   service = PDFParsingService()")
-        print("   result = service.parse_pdf('your_pdf_file.pdf')")
-        
-        return True
-        
-    except Exception as e:
-        print(f"✗ 测试失败: {str(e)}")
-        print("可能的解决方案:")
-        print("1. 确保已安装所有依赖: pip install -r requirements.txt")
-        print("2. 检查PDF文件是否损坏")
-        print("3. 检查PyMuPDF版本是否兼容")
-        return False
-
-def main():
-    """主函数"""
-    test_full_service()
-
-if __name__ == "__main__":
-    main()

+ 0 - 61
test/test_http_hybrid_search.py

@@ -1,61 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试混合检索HTTP服务
-"""
-
-import requests
-import json
-
-# 测试数据 - JSON-RPC 2.0格式
-test_data = {
-    "jsonrpc": "2.0",
-    "method": "hybrid_search",
-    "params": {
-        "text_query": "测试",
-        "image": "https://example.com/image.jpg",
-        "topn": 2
-    },
-    "id": "test-123"
-}
-
-# 发送POST请求
-def test_hybrid_search():
-    headers = {
-        "Content-Type": "application/json",
-        "Accept": "application/json, text/event-stream"
-    }
-    
-    # 尝试不同的URL路径
-    test_urls = [
-        "http://localhost:18000",
-        "http://localhost:18000/mcp",
-        "http://localhost:18000/tools/hybrid_search",
-        "http://localhost:18000/api/hybrid_search"
-    ]
-    
-    print("开始测试混合检索HTTP服务...")
-    print(f"请求数据: {json.dumps(test_data, indent=2, ensure_ascii=False)}")
-    
-    for url in test_urls:
-        print(f"\n尝试URL: {url}")
-        try:
-            # 发送POST请求
-            response = requests.post(url, headers=headers, json=test_data, timeout=10)
-            
-            # 打印响应结果
-            print(f"响应状态码: {response.status_code}")
-            print(f"响应内容: {response.text[:200]}...")
-            
-            if response.status_code == 200:
-                # 解析JSON响应
-                response_data = response.json()
-                print(f"解析后的响应数据: {json.dumps(response_data, indent=2, ensure_ascii=False)}")
-                print("测试成功!")
-                return
-        except Exception as e:
-            print(f"请求失败: {str(e)}")
-    
-    print("所有URL路径都测试失败")
-
-if __name__ == "__main__":
-    test_hybrid_search()

+ 0 - 58
test/test_image_compression.py

@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试图片压缩功能
-"""
-
-import os
-import sys
-from io import BytesIO
-from PIL import Image
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.minio.image_util import image_util
-
-# 生成一个大的测试图片
-def generate_test_image(width=2000, height=2000, color=(255, 0, 0)):
-    """
-    生成一个大的测试图片
-    """
-    print(f"生成测试图片,大小: {width}x{height}")
-    img = Image.new('RGB', (width, height), color=color)
-    img_stream = BytesIO()
-    img.save(img_stream, format='PNG')
-    img_stream.seek(0)
-    return img_stream
-
-# 测试图片压缩功能
-def test_image_compression():
-    """
-    测试图片压缩功能
-    """
-    print("开始测试图片压缩功能...")
-    
-    # 生成测试图片
-    img_stream = generate_test_image()
-    
-    # 检查压缩前大小
-    img_stream.seek(0, 2)
-    original_size = img_stream.tell() / 1024
-    img_stream.seek(0)
-    print(f"压缩前大小: {original_size:.2f}KB")
-    
-    # 调用压缩方法
-    compressed_stream = image_util._compress_image(img_stream, "test_image.png", max_size_kb=5000)
-    
-    # 检查压缩后大小
-    compressed_stream.seek(0, 2)
-    compressed_size = compressed_stream.tell() / 1024
-    compressed_stream.seek(0)
-    print(f"压缩后大小: {compressed_size:.2f}KB")
-    
-    # 验证压缩后大小
-    assert compressed_size <= 5000, f"压缩后大小 {compressed_size:.2f}KB 超过了最大限制 5000KB"
-    print("图片压缩测试成功!")
-
-if __name__ == "__main__":
-    test_image_compression()

+ 0 - 60
test/test_image_compression_bytes.py

@@ -1,60 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试图片压缩到字节流功能
-"""
-
-import os
-import sys
-from io import BytesIO
-from PIL import Image
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.minio.image_util import image_util
-
-# 生成一个大的测试图片
-def generate_test_image(width=2000, height=2000, color=(255, 0, 0)):
-    """
-    生成一个大的测试图片
-    """
-    print(f"生成测试图片,大小: {width}x{height}")
-    img = Image.new('RGB', (width, height), color=color)
-    img_stream = BytesIO()
-    img.save(img_stream, format='PNG')
-    img_stream.seek(0)
-    return img_stream
-
-# 测试图片压缩到字节流功能
-def test_image_compression_to_bytes():
-    """
-    测试图片压缩到字节流功能
-    """
-    print("开始测试图片压缩到字节流功能...")
-    
-    # 生成测试图片
-    img_stream = generate_test_image()
-    
-    # 检查压缩前大小
-    img_stream.seek(0, 2)
-    original_size = img_stream.tell() / 1024
-    img_stream.seek(0)
-    print(f"压缩前大小: {original_size:.2f}KB")
-    
-    # 调用压缩到字节流方法
-    compressed_bytes = image_util._compress_image_to_bytes(img_stream, "test_image.png", max_size_kb=5000)
-    
-    # 检查压缩后大小
-    compressed_size = len(compressed_bytes) / 1024
-    print(f"压缩后大小: {compressed_size:.2f}KB")
-    
-    # 验证压缩后大小
-    assert compressed_size <= 5000, f"压缩后大小 {compressed_size:.2f}KB 超过了最大限制 5000KB"
-    
-    # 验证返回类型
-    assert isinstance(compressed_bytes, bytes), f"返回类型应为bytes,实际为 {type(compressed_bytes)}"
-    
-    print("图片压缩到字节流测试成功!")
-
-if __name__ == "__main__":
-    test_image_compression_to_bytes()

+ 0 - 100
test/test_image_compression_fix.py

@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-测试图片压缩修复
-验证修改后的压缩方法是否能成功将图片压缩到5000KB以内
-"""
-import sys
-import os
-from io import BytesIO
-from PIL import Image
-import random
-
-# 添加项目根目录到Python路径
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.minio.image_util import ImageUtil
-
-def create_large_test_image(width=3000, height=3000) -> BytesIO:
-    """
-    创建一个大尺寸测试图片
-    
-    Args:
-        width: 图片宽度
-        height: 图片高度
-        
-    Returns:
-        BytesIO: 大尺寸图片流
-    """
-    print(f"创建 {width}x{height} 的测试图片...")
-    
-    # 创建一个大尺寸图片,使用随机颜色填充
-    img = Image.new('RGB', (width, height), color=(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)))
-    
-    # 将图片保存到BytesIO流
-    img_stream = BytesIO()
-    img.save(img_stream, format='PNG')
-    img_stream.seek(0)
-    
-    # 检查图片大小
-    img_stream.seek(0, 2)
-    size_kb = img_stream.tell() / 1024
-    img_stream.seek(0)
-    
-    print(f"测试图片创建完成,大小为 {size_kb:.2f}KB")
-    return img_stream
-
-def test_image_compression():
-    """
-    测试图片压缩方法
-    """
-    print("开始测试图片压缩方法...")
-    
-    # 创建ImageUtil实例
-    image_util = ImageUtil()
-    
-    # 测试不同尺寸的图片压缩
-    test_sizes = [
-        (3000, 3000),   # 约 25MB
-        (4000, 4000),   # 约 45MB
-        (5000, 5000)    # 约 70MB
-    ]
-    
-    for width, height in test_sizes:
-        print(f"\n=== 测试 {width}x{height} 图片压缩 ===")
-        
-        # 创建大尺寸测试图片
-        img_stream = create_large_test_image(width, height)
-        
-        # 调用压缩方法
-        compressed_stream = image_util._compress_image(img_stream, "test_large_image.png")
-        
-        # 检查压缩后的大小
-        compressed_stream.seek(0, 2)
-        compressed_size_kb = compressed_stream.tell() / 1024
-        compressed_stream.seek(0)
-        
-        print(f"压缩后大小: {compressed_size_kb:.2f}KB")
-        
-        # 验证压缩结果
-        if compressed_size_kb <= 5000:
-            print("✅ 压缩成功!压缩后大小小于等于5000KB")
-        else:
-            print("❌ 压缩失败!压缩后大小仍大于5000KB")
-    
-    # 测试_compress_image_to_bytes方法
-    print(f"\n=== 测试 _compress_image_to_bytes 方法 ===")
-    img_stream = create_large_test_image(4000, 4000)
-    compressed_bytes = image_util._compress_image_to_bytes(img_stream)
-    compressed_size_kb = len(compressed_bytes) / 1024
-    print(f"压缩后字节大小: {compressed_size_kb:.2f}KB")
-    
-    if compressed_size_kb <= 5000:
-        print("✅ _compress_image_to_bytes 压缩成功!")
-    else:
-        print("❌ _compress_image_to_bytes 压缩失败!")
-    
-    print("\n=== 所有测试完成 ===")
-
-if __name__ == "__main__":
-    test_image_compression()

+ 0 - 132
test/test_image_compression_real.py

@@ -1,132 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-测试图片压缩修复 - 真实场景
-使用更真实的大图片验证压缩方法
-"""
-import sys
-import os
-from io import BytesIO
-from PIL import Image, ImageDraw, ImageFont
-import random
-
-# 添加项目根目录到Python路径
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.minio.image_util import ImageUtil
-
-def create_complex_test_image(width=3000, height=3000) -> BytesIO:
-    """
-    创建一个复杂的大尺寸测试图片,包含多种元素以增加文件大小
-    
-    Args:
-        width: 图片宽度
-        height: 图片高度
-        
-    Returns:
-        BytesIO: 复杂大尺寸图片流
-    """
-    print(f"创建 {width}x{height} 的复杂测试图片...")
-    
-    # 创建一个白色背景图片
-    img = Image.new('RGB', (width, height), color=(255, 255, 255))
-    draw = ImageDraw.Draw(img)
-    
-    # 添加大量随机形状和颜色,增加图片复杂度
-    for _ in range(10000):
-        # 随机位置
-        x1 = random.randint(0, width)
-        y1 = random.randint(0, height)
-        x2 = x1 + random.randint(10, 100)
-        y2 = y1 + random.randint(10, 100)
-        
-        # 随机颜色
-        color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
-        
-        # 随机形状
-        shape_type = random.choice(['rectangle', 'ellipse', 'line'])
-        if shape_type == 'rectangle':
-            draw.rectangle([x1, y1, x2, y2], fill=color)
-        elif shape_type == 'ellipse':
-            draw.ellipse([x1, y1, x2, y2], fill=color)
-        else:
-            draw.line([x1, y1, x2, y2], fill=color, width=random.randint(1, 5))
-    
-    # 添加一些随机文本
-    try:
-        # 尝试使用默认字体
-        font = ImageFont.load_default()
-        for _ in range(1000):
-            text = f"Test {random.randint(1, 1000)}"
-            x = random.randint(0, width - 100)
-            y = random.randint(0, height - 20)
-            draw.text((x, y), text, fill=(0, 0, 0), font=font)
-    except Exception as e:
-        print(f"添加文本失败: {e}")
-    
-    # 将图片保存到BytesIO流,使用JPEG格式以获得更大的文件大小
-    img_stream = BytesIO()
-    img.save(img_stream, format='JPEG', quality=100)  # 使用最高质量生成大文件
-    img_stream.seek(0)
-    
-    # 检查图片大小
-    img_stream.seek(0, 2)
-    size_kb = img_stream.tell() / 1024
-    img_stream.seek(0)
-    
-    print(f"测试图片创建完成,大小为 {size_kb:.2f}KB")
-    return img_stream
-
-def test_image_compression():
-    """
-    测试图片压缩方法
-    """
-    print("开始测试图片压缩方法...")
-    
-    # 创建ImageUtil实例
-    image_util = ImageUtil()
-    
-    # 测试不同尺寸的复杂图片压缩
-    test_sizes = [
-        (3000, 3000),   # 约 25MB
-        (4000, 4000),   # 约 45MB
-    ]
-    
-    for width, height in test_sizes:
-        print(f"\n=== 测试 {width}x{height} 复杂图片压缩 ===")
-        
-        # 创建大尺寸测试图片
-        img_stream = create_complex_test_image(width, height)
-        
-        # 调用压缩方法
-        compressed_stream = image_util._compress_image(img_stream, "test_complex_image.jpg")
-        
-        # 检查压缩后的大小
-        compressed_stream.seek(0, 2)
-        compressed_size_kb = compressed_stream.tell() / 1024
-        compressed_stream.seek(0)
-        
-        print(f"压缩后大小: {compressed_size_kb:.2f}KB")
-        
-        # 验证压缩结果
-        if compressed_size_kb <= 5000:
-            print("✅ 压缩成功!压缩后大小小于等于5000KB")
-        else:
-            print("❌ 压缩失败!压缩后大小仍大于5000KB")
-    
-    # 测试_compress_image_to_bytes方法
-    print(f"\n=== 测试 _compress_image_to_bytes 方法 ===")
-    img_stream = create_complex_test_image(4000, 4000)
-    compressed_bytes = image_util._compress_image_to_bytes(img_stream)
-    compressed_size_kb = len(compressed_bytes) / 1024
-    print(f"压缩后字节大小: {compressed_size_kb:.2f}KB")
-    
-    if compressed_size_kb <= 5000:
-        print("✅ _compress_image_to_bytes 压缩成功!")
-    else:
-        print("❌ _compress_image_to_bytes 压缩失败!")
-    
-    print("\n=== 所有测试完成 ===")
-
-if __name__ == "__main__":
-    test_image_compression()

+ 0 - 180
test/test_infinity_encapsulation.py

@@ -1,180 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-测试Infinity向量数据库封装
-"""
-
-from services.utils.infinity_util import InfinityVectorDB
-from conf.config import VectorDBConfig
-
-def test_infinity_connection():
-    """
-    测试Infinity连接
-    """
-    print("=== 测试Infinity连接 ===")
-    
-    try:
-        # 初始化InfinityVectorDB
-        infinity_db = InfinityVectorDB()
-        print("✅ InfinityVectorDB初始化成功")
-        
-        # 测试索引创建
-        index_name = "test_collection"
-        print(f"\n测试创建索引: {index_name}")
-        result = infinity_db.create_index(index_name)
-        if result:
-            print(f"✅ 索引 {index_name} 创建成功")
-        else:
-            print(f"❌ 索引 {index_name} 创建失败")
-            return False
-        
-        # 测试索引存在检查
-        print(f"\n测试检查索引存在: {index_name}")
-        exists = infinity_db.index_exists(index_name)
-        if exists:
-            print(f"✅ 索引 {index_name} 存在")
-        else:
-            print(f"❌ 索引 {index_name} 不存在")
-            return False
-        
-        # 测试插入文档
-        print(f"\n测试插入文档")
-        document = {
-            "file_name": "test.pdf",
-            "file_page_count": 10,
-            "page_number": 1,
-            "text": "这是一个测试文档",
-            "image_path": "test.png",
-            "sparse_vector": [],
-            "dense_vector_1024": [0.1] * 1024,
-            "dataset_id": "test_dataset",
-            "document_id": "test_doc_id"
-        }
-        
-        insert_result = infinity_db.insert_document(index_name, document)
-        if insert_result:
-            print(f"✅ 文档插入成功")
-        else:
-            print(f"❌ 文档插入失败")
-            return False
-        
-        # 测试批量插入
-        print(f"\n测试批量插入文档")
-        documents = []
-        for i in range(2, 5):
-            doc = {
-                "file_name": "test.pdf",
-                "file_page_count": 10,
-                "page_number": i,
-                "text": f"这是第 {i} 页",
-                "image_path": f"test_{i}.png",
-                "sparse_vector": [],
-                "dense_vector_1024": [0.1] * 1024,
-                "dataset_id": "test_dataset",
-                "document_id": "test_doc_id"
-            }
-            documents.append(doc)
-        
-        bulk_result = infinity_db.bulk_insert(index_name, documents)
-        if bulk_result["success"] == len(documents):
-            print(f"✅ 批量插入成功,共插入 {bulk_result['success']} 个文档")
-        else:
-            print(f"❌ 批量插入失败,成功 {bulk_result['success']} 个,失败 {bulk_result['failed']} 个")
-            return False
-        
-        # 测试向量检索
-        print(f"\n测试向量检索")
-        vector = [0.1] * 1024
-        search_result = infinity_db.vector_search(index_name, "dense_vector_1024", vector, size=5)
-        if search_result["hits"]["total"] > 0:
-            print(f"✅ 向量检索成功,找到 {search_result['hits']['total']} 个结果")
-        else:
-            print(f"❌ 向量检索失败,未找到结果")
-        
-        # 测试混合检索
-        print(f"\n测试混合检索")
-        hybrid_result = infinity_db.hybrid_search(
-            index_name,
-            text_query="测试",
-            vector_field="dense_vector_1024",
-            vector=vector,
-            size=5
-        )
-        if hybrid_result["hits"]["total"] > 0:
-            print(f"✅ 混合检索成功,找到 {hybrid_result['hits']['total']} 个结果")
-        else:
-            print(f"❌ 混合检索失败,未找到结果")
-        
-        # 测试删除索引
-        print(f"\n测试删除索引: {index_name}")
-        delete_result = infinity_db.delete_index(index_name)
-        if delete_result:
-            print(f"✅ 索引 {index_name} 删除成功")
-        else:
-            print(f"❌ 索引 {index_name} 删除失败")
-            return False
-        
-        # 关闭连接
-        infinity_db.close()
-        print(f"\n✅ 成功关闭连接")
-        
-        return True
-        
-    except Exception as e:
-        print(f"\n❌ 测试失败: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-def test_vector_db_factory():
-    """
-    测试VectorDBFactory
-    """
-    print("\n=== 测试VectorDBFactory ===")
-    
-    try:
-        from services.utils.vector_db import VectorDBFactory
-        
-        # 获取向量数据库实例
-        vector_db = VectorDBFactory.get_vector_db()
-        print(f"✅ 成功获取向量数据库实例: {type(vector_db).__name__}")
-        
-        # 测试创建索引
-        index_name = "test_factory_collection"
-        result = vector_db.create_index(index_name)
-        if result:
-            print(f"✅ 通过工厂创建索引 {index_name} 成功")
-        else:
-            print(f"❌ 通过工厂创建索引 {index_name} 失败")
-        
-        vector_db.close()
-        print(f"✅ 成功关闭通过工厂获取的连接")
-        
-        return True
-        
-    except Exception as e:
-        print(f"\n❌ 工厂测试失败: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-if __name__ == "__main__":
-    print("开始测试Infinity向量数据库封装...")
-    
-    # 测试Infinity连接
-    connection_result = test_infinity_connection()
-    
-    # 测试VectorDBFactory
-    factory_result = test_vector_db_factory()
-    
-    # 总结
-    print("\n=== 测试总结 ===")
-    if connection_result and factory_result:
-        print("✅ 所有测试通过!")
-        exit(0)
-    else:
-        print("❌ 部分测试失败!")
-        exit(1)

+ 0 - 56
test/test_infinity_http.py

@@ -1,56 +0,0 @@
-import sys
-import os
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-def test_infinity_http_import():
-    """测试Infinity HTTP实现导入"""
-    try:
-        from services.utils.infinity_util import InfinityVectorDB
-        print("✓ InfinityVectorDB导入成功")
-        return True
-    except Exception as e:
-        print(f"✗ InfinityVectorDB导入失败: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-def test_infinity_http_init():
-    """测试Infinity HTTP实现初始化"""
-    try:
-        from services.utils.infinity_util import InfinityVectorDB
-        from conf.config import VectorDBConfig
-        
-        # 打印配置信息,方便调试
-        print(f"\nInfinity配置:")
-        print(f"  Host: {VectorDBConfig.get_infinity_host()}")
-        print(f"  Port: {VectorDBConfig.get_infinity_port()}")
-        print(f"  User: {VectorDBConfig.get_infinity_user()}")
-        print(f"  Password: {VectorDBConfig.get_infinity_password()}")
-        print(f"  Database: {VectorDBConfig.get_infinity_database()}")
-        
-        # 尝试初始化,但不实际连接
-        # 这里只检查初始化逻辑是否正确
-        print("\n✓ InfinityVectorDB初始化逻辑检查通过")
-        return True
-    except Exception as e:
-        print(f"✗ InfinityVectorDB初始化失败: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-def main():
-    """主测试函数"""
-    print("=== 测试Infinity HTTP实现 ===")
-    
-    # 测试导入
-    test_infinity_http_import()
-    
-    # 测试初始化
-    test_infinity_http_init()
-    
-    print("\n=== 测试完成 ===")
-
-if __name__ == "__main__":
-    main()

+ 0 - 60
test/test_infinity_sdk.py

@@ -1,60 +0,0 @@
-import sys
-import os
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-def test_infinity_import():
-    """测试Infinity SDK导入"""
-    try:
-        from services.utils.infinity_util import InfinityVectorDB
-        print("✓ InfinityVectorDB导入成功")
-        return True
-    except Exception as e:
-        print(f"✗ InfinityVectorDB导入失败: {str(e)}")
-        return False
-
-def test_infinity_sdk_version():
-    """测试Infinity SDK版本"""
-    try:
-        import infinity
-        print(f"✓ Infinity SDK版本: {infinity.__version__}")
-        return True
-    except Exception as e:
-        print(f"✗ 无法获取Infinity SDK版本: {str(e)}")
-        return False
-
-def test_infinity_api():
-    """测试Infinity API可用性"""
-    try:
-        import infinity
-        from infinity.common import ConflictType
-        
-        print(f"✓ infinity模块: {infinity}")
-        print(f"✓ ConflictType: {ConflictType}")
-        print(f"✓ NetworkAddress: {infinity.NetworkAddress}")
-        print(f"✓ DataType: {infinity.DataType}")
-        print(f"✓ IndexType: {infinity.IndexType}")
-        print(f"✓ MetricType: {infinity.MetricType}")
-        return True
-    except Exception as e:
-        print(f"✗ 无法访问Infinity API: {str(e)}")
-        return False
-
-def main():
-    """主测试函数"""
-    print("=== 测试Infinity SDK实现 ===")
-    
-    # 测试导入
-    test_infinity_import()
-    
-    # 测试SDK版本
-    test_infinity_sdk_version()
-    
-    # 测试API可用性
-    test_infinity_api()
-    
-    print("\n=== 测试完成 ===")
-
-if __name__ == "__main__":
-    main()

+ 0 - 6
test/test_mcp.py

@@ -1,6 +0,0 @@
-from PIL import Image
-
-if __name__ == "__main__":
-    image = Image.open("http://image.dawn-infinite.cn/file/1.png")
-    # 打开一个网络图片转换为Image.Image
-    

+ 0 - 72
test/test_mcp_hybrid_search.py

@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试MCP服务的混合检索功能
-"""
-
-import sys
-import os
-import json
-import unittest
-import requests
-from typing import Dict, Any
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-class TestMCPHybridSearch(unittest.TestCase):
-    """测试MCP服务的混合检索功能"""
-    
-    def setUp(self):
-        """设置测试环境"""
-        # MCP服务的基础URL
-        self.base_url = "http://localhost:18000"
-        
-        # 测试数据
-        # 测试图片
-        self.test_image = r"D:\project\work\ragflow_plugs\book\output\temp\2.png"
-        # 测试文本查询
-        self.test_text_query = "卡梅拉"
-
-    
-    def test_hybrid_search(self):
-        """测试混合检索API"""
-        print("测试混合检索API...")
-        
-        # 构建请求数据
-        data = {
-            "text_query": self.test_text_query,
-            "image": self.test_image,
-            "topn": 2
-        }
-        
-        # 发送请求
-        response = requests.post(
-            f"{self.base_url}/tools/hybrid_search",
-            json=data
-        )
-        
-        # 验证响应
-        self.assertEqual(response.status_code, 200, f"请求失败: {response.text}")
-        result = response.json()
-        self.assertTrue(result["success"], f"API调用失败: {result.get('message', '未知错误')}")
-        self.assertIn("output", result, "响应中缺少hits字段")
-        self.assertIn("total", result, "响应中缺少total字段")
-        self.assertIsInstance(result["output"], list, "hits字段应该是一个列表")
-        self.assertIsInstance(result["total"], int, "total字段应该是一个整数")
-        
-        print(f"✓ 混合检索API测试通过,命中数量: {result['total']}")
-    
- 
-
-if __name__ == "__main__":
-    """运行测试"""
-    print("开始测试MCP服务的混合检索功能...\n")
-    
-    # 创建测试套件
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestMCPHybridSearch)
-    
-    # 运行测试
-    runner = unittest.TextTestRunner(verbosity=2)
-    result = runner.run(suite)
-    
-    print(f"\n测试完成,共运行 {result.testsRun} 个测试,成功 {result.testsRun - len(result.failures) - len(result.errors)} 个,失败 {len(result.failures)} 个,错误 {len(result.errors)} 个")

+ 0 - 47
test/test_mcp_simple.py

@@ -1,47 +0,0 @@
-#!/usr/bin/env python3
-"""
-简单测试MCP服务的API路径
-"""
-
-import requests
-
-# MCP服务的基础URL
-BASE_URL = "http://localhost:18000"
-
-# 测试不同的API路径格式
-test_paths = [
-    "/vectorize_store",
-    "/tools/vectorize_store",
-    "/mcp/tools/vectorize_store",
-    "/api/vectorize_store",
-    "/",
-    "/docs",
-    "/openapi.json"
-]
-
-# 测试数据
-test_data = {
-    "dataset_id": "test_dataset_001",
-    "book_name": "测试书籍",
-    "document_id": "test_doc_001",
-    "parsed_results": [
-        {
-            "page_number": 1,
-            "content": "这是测试书籍的第1页内容",
-            "image_url": "https://example.com/image1.jpg"
-        }
-    ]
-}
-
-print("开始测试MCP服务的API路径...\n")
-
-for path in test_paths:
-    url = f"{BASE_URL}{path}"
-    print(f"测试路径: {url}")
-    try:
-        response = requests.post(url, json=test_data, timeout=5)
-        print(f"状态码: {response.status_code}")
-        print(f"响应内容: {response.text[:100]}...")
-    except Exception as e:
-        print(f"请求失败: {str(e)}")
-    print("-" * 50)

+ 0 - 59
test/test_multimodal_embedding.py

@@ -1,59 +0,0 @@
-#!/usr/bin/env python3
-"""
-测试MultimodalEmbedding类的修复
-"""
-
-from services.model.multimodal_embedding import MultimodalEmbedding
-
-def test_multimodal_embedding_init():
-    """测试MultimodalEmbedding实例化"""
-    print("=== 测试MultimodalEmbedding实例化 ===")
-    try:
-        # 尝试实例化MultimodalEmbedding类
-        embedding = MultimodalEmbedding()
-        print("✓ MultimodalEmbedding实例化成功")
-        print(f"  模型提供商: {embedding.model_provider}")
-        print(f"  模型名称: {embedding.model_name}")
-        return True
-    except Exception as e:
-        print(f"✗ MultimodalEmbedding实例化失败: {str(e)}")
-        return False
-
-def test_multimodal_embedding_methods():
-    """测试MultimodalEmbedding方法"""
-    print("\n=== 测试MultimodalEmbedding方法 ===")
-    try:
-        embedding = MultimodalEmbedding()
-        
-        # 测试方法是否存在
-        methods_to_test = [
-            'get_text_embedding',
-            'get_texts_embedding',
-            'get_image_embedding',
-            'get_multimodal_embedding'
-        ]
-        
-        for method_name in methods_to_test:
-            if hasattr(embedding, method_name) and callable(getattr(embedding, method_name)):
-                print(f"✓ 方法 {method_name} 存在且可调用")
-            else:
-                print(f"✗ 方法 {method_name} 不存在或不可调用")
-                return False
-        
-        return True
-    except Exception as e:
-        print(f"✗ 测试方法存在性失败: {str(e)}")
-        return False
-
-if __name__ == "__main__":
-    print("开始测试MultimodalEmbedding修复...")
-    
-    test1 = test_multimodal_embedding_init()
-    test2 = test_multimodal_embedding_methods()
-    
-    if test1 and test2:
-        print("\n🎉 所有测试通过!MultimodalEmbedding修复成功。")
-        exit(0)
-    else:
-        print("\n❌ 测试失败!MultimodalEmbedding修复存在问题。")
-        exit(1)

+ 0 - 126
test/test_mysql_conn.py

@@ -1,126 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-MySQL连接工具类测试脚本
-"""
-import sys
-import os
-
-# 添加项目根目录到Python路径
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from utils.mysql_conn import get_mysql_conn
-
-def test_mysql_conn():
-    """
-    测试MySQL连接工具类
-    """
-    print("测试MySQL连接工具类...")
-    
-    try:
-        # 获取MySQL连接管理器实例
-        mysql_conn = get_mysql_conn(
-            host="localhost",
-            port=3306,
-            user="root",
-            password="password",
-            database="test_db",
-            pool_size=3
-        )
-        
-        print("✓ 成功获取MySQL连接管理器实例")
-        
-        # 测试创建表
-        create_table_sql = """
-        CREATE TABLE IF NOT EXISTS test_users (
-            id INT AUTO_INCREMENT PRIMARY KEY,
-            name VARCHAR(50) NOT NULL,
-            email VARCHAR(100) NOT NULL UNIQUE,
-            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-        ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
-        """
-        
-        result = mysql_conn.execute(create_table_sql)
-        print("✓ 成功创建测试表")
-        
-        # 测试插入数据
-        insert_sql = "INSERT INTO test_users (name, email) VALUES (%s, %s)"
-        insert_params = ("测试用户", "test@example.com")
-        
-        row_count = mysql_conn.execute(insert_sql, insert_params)
-        print(f"✓ 成功插入 {row_count} 条数据")
-        
-        # 测试查询数据
-        select_sql = "SELECT * FROM test_users WHERE name = %s"
-        select_params = ("测试用户",)
-        
-        user = mysql_conn.fetch_one(select_sql, select_params)
-        if user:
-            print(f"✓ 成功查询数据: {user}")
-        else:
-            print("✗ 查询数据失败")
-        
-        # 测试批量插入
-        bulk_insert_sql = "INSERT INTO test_users (name, email) VALUES (%s, %s)"
-        bulk_params = [
-            ("批量用户1", "batch1@example.com"),
-            ("批量用户2", "batch2@example.com"),
-            ("批量用户3", "batch3@example.com")
-        ]
-        
-        bulk_row_count = mysql_conn.bulk_insert(bulk_insert_sql, bulk_params)
-        print(f"✓ 成功批量插入 {bulk_row_count} 条数据")
-        
-        # 测试查询所有数据
-        select_all_sql = "SELECT * FROM test_users"
-        all_users = mysql_conn.fetch_all(select_all_sql)
-        print(f"✓ 成功查询所有数据,共 {len(all_users)} 条")
-        
-        # 测试更新数据
-        update_sql = "UPDATE test_users SET name = %s WHERE id = %s"
-        update_params = ("更新后的测试用户", user["id"])
-        
-        update_row_count = mysql_conn.execute(update_sql, update_params)
-        print(f"✓ 成功更新 {update_row_count} 条数据")
-        
-        # 测试删除数据
-        delete_sql = "DELETE FROM test_users WHERE id = %s"
-        delete_params = (user["id"],)
-        
-        delete_row_count = mysql_conn.execute(delete_sql, delete_params)
-        print(f"✓ 成功删除 {delete_row_count} 条数据")
-        
-        # 测试事务
-        print("测试事务处理...")
-        conn, cursor = mysql_conn.begin_transaction()
-        try:
-            # 在事务中执行多个操作
-            cursor.execute("INSERT INTO test_users (name, email) VALUES (%s, %s)", ("事务用户1", "transaction1@example.com"))
-            cursor.execute("INSERT INTO test_users (name, email) VALUES (%s, %s)", ("事务用户2", "transaction2@example.com"))
-            mysql_conn.commit_transaction(conn, cursor)
-            print("✓ 事务提交成功")
-        except Exception as e:
-            mysql_conn.rollback_transaction(conn, cursor)
-            print(f"✗ 事务回滚: {e}")
-        
-        # 清理测试数据
-        drop_table_sql = "DROP TABLE IF EXISTS test_users"
-        mysql_conn.execute(drop_table_sql)
-        print("✓ 成功清理测试表")
-        
-        # 关闭连接池
-        mysql_conn.close()
-        print("✓ 成功关闭连接池")
-        
-        print("\n🎉 所有测试通过!MySQL连接工具类工作正常。")
-        
-    except Exception as e:
-        print(f"\n❌ 测试失败: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-    
-    return True
-
-if __name__ == "__main__":
-    test_mysql_conn()

+ 0 - 34
test/test_simple.py

@@ -1,34 +0,0 @@
-"""简单测试脚本,直接测试PDF解析服务"""
-
-import sys
-import os
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from services.pdf_parser.main import PDFParsingService
-
-def test_simple():
-    """简单测试函数"""
-    print("开始简单测试PDF解析服务...")
-    print("=" * 50)
-    
-    try:
-        # 测试服务初始化
-        service = PDFParsingService()
-        print("✓ 服务初始化成功")
-        
-        # 测试核心功能
-        print("✓ 核心功能可用")
-        
-        print("\n服务功能测试完成!")
-        print("使用示例:")
-        print("python -m services.pdf_parser.main --pdf_path <pdf文件路径> --output <输出json路径>")
-        print("\n例如:")
-        print("python -m services.pdf_parser.main --pdf_path sample.pdf --output result.json")
-        
-        return True
-    except Exception as e:
-        print(f"✗ 测试失败: {str(e)}")
-        return False
-
-if __name__ == "__main__":
-    test_simple()

+ 0 - 39
test/test_upload_document.py

@@ -1,39 +0,0 @@
-import sys
-import os
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from services.utils.http_client import HTTPClient
-from services.ragflow.document_service import DocumentService
-
-# 配置信息
-API_URL = "http://localhost:8000"  # 替换为实际的RAGFlow API URL
-API_KEY = "your_api_key"  # 替换为实际的API密钥
-DATASET_ID = "your_dataset_id"  # 替换为实际的数据集ID
-PDF_PATH = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"  # 使用已有的测试PDF文件
-
-def test_upload_document():
-    """测试上传文档功能"""
-    try:
-        # 创建HTTP客户端实例
-        http_client = HTTPClient(base_url=API_URL, api_key=API_KEY)
-        
-        # 创建文档服务实例
-        document_service = DocumentService(http_client)
-        
-        # 调用上传文档方法
-        print(f"开始上传文档: {PDF_PATH}")
-        result = document_service.upload_document(DATASET_ID, PDF_PATH)
-        
-        # 打印结果
-        print(f"文档上传成功: {result}")
-        return True
-    except Exception as e:
-        print(f"文档上传失败: {str(e)}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-if __name__ == "__main__":
-    test_upload_document()

+ 0 - 104
test/test_vector_db.py

@@ -1,104 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-向量数据库测试脚本
-测试向量数据库工厂类的基本功能和配置切换
-"""
-
-from services.utils.vector_db import VectorDBFactory
-from conf.config import VectorDBConfig
-
-def test_vector_db_factory():
-    """
-    测试向量数据库工厂类
-    """
-    print("=== 测试向量数据库工厂类 ===")
-    
-    # 获取配置的向量数据库类型
-    vector_db_type = VectorDBConfig.get_vector_db_type()
-    print(f"当前配置的向量数据库类型: {vector_db_type}")
-    
-    try:
-        # 获取向量数据库实例
-        vector_db = VectorDBFactory.get_vector_db()
-        print(f"成功获取向量数据库实例: {type(vector_db).__name__}")
-        
-        # 测试创建索引
-        index_name = "test_index"
-        print(f"\n测试创建索引: {index_name}")
-        result = vector_db.create_index(index_name)
-        print(f"创建索引结果: {result}")
-        
-        # 测试向量检索接口
-        print(f"\n测试向量检索接口")
-        vector = [0.1] * 768
-        result = vector_db.vector_search(index_name, "vector_768_vec", vector, size=5)
-        print(f"向量检索结果: {result}")
-        
-        # 测试混合检索接口
-        print(f"\n测试混合检索接口")
-        result = vector_db.hybrid_search(
-            index_name, 
-            text_query="测试", 
-            vector_field="vector_768_vec", 
-            vector=vector, 
-            size=5
-        )
-        print(f"混合检索结果: {result}")
-        
-        # 关闭连接
-        vector_db.close()
-        print(f"\n成功关闭向量数据库连接")
-        
-        return True
-        
-    except Exception as e:
-        print(f"测试失败: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-def test_vector_db_switch():
-    """
-    测试向量数据库切换功能
-    """
-    print("\n=== 测试向量数据库切换功能 ===")
-    
-    # 测试不同类型的向量数据库
-    test_types = ["es", "infinity"]
-    
-    for db_type in test_types:
-        print(f"\n测试向量数据库类型: {db_type}")
-        try:
-            # 临时修改配置(实际使用时通过环境变量配置)
-            from conf.config import VectorDBConfig
-            
-            # 注意:这里我们不能直接修改配置类的静态方法返回值
-            # 所以我们通过工厂类的实现来测试
-            
-            # 这里只测试工厂类是否能正确创建不同类型的向量数据库
-            if db_type == "es":
-                from services.utils.vector_db import ElasticsearchVectorDB
-                vector_db = ElasticsearchVectorDB()
-            else:
-                from services.utils.vector_db import InfinityVectorDB
-                vector_db = InfinityVectorDB()
-            
-            print(f"成功创建{db_type}向量数据库实例: {type(vector_db).__name__}")
-            vector_db.close()
-            print(f"成功关闭{db_type}向量数据库连接")
-            
-        except Exception as e:
-            print(f"测试{db_type}失败: {e}")
-            import traceback
-            traceback.print_exc()
-
-if __name__ == "__main__":
-    # 测试向量数据库工厂类
-    test_vector_db_factory()
-    
-    # 测试向量数据库切换功能
-    test_vector_db_switch()
-    
-    print("\n=== 测试完成 ===")

+ 0 - 71
test/test_workflow.py

@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-PDF解析工作流测试脚本
-测试包含向量化入库的完整工作流
-"""
-
-import os
-import sys
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-from services.pdf_parser.workflow import PDFParsingWorkflow
-
-
-def test_pdf_parsing_workflow():
-    """
-    测试PDF解析工作流,包括向量化入库
-    """
-    print("=== 测试PDF解析工作流 ===")
-    
-    # 测试参数
-    pdf_path = "test/sample.pdf"  # 替换为实际的测试PDF路径
-    dataset_id = "test_dataset"
-    ragflow_api_url = "http://localhost:8000/"  # 替换为实际的RAGFLOW API URL
-    rag_flow_api_key = "test_api_key"  # 替换为实际的API密钥
-    
-    try:
-        # 检查测试PDF文件是否存在
-        if not os.path.exists(pdf_path):
-            print(f"测试PDF文件不存在: {pdf_path}")
-            print("请将测试PDF文件放置在指定位置")
-            return False
-        
-        # 初始化工作流
-        workflow = PDFParsingWorkflow()
-        print(f"工作流初始化成功")
-        
-        # 运行工作流
-        print(f"开始运行工作流,解析PDF: {pdf_path}")
-        result = workflow.run(
-            pdf_path=pdf_path,
-            dataset_id=dataset_id,
-            ragflow_api_url=ragflow_api_url,
-            rag_flow_api_key=rag_flow_api_key
-        )
-        
-        # 打印结果
-        print(f"\n工作流运行完成")
-        print(f"解析页面数量: {len(result.get('parsed_results', []))}")
-        print(f"向量化页面数量: {result.get('vectorized_pages', 0)}")
-        print(f"向量化结果数量: {len(result.get('vectorized_results', []))}")
-        
-        # 检查结果
-        if result.get('is_complete', False):
-            print("\n✅ 工作流运行成功!")
-            return True
-        else:
-            print("\n❌ 工作流运行失败!")
-            return False
-            
-    except Exception as e:
-        print(f"\n❌ 测试失败: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-if __name__ == "__main__":
-    test_pdf_parsing_workflow()

+ 0 - 99
test/verify_pdf_splitter.py

@@ -1,99 +0,0 @@
-"""PDF拆分功能验证脚本"""
-
-import sys
-import os
-from pathlib import Path
-
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-def check_pymupdf():
-    """检查PyMuPDF是否安装"""
-    print("检查PyMuPDF是否安装...")
-    try:
-        import fitz
-        print(f"✓ PyMuPDF已安装")
-        print(f"  版本: {fitz.__version__}")
-        return True
-    except ImportError:
-        print("✗ 未安装PyMuPDF,请运行: pip install PyMuPDF")
-        return False
-    except Exception as e:
-        print(f"✗ 检查PyMuPDF时出错: {str(e)}")
-        return False
-
-def check_pdf_file(pdf_path):
-    """检查PDF文件是否存在"""
-    print(f"检查PDF文件: {pdf_path}")
-    pdf = Path(pdf_path)
-    if pdf.exists():
-        print(f"✓ PDF文件存在,大小: {pdf.stat().st_size} 字节")
-        return True
-    else:
-        print(f"✗ PDF文件不存在: {pdf_path}")
-        return False
-
-def test_pdf_splitter():
-    """测试PDF拆分功能"""
-    print("=" * 50)
-    print("PDF拆分功能验证")
-    print("=" * 50)
-    
-    # 检查PyMuPDF
-    pymupdf_ok = check_pymupdf()
-    print()
-    
-    # 检查示例PDF文件
-    pdf_path = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"
-    pdf_ok = check_pdf_file(pdf_path)
-    print()
-    
-    if not pymupdf_ok:
-        print("=" * 50)
-        print("PyMuPDF安装指南:")
-        print("1. 运行命令安装: pip install PyMuPDF")
-        print("2. 安装完成后重试")
-        print("=" * 50)
-        return False
-    
-    if not pdf_ok:
-        print("=" * 50)
-        print(r"请确保PDF文件存在: D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf")
-        print("或修改脚本中的pdf_path变量为实际的PDF文件路径")
-        print("=" * 50)
-        return False
-    
-    # 测试PDF拆分功能
-    print("开始测试PDF拆分功能...")
-    try:
-        from services.pdf_parser.pdf_splitter import PDFSplitter
-        
-        splitter = PDFSplitter()
-        print(f"正在拆分PDF: {pdf_path}")
-        pages = splitter.split_pdf(pdf_path)
-        
-        print(f"✓ PDF拆分成功,共 {len(pages)} 页")
-        for page in pages[:3]:  # 只显示前3页
-            print(f"  - 页码: {page['page_number']}, 图像大小: {page['image'].size}")
-        
-        if len(pages) > 3:
-            print(f"  ... 以及 {len(pages) - 3} 页")
-        
-        return True
-    except Exception as e:
-        print(f"✗ PDF拆分失败: {str(e)}")
-        print("可能的解决方案:")
-        print("1. 确保poppler已正确安装并在PATH中")
-        print("2. 检查PDF文件是否损坏")
-        print("3. 检查pdf2image库版本是否兼容")
-        return False
-
-def main():
-    """主函数"""
-    test_pdf_splitter()
-    
-    print("\n" + "=" * 50)
-    print("验证完成")
-    print("=" * 50)
-
-if __name__ == "__main__":
-    main()

+ 0 - 38
test/vl_embedding_test.py

@@ -1,38 +0,0 @@
-import sys
-import os
-from PIL import Image
-
-# 添加项目根目录到Python路径
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-
-from model.multimodal_embedding import Embedding
-from conf.config import ModelConfig
-
-def main():
-    """测试主函数"""
-    print("开始测试VL嵌入...")
-    print("=" * 50)
-    
-    # 初始化OpenAIEmbedding模型
-    embedding_model = Embedding("qwen2.5-vl-embedding", "sk-bc0f1026a41c4c92beb014be8973e4e2")
-    # 图片
-    image_path = r"D:\project\work\ragflow_plugs\book\output\temp\美美.png"
-    
-    # 检查图片文件是否存在
-    if not os.path.exists(image_path):
-        print(f"图片文件不存在: {image_path}")
-        return
-    
-    try:
-        # 打开图像文件
-        image = Image.open(image_path)
-        text = "美美"
-        res = embedding_model.get_multimodal_embedding(text, image)
-        print(f"图片embedding值: {res}")
-    except Exception as e:
-        print(f"测试失败: {str(e)}")
-    
-    print("=" * 50)
-
-if __name__ == "__main__":
-    main()

BIN
utils/__pycache__/__init__.cpython-312.pyc


+ 0 - 17
utils/es/__init__.py

@@ -1,17 +0,0 @@
-"""
-Elasticsearch 工具模块
-"""
-
-from .base import ESConnection
-from .constants import ES_DEFAULT_CONFIG
-from .document import DocumentManager
-from .index import IndexManager
-from .search import SearchManager
-
-__all__ = [
-    "ESConnection",
-    "ES_DEFAULT_CONFIG",
-    "DocumentManager",
-    "IndexManager",
-    "SearchManager"
-]

BIN
utils/es/__pycache__/__init__.cpython-312.pyc


BIN
utils/es/__pycache__/base.cpython-312.pyc


BIN
utils/es/__pycache__/constants.cpython-312.pyc


BIN
utils/es/__pycache__/document.cpython-312.pyc


BIN
utils/es/__pycache__/index.cpython-312.pyc


BIN
utils/es/__pycache__/search.cpython-312.pyc


BIN
utils/es/__pycache__/templates.cpython-312.pyc


+ 0 - 68
utils/es/base.py

@@ -1,68 +0,0 @@
-"""
-Elasticsearch 连接基础类
-"""
-from typing import List, Dict, Any, Optional
-from elasticsearch import Elasticsearch
-from elastic_transport import ConnectionTimeout
-from utils.decorators import singleton
-from utils.es.constants import ES_DEFAULT_CONFIG
-from utils.es.templates import get_dynamic_templates
-
-
-@singleton
-class ESConnection:
-    """
-    Elasticsearch 连接管理器
-    支持:
-    - 单例模式
-    - 连接池管理
-    - 基础配置管理
-    """
-    
-    def __init__(self, hosts: List[str] = None, **kwargs):
-        """
-        初始化 Elasticsearch 连接
-        
-        Args:
-            hosts: Elasticsearch 主机列表,格式如 ["http://localhost:9200"]
-            **kwargs: 其他 Elasticsearch 客户端配置参数
-        """
-        # 合并配置
-        self.config = {**ES_DEFAULT_CONFIG, **kwargs}
-        self.hosts = hosts or ES_DEFAULT_CONFIG.get("hosts", ["http://localhost:9200"])
-        
-        # 初始化 Elasticsearch 客户端
-        self.es = Elasticsearch(
-            hosts=self.hosts,
-            **self.config
-        )
-        
-        # 动态模板映射
-        self.dynamic_templates = get_dynamic_templates()
-    
-    def ping(self) -> bool:
-        """
-        检查 ES 连接是否正常
-        
-        Returns:
-            bool: 连接是否正常
-        """
-        try:
-            return self.es.ping()
-        except Exception:
-            return False
-    
-    def get_client(self) -> Elasticsearch:
-        """
-        获取 ES 客户端实例
-        
-        Returns:
-            Elasticsearch: ES 客户端实例
-        """
-        return self.es
-    
-    def close(self):
-        """
-        关闭 Elasticsearch 连接
-        """
-        self.es.close()

+ 0 - 25
utils/es/constants.py

@@ -1,25 +0,0 @@
-"""
-Elasticsearch 常量配置
-"""
-
-# 默认配置
-ES_DEFAULT_CONFIG = {
-    "http_compress": True,
-    "max_retries": 3,
-    "retry_on_timeout": True,
-    "timeout": 60,
-    "sniff_on_start": False,
-    "sniff_on_connection_fail": False,
-    "sniffer_timeout": 0,
-    "connections_per_node": 5,  # 每个节点的连接数
-    "randomize_nodes_in_pool": True
-}
-
-# 连接池大小
-ES_CONNECTIONS_PER_NODE = 5
-
-# 默认超时时间
-ES_DEFAULT_TIMEOUT = 60
-
-# 默认主机
-ES_DEFAULT_HOSTS = ["http://localhost:9200"]

+ 0 - 192
utils/es/document.py

@@ -1,192 +0,0 @@
-"""
-Elasticsearch 文档管理
-"""
-from typing import List, Dict, Any, Optional
-from elasticsearch.helpers import bulk, BulkIndexError
-from elasticsearch.exceptions import NotFoundError
-from utils.es.base import ESConnection
-
-
-class DocumentManager:
-    """
-    Elasticsearch 文档管理器
-    负责:
-    - 文档插入(单条和批量)
-    - 文档更新
-    - 文档删除(单条和批量)
-    - 文档获取
-    """
-    
-    def __init__(self, es_connection: Optional[ESConnection] = None):
-        """
-        初始化文档管理器
-        
-        Args:
-            es_connection: ES 连接实例,可选
-        """
-        self.es_conn = es_connection or ESConnection()
-        self.es = self.es_conn.get_client()
-    
-    def insert(self, index_name: str, document: Dict[str, Any], id: str = None, refresh: bool = False) -> bool:
-        """
-        插入单个文档
-        
-        Args:
-            index_name: 索引名称
-            document: 文档内容
-            id: 文档ID,可选
-            refresh: 是否立即刷新
-        
-        Returns:
-            bool: 插入是否成功
-        """
-        try:
-            self.es.index(index=index_name, body=document, id=id, refresh=refresh)
-            return True
-        except Exception as e:
-            print(f"插入文档失败: {e}")
-            return False
-    
-    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]], refresh: bool = False) -> Dict[str, Any]:
-        """
-        批量插入文档
-        
-        Args:
-            index_name: 索引名称
-            documents: 文档列表,每个文档可以包含"_id"字段指定ID
-            refresh: 是否立即刷新
-        
-        Returns:
-            Dict: 包含成功和失败信息的字典
-        """
-        try:
-            # 准备批量操作
-            actions = []
-            for doc in documents:
-                action = {
-                    "_index": index_name,
-                    "_source": doc.copy()
-                }
-                # 如果文档包含"_id"字段,将其作为文档ID
-                if "_id" in doc:
-                    action["_id"] = doc["_id"]
-                    del action["_source"]["_id"]
-                actions.append(action)
-            
-            # 执行批量操作
-            success, failed = bulk(self.es, actions, refresh=refresh, stats_only=False)
-            
-            return {
-                "success": success,
-                "failed": len(failed) if failed else 0,
-                "errors": failed if failed else []
-            }
-        except BulkIndexError as e:
-            print(f"批量插入失败: {e}")
-            return {
-                "success": 0,
-                "failed": len(e.errors),
-                "errors": e.errors
-            }
-        except Exception as e:
-            print(f"批量插入失败: {e}")
-            return {
-                "success": 0,
-                "failed": len(documents),
-                "errors": [str(e)] * len(documents)
-            }
-    
-    def update(self, index_name: str, id: str, update_body: Dict[str, Any], refresh: bool = False) -> bool:
-        """
-        更新文档
-        
-        Args:
-            index_name: 索引名称
-            id: 文档ID
-            update_body: 更新内容,格式如 {"doc": {"field": "value"}}
-            refresh: 是否立即刷新
-        
-        Returns:
-            bool: 更新是否成功
-        """
-        try:
-            self.es.update(index=index_name, id=id, body=update_body, refresh=refresh)
-            return True
-        except NotFoundError:
-            print(f"文档不存在: {id}")
-            return False
-        except Exception as e:
-            print(f"更新文档失败: {e}")
-            return False
-    
-    def delete(self, index_name: str, id: str, refresh: bool = False) -> bool:
-        """
-        删除单个文档
-        
-        Args:
-            index_name: 索引名称
-            id: 文档ID
-            refresh: 是否立即刷新
-        
-        Returns:
-            bool: 删除是否成功
-        """
-        try:
-            self.es.delete(index=index_name, id=id, refresh=refresh)
-            return True
-        except NotFoundError:
-            print(f"文档不存在: {id}")
-            return False
-        except Exception as e:
-            print(f"删除文档失败: {e}")
-            return False
-    
-    def delete_by_query(self, index_name: str, query: Dict[str, Any], refresh: bool = False) -> Dict[str, Any]:
-        """
-        按查询条件删除文档
-        
-        Args:
-            index_name: 索引名称
-            query: 查询条件
-            refresh: 是否立即刷新
-        
-        Returns:
-            Dict: 删除结果
-        """
-        try:
-            result = self.es.delete_by_query(index=index_name, body={"query": query}, refresh=refresh)
-            return {
-                "deleted": result["deleted"],
-                "failed": 0
-            }
-        except Exception as e:
-            print(f"按条件删除失败: {e}")
-            return {
-                "deleted": 0,
-                "failed": 1,
-                "error": str(e)
-            }
-    
-    def get(self, index_name: str, id: str, fields: List[str] = None) -> Optional[Dict[str, Any]]:
-        """
-        获取单个文档
-        
-        Args:
-            index_name: 索引名称
-            id: 文档ID
-            fields: 要返回的字段列表,可选
-        
-        Returns:
-            Dict: 文档内容,不存在则返回None
-        """
-        try:
-            params = {}
-            if fields:
-                params["_source"] = fields
-            result = self.es.get(index=index_name, id=id, **params)
-            return result["_source"]
-        except NotFoundError:
-            return None
-        except Exception as e:
-            print(f"获取文档失败: {e}")
-            return None

+ 0 - 131
utils/es/index.py

@@ -1,131 +0,0 @@
-"""
-Elasticsearch 索引管理
-"""
-from typing import Dict, Any, Optional
-from utils.es.base import ESConnection
-
-
-class IndexManager:
-    """
-    Elasticsearch 索引管理器
-    负责:
-    - 索引创建
-    - 索引删除
-    - 索引检查
-    """
-    
-    def __init__(self, es_connection: Optional[ESConnection] = None):
-        """
-        初始化索引管理器
-        
-        Args:
-            es_connection: ES 连接实例,可选
-        """
-        self.es_conn = es_connection or ESConnection()
-        self.es = self.es_conn.get_client()
-    
-    def create_index(self, index_name: str, mappings: Dict[str, Any] = None, settings: Dict[str, Any] = None) -> bool:
-        """
-        创建索引
-        
-        Args:
-            index_name: 索引名称
-            mappings: 自定义映射,会与动态模板合并
-            settings: 索引设置
-        
-        Returns:
-            bool: 创建是否成功
-        """
-        try:
-            # 如果索引已存在,返回True
-            if self.es.indices.exists(index=index_name):
-                return True
-            
-            # 合并动态模板和自定义映射
-            final_mappings = self.es_conn.dynamic_templates.copy()
-            if mappings:
-                if "dynamic_templates" in mappings:
-                    final_mappings["dynamic_templates"] += mappings["dynamic_templates"]
-                if "properties" in mappings:
-                    final_mappings["properties"] = mappings["properties"]
-            
-            body = {}
-            if settings:
-                body["settings"] = settings
-            body["mappings"] = final_mappings
-            
-            self.es.indices.create(index=index_name, body=body)
-            return True
-        except Exception as e:
-            print(f"创建索引失败: {e}")
-            return False
-    
-    def delete_index(self, index_name: str) -> bool:
-        """
-        删除索引
-        
-        Args:
-            index_name: 索引名称
-        
-        Returns:
-            bool: 删除是否成功
-        """
-        try:
-            if self.es.indices.exists(index=index_name):
-                self.es.indices.delete(index=index_name)
-            return True
-        except Exception as e:
-            print(f"删除索引失败: {e}")
-            return False
-    
-    def exists(self, index_name: str) -> bool:
-        """
-        检查索引是否存在
-        
-        Args:
-            index_name: 索引名称
-        
-        Returns:
-            bool: 索引是否存在
-        """
-        try:
-            return self.es.indices.exists(index=index_name)
-        except Exception as e:
-            print(f"检查索引存在失败: {e}")
-            return False
-    
-    def get_mappings(self, index_name: str) -> Optional[Dict[str, Any]]:
-        """
-        获取索引映射
-        
-        Args:
-            index_name: 索引名称
-        
-        Returns:
-            Dict[str, Any]: 索引映射,不存在则返回None
-        """
-        try:
-            if self.exists(index_name):
-                return self.es.indices.get_mapping(index=index_name)
-            return None
-        except Exception as e:
-            print(f"获取索引映射失败: {e}")
-            return None
-    
-    def get_settings(self, index_name: str) -> Optional[Dict[str, Any]]:
-        """
-        获取索引设置
-        
-        Args:
-            index_name: 索引名称
-        
-        Returns:
-            Dict[str, Any]: 索引设置,不存在则返回None
-        """
-        try:
-            if self.exists(index_name):
-                return self.es.indices.get_settings(index=index_name)
-            return None
-        except Exception as e:
-            print(f"获取索引设置失败: {e}")
-            return None

+ 0 - 202
utils/es/search.py

@@ -1,202 +0,0 @@
-"""
-Elasticsearch 搜索管理
-"""
-from typing import List, Dict, Any, Optional
-from utils.es.base import ESConnection
-
-
-class SearchManager:
-    """
-    Elasticsearch 搜索管理器
-    负责:
-    - 全文检索
-    - 向量相似度检索(k-NN)
-    - 混合检索(文本+向量)
-    - 高亮显示
-    """
-    
-    def __init__(self, es_connection: Optional[ESConnection] = None):
-        """
-        初始化搜索管理器
-        
-        Args:
-            es_connection: ES 连接实例,可选
-        """
-        self.es_conn = es_connection or ESConnection()
-        self.es = self.es_conn.get_client()
-    
-    def search(self, index_name: str, query: Dict[str, Any], size: int = 10, from_: int = 0, 
-               fields: List[str] = None, highlight: Dict[str, Any] = None) -> Dict[str, Any]:
-        """
-        搜索文档
-        
-        Args:
-            index_name: 索引名称
-            query: 查询条件
-            size: 返回结果数量
-            from_: 起始位置
-            fields: 要返回的字段列表,可选
-            highlight: 高亮配置,可选
-        
-        Returns:
-            Dict: 搜索结果
-        """
-        try:
-            body = {
-                "query": query,
-                "size": size,
-                "from": from_
-            }
-            
-            if fields:
-                body["_source"] = fields
-            
-            if highlight:
-                body["highlight"] = highlight
-            
-            result = self.es.search(index=index_name, body=body)
-            return result
-        except Exception as e:
-            print(f"搜索失败: {e}")
-            return {"hits": {"total": 0, "hits": []}}
-    
-    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
-                     size: int = 10, from_: int = 0, fields: List[str] = None, 
-                     text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
-        """
-        混合检索:向量相似度检索 + 全文检索
-        
-        Args:
-            index_name: 索引名称
-            text_query: 文本查询语句
-            vector_field: 向量字段名
-            vector: 检索向量
-            size: 返回结果数量
-            from_: 起始位置
-            fields: 要返回的字段列表,可选
-            text_weight: 文本检索权重
-            vector_weight: 向量检索权重
-        
-        Returns:
-            Dict: 搜索结果
-        """
-        try:
-            # 构建混合检索查询
-            query = {
-                "bool": {
-                    "should": [
-                        {
-                            "query_string": {
-                                "query": text_query,
-                                "default_operator": "OR",
-                                "boost": text_weight
-                            }
-                        },
-                        {
-                            "script_score": {
-                                "query": {
-                                    "match_all": {}
-                                },
-                                "script": {
-                                    "source": "cosineSimilarity(params.query_vector, doc[params.vector_field]) + 1.0",
-                                    "params": {
-                                        "query_vector": vector,
-                                        "vector_field": vector_field
-                                    }
-                                },
-                                "boost": vector_weight
-                            }
-                        }
-                    ]
-                }
-            }
-            
-            body = {
-                "query": query,
-                "size": size,
-                "from": from_
-            }
-            
-            if fields:
-                body["_source"] = fields
-            
-            result = self.es.search(index=index_name, body=body)
-            return result
-        except Exception as e:
-            print(f"混合检索失败: {e}")
-            return {"hits": {"total": 0, "hits": []}}
-    
-    def knn_search(self, index_name: str, vector_field: str, vector: List[float], 
-                  k: int = 10, filter_query: Dict[str, Any] = None) -> Dict[str, Any]:
-        """
-        向量相似度检索(k-NN)
-        
-        Args:
-            index_name: 索引名称
-            vector_field: 向量字段名
-            vector: 检索向量
-            k: 返回结果数量
-            filter_query: 过滤条件,可选
-        
-        Returns:
-            Dict: 搜索结果
-        """
-        try:
-            knn = {
-                "field": vector_field,
-                "query_vector": vector,
-                "k": k,
-                "num_candidates": k * 10
-            }
-            
-            if filter_query:
-                knn["filter"] = filter_query
-            
-            body = {
-                "knn": knn
-            }
-            
-            result = self.es.search(index=index_name, body=body)
-            return result
-        except Exception as e:
-            print(f"向量检索失败: {e}")
-            return {"hits": {"total": 0, "hits": []}}
-    
-    def match_search(self, index_name: str, field: str, value: str, size: int = 10, 
-                     fields: List[str] = None) -> Dict[str, Any]:
-        """
-        简单匹配搜索
-        
-        Args:
-            index_name: 索引名称
-            field: 字段名
-            value: 匹配值
-            size: 返回结果数量
-            fields: 要返回的字段列表,可选
-        
-        Returns:
-            Dict: 搜索结果
-        """
-        query = {
-            "match": {
-                field: value
-            }
-        }
-        return self.search(index_name, query, size=size, fields=fields)
-    
-    def match_all(self, index_name: str, size: int = 10, fields: List[str] = None) -> Dict[str, Any]:
-        """
-        匹配所有文档
-        
-        Args:
-            index_name: 索引名称
-            size: 返回结果数量
-            fields: 要返回的字段列表,可选
-        
-        Returns:
-            Dict: 搜索结果
-        """
-        query = {
-            "match_all": {}
-        }
-        return self.search(index_name, query, size=size, fields=fields)

+ 0 - 203
utils/es/templates.py

@@ -1,203 +0,0 @@
-"""
-Elasticsearch 动态模板映射
-"""
-from typing import Dict, Any
-
-
-def get_dynamic_templates() -> Dict[str, Any]:
-    """
-    获取动态模板映射配置
-    参考:d:/project/work/ragflow_plugs/book/es_dynamic.md
-    
-    Returns:
-        Dict[str, Any]: 动态模板映射配置
-    """
-    return {
-        "dynamic_templates": [
-            {
-                "int": {
-                    "match": "*_int",
-                    "mapping": {
-                        "store": True,
-                        "type": "integer"
-                    }
-                }
-            },
-            {
-                "ulong": {
-                    "match": "*_ulong",
-                    "mapping": {
-                        "store": True,
-                        "type": "unsigned_long"
-                    }
-                }
-            },
-            {
-                "long": {
-                    "match": "*_long",
-                    "mapping": {
-                        "store": True,
-                        "type": "long"
-                    }
-                }
-            },
-            {
-                "short": {
-                    "match": "*_short",
-                    "mapping": {
-                        "store": True,
-                        "type": "short"
-                    }
-                }
-            },
-            {
-                "numeric": {
-                    "match": "*_flt",
-                    "mapping": {
-                        "store": True,
-                        "type": "float"
-                    }
-                }
-            },
-            {
-                "tks": {
-                    "match": "*_tks",
-                    "mapping": {
-                        "analyzer": "whitespace",
-                        "similarity": "scripted_sim",
-                        "store": True,
-                        "type": "text"
-                    }
-                }
-            },
-            {
-                "ltks": {
-                    "match": "*_ltks",
-                    "mapping": {
-                        "analyzer": "whitespace",
-                        "store": True,
-                        "type": "text"
-                    }
-                }
-            },
-            {
-                "kwd": {
-                    "match": "^(.*_(kwd|id|ids|uid|uids)|uid)$",
-                    "match_pattern": "regex",
-                    "mapping": {
-                        "similarity": "boolean",
-                        "store": True,
-                        "type": "keyword"
-                    }
-                }
-            },
-            {
-                "dt": {
-                    "match": "^.*(_dt|_time|_at)$",
-                    "match_pattern": "regex",
-                    "mapping": {
-                        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss",
-                        "store": True,
-                        "type": "date"
-                    }
-                }
-            },
-            {
-                "nested": {
-                    "match": "*_nst",
-                    "mapping": {
-                        "type": "nested"
-                    }
-                }
-            },
-            {
-                "object": {
-                    "match": "*_obj",
-                    "mapping": {
-                        "dynamic": True,
-                        "type": "object"
-                    }
-                }
-            },
-            {
-                "string": {
-                    "match": "^.*_(with_weight|list)$",
-                    "match_pattern": "regex",
-                    "mapping": {
-                        "index": False,
-                        "store": True,
-                        "type": "text"
-                    }
-                }
-            },
-            {
-                "rank_feature": {
-                    "match": "*_fea",
-                    "mapping": {
-                        "type": "rank_feature"
-                    }
-                }
-            },
-            {
-                "rank_features": {
-                    "match": "*_feas",
-                    "mapping": {
-                        "type": "rank_features"
-                    }
-                }
-            },
-            {
-                "dense_vector_512": {
-                    "match": "*_512_vec",
-                    "mapping": {
-                        "dims": 512,
-                        "index": True,
-                        "similarity": "cosine",
-                        "type": "dense_vector"
-                    }
-                }
-            },
-            {
-                "dense_vector_768": {
-                    "match": "*_768_vec",
-                    "mapping": {
-                        "dims": 768,
-                        "index": True,
-                        "similarity": "cosine",
-                        "type": "dense_vector"
-                    }
-                }
-            },
-            {
-                "dense_vector_1024": {
-                    "match": "*_1024_vec",
-                    "mapping": {
-                        "dims": 1024,
-                        "index": True,
-                        "similarity": "cosine",
-                        "type": "dense_vector"
-                    }
-                }
-            },
-            {
-                "dense_vector_1536": {
-                    "match": "*_1536_vec",
-                    "mapping": {
-                        "dims": 1536,
-                        "index": True,
-                        "similarity": "cosine",
-                        "type": "dense_vector"
-                    }
-                }
-            },
-            {
-                "binary": {
-                    "match": "*_bin",
-                    "mapping": {
-                        "type": "binary"
-                    }
-                }
-            }
-        ],
-        "date_detection": True
-    }

+ 0 - 138
utils/es_conn.py

@@ -1,138 +0,0 @@
-"""
-Elasticsearch 连接管理器(向后兼容接口)
-
-该文件提供与旧版 es_conn.py 兼容的接口,同时内部使用新的工程化模块。
-"""
-import re
-import json
-import time
-from typing import Any, List, Dict, Optional, Union
-from elasticsearch import Elasticsearch, helpers
-from elasticsearch.helpers import bulk, BulkIndexError
-from elastic_transport import ConnectionTimeout
-from elasticsearch.exceptions import NotFoundError
-
-from services.utils.es.base import ESConnection as _ESConnection
-from services.utils.es.index import IndexManager
-from services.utils.es.document import DocumentManager
-from services.utils.es.search import SearchManager
-
-# 单例装饰器
-class singleton:
-    def __init__(self, cls):
-        self.cls = cls
-        self._instance = None
-    
-    def __call__(self, *args, **kwargs):
-        if self._instance is None:
-            self._instance = self.cls(*args, **kwargs)
-        return self._instance
-
-@singleton
-class ESConnection:
-    """
-    Elasticsearch 连接管理器(向后兼容)
-    支持:
-    - 单例模式
-    - 连接池管理
-    - CRUD操作
-    - 向量相似度检索 + 全文检索的混合检索
-    - 动态模板映射
-    """
-    
-    def __init__(self, hosts: List[str] = None, **kwargs):
-        """
-        初始化 Elasticsearch 连接
-        
-        Args:
-            hosts: Elasticsearch 主机列表,格式如 ["http://localhost:9200"]
-            **kwargs: 其他 Elasticsearch 客户端配置参数
-        """
-        # 使用新的 ESConnection 作为底层连接
-        self._es_conn = _ESConnection(hosts=hosts, **kwargs)
-        
-        # 初始化管理器
-        self.index_manager = IndexManager(self._es_conn)
-        self.document_manager = DocumentManager(self._es_conn)
-        self.search_manager = SearchManager(self._es_conn)
-        
-        # 向后兼容属性
-        self.es = self._es_conn.get_client()
-        self.dynamic_templates = self._es_conn.dynamic_templates
-    
-    def _get_dynamic_templates(self) -> Dict[str, Any]:
-        """
-        获取动态模板映射配置(向后兼容方法)
-        """
-        return self.dynamic_templates
-    
-    def create_index(self, index_name: str, mappings: Dict[str, Any] = None, settings: Dict[str, Any] = None) -> bool:
-        """
-        创建索引
-        """
-        return self.index_manager.create_index(index_name, mappings, settings)
-    
-    def insert(self, index_name: str, document: Dict[str, Any], id: str = None, refresh: bool = False) -> bool:
-        """
-        插入单个文档
-        """
-        return self.document_manager.insert(index_name, document, id, refresh)
-    
-    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]], refresh: bool = False) -> Dict[str, Any]:
-        """
-        批量插入文档
-        """
-        return self.document_manager.bulk_insert(index_name, documents, refresh)
-    
-    def update(self, index_name: str, id: str, update_body: Dict[str, Any], refresh: bool = False) -> bool:
-        """
-        更新文档
-        """
-        return self.document_manager.update(index_name, id, update_body, refresh)
-    
-    def delete(self, index_name: str, id: str, refresh: bool = False) -> bool:
-        """
-        删除文档
-        """
-        return self.document_manager.delete(index_name, id, refresh)
-    
-    def delete_by_query(self, index_name: str, query: Dict[str, Any], refresh: bool = False) -> Dict[str, Any]:
-        """
-        按查询条件删除文档
-        """
-        return self.document_manager.delete_by_query(index_name, query, refresh)
-    
-    def get(self, index_name: str, id: str, fields: List[str] = None) -> Optional[Dict[str, Any]]:
-        """
-        获取单个文档
-        """
-        return self.document_manager.get(index_name, id, fields)
-    
-    def search(self, index_name: str, query: Dict[str, Any], size: int = 10, from_: int = 0, 
-               fields: List[str] = None, highlight: Dict[str, Any] = None) -> Dict[str, Any]:
-        """
-        搜索文档
-        """
-        return self.search_manager.search(index_name, query, size, from_, fields, highlight)
-    
-    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
-                     size: int = 10, from_: int = 0, fields: List[str] = None, 
-                     text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
-        """
-        混合检索:向量相似度检索 + 全文检索
-        """
-        return self.search_manager.hybrid_search(index_name, text_query, vector_field, vector, 
-                                               size, from_, fields, text_weight, vector_weight)
-    
-    def knn_search(self, index_name: str, vector_field: str, vector: List[float], 
-                  k: int = 10, filter_query: Dict[str, Any] = None) -> Dict[str, Any]:
-        """
-        向量相似度检索(k-NN)
-        """
-        return self.search_manager.knn_search(index_name, vector_field, vector, k, filter_query)
-    
-    def close(self):
-        """
-        关闭 Elasticsearch 连接
-        """
-        self._es_conn.close()

BIN
utils/file/__pycache__/file_utils.cpython-312.pyc


BIN
utils/file/__pycache__/image_util.cpython-312.pyc


+ 0 - 0
utils/minio/file_utils.py → utils/file/file_utils.py


+ 29 - 4
utils/minio/image_util.py → utils/file/image_util.py

@@ -2,15 +2,14 @@
 """
 图片处理工具类
 """
-
 import os
 import zipfile
 import re
-from typing import List, Dict, Any
+from typing import List
 from io import BytesIO
 from PIL import Image
-from utils.minio.minio_util import MinIOUtil
-from utils.minio.file_utils import generate_unique_filename
+from utils.file.minio.minio_util import MinIOUtil
+from utils.file.file_utils import generate_unique_filename
 
 
 class ImageUtil:
@@ -19,6 +18,32 @@ class ImageUtil:
     def __init__(self):
         """初始化图片处理工具类"""
         self.minio_util = MinIOUtil()
+
+    # 将图片url转换为Image对象
+    def _url_to_image(self, image_url: str) -> Image.Image:
+        """
+        将图片url转换为Image对象
+        
+        Args:
+            image_url: 图片url
+            
+        Returns:
+            Image.Image: 图片对象
+        """
+        import requests
+
+        # 处理image_url为image: Image.Image
+        if isinstance(image_url, str):
+            # 下载图片
+            response = requests.get(image_url)
+            response.raise_for_status()  # 检查HTTP状态码
+    
+            # 将响应内容转换为字节流
+            image_bytes = BytesIO(response.content)
+    
+            # 创建Image对象
+            image = Image.open(image_bytes)
+            return image
     
     def process_image_zip(self, zip_file_path: str, book_name: str) -> List[str]:
         """

+ 0 - 0
utils/minio/__init__.py → utils/file/minio/__init__.py


+ 0 - 0
utils/minio/__pycache__/__init__.cpython-312.pyc → utils/file/minio/__pycache__/__init__.cpython-312.pyc


+ 0 - 0
utils/minio/__pycache__/file_utils.cpython-312.pyc → utils/file/minio/__pycache__/file_utils.cpython-312.pyc


+ 0 - 0
utils/minio/__pycache__/image_util.cpython-312.pyc → utils/file/minio/__pycache__/image_util.cpython-312.pyc


BIN
utils/minio/__pycache__/minio_util.cpython-312.pyc → utils/file/minio/__pycache__/minio_util.cpython-312.pyc


+ 1 - 1
utils/minio/minio_util.py → utils/file/minio/minio_util.py

@@ -2,7 +2,7 @@ from minio import Minio
 from typing import BinaryIO
 from datetime import timedelta
 from conf.config import MinioConfig
-from .file_utils import generate_unique_filename
+from utils.file.file_utils import generate_unique_filename
 
 class MinIOUtil:
     def __init__(self):

+ 0 - 30
utils/infinity/__init__.py

@@ -11,33 +11,3 @@ __all__ = [
     'get_client',
     'close_client'
 ]
-
-# 使用示例
-"""
-# 示例1:基本使用
-client = InfinityClient()
-databases = client.get_databases()
-print(f"Databases: {databases}")
-
-# 示例2:使用连接池上下文
-with client.get_connection() as conn:
-    tables = conn.get_tables()
-    print(f"Tables: {tables}")
-
-# 示例3:使用全局客户端
-from utils.infinity import get_client
-client = get_client()
-databases = client.get_databases()
-print(f"Databases from global client: {databases}")
-
-# 示例4:混合检索
-result = client.hybrid_search(
-    table_name="my_table",
-    vector_field="vector",
-    query_vector=[0.1, 0.2, 0.3],
-    text_query="test",
-    text_field="content",
-    topn=5
-)
-print(f"Hybrid search result: {result}")
-"""

BIN
utils/infinity/__pycache__/__init__.cpython-312.pyc


BIN
utils/infinity/__pycache__/client.cpython-312.pyc


+ 3 - 3
utils/infinity/client.py

@@ -191,7 +191,7 @@ class InfinityClient:
             # 获取Table对象
             table = self._get_table(conn, table_name, database_name)
             # 获取结果集
-            return table.output(output_fields).match_text(query["field"], query["query"], query["topn"])
+            return table.output(output_fields).match_text(query["match_field"], query["matching_text"], query["topn"])
     
     def hybrid_search(
         self,
@@ -207,7 +207,7 @@ class InfinityClient:
             # 获取结果集
             return table.output(output_fields) \
                 .match_dense(query["vector_field"], query["query_vector"], "float", "cosine", query["topn"]) \
-                .match_text(query["field"], query["query"], query["topn"]) \
+                .match_text(query["match_field"], query["matching_text"], query["topn"]) \
                 .fusion("rrf", query["topn"])
     
     def vector_search(
@@ -239,7 +239,7 @@ _client_lock = threading.Lock()
 
 def get_client(
     host: str = VectorDBConfig.get_infinity_host(),
-    port: str = VectorDBConfig.get_infinity_port(),
+    port: str = VectorDBConfig.get_infinity_sdk_port(),
     database: str = VectorDBConfig.get_infinity_database(),
     min_connections: int = 2,
     max_connections: int = 10

+ 0 - 382
utils/infinity_util/__init__.py

@@ -1,382 +0,0 @@
-"""
-Infinity向量数据库主类
-基于官方Infinity Python SDK实现
-"""
-from typing import List, Dict, Any, Optional
-import json
-
-class InfinityVectorDB:
-    """
-    Infinity向量数据库主类
-    提供统一的接口,整合索引、文档和搜索功能
-    """
-    
-    def __init__(self):
-        """
-        初始化Infinity向量数据库
-        使用HTTP API实现,不依赖官方SDK
-        """
-        from conf.config import VectorDBConfig
-        from utils.http_client import HTTPClient
-        import base64
-        
-        # 获取配置
-        self.host = VectorDBConfig.get_infinity_host()
-        self.port = VectorDBConfig.get_infinity_port()
-        self.user = VectorDBConfig.get_infinity_user()
-        self.password = VectorDBConfig.get_infinity_password()
-        self.database = VectorDBConfig.get_infinity_database()
-        self.headers = {
-                "Accept": "application/json",
-                "Content-Type": "application/json"
-            }
-        
-        # 生成Basic Auth令牌
-        auth_str = f"{self.user}:{self.password}"
-        auth_token = base64.b64encode(auth_str.encode()).decode()
-        
-        # 初始化HTTP客户端
-        self.base_url = f"http://{self.host}:{self.port}"
-        self.http_client = HTTPClient(
-            base_url=self.base_url,
-            api_key=auth_token,
-            auth_type='basic'
-        )
-    
-    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
-        """创建索引"""
-        try:
-            # 使用Infinity官方HTTP API创建表(对应索引)
-            path = f"/databases/{self.database}/tables/{index_name}"
-            
-            # 定义表字段
-            with open("conf/infinity_mapping.json", "r", encoding="utf-8") as f:
-                fields = json.load(f)
-            
-            data = {
-                "create_option": "ignore_if_exists",
-                "fields": fields
-            }
-        
-            response = self.http_client.post(path, json_data=data, headers=self.headers)
-            return response.get("error_code") == 0
-        except Exception as e:
-            print(f"Failed to create index: {str(e)}")
-            return False
-    
-    def delete_index(self, index_name: str) -> bool:
-        """删除索引"""
-        try:
-            # 使用Infinity官方HTTP API删除表(对应索引)
-            path = f"/databases/{self.database}/tables/{index_name}"
-            
-            data = {
-                "drop_option": "ignore_if_not_exists"
-            }
-            
-            response = self.http_client.delete(path, json_data=data, headers=self.headers)
-            return response.get("error_code") == 0
-        except Exception as e:
-            print(f"Failed to delete index: {str(e)}")
-            return False
-    
-    def index_exists(self, index_name: str) -> bool:
-        """检查索引是否存在"""
-        try:
-            # 使用Infinity官方HTTP API获取表列表
-            path = f"/databases/{self.database}/tables"
-            response = self.http_client.get(path, headers=self.headers)
-            
-            if response.get("error_code") == 0:
-                tables = response.get("tables", [])
-                return index_name in tables
-            return False
-        except Exception as e:
-            print(f"Failed to check index existence: {str(e)}")
-            return False
-    
-    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
-        """插入单个文档"""
-        try:
-            # 使用Infinity官方HTTP API插入单行数据
-            path = f"/databases/{self.database}/tables/{index_name}/docs"
-            
-            # 如果提供了id,将其添加到文档中
-            if id:
-                document["id"] = id
-            
-            data = [document]
-            response = self.http_client.post(path, json_data=data, headers=self.headers)
-            return response.get("error_code") == 0
-        except Exception as e:
-            print(f"Failed to insert document: {str(e)}")
-            return False
-    
-    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """批量插入文档"""
-        try:
-            # 使用Infinity官方HTTP API批量插入数据
-            path = f"/databases/{self.database}/tables/{index_name}/docs"
-            
-            data = documents
-            response = self.http_client.post(path, json_data=data, headers=self.headers)
-            
-            if response.get("error_code") == 0:
-                return {
-                    "success": True,
-                    "inserted": len(documents)
-                }
-            else:
-                return {
-                    "success": False,
-                    "error": response.get("error_msg", "Unknown error"),
-                    "inserted": 0
-                }
-        except Exception as e:
-            print(f"Failed to bulk insert documents: {str(e)}")
-            return {
-                "success": False,
-                "error": str(e),
-                "inserted": 0
-            }
-    
-    def update_document(self, index_name: str, document_id: str, document: Dict[str, Any]) -> bool:
-        """更新单个文档"""
-        try:
-            # 使用Infinity官方HTTP API更新行
-            path = f"/databases/{self.database}/tables/{index_name}/rows"
-            
-            data = {
-                "update_by": {
-                    "column": "id",
-                    "value": document_id
-                },
-                "update_data": document
-            }
-            
-            response = self.http_client.put(path, json_data=data, headers=self.headers)
-            return response.get("error_code") == 0
-        except Exception as e:
-            print(f"Failed to update document: {str(e)}")
-            return False
-    
-    def delete_document(self, index_name: str, document_id: str) -> bool:
-        """删除单个文档"""
-        try:
-            # 使用Infinity官方HTTP API删除行
-            path = f"/databases/{self.database}/tables/{index_name}/rows"
-            
-            data = {
-                "delete_by": {
-                    "column": "id",
-                    "value": document_id
-                }
-            }
-            
-            response = self.http_client.delete(path, json_data=data, headers=self.headers)
-            return response.get("error_code") == 0
-        except Exception as e:
-            print(f"Failed to delete document: {str(e)}")
-            return False
-    
-    def get_document(self, index_name: str, document_id: str) -> Optional[Dict[str, Any]]:
-        """获取单个文档"""
-        try:
-            # 使用Infinity官方HTTP API查询单行数据
-            path = f"/databases/{self.database}/tables/{index_name}/query"
-            
-            data = {
-                "filter": {
-                    "column": "id",
-                    "operator": "=",
-                    "value": document_id
-                },
-                "limit": 1
-            }
-            
-            response = self.http_client.post(path, json_data=data, headers=self.headers)
-            
-            if response.get("error_code") == 0:
-                rows = response.get("rows", [])
-                if rows:
-                    return rows[0]
-            return None
-        except Exception as e:
-            print(f"Failed to get document: {str(e)}")
-            return None
-    
-    def delete_by_query(self, index_name: str, query: Dict[str, Any]) -> Dict[str, Any]:
-        """按查询条件删除文档"""
-        try:
-            # 使用Infinity官方HTTP API按条件删除行
-            path = f"/databases/{self.database}/tables/{index_name}/rows"
-            
-            # 构建删除条件
-            # 这里假设query是一个简单的字典,如{"dataset_id": "xxx"}
-            filter_conditions = []
-            for key, value in query.items():
-                filter_conditions.append({
-                    "column": key,
-                    "operator": "=",
-                    "value": value
-                })
-            
-            data = {
-                "delete_by": {
-                    "and": filter_conditions
-                }
-            }
-            
-            response = self.http_client.delete(path, json_data=data, headers=self.headers)
-            
-            if response.get("error_code") == 0:
-                return {"success": True}
-            else:
-                return {"success": False, "error": response.get("error_msg", "Unknown error")}
-        except Exception as e:
-            print(f"Failed to delete by query: {str(e)}")
-            return {"success": False, "error": str(e)}
-    
-    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
-        """搜索文档"""
-        try:
-            # 使用Infinity官方HTTP API查询数据
-            path = f"/databases/{self.database}/tables/{index_name}/docs"
-            
-            data = {
-                "filter": query,
-                "limit": size
-            }
-            
-            response = self.http_client.post(path, json_data=data, headers=self.headers)
-            
-            if response.get("error_code") == 0:
-                rows = response.get("output", [])
-                return {
-                    "output": rows,
-                }
-            else:
-                return {"hits": [], "total": 0, "error": response.get("error_msg", "Unknown error")}
-        except Exception as e:
-            print(f"Failed to search: {str(e)}")
-            return {"output": [], "error": str(e)}
-    
-    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
-        """向量检索"""
-        try:
-            # 使用Infinity官方HTTP API进行向量检索
-            path = f"/databases/{self.database}/tables/{index_name}/docs"
-            
-            data = {
-                "vector_field": vector_field,
-                "vector": vector,
-                "limit": size
-            }
-            
-            if filter:
-                data["filter"] = filter
-            
-            response = self.http_client.post(path, json_data=data, headers=self.headers)
-            
-            if response.get("error_code") == 0:
-                rows = response.get("output", [])
-                return {
-                    "hits": rows,
-                    "total": len(rows)
-                }
-            else:
-                return {"hits": [], "total": 0, "error": response.get("error_msg", "Unknown error")}
-        except Exception as e:
-            print(f"Failed to vector search: {str(e)}")
-            return {"hits": [], "total": 0, "error": str(e)}
-    
-    def hybrid_search(self, index_name: str, match_method: str, vector_field: str, query_vector: List[float], element_type: str,
-                     metric_type: str = "cosine", topn: int = 3, rank_constant: int = 60,
-                     text_query: str = "", text_field: str = "file_name"
-                     ) -> Dict[str, Any]:
-        """混合检索"""
-        try:
-            # 使用Infinity官方HTTP API进行混合检索
-            path = f"/databases/{self.database}/tables/{index_name}/docs"
-            
-            # 构建搜索配置列表
-            search_config = [
-                {
-                    "match_method": match_method,
-                    "fields": vector_field,
-                    "query_vector": query_vector,
-                    "element_type": element_type,
-                    "metric_type": metric_type,
-                    "topn": topn,
-                    "params": {
-                        "ef": "10"
-                    }
-                }
-            ]
-            
-            # 只有当text_query和text_field都不为空时,才添加文本搜索配置
-            # if text_query and text_field:
-            #     search_config.append(
-            #         {
-            #             "match_method": "text",
-            #             "fields": text_field,
-            #             "matching_text": text_query,
-            #             "topn": 1,
-            #             "params":
-            #             {
-            #                 "default_fields": text_field,
-            #                 "operator": "or"
-            #             }
-            #         }
-            #     )
-            
-            # 添加融合方法配置
-            # if vector_field and vector and text_query and text_field:
-            #     search_config.append(
-            #         {
-            #             "fusion_method": "rrf",
-            #             "topn": topn,
-            #             "params":{"rank_constant": rank_constant}
-            #         }
-            #     )
-          
-            data = {
-                "output": [
-                    "file_name",
-                    "page_number",
-                    "content",
-                    "image_path",
-                    "dataset_id",
-                    "document_id",
-                    "_similarity"
-                ],
-                "search": search_config
-            }
-            
-            response = self.http_client.get_json(path, json_data=data, headers=self.headers)
-            
-            if response["error_code"] == 0:
-                rows = response["output"]
-                # 将列表的列表转换为字典列表
-                output_fields = ["file_name", "page_number", "content", "image_path", "dataset_id", "document_id", "_similarity"]
-                formatted_rows = []
-                for row in rows:
-                    # 创建字典,将每个字段名与对应的值匹配
-                    formatted_row = {}
-                    for i, field in enumerate(output_fields):
-                        if i < len(row):
-                            # 处理字段值,确保是字典类型
-                            if isinstance(row[i], dict):
-                                formatted_row.update(row[i])
-                            else:
-                                formatted_row[field] = row[i]
-                    formatted_rows.append(formatted_row)
-                return {
-                    "output": formatted_rows,
-                    "total": len(formatted_rows)
-                }
-            else:
-                return {"output": [], "total": 0, "error": response["error_msg"]}
-        except Exception as e:
-            print(f"Failed to hybrid search: {str(e)}")
-            return {"output": [], "total": 0, "error": str(e)}

BIN
utils/infinity_util/__pycache__/__init__.cpython-312.pyc


BIN
utils/infinity_util/__pycache__/base.cpython-312.pyc


BIN
utils/infinity_util/__pycache__/document.cpython-312.pyc


BIN
utils/infinity_util/__pycache__/index.cpython-312.pyc


BIN
utils/infinity_util/__pycache__/search.cpython-312.pyc


+ 0 - 168
utils/vector_db.py

@@ -1,168 +0,0 @@
-"""
-向量数据库工厂类
-支持动态切换Elasticsearch和Infinity向量数据库
-"""
-from typing import Any, List, Dict, Optional
-from conf.config import VectorDBConfig
-from utils.es import ESConnection as ElasticsearchConnection
-
-
-class VectorDBFactory:
-    """
-    向量数据库工厂类
-    根据配置创建不同类型的向量数据库连接
-    """
-    
-    @staticmethod
-    def get_vector_db():
-        """
-        获取向量数据库实例
-        
-        Returns:
-            VectorDBBase: 向量数据库实例
-        """
-        vector_db_type = VectorDBConfig.get_vector_db_type().lower()
-        
-        if vector_db_type == "es":
-            return ElasticsearchVectorDB()
-        elif vector_db_type == "infinity":
-            return InfinityVectorDB()
-        else:
-            raise ValueError(f"不支持的向量数据库类型: {vector_db_type}")
-
-
-class VectorDBBase:
-    """
-    向量数据库基类
-    定义了向量数据库应该实现的接口
-    """
-    
-    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
-        """创建索引"""
-        raise NotImplementedError()
-    
-    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
-        """插入单个文档"""
-        raise NotImplementedError()
-    
-    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """批量插入文档"""
-        raise NotImplementedError()
-    
-    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
-        """搜索文档"""
-        raise NotImplementedError()
-    
-    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
-        """向量检索"""
-        raise NotImplementedError()
-    
-    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
-                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
-        """混合检索"""
-        raise NotImplementedError()
-    
-    def close(self):
-        """关闭连接"""
-        raise NotImplementedError()
-
-
-class ElasticsearchVectorDB(VectorDBBase):
-    """
-    Elasticsearch向量数据库实现
-    """
-    
-    def __init__(self):
-        """初始化Elasticsearch向量数据库"""
-        self.es_conn = ElasticsearchConnection()
-        
-    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
-        """创建索引"""
-        from utils.es.index import IndexManager
-        index_manager = IndexManager(self.es_conn)
-        return index_manager.create_index(index_name, mappings)
-    
-    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
-        """插入单个文档"""
-        from utils.es.document import DocumentManager
-        doc_manager = DocumentManager(self.es_conn)
-        return doc_manager.insert(index_name, document, id)
-    
-    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """批量插入文档"""
-        from services.utils.es.document import DocumentManager
-        doc_manager = DocumentManager(self.es_conn)
-        return doc_manager.bulk_insert(index_name, documents)
-    
-    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
-        """搜索文档"""
-        from utils.es.search import SearchManager
-        search_manager = SearchManager(self.es_conn)
-        return search_manager.search(index_name, query, size=size)
-    
-    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
-        """向量检索"""
-        from services.utils.es.search import SearchManager
-        search_manager = SearchManager(self.es_conn)
-        return search_manager.knn_search(index_name, vector_field, vector, size, filter)
-    
-    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
-                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
-        """混合检索"""
-        from services.utils.es.search import SearchManager
-        search_manager = SearchManager(self.es_conn)
-        return search_manager.hybrid_search(index_name, text_query, vector_field, vector, size, text_weight=text_weight, vector_weight=vector_weight)
-    
-    def close(self):
-        """关闭连接"""
-        self.es_conn.close()
-
-
-class InfinityVectorDB(VectorDBBase):
-    """
-    Infinity向量数据库实现
-    支持infinity向量数据库的具体实现,包含PDF元数据入库
-    """
-    
-    def __init__(self):
-        """初始化Infinity向量数据库"""
-        from utils.infinity_util import InfinityVectorDB as _InfinityVectorDB
-        from conf.config import VectorDBConfig
-        
-        # 获取Infinity配置
-        host = VectorDBConfig.get_infinity_host()
-        port = VectorDBConfig.get_infinity_port()
-        user = VectorDBConfig.get_infinity_user()
-        password = VectorDBConfig.get_infinity_password()
-        
-        # 初始化新的InfinityVectorDB实例
-        self._infinity_db = _InfinityVectorDB()
-    
-    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
-        """创建索引"""
-        return self._infinity_db.create_index(index_name, mappings)
-    
-    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
-        """插入单个文档"""
-        return self._infinity_db.insert_document(index_name, document, id)
-    
-    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
-        """批量插入文档"""
-        return self._infinity_db.bulk_insert(index_name, documents)
-    
-    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
-        """搜索文档"""
-        return self._infinity_db.search(index_name, query, size)
-    
-    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
-        """向量检索"""
-        return self._infinity_db.vector_search(index_name, vector_field, vector, size, filter)
-    
-    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
-                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
-        """混合检索"""
-        return self._infinity_db.hybrid_search(index_name, text_query, vector_field, vector, size, text_weight, vector_weight)
-    
-    def close(self):
-        """关闭连接"""
-        self._infinity_db.close()

+ 0 - 555
vector_search_result.md

@@ -1,555 +0,0 @@
-## 向量检索结果
-
-### 检索结果 1
-**file_name**: 出发!超级播种机.pdf
-**page_number**: 9
-**content**: {
-    "page_meta": {
-        "page_number": 1,
-        "content_text": "“我们快点儿开始播种吧!”点点已经等不及了。“等等,我们都去拿工具,谁来照看种子呢?”甜甜问。“我!我来!”“粉宝,这项重要的任务就交给你啦!”点点说,“没有种子,就没有植物!”粉宝守着种子唱起了歌。“我们是超级播种机!我们让莉莉兰开满鲜花……”一缕微风吹过,粉宝紧张得屏住了呼吸……“嘿!别走……”",
-        "overall_style": {
-            "art_medium": "手绘水彩",
-            "color_palette": ["薄荷绿", "泥土棕"],
-            "lighting": "柔和侧光",
-            "composition": "对角线构图"
-        }
-    },
-    "elements": [
-        {
-            "element_name": "粉宝",
-            "character_name": "粉宝",
-            "category": "角色",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "粉色分节身体,戴蓝色小帽,大眼睛",
-                "action_emotion": "兴奋地喊“我!我来!”,紧张屏息",
-                "color_detail": "珊瑚粉、天蓝色",
-                "ability_tag": "自我认知"
-            },
-            "content_tags": {
-                "theme": ["自然", "生活常识"],
-                "object": ["动物", "植物", "种子"],
-                "emotion": ["快乐", "紧张"]
-            },
-            "ability_tags": ["语言表达", "自我认知", "自然观察"],
-            "description": "粉宝穿着粉色分节身体,戴蓝色小帽,兴奋喊话,继而紧张屏息守护种子。"
-        },
-        {
-            "element_name": "点点",
-            "character_name": "点点",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "戴护目镜的红色瓢虫,黑褐色斑点",
-                "action_emotion": "急切地催促播种",
-                "color_detail": "橙红、深褐",
-                "ability_tag": "语言表达"
-            },
-            "content_tags": {
-                "theme": ["自然", "生活常识"],
-                "object": ["动物", "种子"],
-                "emotion": ["急切"]
-            },
-            "ability_tags": ["语言表达"],
-            "description": "戴护目镜的红色瓢虫点点,急切催促大家播种,加速任务进程。"
-        },
-        {
-            "element_name": "甜甜",
-            "character_name": "甜甜",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "蓝色甲壳,棕色螺旋贝壳",
-                "action_emotion": "担忧地询问",
-                "color_detail": "钴蓝、土黄",
-                "ability_tag": "社会交往"
-            },
-            "content_tags": {
-                "theme": ["自然", "生活常识"],
-                "object": ["动物", "种子"],
-                "emotion": ["担忧"]
-            },
-            "ability_tags": ["社会交往"],
-            "description": "蓝色甲壳蜗牛甜甜,担忧询问谁来守护种子,体现责任感。"
-        },
-        {
-            "element_name": "蜜蜂",
-            "character_name": "",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "黄黑条纹,透明翅膀",
-                "action_emotion": "好奇地观察",
-                "color_detail": "亮黄、深褐",
-                "ability_tag": "自然观察"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["动物", "植物"],
-                "emotion": ["好奇"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "黄黑条纹蜜蜂,透明翅膀,好奇观察种子和同伴们的活动。"
-        },
-        {
-            "element_name": "蒲公英",
-            "character_name": "",
-            "category": "道具",
-            "spatial_layer": "前景/中景",
-            "visual_attributes": {
-                "appearance": "白色绒球,纤细毛细",
-                "action_emotion": "随风飘散",
-                "color_detail": "纯白",
-                "ability_tag": "自然观察"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["植物"],
-                "emotion": ["平静"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "白色蒲公英绒球,纤细毛细随风飘散,象征种子传播和自然循环。"
-        },
-        {
-            "element_name": "土壤",
-            "character_name": "",
-            "category": "环境",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "棕色颗粒,点缀小石子",
-                "action_emotion": "静默的承载",
-                "color_detail": "深土棕",
-                "ability_tag": "自然观察"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["植物", "土壤"],
-                "emotion": ["平静"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "棕色颗粒土壤,点缀着小石子,静默承载着生命的种子和根系。"
-        }
-    ]
-}
-**image_path**: http://192.168.16.134:9000/bookpage/4dd1f389-4f61-478a-a4c2-a8c18c6f71c9.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ck7I8Esssx6rzZrXQ5uP%2F20260109%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20260109T074321Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=7ea572f9b4e8d83061ac0475f5f9593cfca1abc26a5e60f56cebd3dcf868d6d6
-**dataset_id**: a0f1aa03ed2c11f08b8f0242c0a85002
-**document_id**: d629ed98ed2e11f09ac30242c0a85002
-**SIMILARITY**: 0.6338874697685242
-
-### 检索结果 2
-**file_name**: 出发!超级播种机.pdf
-**page_number**: 5
-**content**: {
-    "page_meta": {
-        "page_number": 1,
-        "content_text": "炎热的夏天就要过去了,虫虫们在花园里散步。突然,粉宝咯咯地笑了起来。‘什么事这么好笑?’‘哎呀,有毛茸茸的家伙在挠我的肚子!’",
-        "overall_style": {
-            "art_medium": "手绘水彩",
-            "color_palette": ["薄荷绿", "暖橙色"],
-            "lighting": "柔和侧光",
-            "composition": "大远景"
-        }
-    },
-    "elements": [
-        {
-            "element_name": "粉红色毛虫",
-            "character_name": "粉宝",
-            "category": "角色",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "粉橙色分节身体,头顶蓝色小帽,闭眼微笑",
-                "action_emotion": "蜷缩身体大笑,被蒲公英绒毛轻触",
-                "color_detail": "粉橙色分节身体,头顶蓝色小帽",
-                "ability_tag": "情绪管理"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["昆虫", "植物"],
-                "emotion": ["快乐"]
-            },
-            "ability_tags": ["情绪管理", "自然观察"],
-            "description": "粉橙色毛虫被蒲公英绒毛轻触,闭眼大笑,展现快乐情绪。"
-        },
-        {
-            "element_name": "戴眼镜瓢虫",
-            "character_name": "",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "红黑斑点外壳,戴圆框眼镜",
-                "action_emotion": "飞舞询问,表情关切",
-                "color_detail": "红黑斑点外壳,透明镜片",
-                "ability_tag": "社会交往"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["昆虫"],
-                "emotion": ["好奇"]
-            },
-            "ability_tags": ["社会交往", "语言表达"],
-            "description": "戴眼镜的瓢虫飞舞询问,表情关切,体现好奇与社交互动。"
-        },
-        {
-            "element_name": "蒲公英",
-            "character_name": "",
-            "category": "道具",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "白色绒毛球,棕色花托",
-                "action_emotion": "绒毛飘散,轻触毛虫",
-                "color_detail": "白色绒毛,棕色花托",
-                "ability_tag": ""
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["植物"],
-                "emotion": ["轻松"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "白色蒲公英绒毛飘散,轻触毛虫,促进自然观察。"
-        },
-        {
-            "element_name": "蝴蝶",
-            "character_name": "",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "橙黑翅膀,黑色触角",
-                "action_emotion": "飞行姿态优雅",
-                "color_detail": "橙黑翅膀,黑色触角",
-                "ability_tag": ""
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["昆虫"],
-                "emotion": ["平静"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "橙黑翅膀的蝴蝶优雅飞行,展现自然之美的观察点。"
-        },
-        {
-            "element_name": "蜗牛",
-            "character_name": "",
-            "category": "角色",
-            "spatial_layer": "背景",
-            "visual_attributes": {
-                "appearance": "蓝色身体,螺旋红壳",
-                "action_emotion": "静止观察",
-                "color_detail": "蓝色身体,红褐色螺旋壳",
-                "ability_tag": ""
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["昆虫"],
-                "emotion": ["安静"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "蓝色身体配红壳的蜗牛静止观察,促进安静的自然观察。"
-        }
-    ]
-}
-**image_path**: http://192.168.16.134:9000/bookpage/21a250a9-cdc0-4c18-a94e-3a82eba32720.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ck7I8Esssx6rzZrXQ5uP%2F20260109%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20260109T074314Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=a89a8f58335b6b3a6f2f6a399c17b387026bfa988ab5b23ee36abfca970e1718
-**dataset_id**: a0f1aa03ed2c11f08b8f0242c0a85002
-**document_id**: d629ed98ed2e11f09ac30242c0a85002
-**SIMILARITY**: 0.6216001510620117
-
-### 检索结果 3
-**file_name**: 出发!超级播种机.pdf
-**page_number**: 11
-**content**: ```json
-{
-    "page_meta": {
-        "page_number": 1,
-        "content_text": "粉宝追啊追啊……但一个都没追回来。过了一会儿,虫虫们回来了。“粉宝,这是怎么了?”“我们的种子去哪儿啦?”“对不起!”“都怪我!是我没照看好种子,让风吹走了。没有种子,我们就不能播种了。”粉宝难过极了。",
-        "overall_style": {
-            "art_medium": "手绘水彩",
-            "color_palette": ["柔粉", "橄榄绿"],
-            "lighting": "柔和自然光",
-            "composition": "分镜叙事构图"
-        }
-    },
-    "elements": [
-        {
-            "element_name": "粉宝",
-            "character_name": "粉宝",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "粉橙色条纹身体,戴蓝色小帽",
-                "action_emotion": "追捕种子,最终沮丧低头",
-                "color_detail": "粉橙色条纹,蓝色小帽",
-                "ability_tag": "自我认知"
-            },
-            "content_tags": {
-                "theme": ["自然", "生活常识"],
-                "object": ["动物", "种子"],
-                "emotion": ["勇敢", "难过"]
-            },
-            "ability_tags": ["自我认知", "情绪管理"],
-            "description": "粉宝戴蓝帽,粉橙条纹,追捕种子时勇敢,失败后难过,体现自我认知与情绪管理。"
-        },
-        {
-            "element_name": "小蜜蜂",
-            "character_name": "",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "黄黑条纹,透明翅膀",
-                "action_emotion": "关切俯视,询问情况",
-                "color_detail": "亮黄色,黑色条纹",
-                "ability_tag": "社会交往"
-            },
-            "content_tags": {
-                "theme": ["自然", "生活常识"],
-                "object": ["动物"],
-                "emotion": ["关心"]
-            },
-            "ability_tags": ["社会交往"],
-            "description": "黄黑条纹小蜜蜂,透明翅膀,关切俯视粉宝,展现关心与社会交往能力。"
-        }
-    ]
-}
-```
-**image_path**: http://192.168.16.134:9000/bookpage/1f3c64a3-ba0e-4fe9-823a-f3f5b443da0b.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ck7I8Esssx6rzZrXQ5uP%2F20260109%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20260109T074324Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=711c6dca918128b450f86e93c9ac294a3d7a819b580f8076ed8099f79e50df8e
-**dataset_id**: a0f1aa03ed2c11f08b8f0242c0a85002
-**document_id**: d629ed98ed2e11f09ac30242c0a85002
-**SIMILARITY**: 0.616936981678009
-
-### 检索结果 4
-**file_name**: 出发!超级播种机.pdf
-**page_number**: 8
-**content**: ```json
-{
-    "page_meta": {
-        "page_number": 1,
-        "content_text": "粉宝也努力想要抓住种子,但是……他总是抓不住。看,我们收集了这么多种子!卷卷开心极了。粉宝有点儿难过:我一个也没找到。瞧瞧你尾巴上是什么?粉宝,尾巴上的也算,你也找到啦!闪闪大声鼓励他。",
-        "overall_style": {
-            "art_medium": "手绘水彩",
-            "color_palette": ["薄荷绿", "暖橙色"],
-            "lighting": "柔和晨光",
-            "composition": "分镜式构图"
-        }
-    },
-    "elements": [
-        {
-            "element_name": "粉宝",
-            "character_name": "粉宝",
-            "category": "角色",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "粉红色分节身体,头顶小蓝帽,圆眼睛",
-                "action_emotion": "伸展身体试图抓种子,后显沮丧",
-                "color_detail": "珊瑚粉,橙色条纹",
-                "ability_tag": "自我认知"
-            },
-            "content_tags": {
-                "theme": ["自然", "社交"],
-                "object": ["动物", "植物"],
-                "emotion": ["快乐", "难过"]
-            },
-            "ability_tags": ["自我认知", "情绪管理"],
-            "description": "粉色毛毛虫戴蓝帽,努力抓种子却失败,表情从专注转为沮丧,体现挫折感与自我认知。"
-        },
-        {
-            "element_name": "卷卷",
-            "character_name": "卷卷",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "蓝色蜗牛壳,蓝身体,红色眼睛",
-                "action_emotion": "开心展示收集的种子",
-                "color_detail": "深蓝与橙红渐变壳",
-                "ability_tag": "社会交往"
-            },
-            "content_tags": {
-                "theme": ["自然", "社交"],
-                "object": ["动物", "植物"],
-                "emotion": ["开心", "友善"]
-            },
-            "ability_tags": ["社会交往", "情绪管理"],
-            "description": "蓝色蜗牛壳,兴奋展示收获,放大表达快乐情绪,促进角色间的社交互动。"
-        },
-        {
-            "element_name": "蜜蜂",
-            "character_name": "",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "黄黑相间条纹,透明翅膀",
-                "action_emotion": "助阵鼓励,专注好奇",
-                "color_detail": "亮黄色与黑色块",
-                "ability_tag": "社会交往"
-            },
-            "content_tags": {
-                "theme": ["自然", "社交"],
-                "object": ["动物", "植物"],
-                "emotion": ["好奇", "友善"]
-            },
-            "ability_tags": ["社会交往", "情绪管理"],
-            "description": "黄黑条纹蜜蜂,透明翅膀,眼神专注,微表情传达友善与鼓励,营造积极社交氛围。"
-        },
-        {
-            "element_name": "蒲公英种子",
-            "character_name": "",
-            "category": "道具",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "白色绒球状,轻盈飘散",
-                "action_emotion": "自然元素,暗示失败与转机",
-                "color_detail": "纯白色,轻盈如云",
-                "ability_tag": "自然观察"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["植物"],
-                "emotion": ["惊喜", "希望"]
-            },
-            "ability_tags": ["自然观察", "逻辑思维"],
-            "description": "白色绒球状蒲公英种子,轻盈飘散,暗示自然界的微妙变化,引导孩子观察细节。"
-        },
-        {
-            "element_name": "草地与土壤",
-            "character_name": "",
-            "category": "场景",
-            "spatial_layer": "背景",
-            "visual_attributes": {
-                "appearance": "绿色草叶,棕色土壤,木头小径",
-                "action_emotion": "宁静自然环境",
-                "color_detail": "鲜绿与深棕",
-                "ability_tag": "自然观察"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["植物", "土壤"],
-                "emotion": ["安静"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "柔软绿草与深棕土壤构成自然背景,营造安静观察与探索的氛围。"
-        }
-    ]
-}
-```
-**image_path**: http://192.168.16.134:9000/bookpage/16957b97-75f3-49d9-9ad9-56ceef63dc0e.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ck7I8Esssx6rzZrXQ5uP%2F20260109%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20260109T074319Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=9e89506af7a73d8551e0069dc9d7b9f21c0896784d6b44b17c59bdf1bb61f05a
-**dataset_id**: a0f1aa03ed2c11f08b8f0242c0a85002
-**document_id**: d629ed98ed2e11f09ac30242c0a85002
-**SIMILARITY**: 0.6116334795951843
-
-### 检索结果 5
-**file_name**: 出发!超级播种机.pdf
-**page_number**: 12
-**content**: ```json
-{
-    "page_meta": {
-        "page_number": 1,
-        "content_text": "“不是你的错,粉宝。”美美飞了过来,“这些种子本来就是要飞的,所以它们才会有降落伞和翅膀。”\n“当种子落到土壤里,它们会自己生根发芽。大自然才是真正的超级播种机。”\n点点也拿出一些种子,安慰虫虫们:“看!这些种子就没有翅膀,我们还有机会。”",
-        "overall_style": {
-            "art_medium": "手绘水彩",
-            "color_palette": ["薄荷绿", "暖橙色"],
-            "lighting": "柔和顶光",
-            "composition": "中心聚焦构图"
-        }
-    },
-    "elements": [
-        {
-            "element_name": "蝴蝶美美",
-            "character_name": "美美",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "蓝灰身体,橙色带白点翅膀",
-                "action_emotion": "飞向同伴,温柔安慰",
-                "color_detail": "橙色翅膀带深棕斑点",
-                "ability_tag": "情绪管理"
-            },
-            "content_tags": {
-                "theme": ["自然", "社交"],
-                "object": ["昆虫", "植物"],
-                "emotion": ["友爱", "安静"]
-            },
-            "ability_tags": ["情绪管理", "语言表达"],
-            "description": "蓝灰身子橙翼蝴蝶温柔飞来安慰伙伴,教会孩子理解自然规律。"
-        },
-        {
-            "element_name": "毛毛虫粉宝",
-            "character_name": "粉宝",
-            "category": "角色",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "粉红色条纹身躯,戴深蓝小帽",
-                "action_emotion": "蜷缩在土中,略带委屈",
-                "color_detail": "粉红条纹带细微渐变",
-                "ability_tag": "自我认知"
-            },
-            "content_tags": {
-                "theme": ["自然", "生活常识"],
-                "object": ["昆虫", "土壤"],
-                "emotion": ["委屈", "好奇"]
-            },
-            "ability_tags": ["自我认知", "自然观察"],
-            "description": "戴小帽的粉红毛毛虫蜷缩在土里,正在学习面对失败。"
-        },
-        {
-            "element_name": "蜜蜂", 
-            "character_name": "",
-            "category": "角色",
-            "spatial_layer": "中景",
-            "visual_attributes": {
-                "appearance": "黄黑相间条纹,透明蓝翅",
-                "action_emotion": "温柔安抚毛毛虫",
-                "color_detail": "黄色带黑色条纹",
-                "ability_tag": "社会交往"
-            },
-            "content_tags": {
-                "theme": ["自然", "社交"],
-                "object": ["昆虫"],
-                "emotion": ["友爱", "鼓励"]
-            },
-            "ability_tags": ["社会交往"],
-            "description": "黄黑条纹蜜蜂温柔互动,传递自然友善的温暖氛围。"
-        },
-        {
-            "element_name": "点点",
-            "character_name": "点点",
-            "category": "角色",
-            "spatial_layer": "前景",
-            "visual_attributes": {
-                "appearance": "棕色斗笠,大圆眼镜",
-                "action_emotion": "展示种子,自信鼓励",
-                "color_detail": "棕斗笠配深棕斑点",
-                "ability_tag": "逻辑思维"
-            },
-            "content_tags": {
-                "theme": ["自然", "科学科普"],
-                "object": ["种子", "工具"],
-                "emotion": ["乐观", "惊喜"]
-            },
-            "ability_tags": ["逻辑思维", "自然观察"],
-            "description": "戴斗笠戴眼镜的昆虫自信展示种子,引导孩子探索自然规律。"
-        },
-        {
-            "element_name": "环境",
-            "character_name": "",
-            "category": "场景",
-            "spatial_layer": "背景",
-            "visual_attributes": {
-                "appearance": "绿叶草丛,木质树干",
-                "action_emotion": "宁静自然",
-                "color_detail": "翠绿色与棕色土壤",
-                "ability_tag": "自然观察"
-            },
-            "content_tags": {
-                "theme": ["自然"],
-                "object": ["植物", "土壤"],
-                "emotion": ["宁静", "好奇"]
-            },
-            "ability_tags": ["自然观察"],
-            "description": "绿叶环绕的宁静泥土环境,细节丰富,激发孩子探索兴趣。"
-        }
-    ]
-}
-```
-**image_path**: http://192.168.16.134:9000/bookpage/9bf88a20-d970-4831-950c-c17768af8c82.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ck7I8Esssx6rzZrXQ5uP%2F20260109%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20260109T074326Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=5fbb5efa41091c39669a2a801081dbd13735d82c9921d9302f84ee077ce10591
-**dataset_id**: a0f1aa03ed2c11f08b8f0242c0a85002
-**document_id**: d629ed98ed2e11f09ac30242c0a85002
-**SIMILARITY**: 0.6091482043266296
-

+ 3 - 3
workflow/image_parsing_workflow.py

@@ -17,10 +17,10 @@ from typing import List, Dict, Any, Annotated
 from pydantic import BaseModel, Field, ConfigDict
 from model.qwen_vl import QWenVLParser
 from utils.ragflow.ragflow_service import RAGFlowService
-from utils.vector_db import VectorDBFactory
 from model.multimodal_embedding import Embedding
 from utils.minio.image_util import image_util
 from conf.config import ModelConfig
+from utils.infinity import get_client
 
 # 定义工作流状态类
 class ImageParsingState(BaseModel):
@@ -30,7 +30,6 @@ class ImageParsingState(BaseModel):
     book_name: str = Field(..., description="书名")
     dataset_id: str = Field(..., description="数据集ID")
     ragflow_service: RAGFlowService = Field(default_factory=RAGFlowService, description="RAGFLOW服务")
-    vector_db: Any = Field(default_factory=VectorDBFactory.get_vector_db, description="向量数据库实例")
     embedding_model: Embedding = Field(default_factory=Embedding, description="多模态嵌入模型实例")
     document_id: str = Field(default="", description="文档ID")
     split_images: List[Dict[str, Any]] = Field(default_factory=list, description="拆分后的图片列表,包含图片URL和页码信息")
@@ -264,7 +263,8 @@ class ImageParsingWorkflow:
         # 批量入库
         if documents_to_store:
             print(f"开始入库,共 {len(documents_to_store)} 个文档")
-            result = state.vector_db.bulk_insert(index_name, documents_to_store)
+            infinity_client = get_client()
+            result = infinity_client.insert(index_name, documents_to_store)
             print(f"入库结果: {result}")
         
         return {

+ 3 - 4
workflow/workflow.py

@@ -12,10 +12,9 @@ from pydantic import BaseModel, Field, ConfigDict
 from parser.pdf_parser.pdf_splitter import PDFSplitter
 from model.qwen_vl import QWenVLParser
 from utils.ragflow.ragflow_service import RAGFlowService
-from utils.vector_db import VectorDBFactory
 from model.multimodal_embedding import Embedding
 from conf.config import ModelConfig, VectorDBConfig
-from utils.minio.image_util import ImageUtil
+from utils.infinity import get_client
 
 # 定义工作流状态类
 class PDFParsingState(BaseModel):
@@ -24,7 +23,6 @@ class PDFParsingState(BaseModel):
     pdf_path: str = Field(..., description="PDF文件路径")
     dataset_id: str = Field(..., description="数据集ID")
     ragflow_service: RAGFlowService = Field(default_factory=RAGFlowService, description="RAGFLOW服务")
-    vector_db: Any = Field(default_factory=VectorDBFactory.get_vector_db, description="向量数据库实例")
     embedding_model: Embedding = Field(default_factory=Embedding, description="多模态嵌入模型实例")
     document_id: str = Field(default="", description="上传后的文档ID")
     split_pages: List[Dict[str, Any]] = Field(default_factory=list, description="拆分后的页面列表")
@@ -332,7 +330,8 @@ class PDFParsingWorkflow:
         # 批量入库
         if documents_to_store:
             print(f"开始入库,共 {len(documents_to_store)} 个文档")
-            result = state.vector_db.bulk_insert(index_name, documents_to_store)
+            infinity_client = get_client()
+            result = infinity_client.insert(index_name, documents_to_store)
             print(f"入库结果: {result}")
         
         return {