yingge 4 ay önce
ebeveyn
işleme
7298cb85f2
70 değiştirilmiş dosya ile 9649 ekleme ve 16 silme
  1. 16 1
      .env
  2. 247 0
      VectorDB_GUIDE.md
  3. 355 0
      book/es_dynamic.md
  4. 4577 0
      book/ragflow_api.md
  5. BIN
      conf/__pycache__/config.cpython-312.pyc
  6. 65 0
      conf/config.py
  7. 686 0
      es_conn_analysis.md
  8. 3 1
      requirements.txt
  9. BIN
      services/model/__pycache__/multimodal_embedding.cpython-312.pyc
  10. BIN
      services/model/__pycache__/qwen_vl.cpython-312.pyc
  11. 3 3
      services/model/multimodal_embedding.py
  12. BIN
      services/pdf_parser/__pycache__/__init__.cpython-312.pyc
  13. BIN
      services/pdf_parser/__pycache__/main.cpython-312.pyc
  14. BIN
      services/pdf_parser/__pycache__/workflow.cpython-312.pyc
  15. 1 1
      services/pdf_parser/main.py
  16. 156 10
      services/pdf_parser/workflow.py
  17. 0 0
      services/ragflow/__init__.py
  18. BIN
      services/ragflow/__pycache__/__init__.cpython-312.pyc
  19. BIN
      services/ragflow/__pycache__/agent_service.cpython-312.pyc
  20. BIN
      services/ragflow/__pycache__/chat_service.cpython-312.pyc
  21. BIN
      services/ragflow/__pycache__/chunk_service.cpython-312.pyc
  22. BIN
      services/ragflow/__pycache__/dataset_service.cpython-312.pyc
  23. BIN
      services/ragflow/__pycache__/document_service.cpython-312.pyc
  24. BIN
      services/ragflow/__pycache__/file_service.cpython-312.pyc
  25. BIN
      services/ragflow/__pycache__/openai_service.cpython-312.pyc
  26. BIN
      services/ragflow/__pycache__/ragflow_service.cpython-312.pyc
  27. 139 0
      services/ragflow/agent_service.py
  28. 146 0
      services/ragflow/chat_service.py
  29. 78 0
      services/ragflow/chunk_service.py
  30. 177 0
      services/ragflow/dataset_service.py
  31. 126 0
      services/ragflow/document_service.py
  32. 141 0
      services/ragflow/file_service.py
  33. 45 0
      services/ragflow/openai_service.py
  34. 298 0
      services/ragflow/ragflow_service.py
  35. BIN
      services/utils/__pycache__/decorators.cpython-312.pyc
  36. BIN
      services/utils/__pycache__/es_conn.cpython-312.pyc
  37. BIN
      services/utils/__pycache__/http_client.cpython-312.pyc
  38. BIN
      services/utils/__pycache__/vector_db.cpython-312.pyc
  39. 13 0
      services/utils/decorators.py
  40. 17 0
      services/utils/es/__init__.py
  41. BIN
      services/utils/es/__pycache__/__init__.cpython-312.pyc
  42. BIN
      services/utils/es/__pycache__/base.cpython-312.pyc
  43. BIN
      services/utils/es/__pycache__/constants.cpython-312.pyc
  44. BIN
      services/utils/es/__pycache__/document.cpython-312.pyc
  45. BIN
      services/utils/es/__pycache__/index.cpython-312.pyc
  46. BIN
      services/utils/es/__pycache__/search.cpython-312.pyc
  47. BIN
      services/utils/es/__pycache__/templates.cpython-312.pyc
  48. 68 0
      services/utils/es/base.py
  49. 25 0
      services/utils/es/constants.py
  50. 192 0
      services/utils/es/document.py
  51. 131 0
      services/utils/es/index.py
  52. 202 0
      services/utils/es/search.py
  53. 203 0
      services/utils/es/templates.py
  54. 138 0
      services/utils/es_conn.py
  55. 105 0
      services/utils/http_client.py
  56. 94 0
      services/utils/infinity/__init__.py
  57. BIN
      services/utils/infinity/__pycache__/__init__.cpython-312.pyc
  58. BIN
      services/utils/infinity/__pycache__/base.cpython-312.pyc
  59. BIN
      services/utils/infinity/__pycache__/document.cpython-312.pyc
  60. BIN
      services/utils/infinity/__pycache__/index.cpython-312.pyc
  61. BIN
      services/utils/infinity/__pycache__/search.cpython-312.pyc
  62. 63 0
      services/utils/infinity/base.py
  63. 168 0
      services/utils/infinity/document.py
  64. 140 0
      services/utils/infinity/index.py
  65. 187 0
      services/utils/infinity/search.py
  66. 168 0
      services/utils/vector_db.py
  67. 121 0
      test_es_conn.py
  68. 180 0
      test_infinity_encapsulation.py
  69. 104 0
      test_vector_db.py
  70. 71 0
      test_workflow.py

+ 16 - 1
.env

@@ -3,9 +3,24 @@ MODEL_PROVIDER=openai
 MODEL_NAME=Qwen/Qwen3-VL-8B-Instruct
 BASE_URL=https://api.siliconflow.cn/v1
 API_KEY=sk-xvrfniafyxprllrgedsgosdwcmfmrbnrvhhztssqsmnzacfj
-
+DASHSCOPE=sk-bc0f1026a41c4c92beb014be8973e4e2
 # embedding模型配置
 EMBEDDING_MODEL_NAME=Qwen/Qwen3-Embedding-0.6B
+MULTIMODAL_EMBEDDING_MODEL_NAME=qwen2.5-vl-embedding
+# RAGFLOW配置
+RAGFLOW_API_URL=http://192.168.16.134/
+RAGFLOW_API_KEY=ragflow-sPJ06xiUdRrcfDRlOD-GN2gl-U2DLB-PbgNGckUu0KM
+DATASET_ID=9f85dbeae94611f0b6e00242ac1d0002
 
 # 应用配置
 LOG_LEVEL=INFO
+
+# 向量数据库配置
+# 可选值: es, infinity
+VECTOR_DB_TYPE=infinity
+
+# Infinity向量数据库配置
+INFINITY_HOST=192.168.16.134
+INFINITY_PORT=23817
+INFINITY_USER=admin
+INFINITY_PASSWORD=admin

+ 247 - 0
VectorDB_GUIDE.md

@@ -0,0 +1,247 @@
+# 向量数据库使用指南
+
+## 1. 配置说明
+
+在 `.env` 文件中添加以下配置,用于选择和配置向量数据库:
+
+```env
+# 向量数据库配置
+VECTOR_DB_TYPE=es  # 可选值: es, infinity
+
+# Infinity向量数据库配置
+INFINITY_HOST=localhost
+INFINITY_PORT=23810
+INFINITY_USER=admin
+INFINITY_PASSWORD=admin
+```
+
+## 2. 基本使用
+
+### 2.1 获取向量数据库实例
+
+```python
+from services.utils.vector_db import VectorDBFactory
+
+# 根据配置获取向量数据库实例
+vector_db = VectorDBFactory.get_vector_db()
+```
+
+### 2.2 创建索引
+
+```python
+# 创建索引
+index_name = "test_index"
+mappings = {
+    "properties": {
+        "content": {
+            "type": "text"
+        },
+        "vector_768_vec": {
+            "type": "dense_vector",
+            "dims": 768,
+            "similarity": "cosine"
+        }
+    }
+}
+
+vector_db.create_index(index_name, mappings)
+```
+
+### 2.3 插入文档
+
+```python
+# 插入单个文档
+document = {
+    "content": "这是一个测试文档",
+    "vector_768_vec": [0.1] * 768,
+    "metadata": {
+        "title": "测试文档",
+        "category": "test"
+    }
+}
+
+vector_db.insert_document(index_name, document, id="1")
+
+# 批量插入文档
+documents = [
+    {
+        "content": f"批量测试文档 {i}",
+        "vector_768_vec": [0.1] * 768,
+        "metadata": {
+            "title": f"批量测试文档 {i}",
+            "category": "test"
+        }
+    }
+    for i in range(5)
+]
+
+vector_db.bulk_insert(index_name, documents)
+```
+
+### 2.4 向量检索
+
+```python
+# 向量检索
+query_vector = [0.1] * 768
+result = vector_db.vector_search(
+    index_name,
+    vector_field="vector_768_vec",
+    vector=query_vector,
+    size=10
+)
+```
+
+### 2.5 混合检索
+
+```python
+# 混合检索(文本+向量)
+result = vector_db.hybrid_search(
+    index_name,
+    text_query="测试",
+    vector_field="vector_768_vec",
+    vector=query_vector,
+    size=10,
+    text_weight=0.5,  # 文本检索权重
+    vector_weight=0.5  # 向量检索权重
+)
+```
+
+### 2.6 关闭连接
+
+```python
+# 关闭连接
+vector_db.close()
+```
+
+## 3. 切换向量数据库
+
+### 3.1 通过环境变量切换
+
+修改 `.env` 文件中的 `VECTOR_DB_TYPE` 配置:
+
+```env
+# 使用Elasticsearch
+VECTOR_DB_TYPE=es
+
+# 使用Infinity
+VECTOR_DB_TYPE=infinity
+```
+
+### 3.2 在代码中动态切换
+
+```python
+# 注意:这种方式不推荐,建议通过环境变量配置
+from services.utils.vector_db import ElasticsearchVectorDB, InfinityVectorDB
+
+# 直接创建特定类型的向量数据库实例
+es_db = ElasticsearchVectorDB()
+infinity_db = InfinityVectorDB()
+```
+
+## 4. 集成到现有代码
+
+### 4.1 替换原有的ES连接
+
+原代码:
+```python
+from services.utils.es_conn import ESConnection
+
+# 初始化ES连接
+es = ESConnection()
+```
+
+新代码:
+```python
+from services.utils.vector_db import VectorDBFactory
+
+# 初始化向量数据库连接(根据配置自动选择)
+vector_db = VectorDBFactory.get_vector_db()
+```
+
+### 4.2 替换向量检索逻辑
+
+原代码:
+```python
+result = es.hybrid_search(
+    index_name, 
+    text_query="测试", 
+    vector_field="vector_768_vec", 
+    vector=vector,
+    size=10
+)
+```
+
+新代码:
+```python
+result = vector_db.hybrid_search(
+    index_name, 
+    text_query="测试", 
+    vector_field="vector_768_vec", 
+    vector=vector,
+    size=10
+)
+```
+
+## 5. 注意事项
+
+1. **依赖问题**:
+   - 使用 Elasticsearch 时,确保已安装 `elasticsearch==8.11.1`
+   - 使用 Infinity 时,需要注意 `infinity-emb` 和 `huggingface_hub` 的版本兼容问题
+     - 推荐:`huggingface_hub==0.21.4`
+     - 安装命令:`pip install huggingface_hub==0.21.4`
+
+2. **配置问题**:
+   - 确保 `.env` 文件中的配置正确
+   - 确保向量数据库服务已启动并可访问
+
+3. **性能问题**:
+   - 对于大规模数据,建议使用批量操作
+   - 调整向量检索的 `size` 参数以平衡性能和结果数量
+
+4. **迁移问题**:
+   - 从一种向量数据库迁移到另一种时,需要重新创建索引和导入数据
+   - 不同向量数据库的映射和查询语法可能有所不同
+
+## 6. 扩展新的向量数据库
+
+要添加对新的向量数据库的支持,需要:
+
+1. 创建一个新的类,继承自 `VectorDBBase`
+2. 实现所有抽象方法
+3. 在 `VectorDBFactory.get_vector_db()` 方法中添加新的类型判断
+
+示例:
+
+```python
+class NewVectorDB(VectorDBBase):
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
+        # 实现创建索引逻辑
+        pass
+    
+    # 实现其他方法...
+
+# 在 VectorDBFactory.get_vector_db() 中添加
+if vector_db_type == "new":
+    return NewVectorDB()
+```
+
+## 7. 故障排除
+
+### 7.1 Elasticsearch 连接超时
+
+- 检查 Elasticsearch 服务是否正在运行
+- 检查 `.env` 文件中的 `ES_HOST` 配置是否正确
+- 检查防火墙设置,确保 Elasticsearch 端口可访问
+
+### 7.2 Infinity 依赖错误
+
+- 尝试安装兼容版本的 `huggingface_hub`:`pip install huggingface_hub==0.21.4`
+- 检查 Infinity 服务是否正在运行
+- 检查 `.env` 文件中的 Infinity 配置是否正确
+
+### 7.3 向量检索返回空结果
+
+- 检查索引是否已创建
+- 检查文档是否已成功插入
+- 检查向量字段名是否正确
+- 检查查询向量的维度是否与索引中存储的向量维度一致

+ 355 - 0
book/es_dynamic.md

@@ -0,0 +1,355 @@
+{
+  "ragflow_92162247e93e11f084830242ac1d0002": {
+    "mappings": {
+      "dynamic_templates": [
+        {
+          "int": {
+            "match": "*_int",
+            "mapping": {
+              "store": "true",
+              "type": "integer"
+            }
+          }
+        },
+        {
+          "ulong": {
+            "match": "*_ulong",
+            "mapping": {
+              "store": "true",
+              "type": "unsigned_long"
+            }
+          }
+        },
+        {
+          "long": {
+            "match": "*_long",
+            "mapping": {
+              "store": "true",
+              "type": "long"
+            }
+          }
+        },
+        {
+          "short": {
+            "match": "*_short",
+            "mapping": {
+              "store": "true",
+              "type": "short"
+            }
+          }
+        },
+        {
+          "numeric": {
+            "match": "*_flt",
+            "mapping": {
+              "store": true,
+              "type": "float"
+            }
+          }
+        },
+        {
+          "tks": {
+            "match": "*_tks",
+            "mapping": {
+              "analyzer": "whitespace",
+              "similarity": "scripted_sim",
+              "store": true,
+              "type": "text"
+            }
+          }
+        },
+        {
+          "ltks": {
+            "match": "*_ltks",
+            "mapping": {
+              "analyzer": "whitespace",
+              "store": true,
+              "type": "text"
+            }
+          }
+        },
+        {
+          "kwd": {
+            "match": "^(.*_(kwd|id|ids|uid|uids)|uid)$",
+            "match_pattern": "regex",
+            "mapping": {
+              "similarity": "boolean",
+              "store": true,
+              "type": "keyword"
+            }
+          }
+        },
+        {
+          "dt": {
+            "match": "^.*(_dt|_time|_at)$",
+            "match_pattern": "regex",
+            "mapping": {
+              "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss",
+              "store": true,
+              "type": "date"
+            }
+          }
+        },
+        {
+          "nested": {
+            "match": "*_nst",
+            "mapping": {
+              "type": "nested"
+            }
+          }
+        },
+        {
+          "object": {
+            "match": "*_obj",
+            "mapping": {
+              "dynamic": "true",
+              "type": "object"
+            }
+          }
+        },
+        {
+          "string": {
+            "match": "^.*_(with_weight|list)$",
+            "match_pattern": "regex",
+            "mapping": {
+              "index": "false",
+              "store": true,
+              "type": "text"
+            }
+          }
+        },
+        {
+          "rank_feature": {
+            "match": "*_fea",
+            "mapping": {
+              "type": "rank_feature"
+            }
+          }
+        },
+        {
+          "rank_features": {
+            "match": "*_feas",
+            "mapping": {
+              "type": "rank_features"
+            }
+          }
+        },
+        {
+          "dense_vector": {
+            "match": "*_512_vec",
+            "mapping": {
+              "dims": 512,
+              "index": true,
+              "similarity": "cosine",
+              "type": "dense_vector"
+            }
+          }
+        },
+        {
+          "dense_vector": {
+            "match": "*_768_vec",
+            "mapping": {
+              "dims": 768,
+              "index": true,
+              "similarity": "cosine",
+              "type": "dense_vector"
+            }
+          }
+        },
+        {
+          "dense_vector": {
+            "match": "*_1024_vec",
+            "mapping": {
+              "dims": 1024,
+              "index": true,
+              "similarity": "cosine",
+              "type": "dense_vector"
+            }
+          }
+        },
+        {
+          "dense_vector": {
+            "match": "*_1536_vec",
+            "mapping": {
+              "dims": 1536,
+              "index": true,
+              "similarity": "cosine",
+              "type": "dense_vector"
+            }
+          }
+        },
+        {
+          "binary": {
+            "match": "*_bin",
+            "mapping": {
+              "type": "binary"
+            }
+          }
+        }
+      ],
+      "date_detection": true,
+      "properties": {
+        "authors_sm_tks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace",
+          "similarity": "scripted_sim"
+        },
+        "authors_tks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace",
+          "similarity": "scripted_sim"
+        },
+        "available_int": {
+          "type": "integer",
+          "store": true
+        },
+        "content_ltks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace"
+        },
+        "content_sm_ltks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace"
+        },
+        "content_with_weight": {
+          "type": "text",
+          "index": false,
+          "store": true
+        },
+        "create_time": {
+          "type": "date",
+          "store": true,
+          "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss"
+        },
+        "create_timestamp_flt": {
+          "type": "float",
+          "store": true
+        },
+        "doc_id": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "doc_type_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "docnm_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "entity_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "entity_type_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "from_entity_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "img_id": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "important_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "important_tks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace",
+          "similarity": "scripted_sim"
+        },
+        "kb_id": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "knowledge_graph_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "lat_lon": {
+          "type": "geo_point",
+          "store": true
+        },
+        "page_num_int": {
+          "type": "integer",
+          "store": true
+        },
+        "position_int": {
+          "type": "integer",
+          "store": true
+        },
+        "q_1024_vec": {
+          "type": "dense_vector",
+          "dims": 1024,
+          "index": true,
+          "similarity": "cosine"
+        },
+        "question_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "question_tks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace",
+          "similarity": "scripted_sim"
+        },
+        "removed_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "source_id": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "title_sm_tks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace",
+          "similarity": "scripted_sim"
+        },
+        "title_tks": {
+          "type": "text",
+          "store": true,
+          "analyzer": "whitespace",
+          "similarity": "scripted_sim"
+        },
+        "to_entity_kwd": {
+          "type": "keyword",
+          "store": true,
+          "similarity": "boolean"
+        },
+        "top_int": {
+          "type": "integer",
+          "store": true
+        },
+        "weight_int": {
+          "type": "integer",
+          "store": true
+        }
+      }
+    }
+  }
+}

+ 4577 - 0
book/ragflow_api.md

@@ -0,0 +1,4577 @@
+HTTP API
+A complete reference for RAGFlow's RESTful API. Before proceeding, please ensure you have your RAGFlow API key ready for authentication.
+
+ERROR CODES
+Code	Message	Description
+400	Bad Request	Invalid request parameters
+401	Unauthorized	Unauthorized access
+403	Forbidden	Access denied
+404	Not Found	Resource not found
+500	Internal Server Error	Server internal error
+1001	Invalid Chunk ID	Invalid Chunk ID
+1002	Chunk Update Failed	Chunk update failed
+OpenAI-Compatible API
+Create chat completion
+POST /api/v1/chats_openai/{chat_id}/chat/completions
+
+Creates a model response for a given chat conversation.
+
+This API follows the same request and response format as OpenAI's API. It allows you to interact with the model in a manner similar to how you would with OpenAI's API.
+
+Request
+Method: POST
+URL: /api/v1/chats_openai/{chat_id}/chat/completions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"model": string
+"messages": object list
+"stream": boolean
+"extra_body": object (optional)
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/chats_openai/{chat_id}/chat/completions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+        "model": "model",
+        "messages": [{"role": "user", "content": "Say this is a test!"}],
+        "stream": true,
+        "extra_body": {
+          "reference": true,
+          "metadata_condition": {
+            "logic": "and",
+            "conditions": [
+              {
+                "name": "author",
+                "comparison_operator": "is",
+                "value": "bob"
+              }
+            ]
+          }
+        }
+      }'
+Request Parameters
+model (Body parameter) string, Required
+The model used to generate the response. The server will parse this automatically, so you can set it to any value for now.
+
+messages (Body parameter) list[object], Required
+A list of historical chat messages used to generate the response. This must contain at least one message with the user role.
+
+stream (Body parameter) boolean
+Whether to receive the response as a stream. Set this to false explicitly if you prefer to receive the entire response in one go instead of as a stream.
+
+extra_body (Body parameter) object
+Extra request parameters:
+
+reference: boolean - include reference in the final chunk (stream) or in the final message (non-stream).
+metadata_condition: object - metadata filter conditions applied to retrieval results.
+Response
+Stream:
+
+data:{
+    "id": "chatcmpl-3b0397f277f511f0b47f729e3aa55728",
+    "choices": [
+        {
+            "delta": {
+                "content": "Hello! It seems like you're just greeting me. If you have a specific",
+                "role": "assistant",
+                "function_call": null,
+                "tool_calls": null,
+                "reasoning_content": null
+            },
+            "finish_reason": null,
+            "index": 0,
+            "logprobs": null
+        }
+    ],
+    "created": 1755084508,
+    "model": "model",
+    "object": "chat.completion.chunk",
+    "system_fingerprint": "",
+    "usage": null
+}
+
+data:{"id": "chatcmpl-3b0397f277f511f0b47f729e3aa55728", "choices": [{"delta": {"content": " question or need information, feel free to ask, and I'll do my best", "role": "assistant", "function_call": null, "tool_calls": null, "reasoning_content": null}, "finish_reason": null, "index": 0, "logprobs": null}], "created": 1755084508, "model": "model", "object": "chat.completion.chunk", "system_fingerprint": "", "usage": null}
+
+data:{"id": "chatcmpl-3b0397f277f511f0b47f729e3aa55728", "choices": [{"delta": {"content": " to assist you based on the knowledge base provided.", "role": "assistant", "function_call": null, "tool_calls": null, "reasoning_content": null}, "finish_reason": null, "index": 0, "logprobs": null}], "created": 1755084508, "model": "model", "object": "chat.completion.chunk", "system_fingerprint": "", "usage": null}
+
+data:{"id": "chatcmpl-3b0397f277f511f0b47f729e3aa55728", "choices": [{"delta": {"content": null, "role": "assistant", "function_call": null, "tool_calls": null, "reasoning_content": null}, "finish_reason": "stop", "index": 0, "logprobs": null}], "created": 1755084508, "model": "model", "object": "chat.completion.chunk", "system_fingerprint": "", "usage": {"prompt_tokens": 5, "completion_tokens": 188, "total_tokens": 193}}
+
+data:[DONE]
+Non-stream:
+
+{
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+                "content": "Hello! I'm your smart assistant. What can I do for you?",
+                "role": "assistant"
+            }
+        }
+    ],
+    "created": 1755084403,
+    "id": "chatcmpl-3b0397f277f511f0b47f729e3aa55728",
+    "model": "model",
+    "object": "chat.completion",
+    "usage": {
+        "completion_tokens": 55,
+        "completion_tokens_details": {
+            "accepted_prediction_tokens": 55,
+            "reasoning_tokens": 5,
+            "rejected_prediction_tokens": 0
+        },
+        "prompt_tokens": 5,
+        "total_tokens": 60
+    }
+}
+Failure:
+
+{
+  "code": 102,
+  "message": "The last content of this conversation is not from user."
+}
+Create agent completion
+POST /api/v1/agents_openai/{agent_id}/chat/completions
+
+Creates a model response for a given chat conversation.
+
+This API follows the same request and response format as OpenAI's API. It allows you to interact with the model in a manner similar to how you would with OpenAI's API.
+
+Request
+Method: POST
+URL: /api/v1/agents_openai/{agent_id}/chat/completions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"model": string
+"messages": object list
+"stream": boolean
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/agents_openai/{agent_id}/chat/completions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+        "model": "model",
+        "messages": [{"role": "user", "content": "Say this is a test!"}],
+        "stream": true
+      }'
+Request Parameters
+model (Body parameter) string, Required
+The model used to generate the response. The server will parse this automatically, so you can set it to any value for now.
+
+messages (Body parameter) list[object], Required
+A list of historical chat messages used to generate the response. This must contain at least one message with the user role.
+
+stream (Body parameter) boolean
+Whether to receive the response as a stream. Set this to false explicitly if you prefer to receive the entire response in one go instead of as a stream.
+
+session_id (Body parameter) string
+Agent session id.
+
+Response
+Stream:
+
+...
+
+data: {
+    "id": "c39f6f9c83d911f0858253708ecb6573",
+    "object": "chat.completion.chunk",
+    "model": "d1f79142831f11f09cc51795b9eb07c0",
+    "choices": [
+        {
+            "delta": {
+                "content": " terminal"
+            },
+            "finish_reason": null,
+            "index": 0
+        }
+    ]
+}
+
+data: {
+    "id": "c39f6f9c83d911f0858253708ecb6573",
+    "object": "chat.completion.chunk",
+    "model": "d1f79142831f11f09cc51795b9eb07c0",
+    "choices": [
+        {
+            "delta": {
+                "content": "."
+            },
+            "finish_reason": null,
+            "index": 0
+        }
+    ]
+}
+
+data: {
+    "id": "c39f6f9c83d911f0858253708ecb6573",
+    "object": "chat.completion.chunk",
+    "model": "d1f79142831f11f09cc51795b9eb07c0",
+    "choices": [
+        {
+            "delta": {
+                "content": "",
+                "reference": {
+                    "chunks": {
+                        "20": {
+                            "id": "4b8935ac0a22deb1",
+                            "content": "```cd /usr/ports/editors/neovim/ && make install```## Android[Termux](https://github.com/termux/termux-app) offers a Neovim package.",
+                            "document_id": "4bdd2ff65e1511f0907f09f583941b45",
+                            "document_name": "INSTALL22.md",
+                            "dataset_id": "456ce60c5e1511f0907f09f583941b45",
+                            "image_id": "",
+                            "positions": [
+                                [
+                                    12,
+                                    11,
+                                    11,
+                                    11,
+                                    11
+                                ]
+                            ],
+                            "url": null,
+                            "similarity": 0.5697155305154673,
+                            "vector_similarity": 0.7323851005515574,
+                            "term_similarity": 0.5000000005,
+                            "doc_type": ""
+                        }
+                    },
+                    "doc_aggs": {
+                        "INSTALL22.md": {
+                            "doc_name": "INSTALL22.md",
+                            "doc_id": "4bdd2ff65e1511f0907f09f583941b45",
+                            "count": 3
+                        },
+                        "INSTALL.md": {
+                            "doc_name": "INSTALL.md",
+                            "doc_id": "4bd7fdd85e1511f0907f09f583941b45",
+                            "count": 2
+                        },
+                        "INSTALL(1).md": {
+                            "doc_name": "INSTALL(1).md",
+                            "doc_id": "4bdfb42e5e1511f0907f09f583941b45",
+                            "count": 2
+                        },
+                        "INSTALL3.md": {
+                            "doc_name": "INSTALL3.md",
+                            "doc_id": "4bdab5825e1511f0907f09f583941b45",
+                            "count": 1
+                        }
+                    }
+                }
+            },
+            "finish_reason": null,
+            "index": 0
+        }
+    ]
+}
+
+data: [DONE]
+Non-stream:
+
+{
+    "choices": [
+        {
+            "finish_reason": "stop",
+            "index": 0,
+            "logprobs": null,
+            "message": {
+                "content": "\nTo install Neovim, the process varies depending on your operating system:\n\n### For Windows:\n1. **Download from GitHub**: \n   - Visit the [Neovim releases page](https://github.com/neovim/neovim/releases)\n   - Download the latest Windows installer (nvim-win64.msi)\n   - Run the installer and follow the prompts\n\n2. **Using winget** (Windows Package Manager):\n...",
+                "reference": {
+                    "chunks": {
+                        "20": {
+                            "content": "```cd /usr/ports/editors/neovim/ && make install```## Android[Termux](https://github.com/termux/termux-app) offers a Neovim package.",
+                            "dataset_id": "456ce60c5e1511f0907f09f583941b45",
+                            "doc_type": "",
+                            "document_id": "4bdd2ff65e1511f0907f09f583941b45",
+                            "document_name": "INSTALL22.md",
+                            "id": "4b8935ac0a22deb1",
+                            "image_id": "",
+                            "positions": [
+                                [
+                                    12,
+                                    11,
+                                    11,
+                                    11,
+                                    11
+                                ]
+                            ],
+                            "similarity": 0.5697155305154673,
+                            "term_similarity": 0.5000000005,
+                            "url": null,
+                            "vector_similarity": 0.7323851005515574
+                        }
+                    },
+                    "doc_aggs": {
+                        "INSTALL(1).md": {
+                            "count": 2,
+                            "doc_id": "4bdfb42e5e1511f0907f09f583941b45",
+                            "doc_name": "INSTALL(1).md"
+                        },
+                        "INSTALL.md": {
+                            "count": 2,
+                            "doc_id": "4bd7fdd85e1511f0907f09f583941b45",
+                            "doc_name": "INSTALL.md"
+                        },
+                        "INSTALL22.md": {
+                            "count": 3,
+                            "doc_id": "4bdd2ff65e1511f0907f09f583941b45",
+                            "doc_name": "INSTALL22.md"
+                        },
+                        "INSTALL3.md": {
+                            "count": 1,
+                            "doc_id": "4bdab5825e1511f0907f09f583941b45",
+                            "doc_name": "INSTALL3.md"
+                        }
+                    }
+                },
+                "role": "assistant"
+            }
+        }
+    ],
+    "created": null,
+    "id": "c39f6f9c83d911f0858253708ecb6573",
+    "model": "d1f79142831f11f09cc51795b9eb07c0",
+    "object": "chat.completion",
+    "param": null,
+    "usage": {
+        "completion_tokens": 415,
+        "completion_tokens_details": {
+            "accepted_prediction_tokens": 0,
+            "reasoning_tokens": 0,
+            "rejected_prediction_tokens": 0
+        },
+        "prompt_tokens": 6,
+        "total_tokens": 421
+    }
+}
+Failure:
+
+{
+  "code": 102,
+  "message": "The last content of this conversation is not from user."
+}
+DATASET MANAGEMENT
+Create dataset
+POST /api/v1/datasets
+
+Creates a dataset.
+
+Request
+Method: POST
+URL: /api/v1/datasets
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name": string
+"avatar": string
+"description": string
+"embedding_model": string
+"permission": string
+"chunk_method": string
+"parser_config": object
+"parse_type": int
+"pipeline_id": string
+A basic request example
+curl --request POST \
+     --url http://{address}/api/v1/datasets \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+      "name": "test_1"
+      }'
+A request example specifying ingestion pipeline
+:::caution WARNING You must not include "chunk_method" or "parser_config" when specifying an ingestion pipeline. :::
+
+curl --request POST \
+  --url http://{address}/api/v1/datasets \
+  --header 'Content-Type: application/json' \
+  --header 'Authorization: Bearer <YOUR_API_KEY>' \
+  --data '{
+   "name": "test-sdk",
+   "parse_type": <NUMBER_OF_PARSERS_IN_YOUR_PARSER_COMPONENT>,
+   "pipeline_id": "<PIPELINE_ID_32_HEX>"
+  }'
+Request parameters
+"name": (Body parameter), string, Required
+The unique name of the dataset to create. It must adhere to the following requirements:
+
+Basic Multilingual Plane (BMP) only
+Maximum 128 characters
+Case-insensitive
+"avatar": (Body parameter), string
+Base64 encoding of the avatar.
+
+Maximum 65535 characters
+"description": (Body parameter), string
+A brief description of the dataset to create.
+
+Maximum 65535 characters
+"embedding_model": (Body parameter), string
+The name of the embedding model to use. For example: "BAAI/bge-large-zh-v1.5@BAAI"
+
+Maximum 255 characters
+Must follow model_name@model_factory format
+"permission": (Body parameter), string
+Specifies who can access the dataset to create. Available options:
+
+"me": (Default) Only you can manage the dataset.
+"team": All team members can manage the dataset.
+"chunk_method": (Body parameter), enum<string>
+The default chunk method of the dataset to create. Mutually exclusive with "parse_type" and "pipeline_id". If you set "chunk_method", do not include "parse_type" or "pipeline_id".
+Available options:
+
+"naive": General (default)
+"book": Book
+"email": Email
+"laws": Laws
+"manual": Manual
+"one": One
+"paper": Paper
+"picture": Picture
+"presentation": Presentation
+"qa": Q&A
+"table": Table
+"tag": Tag
+"parser_config": (Body parameter), object
+The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected "chunk_method":
+
+If "chunk_method" is "naive", the "parser_config" object contains the following attributes:
+"auto_keywords": int
+Defaults to 0
+Minimum: 0
+Maximum: 32
+"auto_questions": int
+Defaults to 0
+Minimum: 0
+Maximum: 10
+"chunk_token_num": int
+Defaults to 512
+Minimum: 1
+Maximum: 2048
+"delimiter": string
+Defaults to "\n".
+"html4excel": bool
+Whether to convert Excel documents into HTML format.
+Defaults to false
+"layout_recognize": string
+Defaults to DeepDOC
+"tag_kb_ids": array<string>
+IDs of datasets to be parsed using the ​​Tag chunk method.
+Before setting this, ensure a tag set is created and properly configured. For details, see Use tag set.
+"task_page_size": int
+For PDFs only.
+Defaults to 12
+Minimum: 1
+"raptor": object RAPTOR-specific settings.
+Defaults to: {"use_raptor": false}
+"graphrag": object GRAPHRAG-specific settings.
+Defaults to: {"use_graphrag": false}
+If "chunk_method" is "qa", "manuel", "paper", "book", "laws", or "presentation", the "parser_config" object contains the following attribute:
+"raptor": object RAPTOR-specific settings.
+Defaults to: {"use_raptor": false}.
+If "chunk_method" is "table", "picture", "one", or "email", "parser_config" is an empty JSON object.
+"parse_type": (Body parameter), int
+The ingestion pipeline parse type identifier, i.e., the number of parsers in your Parser component.
+
+Required (along with "pipeline_id") if specifying an ingestion pipeline.
+Must not be included when "chunk_method" is specified.
+"pipeline_id": (Body parameter), string
+The ingestion pipeline ID. Can be found in the corresponding URL in the RAGFlow UI.
+
+Required (along with "parse_type") if specifying an ingestion pipeline.
+Must be a 32-character lowercase hexadecimal string, e.g., "d0bebe30ae2211f0970942010a8e0005".
+Must not be included when "chunk_method" is specified.
+:::caution WARNING You can choose either of the following ingestion options when creating a dataset, but not both:
+
+Use a built-in chunk method -- specify "chunk_method" (optionally with "parser_config").
+Use an ingestion pipeline -- specify both "parse_type" and "pipeline_id".
+If none of "chunk_method", "parse_type", or "pipeline_id" are provided, the system defaults to chunk_method = "naive". :::
+
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "avatar": null,
+        "chunk_count": 0,
+        "chunk_method": "naive",
+        "create_date": "Mon, 28 Apr 2025 18:40:41 GMT",
+        "create_time": 1745836841611,
+        "created_by": "3af81804241d11f0a6a79f24fc270c7f",
+        "description": null,
+        "document_count": 0,
+        "embedding_model": "BAAI/bge-large-zh-v1.5@BAAI",
+        "id": "3b4de7d4241d11f0a6a79f24fc270c7f",
+        "language": "English",
+        "name": "RAGFlow example",
+        "pagerank": 0,
+        "parser_config": {
+            "chunk_token_num": 128, 
+            "delimiter": "\\n!?;。;!?", 
+            "html4excel": false, 
+            "layout_recognize": "DeepDOC", 
+            "raptor": {
+                "use_raptor": false
+                }
+            },
+        "permission": "me",
+        "similarity_threshold": 0.2,
+        "status": "1",
+        "tenant_id": "3af81804241d11f0a6a79f24fc270c7f",
+        "token_num": 0,
+        "update_date": "Mon, 28 Apr 2025 18:40:41 GMT",
+        "update_time": 1745836841611,
+        "vector_similarity_weight": 0.3,
+    },
+}
+Failure:
+
+{
+    "code": 101,
+    "message": "Dataset name 'RAGFlow example' already exists"
+}
+Delete datasets
+DELETE /api/v1/datasets
+
+Deletes datasets by ID.
+
+Request
+Method: DELETE
+URL: /api/v1/datasets
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"ids": list[string] or null
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/datasets \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+     "ids": ["d94a8dc02c9711f0930f7fbc369eab6d", "e94a8dc02c9711f0930f7fbc369eab6e"]
+     }'
+Request parameters
+"ids": (Body parameter), list[string] or null, Required
+Specifies the datasets to delete:
+If null, all datasets will be deleted.
+If an array of IDs, only the specified datasets will be deleted.
+If an empty array, no datasets will be deleted.
+Response
+Success:
+
+{
+    "code": 0 
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "You don't own the dataset."
+}
+Update dataset
+PUT /api/v1/datasets/{dataset_id}
+
+Updates configurations for a specified dataset.
+
+Request
+Method: PUT
+URL: /api/v1/datasets/{dataset_id}
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name": string
+"avatar": string
+"description": string
+"embedding_model": string
+"permission": string
+"chunk_method": string
+"pagerank": int
+"parser_config": object
+Request example
+curl --request PUT \
+     --url http://{address}/api/v1/datasets/{dataset_id} \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "name": "updated_dataset"
+     }'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the dataset to update.
+"name": (Body parameter), string
+The revised name of the dataset.
+Basic Multilingual Plane (BMP) only
+Maximum 128 characters
+Case-insensitive
+"avatar": (Body parameter), string
+The updated base64 encoding of the avatar.
+Maximum 65535 characters
+"embedding_model": (Body parameter), string
+The updated embedding model name.
+Ensure that "chunk_count" is 0 before updating "embedding_model".
+Maximum 255 characters
+Must follow model_name@model_factory format
+"permission": (Body parameter), string
+The updated dataset permission. Available options:
+"me": (Default) Only you can manage the dataset.
+"team": All team members can manage the dataset.
+"pagerank": (Body parameter), int
+refer to Set page rank
+Default: 0
+Minimum: 0
+Maximum: 100
+"chunk_method": (Body parameter), enum<string>
+The chunking method for the dataset. Available options:
+"naive": General (default)
+"book": Book
+"email": Email
+"laws": Laws
+"manual": Manual
+"one": One
+"paper": Paper
+"picture": Picture
+"presentation": Presentation
+"qa": Q&A
+"table": Table
+"tag": Tag
+"parser_config": (Body parameter), object
+The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected "chunk_method":
+If "chunk_method" is "naive", the "parser_config" object contains the following attributes:
+"auto_keywords": int
+Defaults to 0
+Minimum: 0
+Maximum: 32
+"auto_questions": int
+Defaults to 0
+Minimum: 0
+Maximum: 10
+"chunk_token_num": int
+Defaults to 512
+Minimum: 1
+Maximum: 2048
+"delimiter": string
+Defaults to "\n".
+"html4excel": bool Indicates whether to convert Excel documents into HTML format.
+Defaults to false
+"layout_recognize": string
+Defaults to DeepDOC
+"tag_kb_ids": array<string> refer to Use tag set
+Must include a list of dataset IDs, where each dataset is parsed using the ​​Tag Chunking Method
+"task_page_size": int For PDF only.
+Defaults to 12
+Minimum: 1
+"raptor": object RAPTOR-specific settings.
+Defaults to: {"use_raptor": false}
+"graphrag": object GRAPHRAG-specific settings.
+Defaults to: {"use_graphrag": false}
+If "chunk_method" is "qa", "manuel", "paper", "book", "laws", or "presentation", the "parser_config" object contains the following attribute:
+"raptor": object RAPTOR-specific settings.
+Defaults to: {"use_raptor": false}.
+If "chunk_method" is "table", "picture", "one", or "email", "parser_config" is an empty JSON object.
+Response
+Success:
+
+{
+    "code": 0 
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Can't change tenant_id."
+}
+List datasets
+GET /api/v1/datasets?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}
+
+Lists datasets.
+
+Request
+Method: GET
+URL: /api/v1/datasets?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/datasets?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={dataset_name}&id={dataset_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+page: (Filter parameter)
+Specifies the page on which the datasets will be displayed. Defaults to 1.
+page_size: (Filter parameter)
+The number of datasets on each page. Defaults to 30.
+orderby: (Filter parameter)
+The field by which datasets should be sorted. Available options:
+create_time (default)
+update_time
+desc: (Filter parameter)
+Indicates whether the retrieved datasets should be sorted in descending order. Defaults to true.
+name: (Filter parameter)
+The name of the dataset to retrieve.
+id: (Filter parameter)
+The ID of the dataset to retrieve.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        {
+            "avatar": "",
+            "chunk_count": 59,
+            "create_date": "Sat, 14 Sep 2024 01:12:37 GMT",
+            "create_time": 1726276357324,
+            "created_by": "69736c5e723611efb51b0242ac120007",
+            "description": null,
+            "document_count": 1,
+            "embedding_model": "BAAI/bge-large-zh-v1.5",
+            "id": "6e211ee0723611efa10a0242ac120007",
+            "language": "English",
+            "name": "mysql",
+            "chunk_method": "naive",
+            "parser_config": {
+                "chunk_token_num": 8192,
+                "delimiter": "\\n",
+                "entity_types": [
+                    "organization",
+                    "person",
+                    "location",
+                    "event",
+                    "time"
+                ]
+            },
+            "permission": "me",
+            "similarity_threshold": 0.2,
+            "status": "1",
+            "tenant_id": "69736c5e723611efb51b0242ac120007",
+            "token_num": 12744,
+            "update_date": "Thu, 10 Oct 2024 04:07:23 GMT",
+            "update_time": 1728533243536,
+            "vector_similarity_weight": 0.3
+        }
+    ],
+    "total": 1
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The dataset doesn't exist"
+}
+Get knowledge graph
+GET /api/v1/datasets/{dataset_id}/knowledge_graph
+
+Retrieves the knowledge graph of a specified dataset.
+
+Request
+Method: GET
+URL: /api/v1/datasets/{dataset_id}/knowledge_graph
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/datasets/{dataset_id}/knowledge_graph \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the target dataset.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "graph": {
+            "directed": false,
+            "edges": [
+                {
+                    "description": "The notice is a document issued to convey risk warnings and operational alerts.<SEP>The notice is a specific instance of a notification document issued under the risk warning framework.",
+                    "keywords": ["9", "8"],
+                    "source": "notice",
+                    "source_id": ["8a46cdfe4b5c11f0a5281a58e595aa1c"],
+                    "src_id": "xxx",
+                    "target": "xxx",
+                    "tgt_id": "xxx",
+                    "weight": 17.0
+                }
+            ],
+            "graph": {
+                "source_id": ["8a46cdfe4b5c11f0a5281a58e595aa1c", "8a7eb6424b5c11f0a5281a58e595aa1c"]
+            },
+            "multigraph": false,
+            "nodes": [
+                {
+                    "description": "xxx",
+                    "entity_name": "xxx",
+                    "entity_type": "ORGANIZATION",
+                    "id": "xxx",
+                    "pagerank": 0.10804906590624092,
+                    "rank": 3,
+                    "source_id": ["8a7eb6424b5c11f0a5281a58e595aa1c"]
+                }
+            ]
+        },
+        "mind_map": {}
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The dataset doesn't exist"
+}
+Delete knowledge graph
+DELETE /api/v1/datasets/{dataset_id}/knowledge_graph
+
+Removes the knowledge graph of a specified dataset.
+
+Request
+Method: DELETE
+URL: /api/v1/datasets/{dataset_id}/knowledge_graph
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/datasets/{dataset_id}/knowledge_graph \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the target dataset.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": true
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The dataset doesn't exist"
+}
+Construct knowledge graph
+POST /api/v1/datasets/{dataset_id}/run_graphrag
+
+Constructs a knowledge graph from a specified dataset.
+
+Request
+Method: POST
+URL: /api/v1/datasets/{dataset_id}/run_graphrag
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/datasets/{dataset_id}/run_graphrag \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the target dataset.
+Response
+Success:
+
+{
+    "code":0,
+    "data":{
+      "graphrag_task_id":"e498de54bfbb11f0ba028f704583b57b"
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Invalid Dataset ID"
+}
+Get knowledge graph construction status
+GET /api/v1/datasets/{dataset_id}/trace_graphrag
+
+Retrieves the knowledge graph construction status for a specified dataset.
+
+Request
+Method: GET
+URL: /api/v1/datasets/{dataset_id}/trace_graphrag
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/datasets/{dataset_id}/trace_graphrag \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the target dataset.
+Response
+Success:
+
+{
+    "code":0,
+    "data":{
+        "begin_at":"Wed, 12 Nov 2025 19:36:56 GMT",
+        "chunk_ids":"",
+        "create_date":"Wed, 12 Nov 2025 19:36:56 GMT",
+        "create_time":1762947416350,
+        "digest":"39e43572e3dcd84f",
+        "doc_id":"44661c10bde211f0bc93c164a47ffc40",
+        "from_page":100000000,
+        "id":"e498de54bfbb11f0ba028f704583b57b",
+        "priority":0,
+        "process_duration":2.45419,
+        "progress":1.0,
+        "progress_msg":"19:36:56 created task graphrag\n19:36:57 Task has been received.\n19:36:58 [GraphRAG] doc:083661febe2411f0bc79456921e5745f has no available chunks, skip generation.\n19:36:58 [GraphRAG] build_subgraph doc:44661c10bde211f0bc93c164a47ffc40 start (chunks=1, timeout=10000000000s)\n19:36:58 Graph already contains 44661c10bde211f0bc93c164a47ffc40\n19:36:58 [GraphRAG] build_subgraph doc:44661c10bde211f0bc93c164a47ffc40 empty\n19:36:58 [GraphRAG] kb:33137ed0bde211f0bc93c164a47ffc40 no subgraphs generated successfully, end.\n19:36:58 Knowledge Graph done (0.72s)","retry_count":1,
+        "task_type":"graphrag",
+        "to_page":100000000,
+        "update_date":"Wed, 12 Nov 2025 19:36:58 GMT",
+        "update_time":1762947418454
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Invalid Dataset ID"
+}
+Construct RAPTOR
+POST /api/v1/datasets/{dataset_id}/run_raptor
+
+Construct a RAPTOR from a specified dataset.
+
+Request
+Method: POST
+URL: /api/v1/datasets/{dataset_id}/run_raptor
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/datasets/{dataset_id}/run_raptor \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the target dataset.
+Response
+Success:
+
+{
+    "code":0,
+    "data":{
+        "raptor_task_id":"50d3c31cbfbd11f0ba028f704583b57b"
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Invalid Dataset ID"
+}
+Get RAPTOR construction status
+GET /api/v1/datasets/{dataset_id}/trace_raptor
+
+Retrieves the RAPTOR construction status for a specified dataset.
+
+Request
+Method: GET
+URL: /api/v1/datasets/{dataset_id}/trace_raptor
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/datasets/{dataset_id}/trace_raptor \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the target dataset.
+Response
+Success:
+
+{
+    "code":0,
+    "data":{
+        "begin_at":"Wed, 12 Nov 2025 19:47:07 GMT",
+        "chunk_ids":"",
+        "create_date":"Wed, 12 Nov 2025 19:47:07 GMT",
+        "create_time":1762948027427,
+        "digest":"8b279a6248cb8fc6",
+        "doc_id":"44661c10bde211f0bc93c164a47ffc40",
+        "from_page":100000000,
+        "id":"50d3c31cbfbd11f0ba028f704583b57b",
+        "priority":0,
+        "process_duration":0.948244,
+        "progress":1.0,
+        "progress_msg":"19:47:07 created task raptor\n19:47:07 Task has been received.\n19:47:07 Processing...\n19:47:07 Processing...\n19:47:07 Indexing done (0.01s).\n19:47:07 Task done (0.29s)",
+        "retry_count":1,
+        "task_type":"raptor",
+        "to_page":100000000,
+        "update_date":"Wed, 12 Nov 2025 19:47:07 GMT",
+        "update_time":1762948027948
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Invalid Dataset ID"
+}
+FILE MANAGEMENT WITHIN DATASET
+Upload documents
+POST /api/v1/datasets/{dataset_id}/documents
+
+Uploads documents to a specified dataset.
+
+Request
+Method: POST
+URL: /api/v1/datasets/{dataset_id}/documents
+Headers:
+'Content-Type: multipart/form-data'
+'Authorization: Bearer <YOUR_API_KEY>'
+Form:
+'file=@{FILE_PATH}'
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents \
+     --header 'Content-Type: multipart/form-data' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --form 'file=@./test1.txt' \
+     --form 'file=@./test2.pdf'
+Request parameters
+dataset_id: (Path parameter)
+The ID of the dataset to which the documents will be uploaded.
+'file': (Body parameter)
+A document to upload.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        {
+            "chunk_method": "naive",
+            "created_by": "69736c5e723611efb51b0242ac120007",
+            "dataset_id": "527fa74891e811ef9c650242ac120006",
+            "id": "b330ec2e91ec11efbc510242ac120004",
+            "location": "1.txt",
+            "name": "1.txt",
+            "parser_config": {
+                "chunk_token_num": 128,
+                "delimiter": "\\n",
+                "html4excel": false,
+                "layout_recognize": true,
+                "raptor": {
+                    "use_raptor": false
+                }
+            },
+            "run": "UNSTART",
+            "size": 17966,
+            "thumbnail": "",
+            "type": "doc"
+        }
+    ]
+}
+Failure:
+
+{
+    "code": 101,
+    "message": "No file part!"
+}
+Update document
+PUT /api/v1/datasets/{dataset_id}/documents/{document_id}
+
+Updates configurations for a specified document.
+
+Request
+Method: PUT
+URL: /api/v1/datasets/{dataset_id}/documents/{document_id}
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name":string
+"meta_fields":object
+"chunk_method":string
+"parser_config":object
+Request example
+curl --request PUT \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --header 'Content-Type: application/json' \
+     --data '
+     {
+          "name": "manual.txt", 
+          "chunk_method": "manual", 
+          "parser_config": {"chunk_token_num": 128}
+     }'
+
+Request parameters
+dataset_id: (Path parameter)
+The ID of the associated dataset.
+document_id: (Path parameter)
+The ID of the document to update.
+"name": (Body parameter), string
+"meta_fields": (Body parameter), dict[str, Any] The meta fields of the document.
+"chunk_method": (Body parameter), string
+The parsing method to apply to the document:
+"naive": General
+"manual: Manual
+"qa": Q&A
+"table": Table
+"paper": Paper
+"book": Book
+"laws": Laws
+"presentation": Presentation
+"picture": Picture
+"one": One
+"email": Email
+"parser_config": (Body parameter), object
+The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected "chunk_method":
+If "chunk_method" is "naive", the "parser_config" object contains the following attributes:
+"chunk_token_num": Defaults to 256.
+"layout_recognize": Defaults to true.
+"html4excel": Indicates whether to convert Excel documents into HTML format. Defaults to false.
+"delimiter": Defaults to "\n".
+"task_page_size": Defaults to 12. For PDF only.
+"raptor": RAPTOR-specific settings. Defaults to: {"use_raptor": false}.
+If "chunk_method" is "qa", "manuel", "paper", "book", "laws", or "presentation", the "parser_config" object contains the following attribute:
+"raptor": RAPTOR-specific settings. Defaults to: {"use_raptor": false}.
+If "chunk_method" is "table", "picture", "one", or "email", "parser_config" is an empty JSON object.
+"enabled": (Body parameter), integer
+Whether the document should be available in the knowledge base.
+1 → (available)
+0 → (unavailable)
+Response
+Success:
+
+{
+  "code": 0,
+  "data": {
+    "id": "cd38dd72d4a611f0af9c71de94a988ef",
+    "name": "large.md",
+    "type": "doc",
+    "suffix": "md",
+    "size": 2306906,
+    "location": "large.md",
+    "source_type": "local",
+    "status": "1",
+    "run": "DONE",
+    "dataset_id": "5f546a1ad4a611f0af9c71de94a988ef",
+
+    "chunk_method": "naive",
+    "chunk_count": 2,
+    "token_count": 8126,
+
+    "created_by": "eab7f446cb5a11f0ab334fbc3aa38f35",
+    "create_date": "Tue, 09 Dec 2025 10:28:52 GMT",
+    "create_time": 1765247332122,
+    "update_date": "Wed, 17 Dec 2025 10:51:16 GMT",
+    "update_time": 1765939876819,
+
+    "process_begin_at": "Wed, 17 Dec 2025 10:33:55 GMT",
+    "process_duration": 14.8615,
+    "progress": 1.0,
+
+    "progress_msg": [
+      "10:33:58 Task has been received.",
+      "10:33:59 Page(1~100000001): Start to parse.",
+      "10:33:59 Page(1~100000001): Finish parsing.",
+      "10:34:07 Page(1~100000001): Generate 2 chunks",
+      "10:34:09 Page(1~100000001): Embedding chunks (2.13s)",
+      "10:34:09 Page(1~100000001): Indexing done (0.31s).",
+      "10:34:09 Page(1~100000001): Task done (11.68s)"
+    ],
+
+    "parser_config": {
+      "chunk_token_num": 512,
+      "delimiter": "\n",
+      "auto_keywords": 0,
+      "auto_questions": 0,
+      "topn_tags": 3,
+
+      "layout_recognize": "DeepDOC",
+      "html4excel": false,
+      "image_context_size": 0,
+      "table_context_size": 0,
+
+      "graphrag": {
+        "use_graphrag": true,
+        "method": "light",
+        "entity_types": [
+          "organization",
+          "person",
+          "geo",
+          "event",
+          "category"
+        ]
+      },
+
+      "raptor": {
+        "use_raptor": true,
+        "max_cluster": 64,
+        "max_token": 256,
+        "threshold": 0.1,
+        "random_seed": 0,
+        "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize."
+      }
+    },
+
+    "meta_fields": {},
+    "pipeline_id": "",
+    "thumbnail": ""
+  }
+}
+
+Failure:
+
+{
+    "code": 102,
+    "message": "The dataset does not have the document."
+}
+Download document
+GET /api/v1/datasets/{dataset_id}/documents/{document_id}
+
+Downloads a document from a specified dataset.
+
+Request
+Method: GET
+URL: /api/v1/datasets/{dataset_id}/documents/{document_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Output:
+'{PATH_TO_THE_FILE}'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --output ./ragflow.txt
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+documents_id: (Path parameter)
+The ID of the document to download.
+Response
+Success:
+
+This is a test to verify the file download feature.
+Failure:
+
+{
+    "code": 102,
+    "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005."
+}
+List documents
+GET /api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}&metadata_condition={json}
+
+Lists documents in a specified dataset.
+
+Request
+Method: GET
+URL: /api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}&suffix={file_suffix}&run={run_status}
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Request examples
+A basic request with pagination:
+
+curl --request GET \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents?page=1&page_size=10 \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+keywords: (Filter parameter), string
+The keywords used to match document titles.
+page: (Filter parameter), integer Specifies the page on which the documents will be displayed. Defaults to 1.
+page_size: (Filter parameter), integer
+The maximum number of documents on each page. Defaults to 30.
+orderby: (Filter parameter), string
+The field by which documents should be sorted. Available options:
+create_time (default)
+update_time
+desc: (Filter parameter), boolean
+Indicates whether the retrieved documents should be sorted in descending order. Defaults to true.
+id: (Filter parameter), string
+The ID of the document to retrieve.
+create_time_from: (Filter parameter), integer
+Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to 0.
+create_time_to: (Filter parameter), integer
+Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to 0.
+suffix: (Filter parameter), array[string]
+Filter by file suffix. Supports multiple values, e.g., pdf, txt, and docx. Defaults to all suffixes.
+run: (Filter parameter), array[string]
+Filter by document processing status. Supports numeric, text, and mixed formats:
+Numeric format: ["0", "1", "2", "3", "4"]
+Text format: [UNSTART, RUNNING, CANCEL, DONE, FAIL]
+Mixed format: [UNSTART, 1, DONE] (mixing numeric and text formats)
+Status mapping:
+0 / UNSTART: Document not yet processed
+1 / RUNNING: Document is currently being processed
+2 / CANCEL: Document processing was cancelled
+3 / DONE: Document processing completed successfully
+4 / FAIL: Document processing failed
+Defaults to all statuses.
+metadata_condition: (Filter parameter), object (JSON in query) Optional metadata filter applied to documents when document_ids is not provided. Uses the same structure as retrieval:
+logic: "and" (default) or "or"
+conditions: array of { "name": string, "comparison_operator": string, "value": string }
+comparison_operator supports: is, not is, contains, not contains, in, not in, start with, end with, >, <, ≥, ≤, empty, not empty
+Usage examples
+A request with multiple filtering parameters
+
+curl --request GET \
+     --url 'http://{address}/api/v1/datasets/{dataset_id}/documents?suffix=pdf&run=DONE&page=1&page_size=10' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Filter by metadata (query JSON):
+
+curl -G \
+  --url "http://localhost:9222/api/v1/datasets/{{KB_ID}}/documents" \
+  --header 'Authorization: Bearer <YOUR_API_KEY>' \
+  --data-urlencode 'metadata_condition={"logic":"and","conditions":[{"name":"tags","comparison_operator":"is","value":"bar"},{"name":"author","comparison_operator":"is","value":"alice"}]}'
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "docs": [
+            {
+                "chunk_count": 0,
+                "create_date": "Mon, 14 Oct 2024 09:11:01 GMT",
+                "create_time": 1728897061948,
+                "created_by": "69736c5e723611efb51b0242ac120007",
+                "id": "3bcfbf8a8a0c11ef8aba0242ac120006",
+                "knowledgebase_id": "7898da028a0511efbf750242ac120005",
+                "location": "Test_2.txt",
+                "name": "Test_2.txt",
+                "parser_config": {
+                    "chunk_token_count": 128,
+                    "delimiter": "\n",
+                    "layout_recognize": true,
+                    "task_page_size": 12
+                },
+                "chunk_method": "naive",
+                "process_begin_at": null,
+                "process_duration": 0.0,
+                "progress": 0.0,
+                "progress_msg": "",
+                "run": "UNSTART",
+                "size": 7,
+                "source_type": "local",
+                "status": "1",
+                "thumbnail": null,
+                "token_count": 0,
+                "type": "doc",
+                "update_date": "Mon, 14 Oct 2024 09:11:01 GMT",
+                "update_time": 1728897061948
+            }
+        ],
+        "total_datasets": 1
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "You don't own the dataset 7898da028a0511efbf750242ac1220005. "
+}
+Delete documents
+DELETE /api/v1/datasets/{dataset_id}/documents
+
+Deletes documents by ID.
+
+Request
+Method: DELETE
+URL: /api/v1/datasets/{dataset_id}/documents
+Headers:
+'Content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"ids": list[string]
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "ids": ["id_1","id_2"]
+     }'
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+"ids": (Body parameter), list[string]
+The IDs of the documents to delete. If it is not specified, all documents in the specified dataset will be deleted.
+Response
+Success:
+
+{
+    "code": 0
+}.
+Failure:
+
+{
+    "code": 102,
+    "message": "You do not own the dataset 7898da028a0511efbf750242ac1220005."
+}
+Parse documents
+POST /api/v1/datasets/{dataset_id}/chunks
+
+Parses documents in a specified dataset.
+
+Request
+Method: POST
+URL: /api/v1/datasets/{dataset_id}/chunks
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"document_ids": list[string]
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/datasets/{dataset_id}/chunks \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]
+     }'
+Request parameters
+dataset_id: (Path parameter)
+The dataset ID.
+"document_ids": (Body parameter), list[string], Required
+The IDs of the documents to parse.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "`document_ids` is required"
+}
+Stop parsing documents
+DELETE /api/v1/datasets/{dataset_id}/chunks
+
+Stops parsing specified documents.
+
+Request
+Method: DELETE
+URL: /api/v1/datasets/{dataset_id}/chunks
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"document_ids": list[string]
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/datasets/{dataset_id}/chunks \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "document_ids": ["97a5f1c2759811efaa500242ac120004","97ad64b6759811ef9fc30242ac120004"]
+     }'
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+"document_ids": (Body parameter), list[string], Required
+The IDs of the documents for which the parsing should be stopped.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "`document_ids` is required"
+}
+CHUNK MANAGEMENT WITHIN DATASET
+Add chunk
+POST /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks
+
+Adds a chunk to a specified document in a specified dataset.
+
+Request
+Method: POST
+URL: /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"content": string
+"important_keywords": list[string]
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "content": "<CHUNK_CONTENT_HERE>"
+     }'
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+document_ids: (Path parameter)
+The associated document ID.
+"content": (Body parameter), string, Required
+The text content of the chunk.
+"important_keywords(Body parameter), list[string]
+The key terms or phrases to tag with the chunk.
+"questions"(Body parameter), list[string] If there is a given question, the embedded chunks will be based on them
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "chunk": {
+            "content": "who are you",
+            "create_time": "2024-12-30 16:59:55",
+            "create_timestamp": 1735549195.969164,
+            "dataset_id": "72f36e1ebdf411efb7250242ac120006",
+            "document_id": "61d68474be0111ef98dd0242ac120006",
+            "id": "12ccdc56e59837e5",
+            "important_keywords": [],
+            "questions": []
+        }
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "`content` is required"
+}
+List chunks
+GET /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks?keywords={keywords}&page={page}&page_size={page_size}&id={id}
+
+Lists chunks in a specified document.
+
+Request
+Method: GET
+URL: /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks?keywords={keywords}&page={page}&page_size={page_size}&id={chunk_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks?keywords={keywords}&page={page}&page_size={page_size}&id={chunk_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' 
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+document_id: (Path parameter)
+The associated document ID.
+keywords(Filter parameter), string
+The keywords used to match chunk content.
+page(Filter parameter), integer
+Specifies the page on which the chunks will be displayed. Defaults to 1.
+page_size(Filter parameter), integer
+The maximum number of chunks on each page. Defaults to 1024.
+id(Filter parameter), string
+The ID of the chunk to retrieve.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "chunks": [
+            {
+                "available": true,
+                "content": "This is a test content.",
+                "docnm_kwd": "1.txt",
+                "document_id": "b330ec2e91ec11efbc510242ac120004",
+                "id": "b48c170e90f70af998485c1065490726",
+                "image_id": "",
+                "important_keywords": "",
+                "positions": [
+                    ""
+                ]
+            }
+        ],
+        "doc": {
+            "chunk_count": 1,
+            "chunk_method": "naive",
+            "create_date": "Thu, 24 Oct 2024 09:45:27 GMT",
+            "create_time": 1729763127646,
+            "created_by": "69736c5e723611efb51b0242ac120007",
+            "dataset_id": "527fa74891e811ef9c650242ac120006",
+            "id": "b330ec2e91ec11efbc510242ac120004",
+            "location": "1.txt",
+            "name": "1.txt",
+            "parser_config": {
+                "chunk_token_num": 128,
+                "delimiter": "\\n",
+                "html4excel": false,
+                "layout_recognize": true,
+                "raptor": {
+                    "use_raptor": false
+                }
+            },
+            "process_begin_at": "Thu, 24 Oct 2024 09:56:44 GMT",
+            "process_duration": 0.54213,
+            "progress": 0.0,
+            "progress_msg": "Task dispatched...",
+            "run": "2",
+            "size": 17966,
+            "source_type": "local",
+            "status": "1",
+            "thumbnail": "",
+            "token_count": 8,
+            "type": "doc",
+            "update_date": "Thu, 24 Oct 2024 11:03:15 GMT",
+            "update_time": 1729767795721
+        },
+        "total": 1
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "You don't own the document 5c5999ec7be811ef9cab0242ac12000e5."
+}
+Delete chunks
+DELETE /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks
+
+Deletes chunks by ID.
+
+Request
+Method: DELETE
+URL: /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"chunk_ids": list[string]
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "chunk_ids": ["test_1", "test_2"]
+     }'
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+document_ids: (Path parameter)
+The associated document ID.
+"chunk_ids": (Body parameter), list[string]
+The IDs of the chunks to delete. If it is not specified, all chunks of the specified document will be deleted.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "`chunk_ids` is required"
+}
+Update chunk
+PUT /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}
+
+Updates content or configurations for a specified chunk.
+
+Request
+Method: PUT
+URL: /api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id}
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"content": string
+"important_keywords": list[string]
+"available": boolean
+Request example
+curl --request PUT \
+     --url http://{address}/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks/{chunk_id} \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {   
+          "content": "ragflow123",  
+          "important_keywords": []  
+     }'
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+document_ids: (Path parameter)
+The associated document ID.
+chunk_id: (Path parameter)
+The ID of the chunk to update.
+"content": (Body parameter), string
+The text content of the chunk.
+"important_keywords": (Body parameter), list[string]
+A list of key terms or phrases to tag with the chunk.
+"available": (Body parameter) boolean
+The chunk's availability status in the dataset. Value options:
+true: Available (default)
+false: Unavailable
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Can't find this chunk 29a2d9987e16ba331fb4d7d30d99b71d2"
+}
+Retrieve a metadata summary from a dataset
+GET /api/v1/datasets/{dataset_id}/metadata/summary
+
+Aggregates metadata values across all documents in a dataset.
+
+Request
+Method: GET
+URL: /api/v1/datasets/{dataset_id}/metadata/summary
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Response
+Success:
+
+{
+  "code": 0,
+  "data": {
+    "summary": {
+      "tags": [["bar", 2], ["foo", 1], ["baz", 1]],
+      "author": [["alice", 2], ["bob", 1]]
+    }
+  }
+}
+Update or delete metadata
+POST /api/v1/datasets/{dataset_id}/metadata/update
+
+Batch update or delete document-level metadata within a specified dataset. If both document_ids and metadata_condition are omitted, all documents within that dataset are selected. When both are provided, the intersection is used.
+
+Request
+Method: POST
+URL: /api/v1/datasets/{dataset_id}/metadata/update
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+selector: object
+updates: list[object]
+deletes: list[object]
+Request parameters
+dataset_id: (Path parameter)
+The associated dataset ID.
+"selector": (Body parameter), object, optional
+A document selector:
+"document_ids": list[string] optional
+The associated document ID.
+"metadata_condition": object, optional
+"logic": Defines the logic relation between conditions if multiple conditions are provided. Options:
+"and" (default)
+"or"
+"conditions": list[object] optional
+Each object: { "name": string, "comparison_operator": string, "value": string }
+"name": string The key name to search by.
+"comparison_operator": string Available options:
+"is"
+"not is"
+"contains"
+"not contains"
+"in"
+"not in"
+"start with"
+"end with"
+">"
+"<"
+"≥"
+"≤"
+"empty"
+"not empty"
+"value": string The key value to search by.
+"updates": (Body parameter), list[object], optional
+Replaces metadata of the retrieved documents. Each object: { "key": string, "match": string, "value": string }.
+"key": string The name of the key to update.
+"match": string optional The current value of the key to update. When omitted, the corresponding keys are updated to "value" regardless of their current values.
+"value": string The new value to set for the specified keys.
+"deletes: (Body parameter), list[ojbect], optional
+Deletes metadata of the retrieved documents. Each object: { "key": string, "value": string }.
+"key": string The name of the key to delete.
+"value": string Optional The value of the key to delete.
+When provided, only keys with a matching value are deleted.
+When omitted, all specified keys are deleted.
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/datasets/{dataset_id}/metadata/update \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+       "selector": {
+         "metadata_condition": {
+           "logic": "and",
+           "conditions": [
+             {"name": "author", "comparison_operator": "is", "value": "alice"}
+           ]
+         }
+       },
+       "updates": [
+         {"key": "tags", "match": "foo", "value": "foo_new"}
+       ],
+       "deletes": [
+         {"key": "obsolete_key"},
+         {"key": "author", "value": "alice"}
+       ]
+     }'
+Response
+Success:
+
+{
+  "code": 0,
+  "data": {
+    "updated": 1,
+    "matched_docs": 2
+  }
+}
+Retrieve chunks
+POST /api/v1/retrieval
+
+Retrieves chunks from specified datasets.
+
+Request
+Method: POST
+URL: /api/v1/retrieval
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"question": string
+"dataset_ids": list[string]
+"document_ids": list[string]
+"page": integer
+"page_size": integer
+"similarity_threshold": float
+"vector_similarity_weight": float
+"top_k": integer
+"rerank_id": string
+"keyword": boolean
+"highlight": boolean
+"cross_languages": list[string]
+"metadata_condition": object
+"use_kg": boolean
+"toc_enhance": boolean
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/retrieval \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "question": "What is advantage of ragflow?",
+          "dataset_ids": ["b2a62730759d11ef987d0242ac120004"],
+          "document_ids": ["77df9ef4759a11ef8bdd0242ac120004"],
+          "metadata_condition": {
+            "logic": "and",
+            "conditions": [
+              {
+                "name": "author",
+                "comparison_operator": "=",
+                "value": "Toby"
+              },
+              {
+                "name": "url",
+                "comparison_operator": "not contains",
+                "value": "amd"
+              }
+            ]
+          }
+     }'
+Request parameter
+"question": (Body parameter), string, Required
+The user query or query keywords.
+"dataset_ids": (Body parameter) list[string]
+The IDs of the datasets to search. If you do not set this argument, ensure that you set "document_ids".
+"document_ids": (Body parameter), list[string]
+The IDs of the documents to search. Ensure that all selected documents use the same embedding model. Otherwise, an error will occur. If you do not set this argument, ensure that you set "dataset_ids".
+"page": (Body parameter), integer
+Specifies the page on which the chunks will be displayed. Defaults to 1.
+"page_size": (Body parameter)
+The maximum number of chunks on each page. Defaults to 30.
+"similarity_threshold": (Body parameter)
+The minimum similarity score. Defaults to 0.2.
+"vector_similarity_weight": (Body parameter), float
+The weight of vector cosine similarity. Defaults to 0.3. If x represents the weight of vector cosine similarity, then (1 - x) is the term similarity weight.
+"top_k": (Body parameter), integer
+The number of chunks engaged in vector cosine computation. Defaults to 1024.
+"use_kg": (Body parameter), boolean
+Whether to search chunks related to the generated knowledge graph for multi-hop queries. Defaults to False. Before enabling this, ensure you have successfully constructed a knowledge graph for the specified datasets. See here for details.
+"toc_enhance": (Body parameter), boolean
+Whether to search chunks with extracted table of content. Defaults to False. Before enabling this, ensure you have enabled TOC_Enhance and successfully extracted table of contents for the specified datasets. See here for details.
+"rerank_id": (Body parameter), integer
+The ID of the rerank model.
+"keyword": (Body parameter), boolean
+Indicates whether to enable keyword-based matching:
+true: Enable keyword-based matching.
+false: Disable keyword-based matching (default).
+"highlight": (Body parameter), boolean
+Specifies whether to enable highlighting of matched terms in the results:
+true: Enable highlighting of matched terms.
+false: Disable highlighting of matched terms (default).
+"cross_languages": (Body parameter) list[string]
+The languages that should be translated into, in order to achieve keywords retrievals in different languages.
+"metadata_condition": (Body parameter), object
+The metadata condition used for filtering chunks:
+"logic": (Body parameter), string
+"and": Return only results that satisfy every condition (default).
+"or": Return results that satisfy any condition.
+"conditions": (Body parameter), array
+A list of metadata filter conditions.
+"name": string - The metadata field name to filter by, e.g., "author", "company", "url". Ensure this parameter before use. See Set metadata for details.
+comparison_operator: string - The comparison operator. Can be one of:
+"contains"
+"not contains"
+"start with"
+"empty"
+"not empty"
+"="
+"≠"
+">"
+"<"
+"≥"
+"≤"
+"value": string - The value to compare.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "chunks": [
+            {
+                "content": "ragflow content",
+                "content_ltks": "ragflow content",
+                "document_id": "5c5999ec7be811ef9cab0242ac120005",
+                "document_keyword": "1.txt",
+                "highlight": "<em>ragflow</em> content",
+                "id": "d78435d142bd5cf6704da62c778795c5",
+                "image_id": "",
+                "important_keywords": [
+                    ""
+                ],
+                "kb_id": "c7ee74067a2c11efb21c0242ac120006",
+                "positions": [
+                    ""
+                ],
+                "similarity": 0.9669436601210759,
+                "term_similarity": 1.0,
+                "vector_similarity": 0.8898122004035864
+            }
+        ],
+        "doc_aggs": [
+            {
+                "count": 1,
+                "doc_id": "5c5999ec7be811ef9cab0242ac120005",
+                "doc_name": "1.txt"
+            }
+        ],
+        "total": 1
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "`datasets` is required."
+}
+CHAT ASSISTANT MANAGEMENT
+Create chat assistant
+POST /api/v1/chats
+
+Creates a chat assistant.
+
+Request
+Method: POST
+URL: /api/v1/chats
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name": string
+"avatar": string
+"dataset_ids": list[string]
+"llm": object
+"prompt": object
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/chats \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+    "dataset_ids": ["0b2cbc8c877f11ef89070242ac120005"],
+    "name":"new_chat_1"
+}'
+Request parameters
+"name": (Body parameter), string, Required
+The name of the chat assistant.
+
+"avatar": (Body parameter), string
+Base64 encoding of the avatar.
+
+"dataset_ids": (Body parameter), list[string]
+The IDs of the associated datasets.
+
+"llm": (Body parameter), object
+The LLM settings for the chat assistant to create. If it is not explicitly set, a JSON object with the following values will be generated as the default. An llm JSON object contains the following attributes:
+
+"model_name", string
+The chat model name. If not set, the user's default chat model will be used.
+:::caution WARNING model_type is an internal parameter, serving solely as a temporary workaround for the current model-configuration design limitations.
+
+Its main purpose is to let multimodal models (stored in the database as "image2text") pass backend validation/dispatching. Be mindful that:
+
+Do not treat it as a stable public API.
+
+It is subject to change or removal in future releases. :::
+
+"model_type": string
+A model type specifier. Only "chat" and "image2text" are recognized; any other inputs, or when omitted, are treated as "chat".
+
+"model_name", string
+
+"temperature": float
+Controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses. Defaults to 0.1.
+
+"top_p": float
+Also known as “nucleus sampling”, this parameter sets a threshold to select a smaller set of words to sample from. It focuses on the most likely words, cutting off the less probable ones. Defaults to 0.3
+
+"presence_penalty": float
+This discourages the model from repeating the same information by penalizing words that have already appeared in the conversation. Defaults to 0.4.
+
+"frequency penalty": float
+Similar to the presence penalty, this reduces the model’s tendency to repeat the same words frequently. Defaults to 0.7.
+
+"prompt": (Body parameter), object
+Instructions for the LLM to follow. If it is not explicitly set, a JSON object with the following values will be generated as the default. A prompt JSON object contains the following attributes:
+
+"similarity_threshold": float RAGFlow employs either a combination of weighted keyword similarity and weighted vector cosine similarity, or a combination of weighted keyword similarity and weighted reranking score during retrieval. This argument sets the threshold for similarities between the user query and chunks. If a similarity score falls below this threshold, the corresponding chunk will be excluded from the results. The default value is 0.2.
+"keywords_similarity_weight": float This argument sets the weight of keyword similarity in the hybrid similarity score with vector cosine similarity or reranking model similarity. By adjusting this weight, you can control the influence of keyword similarity in relation to other similarity measures. The default value is 0.7.
+"top_n": int This argument specifies the number of top chunks with similarity scores above the similarity_threshold that are fed to the LLM. The LLM will only access these 'top N' chunks. The default value is 6.
+"variables": object[] This argument lists the variables to use in the 'System' field of Chat Configurations. Note that:
+"knowledge" is a reserved variable, which represents the retrieved chunks.
+All the variables in 'System' should be curly bracketed.
+The default value is [{"key": "knowledge", "optional": true}].
+"rerank_model": string If it is not specified, vector cosine similarity will be used; otherwise, reranking score will be used.
+top_k: int Refers to the process of reordering or selecting the top-k items from a list or set based on a specific ranking criterion. Default to 1024.
+"empty_response": string If nothing is retrieved in the dataset for the user's question, this will be used as the response. To allow the LLM to improvise when nothing is found, leave this blank.
+"opener": string The opening greeting for the user. Defaults to "Hi! I am your assistant, can I help you?".
+"show_quote: boolean Indicates whether the source of text should be displayed. Defaults to true.
+"prompt": string The prompt content.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "avatar": "",
+        "create_date": "Thu, 24 Oct 2024 11:18:29 GMT",
+        "create_time": 1729768709023,
+        "dataset_ids": [
+            "527fa74891e811ef9c650242ac120006"
+        ],
+        "description": "A helpful Assistant",
+        "do_refer": "1",
+        "id": "b1f2f15691f911ef81180242ac120003",
+        "language": "English",
+        "llm": {
+            "frequency_penalty": 0.7,
+            "model_name": "qwen-plus@Tongyi-Qianwen",
+            "presence_penalty": 0.4,
+            "temperature": 0.1,
+            "top_p": 0.3
+        },
+        "name": "12234",
+        "prompt": {
+            "empty_response": "Sorry! No relevant content was found in the knowledge base!",
+            "keywords_similarity_weight": 0.3,
+            "opener": "Hi! I'm your assistant. What can I do for you?",
+            "prompt": "You are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\n ",
+            "rerank_model": "",
+            "similarity_threshold": 0.2,
+            "top_n": 6,
+            "variables": [
+                {
+                    "key": "knowledge",
+                    "optional": false
+                }
+            ]
+        },
+        "prompt_type": "simple",
+        "status": "1",
+        "tenant_id": "69736c5e723611efb51b0242ac120007",
+        "top_k": 1024,
+        "update_date": "Thu, 24 Oct 2024 11:18:29 GMT",
+        "update_time": 1729768709023
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Duplicated chat name in creating dataset."
+}
+Update chat assistant
+PUT /api/v1/chats/{chat_id}
+
+Updates configurations for a specified chat assistant.
+
+Request
+Method: PUT
+URL: /api/v1/chats/{chat_id}
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name": string
+"avatar": string
+"dataset_ids": list[string]
+"llm": object
+"prompt": object
+Request example
+curl --request PUT \
+     --url http://{address}/api/v1/chats/{chat_id} \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "name":"Test"
+     }'
+Parameters
+chat_id: (Path parameter)
+The ID of the chat assistant to update.
+"name": (Body parameter), string, Required
+The revised name of the chat assistant.
+"avatar": (Body parameter), string
+Base64 encoding of the avatar.
+"dataset_ids": (Body parameter), list[string]
+The IDs of the associated datasets.
+"llm": (Body parameter), object
+The LLM settings for the chat assistant to create. If it is not explicitly set, a dictionary with the following values will be generated as the default. An llm object contains the following attributes:
+"model_name", string
+The chat model name. If not set, the user's default chat model will be used.
+"temperature": float
+Controls the randomness of the model's predictions. A lower temperature results in more conservative responses, while a higher temperature yields more creative and diverse responses. Defaults to 0.1.
+"top_p": float
+Also known as “nucleus sampling”, this parameter sets a threshold to select a smaller set of words to sample from. It focuses on the most likely words, cutting off the less probable ones. Defaults to 0.3
+"presence_penalty": float
+This discourages the model from repeating the same information by penalizing words that have already appeared in the conversation. Defaults to 0.2.
+"frequency penalty": float
+Similar to the presence penalty, this reduces the model’s tendency to repeat the same words frequently. Defaults to 0.7.
+"prompt": (Body parameter), object
+Instructions for the LLM to follow. A prompt object contains the following attributes:
+"similarity_threshold": float RAGFlow employs either a combination of weighted keyword similarity and weighted vector cosine similarity, or a combination of weighted keyword similarity and weighted rerank score during retrieval. This argument sets the threshold for similarities between the user query and chunks. If a similarity score falls below this threshold, the corresponding chunk will be excluded from the results. The default value is 0.2.
+"keywords_similarity_weight": float This argument sets the weight of keyword similarity in the hybrid similarity score with vector cosine similarity or reranking model similarity. By adjusting this weight, you can control the influence of keyword similarity in relation to other similarity measures. The default value is 0.7.
+"top_n": int This argument specifies the number of top chunks with similarity scores above the similarity_threshold that are fed to the LLM. The LLM will only access these 'top N' chunks. The default value is 8.
+"variables": object[] This argument lists the variables to use in the 'System' field of Chat Configurations. Note that:
+"knowledge" is a reserved variable, which represents the retrieved chunks.
+All the variables in 'System' should be curly bracketed.
+The default value is [{"key": "knowledge", "optional": true}]
+"rerank_model": string If it is not specified, vector cosine similarity will be used; otherwise, reranking score will be used.
+"empty_response": string If nothing is retrieved in the dataset for the user's question, this will be used as the response. To allow the LLM to improvise when nothing is found, leave this blank.
+"opener": string The opening greeting for the user. Defaults to "Hi! I am your assistant, can I help you?".
+"show_quote: boolean Indicates whether the source of text should be displayed. Defaults to true.
+"prompt": string The prompt content.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Duplicated chat name in updating dataset."
+}
+Delete chat assistants
+DELETE /api/v1/chats
+
+Deletes chat assistants by ID.
+
+Request
+Method: DELETE
+URL: /api/v1/chats
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"ids": list[string]
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/chats \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "ids": ["test_1", "test_2"]
+     }'
+Request parameters
+"ids": (Body parameter), list[string]
+The IDs of the chat assistants to delete. If it is not specified, all chat assistants in the system will be deleted.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "ids are required"
+}
+List chat assistants
+GET /api/v1/chats?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={chat_name}&id={chat_id}
+
+Lists chat assistants.
+
+Request
+Method: GET
+URL: /api/v1/chats?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={chat_name}&id={chat_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/chats?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={chat_name}&id={chat_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+page: (Filter parameter), integer
+Specifies the page on which the chat assistants will be displayed. Defaults to 1.
+page_size: (Filter parameter), integer
+The number of chat assistants on each page. Defaults to 30.
+orderby: (Filter parameter), string
+The attribute by which the results are sorted. Available options:
+create_time (default)
+update_time
+desc: (Filter parameter), boolean
+Indicates whether the retrieved chat assistants should be sorted in descending order. Defaults to true.
+id: (Filter parameter), string
+The ID of the chat assistant to retrieve.
+name: (Filter parameter), string
+The name of the chat assistant to retrieve.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        {
+            "avatar": "",
+            "create_date": "Fri, 18 Oct 2024 06:20:06 GMT",
+            "create_time": 1729232406637,
+            "description": "A helpful Assistant",
+            "do_refer": "1",
+            "id": "04d0d8e28d1911efa3630242ac120006",
+            "dataset_ids": ["527fa74891e811ef9c650242ac120006"],
+            "language": "English",
+            "llm": {
+                "frequency_penalty": 0.7,
+                "model_name": "qwen-plus@Tongyi-Qianwen",
+                "presence_penalty": 0.4,
+                "temperature": 0.1,
+                "top_p": 0.3
+            },
+            "name": "13243",
+            "prompt": {
+                "empty_response": "Sorry! No relevant content was found in the knowledge base!",
+                "keywords_similarity_weight": 0.3,
+                "opener": "Hi! I'm your assistant. What can I do for you?",
+                "prompt": "You are an intelligent assistant. Please summarize the content of the knowledge base to answer the question. Please list the data in the knowledge base and answer in detail. When all knowledge base content is irrelevant to the question, your answer must include the sentence \"The answer you are looking for is not found in the knowledge base!\" Answers need to consider chat history.\n",
+                "rerank_model": "",
+                "similarity_threshold": 0.2,
+                "top_n": 6,
+                "variables": [
+                    {
+                        "key": "knowledge",
+                        "optional": false
+                    }
+                ]
+            },
+            "prompt_type": "simple",
+            "status": "1",
+            "tenant_id": "69736c5e723611efb51b0242ac120007",
+            "top_k": 1024,
+            "update_date": "Fri, 18 Oct 2024 06:20:06 GMT",
+            "update_time": 1729232406638
+        }
+    ]
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The chat doesn't exist"
+}
+SESSION MANAGEMENT
+Create session with chat assistant
+POST /api/v1/chats/{chat_id}/sessions
+
+Creates a session with a chat assistant.
+
+Request
+Method: POST
+URL: /api/v1/chats/{chat_id}/sessions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name": string
+"user_id": string (optional)
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/chats/{chat_id}/sessions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "name": "new session"
+     }'
+Request parameters
+chat_id: (Path parameter)
+The ID of the associated chat assistant.
+"name": (Body parameter), string
+The name of the chat session to create.
+"user_id": (Body parameter), string
+Optional user-defined ID.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "chat_id": "2ca4b22e878011ef88fe0242ac120005",
+        "create_date": "Fri, 11 Oct 2024 08:46:14 GMT",
+        "create_time": 1728636374571,
+        "id": "4606b4ec87ad11efbc4f0242ac120006",
+        "messages": [
+            {
+                "content": "Hi! I am your assistant, can I help you?",
+                "role": "assistant"
+            }
+        ],
+        "name": "new session",
+        "update_date": "Fri, 11 Oct 2024 08:46:14 GMT",
+        "update_time": 1728636374571
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Name cannot be empty."
+}
+Update chat assistant's session
+PUT /api/v1/chats/{chat_id}/sessions/{session_id}
+
+Updates a session of a specified chat assistant.
+
+Request
+Method: PUT
+URL: /api/v1/chats/{chat_id}/sessions/{session_id}
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name: string
+"user_id: string (optional)
+Request example
+curl --request PUT \
+     --url http://{address}/api/v1/chats/{chat_id}/sessions/{session_id} \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "name": "<REVISED_SESSION_NAME_HERE>"
+     }'
+Request Parameter
+chat_id: (Path parameter)
+The ID of the associated chat assistant.
+session_id: (Path parameter)
+The ID of the session to update.
+"name": (Body Parameter), string
+The revised name of the session.
+"user_id": (Body parameter), string
+Optional user-defined ID.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Name cannot be empty."
+}
+List chat assistant's sessions
+GET /api/v1/chats/{chat_id}/sessions?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={session_name}&id={session_id}
+
+Lists sessions associated with a specified chat assistant.
+
+Request
+Method: GET
+URL: /api/v1/chats/{chat_id}/sessions?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={session_name}&id={session_id}&user_id={user_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/chats/{chat_id}/sessions?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={session_name}&id={session_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request Parameters
+chat_id: (Path parameter)
+The ID of the associated chat assistant.
+page: (Filter parameter), integer
+Specifies the page on which the sessions will be displayed. Defaults to 1.
+page_size: (Filter parameter), integer
+The number of sessions on each page. Defaults to 30.
+orderby: (Filter parameter), string
+The field by which sessions should be sorted. Available options:
+create_time (default)
+update_time
+desc: (Filter parameter), boolean
+Indicates whether the retrieved sessions should be sorted in descending order. Defaults to true.
+name: (Filter parameter) string
+The name of the chat session to retrieve.
+id: (Filter parameter), string
+The ID of the chat session to retrieve.
+user_id: (Filter parameter), string
+The optional user-defined ID passed in when creating session.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        {
+            "chat": "2ca4b22e878011ef88fe0242ac120005",
+            "create_date": "Fri, 11 Oct 2024 08:46:43 GMT",
+            "create_time": 1728636403974,
+            "id": "578d541e87ad11ef96b90242ac120006",
+            "messages": [
+                {
+                    "content": "Hi! I am your assistant, can I help you?",
+                    "role": "assistant"
+                }
+            ],
+            "name": "new session",
+            "update_date": "Fri, 11 Oct 2024 08:46:43 GMT",
+            "update_time": 1728636403974
+        }
+    ]
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The session doesn't exist"
+}
+Delete chat assistant's sessions
+DELETE /api/v1/chats/{chat_id}/sessions
+
+Deletes sessions of a chat assistant by ID.
+
+Request
+Method: DELETE
+URL: /api/v1/chats/{chat_id}/sessions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"ids": list[string]
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/chats/{chat_id}/sessions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "ids": ["test_1", "test_2"]
+     }'
+Request Parameters
+chat_id: (Path parameter)
+The ID of the associated chat assistant.
+"ids": (Body Parameter), list[string]
+The IDs of the sessions to delete. If it is not specified, all sessions associated with the specified chat assistant will be deleted.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The chat doesn't own the session"
+}
+Converse with chat assistant
+POST /api/v1/chats/{chat_id}/completions
+
+Asks a specified chat assistant a question to start an AI-powered conversation.
+
+:::tip NOTE
+
+In streaming mode, not all responses include a reference, as this depends on the system's judgement.
+
+In streaming mode, the last message is an empty message:
+
+data:
+{
+  "code": 0,
+  "data": true
+}
+:::
+
+Request
+Method: POST
+URL: /api/v1/chats/{chat_id}/completions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"question": string
+"stream": boolean
+"session_id": string (optional)
+"user_id: string (optional)
+"metadata_condition": object (optional)
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/chats/{chat_id}/completions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data-binary '
+     {
+     }'
+curl --request POST \
+     --url http://{address}/api/v1/chats/{chat_id}/completions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data-binary '
+     {
+          "question": "Who are you",
+          "stream": true,
+          "session_id":"9fa7691cb85c11ef9c5f0242ac120005",
+          "metadata_condition": {
+            "logic": "and",
+            "conditions": [
+              {
+                "name": "author",
+                "comparison_operator": "is",
+                "value": "bob"
+              }
+            ]
+          }
+     }'
+Request Parameters
+chat_id: (Path parameter)
+The ID of the associated chat assistant.
+"question": (Body Parameter), string, Required
+The question to start an AI-powered conversation.
+"stream": (Body Parameter), boolean
+Indicates whether to output responses in a streaming way:
+true: Enable streaming (default).
+false: Disable streaming.
+"session_id": (Body Parameter)
+The ID of session. If it is not provided, a new session will be generated.
+"user_id": (Body parameter), string
+The optional user-defined ID. Valid only when no session_id is provided.
+"metadata_condition": (Body parameter), object
+Optional metadata filter conditions applied to retrieval results.
+logic: string, one of and / or
+conditions: list[object] where each condition contains:
+name: string metadata key
+comparison_operator: string (e.g. is, not is, contains, not contains, start with, end with, empty, not empty, >, <, ≥, ≤)
+value: string|number|boolean (optional for empty/not empty)
+Response
+Success without session_id:
+
+data:{
+    "code": 0,
+    "message": "",
+    "data": {
+        "answer": "Hi! I'm your assistant. What can I do for you?",
+        "reference": {},
+        "audio_binary": null,
+        "id": null,
+        "session_id": "b01eed84b85611efa0e90242ac120005"
+    }
+}
+data:{
+    "code": 0,
+    "message": "",
+    "data": true
+}
+Success with session_id:
+
+data:{
+    "code": 0,
+    "data": {
+        "answer": "I am an intelligent assistant designed to help answer questions by summarizing content from a",
+        "reference": {},
+        "audio_binary": null,
+        "id": "a84c5dd4-97b4-4624-8c3b-974012c8000d",
+        "session_id": "82b0ab2a9c1911ef9d870242ac120006"
+    }
+}
+data:{
+    "code": 0,
+    "data": {
+        "answer": "I am an intelligent assistant designed to help answer questions by summarizing content from a knowledge base. My responses are based on the information available in the knowledge base and",
+        "reference": {},
+        "audio_binary": null,
+        "id": "a84c5dd4-97b4-4624-8c3b-974012c8000d",
+        "session_id": "82b0ab2a9c1911ef9d870242ac120006"
+    }
+}
+data:{
+    "code": 0,
+    "data": {
+        "answer": "I am an intelligent assistant designed to help answer questions by summarizing content from a knowledge base. My responses are based on the information available in the knowledge base and any relevant chat history.",
+        "reference": {},
+        "audio_binary": null,
+        "id": "a84c5dd4-97b4-4624-8c3b-974012c8000d",
+        "session_id": "82b0ab2a9c1911ef9d870242ac120006"
+    }
+}
+data:{
+    "code": 0,
+    "data": {
+        "answer": "I am an intelligent assistant designed to help answer questions by summarizing content from a knowledge base ##0$$. My responses are based on the information available in the knowledge base and any relevant chat history.",
+        "reference": {
+            "total": 1,
+            "chunks": [
+                {
+                    "id": "faf26c791128f2d5e821f822671063bd",
+                    "content": "xxxxxxxx",
+                    "document_id": "dd58f58e888511ef89c90242ac120006",
+                    "document_name": "1.txt",
+                    "dataset_id": "8e83e57a884611ef9d760242ac120006",
+                    "image_id": "",
+                    "url": null,
+                    "similarity": 0.7,
+                    "vector_similarity": 0.0,
+                    "term_similarity": 1.0,
+                    "doc_type": [],
+                    "positions": [
+                        ""
+                    ]
+                }
+            ],
+            "doc_aggs": [
+                {
+                    "doc_name": "1.txt",
+                    "doc_id": "dd58f58e888511ef89c90242ac120006",
+                    "count": 1
+                }
+            ]
+        },
+        "prompt": "xxxxxxxxxxx",
+        "created_at": 1755055623.6401553,
+        "id": "a84c5dd4-97b4-4624-8c3b-974012c8000d",
+        "session_id": "82b0ab2a9c1911ef9d870242ac120006"
+    }
+}
+data:{
+    "code": 0,
+    "data": true
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Please input your question."
+}
+Create session with agent
+:::danger DEPRECATED This method is deprecated and not recommended. You can still call it but be mindful that calling Converse with agent will automatically generate a session ID for the associated agent. :::
+
+POST /api/v1/agents/{agent_id}/sessions
+
+Creates a session with an agent.
+
+Request
+Method: POST
+URL: /api/v1/agents/{agent_id}/sessions?user_id={user_id}
+Headers:
+`'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+the required parameters:str
+other parameters: The variables specified in the Begin component.
+Request example
+If the Begin component in your agent does not take required parameters:
+
+curl --request POST \
+     --url http://{address}/api/v1/agents/{agent_id}/sessions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+     }'
+Request parameters
+agent_id: (Path parameter)
+The ID of the associated agent.
+user_id: (Filter parameter)
+The optional user-defined ID for parsing docs (especially images) when creating a session while uploading files.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "agent_id": "dbb4ed366e8611f09690a55a6daec4ef",
+        "dsl": {
+            "components": {
+                "Message:EightyJobsAsk": {
+                    "downstream": [],
+                    "obj": {
+                        "component_name": "Message",
+                        "params": {
+                            "content": [
+                                "{begin@var1}{begin@var2}"
+                            ],
+                            "debug_inputs": {},
+                            "delay_after_error": 2.0,
+                            "description": "",
+                            "exception_default_value": null,
+                            "exception_goto": null,
+                            "exception_method": null,
+                            "inputs": {},
+                            "max_retries": 0,
+                            "message_history_window_size": 22,
+                            "outputs": {
+                                "content": {
+                                    "type": "str",
+                                    "value": null
+                                }
+                            },
+                            "stream": true
+                        }
+                    },
+                    "upstream": [
+                        "begin"
+                    ]
+                },
+                "begin": {
+                    "downstream": [
+                        "Message:EightyJobsAsk"
+                    ],
+                    "obj": {
+                        "component_name": "Begin",
+                        "params": {
+                            "debug_inputs": {},
+                            "delay_after_error": 2.0,
+                            "description": "",
+                            "enablePrologue": true,
+                            "enable_tips": true,
+                            "exception_default_value": null,
+                            "exception_goto": null,
+                            "exception_method": null,
+                            "inputs": {
+                                "var1": {
+                                    "name": "var1",
+                                    "optional": false,
+                                    "options": [],
+                                    "type": "line",
+                                    "value": null
+                                },
+                                "var2": {
+                                    "name": "var2",
+                                    "optional": false,
+                                    "options": [],
+                                    "type": "line",
+                                    "value": null
+                                }
+                            },
+                            "max_retries": 0,
+                            "message_history_window_size": 22,
+                            "mode": "conversational",
+                            "outputs": {},
+                            "prologue": "Hi! I'm your assistant. What can I do for you?",
+                            "tips": "Please fill in the form"
+                        }
+                    },
+                    "upstream": []
+                }
+            },
+            "globals": {
+                "sys.conversation_turns": 0,
+                "sys.files": [],
+                "sys.query": "",
+                "sys.user_id": ""
+            },
+            "graph": {
+                "edges": [
+                    {
+                        "data": {
+                            "isHovered": false
+                        },
+                        "id": "xy-edge__beginstart-Message:EightyJobsAskend",
+                        "markerEnd": "logo",
+                        "source": "begin",
+                        "sourceHandle": "start",
+                        "style": {
+                            "stroke": "rgba(151, 154, 171, 1)",
+                            "strokeWidth": 1
+                        },
+                        "target": "Message:EightyJobsAsk",
+                        "targetHandle": "end",
+                        "type": "buttonEdge",
+                        "zIndex": 1001
+                    }
+                ],
+                "nodes": [
+                    {
+                        "data": {
+                            "form": {
+                                "enablePrologue": true,
+                                "inputs": {
+                                    "var1": {
+                                        "name": "var1",
+                                        "optional": false,
+                                        "options": [],
+                                        "type": "line"
+                                    },
+                                    "var2": {
+                                        "name": "var2",
+                                        "optional": false,
+                                        "options": [],
+                                        "type": "line"
+                                    }
+                                },
+                                "mode": "conversational",
+                                "prologue": "Hi! I'm your assistant. What can I do for you?"
+                            },
+                            "label": "Begin",
+                            "name": "begin"
+                        },
+                        "dragging": false,
+                        "id": "begin",
+                        "measured": {
+                            "height": 112,
+                            "width": 200
+                        },
+                        "position": {
+                            "x": 270.64098070942583,
+                            "y": -56.320928437811176
+                        },
+                        "selected": false,
+                        "sourcePosition": "left",
+                        "targetPosition": "right",
+                        "type": "beginNode"
+                    },
+                    {
+                        "data": {
+                            "form": {
+                                "content": [
+                                    "{begin@var1}{begin@var2}"
+                                ]
+                            },
+                            "label": "Message",
+                            "name": "Message_0"
+                        },
+                        "dragging": false,
+                        "id": "Message:EightyJobsAsk",
+                        "measured": {
+                            "height": 57,
+                            "width": 200
+                        },
+                        "position": {
+                            "x": 279.5,
+                            "y": 190
+                        },
+                        "selected": true,
+                        "sourcePosition": "right",
+                        "targetPosition": "left",
+                        "type": "messageNode"
+                    }
+                ]
+            },
+            "history": [],
+            "memory": [],
+            "messages": [],
+            "path": [],
+            "retrieval": [],
+            "task_id": "dbb4ed366e8611f09690a55a6daec4ef"
+        },
+        "id": "0b02fe80780e11f084adcfdc3ed1d902",
+        "message": [
+            {
+                "content": "Hi! I'm your assistant. What can I do for you?",
+                "role": "assistant"
+            }
+        ],
+        "source": "agent",
+        "user_id": "c3fb861af27a11efa69751e139332ced"
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Agent not found."
+}
+Converse with agent
+POST /api/v1/agents/{agent_id}/completions
+
+Asks a specified agent a question to start an AI-powered conversation.
+
+:::tip NOTE
+
+In streaming mode, not all responses include a reference, as this depends on the system's judgement.
+
+In streaming mode, the last message is an empty message:
+
+[DONE]
+You can optionally return step-by-step trace logs (see return_trace below).
+
+:::
+
+Request
+Method: POST
+URL: /api/v1/agents/{agent_id}/completions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"question": string
+"stream": boolean
+"session_id": string (optional)
+"inputs": object (optional)
+"user_id": string (optional)
+"return_trace": boolean (optional, default false) — include execution trace logs.
+Streaming events to handle
+When stream=true, the server sends Server-Sent Events (SSE). Clients should handle these event types:
+
+message: streaming content from Message components.
+message_end: end of a Message component; may include reference/attachment.
+node_finished: a component finishes; data.inputs/outputs/error/elapsed_time describe the node result. If return_trace=true, the trace is attached inside the same node_finished event (data.trace).
+The stream terminates with [DONE].
+
+:::info IMPORTANT You can include custom parameters in the request body, but first ensure they are defined in the Begin component. :::
+
+Request example
+If the Begin component does not take parameters:
+curl --request POST \
+     --url http://{address}/api/v1/agents/{agent_id}/completions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data-binary '
+     {
+        "question": "Hello",
+        "stream": false,
+     }'
+If the Begin component takes parameters, include their values in the body of "inputs" as follows:
+curl --request POST \
+     --url http://{address}/api/v1/agents/{agent_id}/completions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data-binary '
+    {
+        "question": "Hello",
+        "stream": false,
+        "inputs": {
+            "line_var": {
+                "type": "line",
+                "value": "I am line_var"
+            },
+            "int_var": {
+                "type": "integer",
+                "value": 1
+            },
+            "paragraph_var": {
+                "type": "paragraph",
+                "value": "a\nb\nc"
+            },
+            "option_var": {
+                "type": "options",
+                "value": "option 2"
+            },
+            "boolean_var": {
+                "type": "boolean",
+                "value": true
+            }
+        }
+    }'
+The following code will execute the completion process
+
+curl --request POST \
+     --url http://{address}/api/v1/agents/{agent_id}/completions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data-binary '
+     {
+          "question": "Hello",
+          "stream": true,
+          "session_id": "cb2f385cb86211efa36e0242ac120005"
+     }'
+Request Parameters
+agent_id: (Path parameter), string
+The ID of the associated agent.
+"question": (Body Parameter), string, Required
+The question to start an AI-powered conversation.
+"stream": (Body Parameter), boolean
+Indicates whether to output responses in a streaming way:
+true: Enable streaming (default).
+false: Disable streaming.
+"session_id": (Body Parameter)
+The ID of the session. If it is not provided, a new session will be generated.
+"inputs": (Body Parameter)
+Variables specified in the Begin component.
+"user_id": (Body parameter), string
+The optional user-defined ID. Valid only when no session_id is provided.
+:::tip NOTE For now, this method does not support a file type input/variable. As a workaround, use the following to upload a file to an agent:
+http://{address}/v1/canvas/upload/{agent_id}
+You will get a corresponding file ID from its response body. :::
+
+Response
+success without session_id provided and with no variables specified in the Begin component:
+
+Stream:
+
+...
+
+data: {
+    "event": "message",
+    "message_id": "cecdcb0e83dc11f0858253708ecb6573",
+    "created_at": 1756364483,
+    "task_id": "d1f79142831f11f09cc51795b9eb07c0",
+    "data": {
+        "content": " themes"
+    },
+    "session_id": "cd097ca083dc11f0858253708ecb6573"
+}
+
+data: {
+    "event": "message",
+    "message_id": "cecdcb0e83dc11f0858253708ecb6573",
+    "created_at": 1756364483,
+    "task_id": "d1f79142831f11f09cc51795b9eb07c0",
+    "data": {
+        "content": "."
+    },
+    "session_id": "cd097ca083dc11f0858253708ecb6573"
+}
+
+data: {
+    "event": "message_end",
+    "message_id": "cecdcb0e83dc11f0858253708ecb6573",
+    "created_at": 1756364483,
+    "task_id": "d1f79142831f11f09cc51795b9eb07c0",
+    "data": {
+        "reference": {
+            "chunks": {
+                "20": {
+                    "id": "4b8935ac0a22deb1",
+                    "content": "```cd /usr/ports/editors/neovim/ && make install```## Android[Termux](https://github.com/termux/termux-app) offers a Neovim package.",
+                    "document_id": "4bdd2ff65e1511f0907f09f583941b45",
+                    "document_name": "INSTALL22.md",
+                    "dataset_id": "456ce60c5e1511f0907f09f583941b45",
+                    "image_id": "",
+                    "positions": [
+                        [
+                            12,
+                            11,
+                            11,
+                            11,
+                            11
+                        ]
+                    ],
+                    "url": null,
+                    "similarity": 0.5705525104787287,
+                    "vector_similarity": 0.7351750337624289,
+                    "term_similarity": 0.5000000005,
+                    "doc_type": ""
+                }
+            },
+            "doc_aggs": {
+                "INSTALL22.md": {
+                    "doc_name": "INSTALL22.md",
+                    "doc_id": "4bdd2ff65e1511f0907f09f583941b45",
+                    "count": 3
+                },
+                "INSTALL.md": {
+                    "doc_name": "INSTALL.md",
+                    "doc_id": "4bd7fdd85e1511f0907f09f583941b45",
+                    "count": 2
+                },
+                "INSTALL(1).md": {
+                    "doc_name": "INSTALL(1).md",
+                    "doc_id": "4bdfb42e5e1511f0907f09f583941b45",
+                    "count": 2
+                },
+                "INSTALL3.md": {
+                    "doc_name": "INSTALL3.md",
+                    "doc_id": "4bdab5825e1511f0907f09f583941b45",
+                    "count": 1
+                }
+            }
+        }
+    },
+    "session_id": "cd097ca083dc11f0858253708ecb6573"
+}
+
+data: {
+    "event": "node_finished",
+    "message_id": "cecdcb0e83dc11f0858253708ecb6573",
+    "created_at": 1756364483,
+    "task_id": "d1f79142831f11f09cc51795b9eb07c0",
+    "data": {
+        "inputs": {
+            "sys.query": "how to install neovim?"
+        },
+        "outputs": {
+            "content": "xxxxxxx",
+            "_created_time": 15294.0382,
+            "_elapsed_time": 0.00017
+        },
+        "component_id": "Agent:EveryHairsChew",
+        "component_name": "Agent_1",
+        "component_type": "Agent",
+        "error": null,
+        "elapsed_time": 11.2091,
+        "created_at": 15294.0382,
+        "trace": [
+            {
+                "component_id": "begin",
+                "trace": [
+                    {
+                        "inputs": {},
+                        "outputs": {
+                            "_created_time": 15257.7949,
+                            "_elapsed_time": 0.00070
+                        },
+                        "component_id": "begin",
+                        "component_name": "begin",
+                        "component_type": "Begin",
+                        "error": null,
+                        "elapsed_time": 0.00085,
+                        "created_at": 15257.7949
+                    }
+                ]
+            },
+            {
+                "component_id": "Agent:WeakDragonsRead",
+                "trace": [
+                    {
+                        "inputs": {
+                            "sys.query": "how to install neovim?"
+                        },
+                        "outputs": {
+                            "content": "xxxxxxx",
+                            "_created_time": 15257.7982,
+                            "_elapsed_time": 36.2382
+                        },
+                        "component_id": "Agent:WeakDragonsRead",
+                        "component_name": "Agent_0",
+                        "component_type": "Agent",
+                        "error": null,
+                        "elapsed_time": 36.2385,
+                        "created_at": 15257.7982
+                    }
+                ]
+            },
+            {
+                "component_id": "Agent:EveryHairsChew",
+                "trace": [
+                    {
+                        "inputs": {
+                            "sys.query": "how to install neovim?"
+                        },
+                        "outputs": {
+                            "content": "xxxxxxxxxxxxxxxxx",
+                            "_created_time": 15294.0382,
+                            "_elapsed_time": 0.00017
+                        },
+                        "component_id": "Agent:EveryHairsChew",
+                        "component_name": "Agent_1",
+                        "component_type": "Agent",
+                        "error": null,
+                        "elapsed_time": 11.2091,
+                        "created_at": 15294.0382
+                    }
+                ]
+            }
+        ]
+    },
+    "session_id": "cd097ca083dc11f0858253708ecb6573"
+}
+
+data:[DONE]
+Non-stream:
+
+{
+    "code": 0,
+    "data": {
+        "created_at": 1756363177,
+        "data": {
+            "content": "\nTo install Neovim, the process varies depending on your operating system:\n\n### For macOS:\nUsing Homebrew:\n```bash\nbrew install neovim\n```\n\n### For Linux (Debian/Ubuntu):\n```bash\nsudo apt update\nsudo apt install neovim\n```\n\nFor other Linux distributions, you can use their respective package managers or build from source.\n\n### For Windows:\n1. Download the latest Windows installer from the official Neovim GitHub releases page\n2. Run the installer and follow the prompts\n3. Add Neovim to your PATH if not done automatically\n\n### From source (Unix-like systems):\n```bash\ngit clone https://github.com/neovim/neovim.git\ncd neovim\nmake CMAKE_BUILD_TYPE=Release\nsudo make install\n```\n\nAfter installation, you can verify it by running `nvim --version` in your terminal.",
+            "created_at": 18129.044975627,
+            "elapsed_time": 10.0157331670016,
+            "inputs": {
+                "var1": {
+                    "value": "I am var1"
+                },
+                "var2": {
+                    "value": "I am var2"
+                }
+            },
+            "outputs": {
+                "_created_time": 18129.502422278,
+                "_elapsed_time": 0.00013378599760471843,
+                "content": "\nTo install Neovim, the process varies depending on your operating system:\n\n### For macOS:\nUsing Homebrew:\n```bash\nbrew install neovim\n```\n\n### For Linux (Debian/Ubuntu):\n```bash\nsudo apt update\nsudo apt install neovim\n```\n\nFor other Linux distributions, you can use their respective package managers or build from source.\n\n### For Windows:\n1. Download the latest Windows installer from the official Neovim GitHub releases page\n2. Run the installer and follow the prompts\n3. Add Neovim to your PATH if not done automatically\n\n### From source (Unix-like systems):\n```bash\ngit clone https://github.com/neovim/neovim.git\ncd neovim\nmake CMAKE_BUILD_TYPE=Release\nsudo make install\n```\n\nAfter installation, you can verify it by running `nvim --version` in your terminal."
+            },
+            "reference": {
+                "chunks": {
+                    "20": {
+                        "content": "```cd /usr/ports/editors/neovim/ && make install```## Android[Termux](https://github.com/termux/termux-app) offers a Neovim package.",
+                        "dataset_id": "456ce60c5e1511f0907f09f583941b45",
+                        "doc_type": "",
+                        "document_id": "4bdd2ff65e1511f0907f09f583941b45",
+                        "document_name": "INSTALL22.md",
+                        "id": "4b8935ac0a22deb1",
+                        "image_id": "",
+                        "positions": [
+                            [
+                                12,
+                                11,
+                                11,
+                                11,
+                                11
+                            ]
+                        ],
+                        "similarity": 0.5705525104787287,
+                        "term_similarity": 0.5000000005,
+                        "url": null,
+                        "vector_similarity": 0.7351750337624289
+                    }
+                },
+                "doc_aggs": {
+                    "INSTALL(1).md": {
+                        "count": 2,
+                        "doc_id": "4bdfb42e5e1511f0907f09f583941b45",
+                        "doc_name": "INSTALL(1).md"
+                    },
+                    "INSTALL.md": {
+                        "count": 2,
+                        "doc_id": "4bd7fdd85e1511f0907f09f583941b45",
+                        "doc_name": "INSTALL.md"
+                    },
+                    "INSTALL22.md": {
+                        "count": 3,
+                        "doc_id": "4bdd2ff65e1511f0907f09f583941b45",
+                        "doc_name": "INSTALL22.md"
+                    },
+                    "INSTALL3.md": {
+                        "count": 1,
+                        "doc_id": "4bdab5825e1511f0907f09f583941b45",
+                        "doc_name": "INSTALL3.md"
+                    }
+                }
+            },
+            "trace": [
+                {
+                    "component_id": "begin",
+                    "trace": [
+                        {
+                            "component_id": "begin",
+                            "component_name": "begin",
+                            "component_type": "Begin",
+                            "created_at": 15926.567517862,
+                            "elapsed_time": 0.0008189299987861887,
+                            "error": null,
+                            "inputs": {},
+                            "outputs": {
+                                "_created_time": 15926.567517862,
+                                "_elapsed_time": 0.0006958619997021742
+                            }
+                        }
+                    ]
+                },
+                {
+                    "component_id": "Agent:WeakDragonsRead",
+                    "trace": [
+                        {
+                            "component_id": "Agent:WeakDragonsRead",
+                            "component_name": "Agent_0",
+                            "component_type": "Agent",
+                            "created_at": 15926.569121755,
+                            "elapsed_time": 53.49016142000073,
+                            "error": null,
+                            "inputs": {
+                                "sys.query": "how to install neovim?"
+                            },
+                            "outputs": {
+                                "_created_time": 15926.569121755,
+                                "_elapsed_time": 53.489981256001556,
+                                "content": "xxxxxxxxxxxxxx",
+                                "use_tools": [
+                                    {
+                                        "arguments": {
+                                            "query": "xxxx"
+                                        },
+                                        "name": "search_my_dateset",
+                                        "results": "xxxxxxxxxxx"
+                                    }
+                                ]
+                            }
+                        }
+                    ]
+                },
+                {
+                    "component_id": "Agent:EveryHairsChew",
+                    "trace": [
+                        {
+                            "component_id": "Agent:EveryHairsChew",
+                            "component_name": "Agent_1",
+                            "component_type": "Agent",
+                            "created_at": 15980.060569101,
+                            "elapsed_time": 23.61718057500002,
+                            "error": null,
+                            "inputs": {
+                                "sys.query": "how to install neovim?"
+                            },
+                            "outputs": {
+                                "_created_time": 15980.060569101,
+                                "_elapsed_time": 0.0003451630000199657,
+                                "content": "xxxxxxxxxxxx"
+                            }
+                        }
+                    ]
+                },
+                {
+                    "component_id": "Message:SlickDingosHappen",
+                    "trace": [
+                        {
+                            "component_id": "Message:SlickDingosHappen",
+                            "component_name": "Message_0",
+                            "component_type": "Message",
+                            "created_at": 15980.061302513,
+                            "elapsed_time": 23.61655923699982,
+                            "error": null,
+                            "inputs": {
+                                "Agent:EveryHairsChew@content": "xxxxxxxxx",
+                                "Agent:WeakDragonsRead@content": "xxxxxxxxxxx"
+                            },
+                            "outputs": {
+                                "_created_time": 15980.061302513,
+                                "_elapsed_time": 0.0006695749998471001,
+                                "content": "xxxxxxxxxxx"
+                            }
+                        }
+                    ]
+                }
+            ]
+        },
+        "event": "workflow_finished",
+        "message_id": "c4692a2683d911f0858253708ecb6573",
+        "session_id": "c39f6f9c83d911f0858253708ecb6573",
+        "task_id": "d1f79142831f11f09cc51795b9eb07c0"
+    }
+}
+Success without session_id provided and with variables specified in the Begin component:
+
+Stream:
+
+data:{
+    "event": "message",
+    "message_id": "0e273472783711f0806e1a6272e682d8",
+    "created_at": 1755083830,
+    "task_id": "99ee29d6783511f09c921a6272e682d8",
+    "data": {
+        "content": "Hello"
+    },
+    "session_id": "0e0d1542783711f0806e1a6272e682d8"
+}
+
+data:{
+    "event": "message",
+    "message_id": "0e273472783711f0806e1a6272e682d8",
+    "created_at": 1755083830,
+    "task_id": "99ee29d6783511f09c921a6272e682d8",
+    "data": {
+        "content": "!"
+    },
+    "session_id": "0e0d1542783711f0806e1a6272e682d8"
+}
+
+data:{
+    "event": "message",
+    "message_id": "0e273472783711f0806e1a6272e682d8",
+    "created_at": 1755083830,
+    "task_id": "99ee29d6783511f09c921a6272e682d8",
+    "data": {
+        "content": " How"
+    },
+    "session_id": "0e0d1542783711f0806e1a6272e682d8"
+}
+
+...
+
+data:[DONE]
+Non-stream:
+
+{
+    "code": 0,
+    "data": {
+        "created_at": 1755083779,
+        "data": {
+            "created_at": 547400.868004651,
+            "elapsed_time": 3.5037803899031132,
+            "inputs": {
+                "boolean_var": {
+                    "type": "boolean",
+                    "value": true
+                },
+                "int_var": {
+                    "type": "integer",
+                    "value": 1
+                },
+                "line_var": {
+                    "type": "line",
+                    "value": "I am line_var"
+                },
+                "option_var": {
+                    "type": "options",
+                    "value": "option 2"
+                },
+                "paragraph_var": {
+                    "type": "paragraph",
+                    "value": "a\nb\nc"
+                }
+            },
+            "outputs": {
+                "_created_time": 547400.869271305,
+                "_elapsed_time": 0.0001251999055966735,
+                "content": "Hello there! How can I assist you today?"
+            }
+        },
+        "event": "workflow_finished",
+        "message_id": "effdad8c783611f089261a6272e682d8",
+        "session_id": "efe523b6783611f089261a6272e682d8",
+        "task_id": "99ee29d6783511f09c921a6272e682d8"
+    }
+}
+Success with variables specified in the Begin component:
+
+Stream:
+
+data:{
+    "event": "message",
+    "message_id": "5b62e790783711f0bc531a6272e682d8",
+    "created_at": 1755083960,
+    "task_id": "99ee29d6783511f09c921a6272e682d8",
+    "data": {
+        "content": "Hello"
+    },
+    "session_id": "979e450c781d11f095cb729e3aa55728"
+}
+
+data:{
+    "event": "message",
+    "message_id": "5b62e790783711f0bc531a6272e682d8",
+    "created_at": 1755083960,
+    "task_id": "99ee29d6783511f09c921a6272e682d8",
+    "data": {
+        "content": "!"
+    },
+    "session_id": "979e450c781d11f095cb729e3aa55728"
+}
+
+data:{
+    "event": "message",
+    "message_id": "5b62e790783711f0bc531a6272e682d8",
+    "created_at": 1755083960,
+    "task_id": "99ee29d6783511f09c921a6272e682d8",
+    "data": {
+        "content": " You"
+    },
+    "session_id": "979e450c781d11f095cb729e3aa55728"
+}
+
+...
+
+data:[DONE]
+Non-stream:
+
+{
+    "code": 0,
+    "data": {
+        "created_at": 1755084029,
+        "data": {
+            "created_at": 547650.750818867,
+            "elapsed_time": 1.6227330720284954,
+            "inputs": {},
+            "outputs": {
+                "_created_time": 547650.752800839,
+                "_elapsed_time": 9.628792759031057e-05,
+                "content": "Hello! It appears you've sent another \"Hello\" without additional context. I'm here and ready to respond to any requests or questions you may have. Is there something specific you'd like to discuss or learn about?"
+            }
+        },
+        "event": "workflow_finished",
+        "message_id": "84eec534783711f08db41a6272e682d8",
+        "session_id": "979e450c781d11f095cb729e3aa55728",
+        "task_id": "99ee29d6783511f09c921a6272e682d8"
+    }
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "`question` is required."
+}
+List agent sessions
+GET /api/v1/agents/{agent_id}/sessions?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&id={session_id}&user_id={user_id}&dsl={dsl}
+
+Lists sessions associated with a specified agent.
+
+Request
+Method: GET
+URL: /api/v1/agents/{agent_id}/sessions?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&id={session_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/agents/{agent_id}/sessions?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&id={session_id}&user_id={user_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request Parameters
+agent_id: (Path parameter)
+The ID of the associated agent.
+page: (Filter parameter), integer
+Specifies the page on which the sessions will be displayed. Defaults to 1.
+page_size: (Filter parameter), integer
+The number of sessions on each page. Defaults to 30.
+orderby: (Filter parameter), string
+The field by which sessions should be sorted. Available options:
+create_time (default)
+update_time
+desc: (Filter parameter), boolean
+Indicates whether the retrieved sessions should be sorted in descending order. Defaults to true.
+id: (Filter parameter), string
+The ID of the agent session to retrieve.
+user_id: (Filter parameter), string
+The optional user-defined ID passed in when creating session.
+dsl: (Filter parameter), boolean
+Indicates whether to include the dsl field of the sessions in the response. Defaults to true.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [{
+        "agent_id": "e9e2b9c2b2f911ef801d0242ac120006",
+        "dsl": {
+            "answer": [],
+            "components": {
+                "Answer:OrangeTermsBurn": {
+                    "downstream": [],
+                    "obj": {
+                        "component_name": "Answer",
+                        "params": {}
+                    },
+                    "upstream": []
+                },
+                "Generate:SocialYearsRemain": {
+                    "downstream": [],
+                    "obj": {
+                        "component_name": "Generate",
+                        "params": {
+                            "cite": true,
+                            "frequency_penalty": 0.7,
+                            "llm_id": "gpt-4o___OpenAI-API@OpenAI-API-Compatible",
+                            "message_history_window_size": 12,
+                            "parameters": [],
+                            "presence_penalty": 0.4,
+                            "prompt": "Please summarize the following paragraph. Pay attention to the numbers and do not make things up. The paragraph is as follows:\n{input}\nThis is what you need to summarize.",
+                            "temperature": 0.1,
+                            "top_p": 0.3
+                        }
+                    },
+                    "upstream": []
+                },
+                "begin": {
+                    "downstream": [],
+                    "obj": {
+                        "component_name": "Begin",
+                        "params": {}
+                    },
+                    "upstream": []
+                }
+            },
+            "graph": {
+                "edges": [],
+                "nodes": [
+                    {
+                        "data": {
+                            "label": "Begin",
+                            "name": "begin"
+                        },
+                        "height": 44,
+                        "id": "begin",
+                        "position": {
+                            "x": 50,
+                            "y": 200
+                        },
+                        "sourcePosition": "left",
+                        "targetPosition": "right",
+                        "type": "beginNode",
+                        "width": 200
+                    },
+                    {
+                        "data": {
+                            "form": {
+                                "cite": true,
+                                "frequencyPenaltyEnabled": true,
+                                "frequency_penalty": 0.7,
+                                "llm_id": "gpt-4o___OpenAI-API@OpenAI-API-Compatible",
+                                "maxTokensEnabled": true,
+                                "message_history_window_size": 12,
+                                "parameters": [],
+                                "presencePenaltyEnabled": true,
+                                "presence_penalty": 0.4,
+                                "prompt": "Please summarize the following paragraph. Pay attention to the numbers and do not make things up. The paragraph is as follows:\n{input}\nThis is what you need to summarize.",
+                                "temperature": 0.1,
+                                "temperatureEnabled": true,
+                                "topPEnabled": true,
+                                "top_p": 0.3
+                            },
+                            "label": "Generate",
+                            "name": "Generate Answer_0"
+                        },
+                        "dragging": false,
+                        "height": 105,
+                        "id": "Generate:SocialYearsRemain",
+                        "position": {
+                            "x": 561.3457829707513,
+                            "y": 178.7211182312641
+                        },
+                        "positionAbsolute": {
+                            "x": 561.3457829707513,
+                            "y": 178.7211182312641
+                        },
+                        "selected": true,
+                        "sourcePosition": "right",
+                        "targetPosition": "left",
+                        "type": "generateNode",
+                        "width": 200
+                    },
+                    {
+                        "data": {
+                            "form": {},
+                            "label": "Answer",
+                            "name": "Dialogue_0"
+                        },
+                        "height": 44,
+                        "id": "Answer:OrangeTermsBurn",
+                        "position": {
+                            "x": 317.2368194777658,
+                            "y": 218.30635555445093
+                        },
+                        "sourcePosition": "right",
+                        "targetPosition": "left",
+                        "type": "logicNode",
+                        "width": 200
+                    }
+                ]
+            },
+            "history": [],
+            "messages": [],
+            "path": [],
+            "reference": []
+        },
+        "id": "792dde22b2fa11ef97550242ac120006",
+        "message": [
+            {
+                "content": "Hi! I'm your smart assistant. What can I do for you?",
+                "role": "assistant"
+            }
+        ],
+        "source": "agent",
+        "user_id": ""
+    }]
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "You don't own the agent ccd2f856b12311ef94ca0242ac1200052."
+}
+Delete agent's sessions
+DELETE /api/v1/agents/{agent_id}/sessions
+
+Deletes sessions of an agent by ID.
+
+Request
+Method: DELETE
+URL: /api/v1/agents/{agent_id}/sessions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"ids": list[string]
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/agents/{agent_id}/sessions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '
+     {
+          "ids": ["test_1", "test_2"]
+     }'
+Request Parameters
+agent_id: (Path parameter)
+The ID of the associated agent.
+"ids": (Body Parameter), list[string]
+The IDs of the sessions to delete. If it is not specified, all sessions associated with the specified agent will be deleted.
+Response
+Success:
+
+{
+    "code": 0
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The agent doesn't own the session cbd31e52f73911ef93b232903b842af6"
+}
+Generate related questions
+POST /api/v1/sessions/related_questions
+
+Generates five to ten alternative question strings from the user's original query to retrieve more relevant search results.
+
+This operation requires a Bearer Login Token, which typically expires with in 24 hours. You can find it in the Request Headers in your browser easily as shown below:
+
+Image
+
+:::tip NOTE The chat model autonomously determines the number of questions to generate based on the instruction, typically between five and ten. :::
+
+Request
+Method: POST
+URL: /api/v1/sessions/related_questions
+Headers:
+'content-Type: application/json'
+'Authorization: Bearer <YOUR_LOGIN_TOKEN>'
+Body:
+"question": string
+"industry": string
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/sessions/related_questions \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_LOGIN_TOKEN>' \
+     --data '
+     {
+          "question": "What are the key advantages of Neovim over Vim?",
+          "industry": "software_development"
+     }'
+Request Parameters
+"question": (Body Parameter), string The original user question.
+"industry": (Body Parameter), string Industry of the question.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        "What makes Neovim superior to Vim in terms of features?",
+        "How do the benefits of Neovim compare to those of Vim?",
+        "What advantages does Neovim offer that are not present in Vim?",
+        "In what ways does Neovim outperform Vim in functionality?",
+        "What are the most significant improvements in Neovim compared to Vim?",
+        "What unique advantages does Neovim bring to the table over Vim?",
+        "How does the user experience in Neovim differ from Vim in terms of benefits?",
+        "What are the top reasons to switch from Vim to Neovim?",
+        "What features of Neovim are considered more advanced than those in Vim?"
+    ],
+    "message": "success"
+}
+Failure:
+
+{
+    "code": 401,
+    "data": null,
+    "message": "<Unauthorized '401: Unauthorized'>"
+}
+AGENT MANAGEMENT
+List agents
+GET /api/v1/agents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&name={agent_name}&id={agent_id}
+
+Lists agents.
+
+Request
+Method: GET
+URL: /api/v1/agents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&title={agent_name}&id={agent_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/agents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&title={agent_name}&id={agent_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+page: (Filter parameter), integer
+Specifies the page on which the agents will be displayed. Defaults to 1.
+page_size: (Filter parameter), integer
+The number of agents on each page. Defaults to 30.
+orderby: (Filter parameter), string
+The attribute by which the results are sorted. Available options:
+create_time (default)
+update_time
+desc: (Filter parameter), boolean
+Indicates whether the retrieved agents should be sorted in descending order. Defaults to true.
+id: (Filter parameter), string
+The ID of the agent to retrieve.
+title: (Filter parameter), string
+The name of the agent to retrieve.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        {
+            "avatar": null,
+            "canvas_type": null,
+            "create_date": "Thu, 05 Dec 2024 19:10:36 GMT",
+            "create_time": 1733397036424,
+            "description": null,
+            "dsl": {
+                "answer": [],
+                "components": {
+                    "begin": {
+                        "downstream": [],
+                        "obj": {
+                            "component_name": "Begin",
+                            "params": {}
+                        },
+                        "upstream": []
+                    }
+                },
+                "graph": {
+                    "edges": [],
+                    "nodes": [
+                        {
+                            "data": {
+                                "label": "Begin",
+                                "name": "begin"
+                            },
+                            "height": 44,
+                            "id": "begin",
+                            "position": {
+                                "x": 50,
+                                "y": 200
+                            },
+                            "sourcePosition": "left",
+                            "targetPosition": "right",
+                            "type": "beginNode",
+                            "width": 200
+                        }
+                    ]
+                },
+                "history": [],
+                "messages": [],
+                "path": [],
+                "reference": []
+            },
+            "id": "8d9ca0e2b2f911ef9ca20242ac120006",
+            "title": "123465",
+            "update_date": "Thu, 05 Dec 2024 19:10:56 GMT",
+            "update_time": 1733397056801,
+            "user_id": "69736c5e723611efb51b0242ac120007"
+        }
+    ]
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "The agent doesn't exist."
+}
+Create agent
+POST /api/v1/agents
+
+Create an agent.
+
+Request
+Method: POST
+URL: /api/v1/agents
+Headers:
+'Content-Type: application/json
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"title": string
+"description": string
+"dsl": object
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/agents \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+         "title": "Test Agent",
+         "description": "A test agent",
+         "dsl": {
+           // ... Canvas DSL here ...
+         }
+     }'
+Request parameters
+title: (Body parameter), string, Required
+The title of the agent.
+description: (Body parameter), string
+The description of the agent. Defaults to None.
+dsl: (Body parameter), object, Required
+The canvas DSL object of the agent.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": true,
+    "message": "success"
+}
+Failure:
+
+{
+    "code": 102,
+    "message": "Agent with title test already exists."
+}
+Update agent
+PUT /api/v1/agents/{agent_id}
+
+Update an agent by id.
+
+Request
+Method: PUT
+URL: /api/v1/agents/{agent_id}
+Headers:
+'Content-Type: application/json
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"title": string
+"description": string
+"dsl": object
+Request example
+curl --request PUT \
+     --url http://{address}/api/v1/agents/58af890a2a8911f0a71a11b922ed82d6 \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+         "title": "Test Agent",
+         "description": "A test agent",
+         "dsl": {
+           // ... Canvas DSL here ...
+         }
+     }'
+Request parameters
+agent_id: (Path parameter), string
+The id of the agent to be updated.
+title: (Body parameter), string
+The title of the agent.
+description: (Body parameter), string
+The description of the agent.
+dsl: (Body parameter), object
+The canvas DSL object of the agent.
+Only specify the parameter you want to change in the request body. If a parameter does not exist or is None, it won't be updated.
+
+Response
+Success:
+
+{
+    "code": 0,
+    "data": true,
+    "message": "success"
+}
+Failure:
+
+{
+    "code": 103,
+    "message": "Only owner of canvas authorized for this operation."
+}
+Delete agent
+DELETE /api/v1/agents/{agent_id}
+
+Delete an agent by id.
+
+Request
+Method: DELETE
+URL: /api/v1/agents/{agent_id}
+Headers:
+'Content-Type: application/json
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request DELETE \
+     --url http://{address}/api/v1/agents/58af890a2a8911f0a71a11b922ed82d6 \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{}'
+Request parameters
+agent_id: (Path parameter), string
+The id of the agent to be deleted.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": true,
+    "message": "success"
+}
+Failure:
+
+{
+    "code": 103,
+    "message": "Only owner of canvas authorized for this operation."
+}
+System
+Check system health
+GET /v1/system/healthz
+
+Check the health status of RAGFlow’s dependencies (database, Redis, document engine, object storage).
+
+Request
+Method: GET
+URL: /v1/system/healthz
+Headers:
+'Content-Type: application/json' (no Authorization required)
+Request example
+curl --request GET
+     --url http://{address}/v1/system/healthz
+     --header 'Content-Type: application/json'
+Request parameters
+address: (Path parameter), string
+The host and port of the backend service (e.g., localhost:7897).
+Responses
+200 OK – All services healthy
+HTTP/1.1 200 OK
+Content-Type: application/json
+
+{
+  "db": "ok",
+  "redis": "ok",
+  "doc_engine": "ok",
+  "storage": "ok",
+  "status": "ok"
+}
+500 Internal Server Error – At least one service unhealthy
+HTTP/1.1 500 INTERNAL SERVER ERROR
+Content-Type: application/json
+
+{
+  "db": "ok",
+  "redis": "nok",
+  "doc_engine": "ok",
+  "storage": "ok",
+  "status": "nok",
+  "_meta": {
+    "redis": {
+      "elapsed": "5.2",
+      "error": "Lost connection!"
+    }
+  }
+}
+Explanation:
+
+Each service is reported as "ok" or "nok".
+The top-level status reflects overall health.
+If any service is "nok", detailed error info appears in _meta.
+FILE MANAGEMENT
+Upload file
+POST /api/v1/file/upload
+
+Uploads one or multiple files to the system.
+
+Request
+Method: POST
+URL: /api/v1/file/upload
+Headers:
+'Content-Type: multipart/form-data'
+'Authorization: Bearer <YOUR_API_KEY>'
+Form:
+'file=@{FILE_PATH}'
+'parent_id': string (optional)
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/file/upload \
+     --header 'Content-Type: multipart/form-data' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --form 'file=@./test1.txt' \
+     --form 'file=@./test2.pdf' \
+     --form 'parent_id={folder_id}'
+Request parameters
+'file': (Form parameter), file, Required
+The file(s) to upload. Multiple files can be uploaded in a single request.
+'parent_id': (Form parameter), string
+The parent folder ID where the file will be uploaded. If not specified, files will be uploaded to the root folder.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        {
+            "id": "b330ec2e91ec11efbc510242ac120004",
+            "name": "test1.txt",
+            "size": 17966,
+            "type": "doc",
+            "parent_id": "527fa74891e811ef9c650242ac120006",
+            "location": "test1.txt",
+            "create_time": 1729763127646
+        }
+    ]
+}
+Failure:
+
+{
+    "code": 400,
+    "message": "No file part!"
+}
+Create file or folder
+POST /api/v1/file/create
+
+Creates a new file or folder in the system.
+
+Request
+Method: POST
+URL: /api/v1/file/create
+Headers:
+'Content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"name": string
+"parent_id": string (optional)
+"type": string
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/file/create \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+          "name": "New Folder",
+          "type": "FOLDER",
+          "parent_id": "{folder_id}"
+     }'
+Request parameters
+"name": (Body parameter), string, Required
+The name of the file or folder to create.
+"parent_id": (Body parameter), string
+The parent folder ID. If not specified, the file/folder will be created in the root folder.
+"type": (Body parameter), string
+The type of the file to create. Available options:
+"FOLDER": Create a folder
+"VIRTUAL": Create a virtual file
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "id": "b330ec2e91ec11efbc510242ac120004",
+        "name": "New Folder",
+        "type": "FOLDER",
+        "parent_id": "527fa74891e811ef9c650242ac120006",
+        "size": 0,
+        "create_time": 1729763127646
+    }
+}
+Failure:
+
+{
+    "code": 409,
+    "message": "Duplicated folder name in the same folder."
+}
+List files
+GET /api/v1/file/list?parent_id={parent_id}&keywords={keywords}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}
+
+Lists files and folders under a specific folder.
+
+Request
+Method: GET
+URL: /api/v1/file/list?parent_id={parent_id}&keywords={keywords}&page={page}&page_size={page_size}&orderby={orderby}&desc={desc}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url 'http://{address}/api/v1/file/list?parent_id={folder_id}&page=1&page_size=15' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+parent_id: (Filter parameter), string
+The folder ID to list files from. If not specified, the root folder is used by default.
+keywords: (Filter parameter), string
+Search keyword to filter files by name.
+page: (Filter parameter), integer
+Specifies the page on which the files will be displayed. Defaults to 1.
+page_size: (Filter parameter), integer
+The number of files on each page. Defaults to 15.
+orderby: (Filter parameter), string
+The field by which files should be sorted. Available options:
+create_time (default)
+desc: (Filter parameter), boolean
+Indicates whether the retrieved files should be sorted in descending order. Defaults to true.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "total": 10,
+        "files": [
+            {
+                "id": "b330ec2e91ec11efbc510242ac120004",
+                "name": "test1.txt",
+                "type": "doc",
+                "size": 17966,
+                "parent_id": "527fa74891e811ef9c650242ac120006",
+                "create_time": 1729763127646
+            }
+        ],
+        "parent_folder": {
+            "id": "527fa74891e811ef9c650242ac120006",
+            "name": "Parent Folder"
+        }
+    }
+}
+Failure:
+
+{
+    "code": 404,
+    "message": "Folder not found!"
+}
+Get root folder
+GET /api/v1/file/root_folder
+
+Retrieves the user's root folder information.
+
+Request
+Method: GET
+URL: /api/v1/file/root_folder
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/file/root_folder \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+No parameters required.
+
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "root_folder": {
+            "id": "527fa74891e811ef9c650242ac120006",
+            "name": "root",
+            "type": "FOLDER"
+        }
+    }
+}
+Get parent folder
+GET /api/v1/file/parent_folder?file_id={file_id}
+
+Retrieves the immediate parent folder information of a specified file.
+
+Request
+Method: GET
+URL: /api/v1/file/parent_folder?file_id={file_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url 'http://{address}/api/v1/file/parent_folder?file_id={file_id}' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+file_id: (Filter parameter), string, Required
+The ID of the file whose immediate parent folder to retrieve.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "parent_folder": {
+            "id": "527fa74891e811ef9c650242ac120006",
+            "name": "Parent Folder"
+        }
+    }
+}
+Failure:
+
+{
+    "code": 404,
+    "message": "Folder not found!"
+}
+Get all parent folders
+GET /api/v1/file/all_parent_folder?file_id={file_id}
+
+Retrieves all parent folders of a specified file in the folder hierarchy.
+
+Request
+Method: GET
+URL: /api/v1/file/all_parent_folder?file_id={file_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url 'http://{address}/api/v1/file/all_parent_folder?file_id={file_id}' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>'
+Request parameters
+file_id: (Filter parameter), string, Required
+The ID of the file whose parent folders to retrieve.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": {
+        "parent_folders": [
+            {
+                "id": "527fa74891e811ef9c650242ac120006",
+                "name": "Parent Folder 1"
+            },
+            {
+                "id": "627fa74891e811ef9c650242ac120007",
+                "name": "Parent Folder 2"
+            }
+        ]
+    }
+}
+Failure:
+
+{
+    "code": 404,
+    "message": "Folder not found!"
+}
+Delete files
+POST /api/v1/file/rm
+
+Deletes one or multiple files or folders.
+
+Request
+Method: POST
+URL: /api/v1/file/rm
+Headers:
+'Content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"file_ids": list[string]
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/file/rm \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+          "file_ids": ["file_id_1", "file_id_2"]
+     }'
+Request parameters
+"file_ids": (Body parameter), list[string], Required
+The IDs of the files or folders to delete.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": true
+}
+Failure:
+
+{
+    "code": 404,
+    "message": "File or Folder not found!"
+}
+Rename file
+POST /api/v1/file/rename
+
+Renames a file or folder.
+
+Request
+Method: POST
+URL: /api/v1/file/rename
+Headers:
+'Content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"file_id": string
+"name": string
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/file/rename \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+          "file_id": "{file_id}",
+          "name": "new_name.txt"
+     }'
+Request parameters
+"file_id": (Body parameter), string, Required
+The ID of the file or folder to rename.
+"name": (Body parameter), string, Required
+The new name for the file or folder. Note: Changing file extensions is not supported.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": true
+}
+Failure:
+
+{
+    "code": 400,
+    "message": "The extension of file can't be changed"
+}
+or
+
+{
+    "code": 409,
+    "message": "Duplicated file name in the same folder."
+}
+Download file
+GET /api/v1/file/get/{file_id}
+
+Downloads a file from the system.
+
+Request
+Method: GET
+URL: /api/v1/file/get/{file_id}
+Headers:
+'Authorization: Bearer <YOUR_API_KEY>'
+Request example
+curl --request GET \
+     --url http://{address}/api/v1/file/get/{file_id} \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --output ./downloaded_file.txt
+Request parameters
+file_id: (Path parameter), string, Required
+The ID of the file to download.
+Response
+Success:
+
+Returns the file content as a binary stream with appropriate Content-Type headers.
+
+Failure:
+
+{
+    "code": 404,
+    "message": "Document not found!"
+}
+Move files
+POST /api/v1/file/mv
+
+Moves one or multiple files or folders to a specified folder.
+
+Request
+Method: POST
+URL: /api/v1/file/mv
+Headers:
+'Content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"src_file_ids": list[string]
+"dest_file_id": string
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/file/mv \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+          "src_file_ids": ["file_id_1", "file_id_2"],
+          "dest_file_id": "{destination_folder_id}"
+     }'
+Request parameters
+"src_file_ids": (Body parameter), list[string], Required
+The IDs of the files or folders to move.
+"dest_file_id": (Body parameter), string, Required
+The ID of the destination folder.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": true
+}
+Failure:
+
+{
+    "code": 404,
+    "message": "File or Folder not found!"
+}
+or
+
+{
+    "code": 404,
+    "message": "Parent Folder not found!"
+}
+Convert files to documents and link them to datasets
+POST /api/v1/file/convert
+
+Converts files to documents and links them to specified datasets.
+
+Request
+Method: POST
+URL: /api/v1/file/convert
+Headers:
+'Content-Type: application/json'
+'Authorization: Bearer <YOUR_API_KEY>'
+Body:
+"file_ids": list[string]
+"kb_ids": list[string]
+Request example
+curl --request POST \
+     --url http://{address}/api/v1/file/convert \
+     --header 'Content-Type: application/json' \
+     --header 'Authorization: Bearer <YOUR_API_KEY>' \
+     --data '{
+          "file_ids": ["file_id_1", "file_id_2"],
+          "kb_ids": ["dataset_id_1", "dataset_id_2"]
+     }'
+Request parameters
+"file_ids": (Body parameter), list[string], Required
+The IDs of the files to convert. If a folder ID is provided, all files within that folder will be converted.
+"kb_ids": (Body parameter), list[string], Required
+The IDs of the target datasets.
+Response
+Success:
+
+{
+    "code": 0,
+    "data": [
+        {
+            "id": "file2doc_id_1",
+            "file_id": "file_id_1",
+            "document_id": "document_id_1"
+        }
+    ]
+}
+Failure:
+
+{
+    "code": 404,
+    "message": "File not found!"
+}
+or
+
+{
+    "code": 404,
+    "message": "Can't find this dataset!"
+}

BIN
conf/__pycache__/config.cpython-312.pyc


+ 65 - 0
conf/config.py

@@ -54,6 +54,32 @@ class ModelConfig:
             "api_key": ModelConfig.get_api_key()
         }
 
+    @staticmethod
+    def get_multimodal_embedding_model_name() -> str:
+        """获取多模态嵌入模型名称"""
+        return os.getenv("MULTIMODAL_EMBEDDING_MODEL_NAME", "qwen2.5-vl-embedding")
+    
+    @staticmethod
+    def get_dashscope_api_key() -> str:
+        """获取DASHSCOPE API密钥"""
+        return os.getenv("DASHSCOPE", "")
+
+    # RAGFLOW配置
+    @staticmethod
+    def get_ragflow_api_url() -> str:
+        """获取RAGFLOW API基础URL"""
+        return os.getenv("RAGFLOW_API_URL", "http://192.168.16.134/")
+    
+    @staticmethod
+    def get_ragflow_api_key() -> str:
+        """获取RAGFLOW API密钥"""
+        return os.getenv("RAGFLOW_API_KEY", "")
+    
+    @staticmethod
+    def get_dataset_id() -> str:
+        """获取数据集ID"""
+        return os.getenv("DATASET_ID", "")
+
 class AppConfig:
     """应用配置类"""
     
@@ -62,8 +88,47 @@ class AppConfig:
         """获取日志级别"""
         return os.getenv("LOG_LEVEL", "INFO")
 
+
+class VectorDBConfig:
+    """向量数据库配置类"""
+    
+    @staticmethod
+    def get_vector_db_type() -> str:
+        """获取向量数据库类型"""
+        return os.getenv("VECTOR_DB_TYPE", "es")
+    
+    @staticmethod
+    def get_infinity_host() -> str:
+        """获取Infinity向量数据库主机"""
+        return os.getenv("INFINITY_HOST", "localhost")
+    
+    @staticmethod
+    def get_infinity_port() -> int:
+        """获取Infinity向量数据库端口"""
+        return int(os.getenv("INFINITY_PORT", "23810"))
+    
+    @staticmethod
+    def get_infinity_user() -> str:
+        """获取Infinity向量数据库用户名"""
+        return os.getenv("INFINITY_USER", "admin")
+    
+    @staticmethod
+    def get_infinity_password() -> str:
+        """获取Infinity向量数据库密码"""
+        return os.getenv("INFINITY_PASSWORD", "admin")
+
 # 导出配置实例
 model_config = ModelConfig.get_model_config()
 app_config = {
     "log_level": AppConfig.get_log_level()
 }
+
+vector_db_config = {
+    "type": VectorDBConfig.get_vector_db_type(),
+    "infinity": {
+        "host": VectorDBConfig.get_infinity_host(),
+        "port": VectorDBConfig.get_infinity_port(),
+        "user": VectorDBConfig.get_infinity_user(),
+        "password": VectorDBConfig.get_infinity_password()
+    }
+}

+ 686 - 0
es_conn_analysis.md

@@ -0,0 +1,686 @@
+# Elasticsearch 连接器技术文档
+
+## 概述
+
+本文档详细解析 `es_conn.py` 文件的设计思路与具体实现。该文件实现了一个基于 Elasticsearch 的数据库连接器,提供完整的 CRUD 操作支持,并具备混合搜索(文本+向量)、批量处理、重试机制等高级功能。
+
+---
+
+## 文件结构总览
+
+### 1. 版权声明与许可 (1-15行)
+
+文件开头的 Apache License 2.0 声明表明这是一个开源项目,遵循 Apache 开源协议。该协议允许自由使用、修改和分发代码,但需要保留原始版权声明和许可证文本。
+
+### 2. 导入模块分析 (17-28行)
+
+```python
+import re              # 正则表达式处理
+import json            # JSON 序列化
+import time            # 时间控制
+
+import copy            # 深拷贝操作
+from elasticsearch_dsl import UpdateByQuery, Q, Search  # ES DSL 查询构建器
+from elastic_transport import ConnectionTimeout         # 连接超时异常
+from common.decorator import singleton                  # 单例装饰器
+from common.doc_store.doc_store_base import MatchTextExpr, OrderByExpr, MatchExpr, MatchDenseExpr, FusionExpr  # 查询表达式基类
+from common.doc_store.es_conn_base import ESConnectionBase  # 基础连接类
+from common.float_utils import get_float                # 浮点数处理工具
+from common.constants import PAGERANK_FLD, TAG_FLD      # 常量定义
+```
+
+**设计思路**:
+- 使用 `elasticsearch_dsl` 库构建类型安全的查询 DSL(领域特定语言)
+- 通过 `singleton` 装饰器确保全局只有一个连接实例,避免连接池浪费
+- 自定义异常类型 `ConnectionTimeout` 用于优雅处理超时情况
+- 导入基类 `ESConnectionBase` 遵循开放/封闭原则,便于扩展
+
+### 3. 常量定义 (30行)
+
+```python
+ATTEMPT_TIME = 2
+```
+
+**设计思路**:定义重试次数为 2 次,在可靠性和性能之间取得平衡。网络操作可能因临时故障失败,轻量级重试机制可提高成功率。
+
+---
+
+## 核心类设计
+
+### ESConnection 类 (33-375行)
+
+#### 类的继承关系
+
+```python
+@singleton
+class ESConnection(ESConnectionBase):
+```
+
+**设计思路**:
+- 继承自 `ESConnectionBase` 抽象基类,遵循依赖倒置原则
+- 使用 `@singleton` 装饰器确保单例模式,整个应用生命周期内只有一个实例
+- 单例模式避免了频繁创建/销毁连接的开销,提高了资源利用效率
+
+---
+
+## 搜索功能详解 (39-169行)
+
+### 方法签名
+
+```python
+def search(
+        self, select_fields: list[str],
+        highlight_fields: list[str],
+        condition: dict,
+        match_expressions: list[MatchExpr],
+        order_by: OrderByExpr,
+        offset: int,
+        limit: int,
+        index_names: str | list[str],
+        knowledgebase_ids: list[str],
+        agg_fields: list[str] | None = None,
+        rank_feature: dict | None = None
+):
+```
+
+**参数设计思路**:
+- `select_fields` 和 `highlight_fields` 分离了字段选择和高亮显示的关注点
+- `condition` 使用字典存储查询条件,提供了灵活的条件组合方式
+- `match_expressions` 使用表达式模式,支持多种匹配类型的组合
+- `knowledgebase_ids` 支持多知识库搜索场景
+
+### 索引名称处理 (55-58行)
+
+```python
+if isinstance(index_names, str):
+    index_names = index_names.split(",")
+assert isinstance(index_names, list) and len(index_names) > 0
+assert "_id" not in condition"
+```
+
+**设计思路**:
+- 兼容字符串和列表两种输入形式,提高了 API 的易用性
+- 使用断言确保参数的有效性,在开发阶段快速捕获错误
+- 禁止直接使用 `_id` 作为查询条件,防止潜在的注入风险
+
+### 查询条件构建 (60-78行)
+
+```python
+bool_query = Q("bool", must=[])
+condition["kb_id"] = knowledgebase_ids
+for k, v in condition.items():
+    if k == "available_int":
+        if v == 0:
+            bool_query.filter.append(Q("range", available_int={"lt": 1}))
+        else:
+            bool_query.filter.append(
+                Q("bool", must_not=Q("range", available_int={"lt": 1}")))
+        continue
+    if not v:
+        continue
+    if isinstance(v, list):
+        bool_query.filter.append(Q("terms", **{k: v}))
+    elif isinstance(v, str) or isinstance(v, int):
+        bool_query.filter.append(Q("term", **{k: v}))
+    else:
+        raise Exception(...)
+```
+
+**设计思路**:
+- 使用 Elasticsearch 的 bool 查询作为基础,灵活组合 must/filter/must_not 等条件
+- 特殊处理 `available_int` 字段,实现可用性过滤逻辑
+- 根据值的类型自动选择 `terms`(列表)或 `term`(单值)查询方式
+- 跳过空值条件,避免不必要的查询开销
+
+### 混合搜索权重计算 (80-89行)
+
+```python
+s = Search()
+vector_similarity_weight = 0.5
+for m in match_expressions:
+    if isinstance(m, FusionExpr) and m.method == "weighted_sum" and "weights" in m.fusion_params:
+        assert len(match_expressions) == 3 and isinstance(match_expressions[0], MatchTextExpr) and isinstance(
+            match_expressions[1],
+            MatchDenseExpr) and isinstance(
+            match_expressions[2], FusionExpr)
+        weights = m.fusion_params["weights"]
+        vector_similarity_weight = get_float(weights.split(",")[1])
+```
+
+**设计思路**:
+- 支持混合搜索场景,同时处理文本匹配和向量相似度搜索
+- 从权重参数中解析向量搜索的权重比例
+- 使用断言确保混合搜索的表达式组合符合预期模式
+
+### 文本匹配处理 (91-99行)
+
+```python
+for m in match_expressions:
+    if isinstance(m, MatchTextExpr):
+        minimum_should_match = m.extra_options.get("minimum_should_match", 0.0)
+        if isinstance(minimum_should_match, float):
+            minimum_should_match = str(int(minimum_should_match * 100)) + "%"
+        bool_query.must.append(Q("query_string", fields=m.fields,
+                                 type="best_fields", query=m.matching_text,
+                                 minimum_should_match=minimum_should_match,
+                                 boost=1))
+        bool_query.boost = 1.0 - vector_similarity_weight
+```
+
+**设计思路**:
+- 使用 `query_string` 查询支持复杂的查询语法
+- `best_fields` 类型在多字段搜索中返回最佳匹配字段的分数
+- 将浮点数百分比转换为 Elasticsearch 要求的字符串格式
+- 根据混合搜索权重调整文本查询的 boost 值
+
+### 向量搜索处理 (101-112行)
+
+```python
+elif isinstance(m, MatchDenseExpr):
+    assert (bool_query is not None)
+    similarity = 0.0
+    if "similarity" in m.extra_options:
+        similarity = m.extra_options["similarity"]
+    s = s.knn(m.vector_column_name,
+              m.topn,
+              m.topn * 2,
+              query_vector=list(m.embedding_data),
+              filter=bool_query.to_dict(),
+              similarity=similarity,
+              )
+```
+
+**设计思路**:
+- 使用 Elasticsearch 的 k-NN(最近邻)搜索功能进行向量相似度匹配
+- `topn * 2` 的设计是为了在应用其他过滤条件后仍能返回足够的候选结果
+- 将查询过滤器传递给 k-NN 搜索,实现向量搜索与过滤条件的组合
+- 支持自定义相似度阈值
+
+### 排名特征增强 (114-118行)
+
+```python
+if bool_query and rank_feature:
+    for fld, sc in rank_feature.items():
+        if fld != PAGERANK_FLD:
+            fld = f"{TAG_FLD}.{fld}"
+        bool_query.should.append(Q("rank_feature", field=fld, linear={}, boost=sc))
+```
+
+**设计思路**:
+- 使用 `rank_feature` 查询提升特定特征的权重
+- 对非 pagerank 字段添加 `tag.` 前缀,符合数据模型的命名规范
+- `should` 查询不会排除结果,只会调整相关性分数
+
+### 高亮与排序处理 (120-137行)
+
+```python
+for field in highlight_fields:
+    s = s.highlight(field)
+
+if order_by:
+    orders = list()
+    for field, order in order_by.fields:
+        order = "asc" if order == 0 else "desc"
+        if field in ["page_num_int", "top_int"]:
+            order_info = {"order": order, "unmapped_type": "float",
+                          "mode": "avg", "numeric_type": "double"}
+        elif field.endswith("_int") or field.endswith("_flt"):
+            order_info = {"order": order, "unmapped_type": "float"}
+        else:
+            order_info = {"order": order, "unmapped_type": "text"}
+        orders.append({field: order_info})
+    s = s.sort(*orders)
+```
+
+**设计思路**:
+- 分离高亮配置和排序配置的关注点
+- 根据字段类型设置不同的 `unmapped_type`,避免排序时因字段不存在而失败
+- 数值类型字段特殊处理,支持正确的数值排序
+
+### 聚合处理 (138-140行)
+
+```python
+if agg_fields:
+    for fld in agg_fields:
+        s.aggs.bucket(f'aggs_{fld}', 'terms', field=fld, size=1000000)
+```
+
+**设计思路**:
+- 使用 `terms` 聚合统计字段值的分布
+- `size=1000000` 设置较大的聚合桶数量,确保结果完整性
+
+### 分页处理 (142-143行)
+
+```python
+if limit > 0:
+    s = s[offset:offset + limit]
+```
+
+**设计思路**:
+- Elasticsearch DSL 的切片语法简洁直观
+- 限制只有 limit > 0 时才应用分页,支持返回所有记录的场景
+
+### 重试机制与超时处理 (147-169行)
+
+```python
+for i in range(ATTEMPT_TIME):
+    try:
+        res = self.es.search(index=index_names,
+                             body=q,
+                             timeout="600s",
+                             track_total_hits=True,
+                             _source=True)
+        if str(res.get("timed_out", "")).lower() == "true":
+            raise Exception("Es Timeout.")
+        self.logger.debug(f"ESConnection.search {str(index_names)} res: " + str(res))
+        return res
+    except ConnectionTimeout:
+        self.logger.exception("ES request timeout")
+        self._connect()
+        continue
+    except Exception as e:
+        self.logger.exception(f"ESConnection.search {str(index_names)} query: " + str(q) + str(e))
+        raise e
+
+self.logger.error(f"ESConnection.search timeout for {ATTEMPT_TIME} times!")
+raise Exception("ESConnection.search timeout.")
+```
+
+**设计思路**:
+- 设置 600 秒超时时间,适应复杂查询场景
+- `track_total_hits=True` 确保返回准确的命中总数
+- ConnectionTimeout 异常时重连后重试,最大化容错能力
+- 非超时异常直接抛出,便于上层处理
+- 所有重试失败后抛出明确的超时异常
+
+---
+
+## 插入功能详解 (171-207行)
+
+### 方法签名
+
+```python
+def insert(self, documents: list[dict], index_name: str, knowledgebase_id: str = None) -> list[str]:
+```
+
+**设计思路**:
+- `documents` 参数接收字典列表,支持批量插入
+- `knowledgebase_id` 参数将知识库 ID 注入每条文档,实现数据隔离
+
+### 批量操作构建 (173-182行)
+
+```python
+operations = []
+for d in documents:
+    assert "_id" not in d
+    assert "id" in d
+    d_copy = copy.deepcopy(d)
+    d_copy["kb_id"] = knowledgebase_id
+    meta_id = d_copy.pop("id", "")
+    operations.append(
+        {"index": {"_index": index_name, "_id": meta_id}})
+    operations.append(d_copy)
+```
+
+**设计思路**:
+- 使用 Elasticsearch 的批量 API(Bulk API)提高插入效率
+- 禁止使用 `_id` 作为字段名,防止与 ES 内部 ID 冲突
+- 要求每条文档必须有 `id` 字段,作为 ES 文档 ID
+- 使用深拷贝避免修改原始数据
+- 将 `id` 字段重命名为 ES 的 `_id`,并添加 `kb_id` 字段
+
+### 批量执行与错误处理 (184-207行)
+
+```python
+for _ in range(ATTEMPT_TIME):
+    try:
+        r = self.es.bulk(index=index_name, operations=operations,
+                         refresh=False, timeout="60s")
+        if re.search(r"False", str(r["errors"]), re.IGNORECASE):
+            return res
+
+        for item in r["items"]:
+            for action in ["create", "delete", "index", "update"]:
+                if action in item and "error" in item[action]:
+                    res.append(str(item[action]["_id"]) + ":" + str(item[action]["error"]))
+        return res
+    except ConnectionTimeout:
+        self.logger.exception("ES request timeout")
+        time.sleep(3)
+        self._connect()
+        continue
+    except Exception as e:
+        res.append(str(e))
+        self.logger.warning("ESConnection.insert got exception: " + str(e))
+
+return res
+```
+
+**设计思路**:
+- `refresh=False` 提高写入性能,不要求立即刷新到磁盘
+- 返回值为错误列表,空列表表示全部成功
+- 遍历批量响应中的每个操作,收集错误信息
+- 超时后等待 3 秒再重试,给系统恢复时间
+
+---
+
+## 更新功能详解 (209-301行)
+
+### 方法签名
+
+```python
+def update(self, condition: dict, new_value: dict, index_name: str, knowledgebase_id: str) -> bool:
+```
+
+**设计思路**:
+- 支持两种更新场景:指定 ID 的单文档更新和条件匹配的多文档更新
+- 返回布尔值表示更新是否成功
+
+### 单文档更新 (213-233行)
+
+```python
+if "id" in condition and isinstance(condition["id"], str):
+    chunk_id = condition["id"]
+    for i in range(ATTEMPT_TIME):
+        for k in doc.keys():
+            if "feas" != k.split("_")[-1]:
+                continue
+            try:
+                self.es.update(index=index_name, id=chunk_id, script=f"ctx._source.remove(\"{k}\");")
+            except Exception:
+                self.logger.exception(...)
+        try:
+            self.es.update(index=index_name, id=chunk_id, doc=doc)
+            return True
+        except Exception as e:
+            self.logger.exception(...)
+            break
+    return False
+```
+
+**设计思路**:
+- 对于以 `feas` 结尾的字段,使用脚本执行删除操作
+- 这种设计支持特殊字段的清理逻辑
+- 主更新操作使用简单的文档更新方式
+
+### 多文档更新条件构建 (236-249行)
+
+```python
+bool_query = Q("bool")
+for k, v in condition.items():
+    if not isinstance(k, str) or not v:
+        continue
+    if k == "exists":
+        bool_query.filter.append(Q("exists", field=v))
+        continue
+    if isinstance(v, list):
+        bool_query.filter.append(Q("terms", **{k: v}))
+    elif isinstance(v, str) or isinstance(v, int):
+        bool_query.filter.append(Q("term", **{k: v}))
+    else:
+        raise Exception(...)
+```
+
+**设计思路**:
+- 使用 bool 查询构建复杂条件
+- 支持 `exists` 查询检查字段是否存在
+- 统一的条件处理逻辑,支持多种值类型
+
+### 脚本生成器 (250-280行)
+
+```python
+scripts = []
+params = {}
+for k, v in new_value.items():
+    if k == "remove":
+        if isinstance(v, str):
+            scripts.append(f"ctx._source.remove('{v}');")
+        if isinstance(v, dict):
+            for kk, vv in v.items():
+                scripts.append(f"int i=ctx._source.{kk}.indexOf(params.p_{kk});ctx._source.{kk}.remove(i);")
+                params[f"p_{kk}"] = vv
+        continue
+    if k == "add":
+        if isinstance(v, dict):
+            for kk, vv in v.items():
+                scripts.append(f"ctx._source.{kk}.add(params.pp_{kk});")
+                params[f"pp_{kk}"] = vv.strip()
+        continue
+    if (not isinstance(k, str) or not v) and k != "available_int":
+        continue
+    if isinstance(v, str):
+        v = re.sub(r"(['\n\r]|\\.)", " ", v)
+        params[f"pp_{k}"] = v
+        scripts.append(f"ctx._source.{k}=params.pp_{k};")
+    elif isinstance(v, int) or isinstance(v, float):
+        scripts.append(f"ctx._source.{k}={v};")
+    elif isinstance(v, list):
+        scripts.append(f"ctx._source.{k}=params.pp_{k};")
+        params[f"pp_{k}"] = json.dumps(v, ensure_ascii=False)
+    else:
+        raise Exception(...)
+```
+
+**设计思路**:
+- 使用 Elasticsearch 的 Painless 脚本语言实现复杂更新逻辑
+- `remove` 操作支持单个字段删除和数组元素删除
+- `add` 操作支持向数组添加元素
+- 字符串值经过转义处理,防止脚本注入
+- 数组值使用 JSON 序列化后传入脚本
+- 使用参数化方式传递值,避免字符串拼接风险
+
+### UpdateByQuery 执行 (281-301行)
+
+```python
+ubq = UpdateByQuery(
+    index=index_name).using(
+    self.es).query(bool_query)
+ubq = ubq.script(source="".join(scripts), params=params)
+ubq = ubq.params(refresh=True)
+ubq = ubq.params(slices=5)
+ubq = ubq.params(conflicts="proceed")
+
+for _ in range(ATTEMPT_TIME):
+    try:
+        _ = ubq.execute()
+        return True
+    except ConnectionTimeout:
+        self.logger.exception("ES request timeout")
+        time.sleep(3)
+        self._connect()
+        continue
+    except Exception as e:
+        self.logger.error("ESConnection.update got exception: " + str(e) + "\n".join(scripts))
+        break
+return False
+```
+
+**设计思路**:
+- `UpdateByQuery` 用于批量更新符合条件的文档
+- `refresh=True` 确保更新后立即对搜索可见
+- `slices=5` 启用并行处理,提高大数据量更新的性能
+- `conflicts="proceed"` 允许在文档被其他操作修改时继续执行
+
+---
+
+## 删除功能详解 (303-349行)
+
+### 方法签名
+
+```python
+def delete(self, condition: dict, index_name: str, knowledgebase_id: str) -> int:
+```
+
+**设计思路**:
+- 返回删除的文档数量,便于调用者了解操作结果
+- 条件删除场景支持灵活的数据清理操作
+
+### 条件处理 (304-331行)
+
+```python
+assert "_id" not in condition
+condition["kb_id"] = knowledgebase_id
+if "id" in condition:
+    chunk_ids = condition["id"]
+    if not isinstance(chunk_ids, list):
+        chunk_ids = [chunk_ids]
+    if not chunk_ids:
+        qry = Q("match_all")
+    else:
+        qry = Q("ids", values=chunk_ids)
+else:
+    qry = Q("bool")
+    for k, v in condition.items():
+        if k == "exists":
+            qry.filter.append(Q("exists", field=v))
+        elif k == "must_not":
+            if isinstance(v, dict):
+                for kk, vv in v.items():
+                    if kk == "exists":
+                        qry.must_not.append(Q("exists", field=vv))
+        elif isinstance(v, list):
+            qry.must.append(Q("terms", **{k: v}))
+        elif isinstance(v, str) or isinstance(v, int):
+            qry.must.append(Q("term", **{k: v}))
+        else:
+            raise Exception("Condition value must be int, str or list.")
+```
+
+**设计思路**:
+- `id` 参数支持单个 ID 或 ID 列表,灵活处理删除范围
+- 空 ID 列表时使用 `match_all` 删除全部文档
+- 使用 `ids` 查询高效删除指定文档
+- 支持 `exists` 和 `must_not` 高级条件
+
+### 删除执行 (333-349行)
+
+```python
+self.logger.debug("ESConnection.delete query: " + json.dumps(qry.to_dict()))
+for _ in range(ATTEMPT_TIME):
+    try:
+        res = self.es.delete_by_query(
+            index=index_name,
+            body=Search().query(qry).to_dict(),
+            refresh=True)
+        return res["deleted"]
+    except ConnectionTimeout:
+        self.logger.exception("ES request timeout")
+        time.sleep(3)
+        self._connect()
+        continue
+    except Exception as e:
+        self.logger.warning("ESConnection.delete got exception: " + str(e))
+        if re.search(r"(not_found)", str(e), re.IGNORECASE):
+            return 0
+return 0
+```
+
+**设计思路**:
+- 使用 `delete_by_query` API 批量删除文档
+- `refresh=True` 确保删除操作立即生效
+- 返回 `deleted` 字段表示实际删除的文档数量
+- 忽略 "not_found" 异常,因为删除不存在的文档是预期行为
+
+---
+
+## 辅助功能详解 (355-375行)
+
+### get_fields 方法
+
+```python
+def get_fields(self, res, fields: list[str]) -> dict[str, dict]:
+    res_fields = {}
+    if not fields:
+        return {}
+    for d in self._get_source(res):
+        m = {n: d.get(n) for n in fields if d.get(n) is not None}
+        for n, v in m.items():
+            if isinstance(v, list):
+                m[n] = v
+                continue
+            if n == "available_int" and isinstance(v, (int, float)):
+                m[n] = v
+                continue
+            if not isinstance(v, str):
+                m[n] = str(v)
+            # if n.find("tks") > 0:
+            #     m[n] = remove_redundant_spaces(m[n])
+
+        if m:
+            res_fields[d["id"]] = m
+    return res_fields
+```
+
+**设计思路**:
+- 将搜索结果转换为以文档 ID 为键的字典结构,便于后续处理
+- 只提取指定的字段,过滤掉不需要的数据
+- 保留列表类型和数值类型的字段值
+- 将其他类型转换为字符串,保证输出的一致性
+- 使用 `_get_source` 方法提取搜索结果的源文档(由基类提供)
+
+---
+
+## 设计模式总结
+
+### 1. 单例模式
+使用 `@singleton` 装饰器确保全局唯一的连接实例,避免资源浪费。
+
+### 2. 策略模式
+通过 `match_expressions` 参数支持多种匹配策略(文本、向量、融合)的组合。
+
+### 3. 模板方法模式
+继承自 `ESConnectionBase`,遵循基类定义的接口规范。
+
+### 4. 责任链模式
+重试机制将处理责任依次传递给自身,实现优雅降级。
+
+### 5. 建造者模式
+使用 Elasticsearch DSL 构建复杂的查询对象。
+
+---
+
+## 性能优化策略
+
+### 1. 批量操作
+`insert` 方法使用 Bulk API 批量插入,减少网络往返开销。
+
+### 2. 并行处理
+`UpdateByQuery` 的 `slices=5` 参数启用多线程并行处理。
+
+### 3. 延迟刷新
+`refresh=False` 避免每次写入后立即刷新,提高写入性能。
+
+### 4. 超时控制
+600 秒超时适应复杂查询,避免无限期等待。
+
+### 5. 条件跳过
+空值条件自动跳过,减少不必要的查询开销。
+
+---
+
+## 错误处理机制
+
+### 1. 连接超时
+检测到 `ConnectionTimeout` 异常后重连并重试。
+
+### 2. 批量操作错误
+收集批量操作中每个文档的错误信息。
+
+### 3. 参数验证
+使用断言确保关键参数的有效性。
+
+### 4. 日志记录
+所有异常和关键操作都有日志记录,便于问题排查。
+
+---
+
+## 扩展性考虑
+
+### 1. 表达式系统
+`MatchExpr` 系列类支持扩展新的匹配类型。
+
+### 2. 查询构建器
+继承 `ESConnectionBase` 可实现其他数据库的连接器。
+
+### 3. 配置外部化
+超时、重试次数等参数可配置,适应不同环境需求。

+ 3 - 1
requirements.txt

@@ -4,4 +4,6 @@ langgraph
 pydantic
 PyMuPDF
 Pillow
-python-dotenv
+python-dotenv
+elasticsearch==8.11.1
+infinity-emb

BIN
services/model/__pycache__/multimodal_embedding.cpython-312.pyc


BIN
services/model/__pycache__/qwen_vl.cpython-312.pyc


+ 3 - 3
services/model/multimodal_embedding.py

@@ -1,4 +1,4 @@
-from typing import Dict, Any, List
+from typing import List
 from PIL import Image
 import base64
 import io
@@ -7,11 +7,11 @@ from dashscope import MultiModalEmbedding
 from conf.config import ModelConfig
 
 class MultimodalEmbedding:
-    """OpenAI Embedding模型工具"""
+    """Embedding模型工具"""
     
     def __init__(self, model_name: str = None, api_key: str = None):
         """
-        初始化OpenAI Embedding模型
+        初始化Embedding模型
         
         Args:
             model_name: 模型名称,若为None则使用配置文件中的值

BIN
services/pdf_parser/__pycache__/__init__.cpython-312.pyc


BIN
services/pdf_parser/__pycache__/main.cpython-312.pyc


BIN
services/pdf_parser/__pycache__/workflow.cpython-312.pyc


+ 1 - 1
services/pdf_parser/main.py

@@ -45,7 +45,7 @@ class PDFParsingService:
                 - is_complete: 是否完成
         """
         # 运行工作流
-        result = self.workflow.run(pdf_path)
+        result = self.workflow.run(pdf_path, ModelConfig.get_dataset_id(), ModelConfig.get_ragflow_api_url(), ModelConfig.get_ragflow_api_key())
         
         # 整理输出结果
         output = {

+ 156 - 10
services/pdf_parser/workflow.py

@@ -8,18 +8,30 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(
 from langgraph.graph import StateGraph, START, END
 from langgraph.graph.message import add_messages
 from typing import List, Dict, Any, Annotated
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, ConfigDict
 from services.pdf_parser.pdf_splitter import PDFSplitter
 from services.model.qwen_vl import QWenVLParser
+from services.ragflow.ragflow_service import RAGFlowService
+from services.utils.vector_db import VectorDBFactory
+from services.model.multimodal_embedding import MultimodalEmbedding
+from conf.config import ModelConfig
 
 # 定义工作流状态类
 class PDFParsingState(BaseModel):
     """PDF解析工作流状态"""
+    model_config = ConfigDict(arbitrary_types_allowed=True)
     pdf_path: str = Field(..., description="PDF文件路径")
+    dataset_id: str = Field(..., description="数据集ID")
+    ragflow_service: RAGFlowService = Field(default_factory=RAGFlowService, description="RAGFLOW服务")
+    vector_db: Any = Field(default_factory=VectorDBFactory.get_vector_db, description="向量数据库实例")
+    embedding_model: MultimodalEmbedding = Field(default_factory=MultimodalEmbedding, description="多模态嵌入模型实例")
+    document_id: str = Field(default="", description="上传后的文档ID")
     split_pages: List[Dict[str, Any]] = Field(default_factory=list, description="拆分后的页面列表")
     current_page: Dict[str, Any] = Field(default_factory=dict, description="当前处理的页面")
     parsed_results: List[Dict[str, Any]] = Field(default_factory=list, description="解析结果列表")
+    vectorized_results: List[Dict[str, Any]] = Field(default_factory=list, description="向量化结果列表")
     processed_pages: int = Field(default=0, description="已处理的页面数量")
+    vectorized_pages: int = Field(default=0, description="已向量化的页面数量")
     is_complete: bool = Field(default=False, description="是否处理完成")
 
 # 创建工作流构建器
@@ -41,17 +53,31 @@ class PDFParsingWorkflow:
         # 创建状态图
         graph = StateGraph(PDFParsingState)
         
+        # 添加上传文档节点
+        graph.add_node("upload_document", self._upload_document_node)
+        
+        # 添加解析文档节点
+        graph.add_node("parse_document", self._parse_document_node)
+        
         # 添加拆分PDF节点
         graph.add_node("split_pdf", self._split_pdf_node)
         
         # 添加解析图像节点
         graph.add_node("parse_image", self._parse_image_node)
         
+        # 添加向量化入库节点
+        graph.add_node("vectorize_store", self._vectorize_store_node)
+        
         # 添加完成节点
         graph.add_node("complete", self._complete_node)
         
         # 定义边
-        graph.add_edge(START, "split_pdf")
+        graph.add_edge(START, "upload_document")
+        
+        # 添加解析文档边
+        graph.add_edge("upload_document", "parse_document")
+        
+        graph.add_edge("parse_document", "split_pdf")
         graph.add_edge("split_pdf", "parse_image")
         
         # 添加条件边:判断是否继续解析
@@ -60,15 +86,67 @@ class PDFParsingWorkflow:
             self._should_continue_parsing,
             {
                 "continue": "parse_image",
-                "complete": "complete"
+                "complete": "vectorize_store"
             }
         )
         
+        # 添加向量化入库边
+        graph.add_edge("vectorize_store", "complete")
+        
         graph.add_edge("complete", END)
         
         # 编译工作流
         return graph.compile()
     
+    def _upload_document_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """RAGFLOW上传文档节点"""
+        print(f"开始上传文档到数据集 {state.dataset_id}: {state.pdf_path}")
+        
+        try:
+            # 上传文档
+            document_info_list = state.ragflow_service.upload_document(
+                dataset_id=state.dataset_id,
+                file_path=state.pdf_path
+            )
+            
+            # 检查响应
+            if document_info_list and len(document_info_list) > 0:
+                document_id = document_info_list[0].id
+                print(f"文档上传成功,文档ID: {document_id}")
+                return {
+                    "document_id": document_id
+                }
+            else:
+                print("文档上传失败: 未返回有效的文档信息")
+                raise Exception("文档上传失败: 未返回有效的文档信息")
+        except Exception as e:
+            print(f"上传文档时出错: {str(e)}")
+            raise
+
+    def _parse_document_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """RAGFLOW文档解析节点"""
+        print(f"开始解析文档 {state.dataset_id}: {state.document_id}")
+        
+        try:        
+            # 解析文档
+            parsed_results = state.ragflow_service.parse_document(
+                dataset_id=state.dataset_id,
+                document_ids=[state.document_id]
+            )
+            
+            # 检查响应parsed_results为bool
+            if parsed_results:
+                print(f"文档解析成功,文档ID: {state.document_id}")
+                return {
+                    "parsed_results": parsed_results
+                }
+            else:
+                print("文档解析失败: 未返回有效的解析结果")
+                raise Exception("文档解析失败: 未返回有效的解析结果")
+        except Exception as e:
+            print(f"解析文档时出错: {str(e)}")
+            raise
+    
     def _split_pdf_node(self, state: PDFParsingState) -> Dict[str, Any]:
         """拆分PDF节点"""
         print(f"开始拆分PDF: {state.pdf_path}")
@@ -89,9 +167,8 @@ class PDFParsingWorkflow:
     def _parse_single_page(self, page: Dict[str, Any], model_name: str) -> Dict[str, Any]:
         """解析单个页面(用于并行处理)"""
         prompt = """
-            你是一个画本类童书的创作者,创作的内容适合0-12岁的儿童
-            任务:你需要根据现有童书插画与内容,提取出插画中的各种要素、行为、情感,并针对每个要素进行独立描述
-            注意:描述内容要积极正向,符合社会主义核心价值观
+            你是一个文本提取助手,你的任务是从图像中提取出文本内容。
+            注意:不要修改任何文本原始内容与标点符号,只进行提取。
             """
         
         page_number = page["page_number"]
@@ -116,7 +193,7 @@ class PDFParsingWorkflow:
         parsed_results = []
         
         # 使用ThreadPoolExecutor实现并行处理
-        with ThreadPoolExecutor(max_workers=6) as executor:
+        with ThreadPoolExecutor(max_workers=10) as executor:
             # 提交所有页面解析任务
             future_to_page = {
                 executor.submit(self._parse_single_page, page, self.model_name): page
@@ -152,24 +229,93 @@ class PDFParsingWorkflow:
         # 所以这里总是返回"complete"
         return "complete"
     
+    def _vectorize_store_node(self, state: PDFParsingState) -> Dict[str, Any]:
+        """向量化入库节点"""
+        print(f"开始向量化入库,共 {len(state.parsed_results)} 页")
+        
+        # 创建索引(如果不存在)
+        index_name = f"pdf_documents_{state.dataset_id}"
+        state.vector_db.create_index(index_name)
+        
+        # 准备要入库的文档列表
+        documents_to_store = []
+        
+        # 获取文件名和总页数
+        file_name = os.path.basename(state.pdf_path)
+        file_page_count = len(state.split_pages)
+        
+        # 遍历所有解析结果,生成向量化文档
+        for i, parsed_result in enumerate(state.parsed_results):
+            try:
+                page_number = parsed_result.get("page_number")
+                text = parsed_result.get("text", "")
+                image = parsed_result.get("image")
+                
+                # 生成图片地址(假设图片已保存)
+                image_path = parsed_result.get("image_path", f"temp/{file_name}_{page_number}.png")
+                
+                # 获取多模态嵌入向量
+                print(f"正在生成第 {page_number} 页的多模态嵌入...")
+                embedding = state.embedding_model.get_multimodal_embedding(text, image)
+                
+                # 生成1024维稠密向量(如果嵌入向量维度不是1024,这里需要处理)
+                dense_vector_1024 = embedding[:1024]  # 取前1024维
+                
+                # 创建文档
+                document = {
+                    "file_name": file_name,
+                    "file_page_count": file_page_count,
+                    "page_number": page_number,
+                    "text": text,
+                    "image_path": image_path,
+                    "sparse_vector": [],  # 稀疏向量,暂时为空
+                    "dense_vector_1024": dense_vector_1024,
+                    "dataset_id": state.dataset_id,
+                    "document_id": state.document_id
+                }
+                
+                documents_to_store.append(document)
+                print(f"第 {page_number} 页向量化完成")
+            except Exception as e:
+                print(f"第 {i+1} 页向量化失败: {str(e)}")
+        
+        # 批量入库
+        if documents_to_store:
+            print(f"开始批量入库,共 {len(documents_to_store)} 个文档")
+            result = state.vector_db.bulk_insert(index_name, documents_to_store)
+            print(f"批量入库结果: {result}")
+        
+        return {
+            "vectorized_results": documents_to_store,
+            "vectorized_pages": len(documents_to_store),
+            "is_complete": True
+        }
+    
     def _complete_node(self, state: PDFParsingState) -> Dict[str, Any]:
         """完成节点"""
-        print(f"PDF解析工作流完成,共解析 {len(state.parsed_results)} 页")
+        print(f"PDF解析工作流完成,共解析 {len(state.parsed_results)} 页,向量化 {state.vectorized_pages} 页")
         return {
             "is_complete": True
         }
     
-    def run(self, pdf_path: str) -> Dict[str, Any]:
+    def run(self, pdf_path: str, dataset_id: str, ragflow_api_url: str, rag_flow_api_key: str) -> Dict[str, Any]:
         """
         运行PDF解析工作流
         
         Args:
             pdf_path: PDF文件路径
+            dataset_id: 数据集ID
+            ragflow_api_url: RAGFLOW API URL
+            rag_flow_api_key: RAGFLOW API密钥
             
         Returns:
             Dict: 包含最终状态的字典
         """
-        initial_state = PDFParsingState(pdf_path=pdf_path)
+        initial_state = PDFParsingState(
+            pdf_path=pdf_path,
+            dataset_id=dataset_id,
+            ragflow_service=RAGFlowService(base_url=ragflow_api_url, api_key=rag_flow_api_key)
+        )
         result = self.workflow.invoke(initial_state)
         
         # 检查结果类型,如果是字典直接返回,否则调用dict()方法

+ 0 - 0
services/ragflow/__init__.py


BIN
services/ragflow/__pycache__/__init__.cpython-312.pyc


BIN
services/ragflow/__pycache__/agent_service.cpython-312.pyc


BIN
services/ragflow/__pycache__/chat_service.cpython-312.pyc


BIN
services/ragflow/__pycache__/chunk_service.cpython-312.pyc


BIN
services/ragflow/__pycache__/dataset_service.cpython-312.pyc


BIN
services/ragflow/__pycache__/document_service.cpython-312.pyc


BIN
services/ragflow/__pycache__/file_service.cpython-312.pyc


BIN
services/ragflow/__pycache__/openai_service.cpython-312.pyc


BIN
services/ragflow/__pycache__/ragflow_service.cpython-312.pyc


+ 139 - 0
services/ragflow/agent_service.py

@@ -0,0 +1,139 @@
+from typing import Dict, Any, List, Optional
+
+class AgentService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_agent(self, name: str, llm: Dict[str, Any], description: str = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/agents"
+        
+        data = {"name": name, "llm": llm}
+        if description is not None:
+            data["description"] = description
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建代理失败: {response.get('message', '未知错误')}")
+    
+    def update_agent(self, agent_id: str, name: str = None, llm: Dict[str, Any] = None,
+                    description: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents/{agent_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if llm is not None:
+            data["llm"] = llm
+        if description is not None:
+            data["description"] = description
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新代理失败: {response.get('message', '未知错误')}")
+    
+    def delete_agent(self, agent_id: str) -> bool:
+        endpoint = f"/api/v1/agents/{agent_id}"
+        
+        response = self.http_client.post(endpoint, json={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除代理失败: {response.get('message', '未知错误')}")
+    
+    def list_agents(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                   desc: bool = True, name: str = None, agent_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/agents"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if name is not None:
+            params["name"] = name
+        if agent_id is not None:
+            params["id"] = agent_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出代理失败: {response.get('message', '未知错误')}")
+    
+    def create_agent_session(self, agent_id: str, name: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents/{agent_id}/sessions"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建代理会话失败: {response.get('message', '未知错误')}")
+    
+    def list_agent_sessions(self, agent_id: str, page: int = 1, size: int = 20,
+                           orderby: str = "create_time", desc: bool = True,
+                           session_id: str = None, user_id: str = None,
+                           dsl: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/agents/{agent_id}/sessions"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if session_id is not None:
+            params["id"] = session_id
+        if user_id is not None:
+            params["user_id"] = user_id
+        if dsl is not None:
+            params["dsl"] = dsl
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出代理会话失败: {response.get('message', '未知错误')}")
+    
+    def delete_agent_session(self, agent_id: str, session_id: str) -> bool:
+        endpoint = f"/api/v1/agents/{agent_id}/sessions"
+        
+        response = self.http_client.post(endpoint, json={"session_ids": [session_id]})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除代理会话失败: {response.get('message', '未知错误')}")
+    
+    def agent_completion(self, agent_id: str, query: str, stream: bool = False,
+                        session_id: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents/{agent_id}/completions"
+        
+        data = {"query": query, "stream": stream}
+        if session_id is not None:
+            data["session_id"] = session_id
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"代理完成失败: {response.get('message', '未知错误')}")
+    
+    def get_related_questions(self, dataset_id: str, question: str, top: int = 10) -> List[str]:
+        endpoint = "/api/v1/sessions/related_questions"
+        
+        response = self.http_client.post(endpoint, json={
+            "dataset_id": dataset_id,
+            "question": question,
+            "top": top
+        })
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取相关问题失败: {response.get('message', '未知错误')}")

+ 146 - 0
services/ragflow/chat_service.py

@@ -0,0 +1,146 @@
+from typing import Dict, Any, List, Optional
+
+class ChatService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_chat(self, name: str, dataset_ids: List[str], llm: Dict[str, Any],
+                   prompt: str = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/chats"
+        
+        data = {
+            "name": name,
+            "dataset_ids": dataset_ids,
+            "llm": llm
+        }
+        if prompt is not None:
+            data["prompt"] = prompt
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建聊天失败: {response.get('message', '未知错误')}")
+    
+    def update_chat(self, chat_id: str, name: str = None, dataset_ids: List[str] = None,
+                   llm: Dict[str, Any] = None, prompt: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if dataset_ids is not None:
+            data["dataset_ids"] = dataset_ids
+        if llm is not None:
+            data["llm"] = llm
+        if prompt is not None:
+            data["prompt"] = prompt
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新聊天失败: {response.get('message', '未知错误')}")
+    
+    def delete_chats(self, chat_ids: List[str]) -> bool:
+        endpoint = "/api/v1/chats"
+        
+        response = self.http_client.post(endpoint, json={"chat_ids": chat_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除聊天失败: {response.get('message', '未知错误')}")
+    
+    def list_chats(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True, name: str = None, chat_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/chats"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if name is not None:
+            params["name"] = name
+        if chat_id is not None:
+            params["id"] = chat_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出聊天失败: {response.get('message', '未知错误')}")
+    
+    def create_chat_session(self, chat_id: str, name: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建会话失败: {response.get('message', '未知错误')}")
+    
+    def update_chat_session(self, chat_id: str, session_id: str, 
+                           name: str = None, message: List[Dict] = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions/{session_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if message is not None:
+            data["message"] = message
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新会话失败: {response.get('message', '未知错误')}")
+    
+    def list_chat_sessions(self, chat_id: str, page: int = 1, size: int = 20,
+                          orderby: str = "create_time", desc: bool = True,
+                          session_id: str = None, session_name: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if session_id is not None:
+            params["id"] = session_id
+        if session_name is not None:
+            params["name"] = session_name
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出会话失败: {response.get('message', '未知错误')}")
+    
+    def delete_chat_session(self, chat_id: str, session_id: str) -> bool:
+        endpoint = f"/api/v1/chats/{chat_id}/sessions"
+        
+        response = self.http_client.post(endpoint, json={"session_ids": [session_id]})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除会话失败: {response.get('message', '未知错误')}")
+    
+    def chat_completion(self, chat_id: str, query: str, stream: bool = False,
+                       session_id: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats/{chat_id}/completions"
+        
+        data = {"query": query, "stream": stream}
+        if session_id is not None:
+            data["session_id"] = session_id
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"聊天完成失败: {response.get('message', '未知错误')}")

+ 78 - 0
services/ragflow/chunk_service.py

@@ -0,0 +1,78 @@
+from typing import Dict, Any, List, Optional
+
+class ChunkService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_chunk(self, dataset_id: str, document_id: str, content: str, 
+                    meta_fields: Dict = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks"
+        
+        data = {"content": content}
+        if meta_fields is not None:
+            data["meta_fields"] = meta_fields
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建切片失败: {response.get('message', '未知错误')}")
+    
+    def update_chunk(self, dataset_id: str, chunk_id: str, content: str = None,
+                    meta_fields: Dict = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/chunks/{chunk_id}"
+        
+        data = {}
+        if content is not None:
+            data["content"] = content
+        if meta_fields is not None:
+            data["meta_fields"] = meta_fields
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新切片失败: {response.get('message', '未知错误')}")
+    
+    def delete_chunk(self, dataset_id: str, chunk_id: str) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/chunks/{chunk_id}"
+        
+        response = self.http_client.post(endpoint, json={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除切片失败: {response.get('message', '未知错误')}")
+    
+    def delete_chunks(self, dataset_id: str, document_id: str, chunk_ids: List[str]) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks"
+        
+        response = self.http_client.post(endpoint, json={"chunk_ids": chunk_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"批量删除切片失败: {response.get('message', '未知错误')}")
+    
+    def retrieval(self, dataset_ids: List[str], query: str, top_k: int = 5,
+                 similarity_threshold: float = 0.1, vector_similarity_weight: float = 0.3,
+                 refine: bool = False) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/retrieval"
+        
+        data = {
+            "dataset_ids": dataset_ids,
+            "query": query,
+            "top_k": top_k,
+            "similarity_threshold": similarity_threshold,
+            "vector_similarity_weight": vector_similarity_weight,
+            "refine": refine
+        }
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"检索失败: {response.get('message', '未知错误')}")

+ 177 - 0
services/ragflow/dataset_service.py

@@ -0,0 +1,177 @@
+from typing import Dict, Any, List, Optional
+
+class DatasetService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def create_dataset(self, name: str, description: str = None, 
+                      embedding_model: str = None, permission: str = None,
+                      chunk_method: str = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/datasets"
+        
+        data = {"name": name}
+        if description is not None:
+            data["description"] = description
+        if embedding_model is not None:
+            data["embedding_model"] = embedding_model
+        if permission is not None:
+            data["permission"] = permission
+        if chunk_method is not None:
+            data["chunk_method"] = chunk_method
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"创建数据集失败: {response.get('message', '未知错误')}")
+    
+    def delete_datasets(self, dataset_ids: List[str]) -> bool:
+        endpoint = "/api/v1/datasets"
+        
+        response = self.http_client.post(endpoint, json={"dataset_ids": dataset_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除数据集失败: {response.get('message', '未知错误')}")
+    
+    def update_dataset(self, dataset_id: str, name: str = None, 
+                      description: str = None, embedding_model: str = None,
+                      permission: str = None, chunk_method: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if description is not None:
+            data["description"] = description
+        if embedding_model is not None:
+            data["embedding_model"] = embedding_model
+        if permission is not None:
+            data["permission"] = permission
+        if chunk_method is not None:
+            data["chunk_method"] = chunk_method
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新数据集失败: {response.get('message', '未知错误')}")
+    
+    def list_datasets(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                     desc: bool = True, name: str = None, dataset_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/datasets"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if name is not None:
+            params["name"] = name
+        if dataset_id is not None:
+            params["id"] = dataset_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出数据集失败: {response.get('message', '未知错误')}")
+    
+    def get_dataset(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取数据集失败: {response.get('message', '未知错误')}")
+    
+    def get_knowledge_graph(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/knowledge_graph"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取知识图谱失败: {response.get('message', '未知错误')}")
+    
+    def delete_knowledge_graph(self, dataset_id: str) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/knowledge_graph"
+        
+        response = self.http_client.post(endpoint, json={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除知识图谱失败: {response.get('message', '未知错误')}")
+    
+    def trace_graphrag(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/trace_graphrag"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取GraphRAG追踪失败: {response.get('message', '未知错误')}")
+    
+    def trace_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/trace_raptor"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取RAPTOR追踪失败: {response.get('message', '未知错误')}")
+    
+    def get_metadata_summary(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/metadata/summary"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"获取元数据摘要失败: {response.get('message', '未知错误')}")
+    
+    def update_metadata(self, dataset_id: str, metadata: Dict = None,
+                       document_ids: List[str] = None, metadata_condition: Dict = None) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/metadata/update"
+        
+        data = {}
+        if metadata is not None:
+            data["metadata"] = metadata
+        if document_ids is not None:
+            data["document_ids"] = document_ids
+        if metadata_condition is not None:
+            data["metadata_condition"] = metadata_condition
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"更新元数据失败: {response.get('message', '未知错误')}")
+    
+    def run_graphrag(self, dataset_id: str, mode: str = "light") -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/run_graphrag"
+        
+        response = self.http_client.post(endpoint, json={"mode": mode})
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"运行GraphRAG失败: {response.get('message', '未知错误')}")
+    
+    def run_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/run_raptor"
+        
+        response = self.http_client.post(endpoint)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"运行RAPTOR失败: {response.get('message', '未知错误')}")

+ 126 - 0
services/ragflow/document_service.py

@@ -0,0 +1,126 @@
+from typing import Dict, Any, List, Optional
+
+class DocumentService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def upload_document(self, dataset_id: str, file_path: str) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents"
+        
+        with open(file_path, 'rb') as f:
+            files = {'file': (file_path.split('/')[-1], f)}
+            headers = {'Content-Type': 'multipart/form-data'}
+            
+            response = self.http_client.post(endpoint, files=files, headers=headers)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"上传文档失败: {response.get('message', '未知错误')}")
+    
+    def update_document(self, dataset_id: str, document_id: str, 
+                       name: str = None, meta_fields: Dict = None, 
+                       chunk_method: str = None, parser_config: Dict = None,
+                       enabled: int = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}"
+        
+        data = {}
+        if name is not None:
+            data["name"] = name
+        if meta_fields is not None:
+            data["meta_fields"] = meta_fields
+        if chunk_method is not None:
+            data["chunk_method"] = chunk_method
+        if parser_config is not None:
+            data["parser_config"] = parser_config
+        if enabled is not None:
+            data["enabled"] = enabled
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"更新文档失败: {response.get('message', '未知错误')}")
+    
+    def delete_document(self, dataset_id: str, document_id: str) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}"
+        
+        response = self.http_client.post(endpoint, json={})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除文档失败: {response.get('message', '未知错误')}")
+    
+    def delete_documents(self, dataset_id: str, document_ids: List[str]) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents"
+        
+        response = self.http_client.post(endpoint, json={"document_ids": document_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"批量删除文档失败: {response.get('message', '未知错误')}")
+    
+    def get_document(self, dataset_id: str, document_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取文档失败: {response.get('message', '未知错误')}")
+    
+    def list_documents(self, dataset_id: str, page: int = 1, size: int = 20,
+                      keywords: str = None, document_id: str = None, document_name: str = None,
+                      suffix: str = None, run: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents"
+        
+        params = {"page": page, "page_size": size}
+        if keywords is not None:
+            params["keywords"] = keywords
+        if document_id is not None:
+            params["id"] = document_id
+        if document_name is not None:
+            params["name"] = document_name
+        if suffix is not None:
+            params["suffix"] = suffix
+        if run is not None:
+            params["run"] = run
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出文档失败: {response.get('message', '未知错误')}")
+    
+    def get_document_chunks(self, dataset_id: str, document_id: str,
+                           keywords: str = None, page: int = 1, size: int = 20,
+                           chunk_id: str = None) -> List[Dict[str, Any]]:
+        endpoint = f"/api/v1/datasets/{dataset_id}/documents/{document_id}/chunks"
+        
+        params = {"page": page, "page_size": size}
+        if keywords is not None:
+            params["keywords"] = keywords
+        if chunk_id is not None:
+            params["id"] = chunk_id
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取文档切片失败: {response.get('message', '未知错误')}")
+    
+    def parse_document(self, dataset_id: str, document_ids: List[str]) -> bool:
+        endpoint = f"/api/v1/datasets/{dataset_id}/chunks"
+        
+        response = self.http_client.post(endpoint, json={"document_ids": document_ids})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"解析文档失败: {response.get('message', '未知错误')}")

+ 141 - 0
services/ragflow/file_service.py

@@ -0,0 +1,141 @@
+from typing import Dict, Any, List, Optional
+
+class FileService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def list_files(self, parent_id: str = None, keywords: str = None,
+                  page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/file/list"
+        
+        params = {"page": page, "page_size": size, "orderby": orderby, "desc": int(desc)}
+        if parent_id is not None:
+            params["parent_id"] = parent_id
+        if keywords is not None:
+            params["keywords"] = keywords
+        
+        response = self.http_client.get(endpoint, params=params)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"列出文件失败: {response.get('message', '未知错误')}")
+    
+    def get_root_folder(self) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/root_folder"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取根目录失败: {response.get('message', '未知错误')}")
+    
+    def get_parent_folder(self, file_id: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/parent_folder"
+        
+        response = self.http_client.get(endpoint, params={"file_id": file_id})
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取父目录失败: {response.get('message', '未知错误')}")
+    
+    def get_all_parent_folders(self, file_id: str) -> List[Dict[str, Any]]:
+        endpoint = "/api/v1/file/all_parent_folder"
+        
+        response = self.http_client.get(endpoint, params={"file_id": file_id})
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取所有父目录失败: {response.get('message', '未知错误')}")
+    
+    def get_file(self, file_id: str) -> Dict[str, Any]:
+        endpoint = f"/api/v1/file/get/{file_id}"
+        
+        response = self.http_client.get(endpoint)
+        
+        if response.get("code") == 0 and response.get("data"):
+            return response["data"]
+        else:
+            raise Exception(f"获取文件失败: {response.get('message', '未知错误')}")
+    
+    def upload_file(self, file_path: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/upload"
+        
+        with open(file_path, 'rb') as f:
+            files = {'file': (file_path.split('/')[-1], f)}
+            headers = {'Content-Type': 'multipart/form-data'}
+            
+            response = self.http_client.post(endpoint, files=files, headers=headers)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"上传文件失败: {response.get('message', '未知错误')}")
+    
+    def create_file(self, file_id: str, tenant_id: str = None) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/create"
+        
+        data = {"file_id": file_id}
+        if tenant_id is not None:
+            data["tenant_id"] = tenant_id
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"创建文件失败: {response.get('message', '未知错误')}")
+    
+    def delete_file(self, file_id: str) -> bool:
+        endpoint = "/api/v1/file/rm"
+        
+        response = self.http_client.post(endpoint, json={"file_id": file_id})
+        
+        if response.get("code") == 0:
+            return True
+        else:
+            raise Exception(f"删除文件失败: {response.get('message', '未知错误')}")
+    
+    def rename_file(self, file_id: str, new_name: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/rename"
+        
+        data = {
+            "file_id": file_id,
+            "new_name": new_name
+        }
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"重命名文件失败: {response.get('message', '未知错误')}")
+    
+    def move_file(self, file_id: str, parent_id: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/mv"
+        
+        data = {
+            "file_id": file_id,
+            "parent_id": parent_id
+        }
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"移动文件失败: {response.get('message', '未知错误')}")
+    
+    def convert_file(self, file_id: str) -> Dict[str, Any]:
+        endpoint = "/api/v1/file/convert"
+        
+        response = self.http_client.post(endpoint, json={"file_id": file_id})
+        
+        if response.get("code") == 0:
+            return response.get("data", {})
+        else:
+            raise Exception(f"转换文件失败: {response.get('message', '未知错误')}")

+ 45 - 0
services/ragflow/openai_service.py

@@ -0,0 +1,45 @@
+from typing import Dict, Any, List, Optional
+
+class OpenAICompatibleService:
+    def __init__(self, http_client):
+        self.http_client = http_client
+    
+    def chat_completion(self, chat_id: str, messages: List[Dict[str, Any]], 
+                       stream: bool = False, model: str = "model",
+                       extra_body: Dict = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/chats_openai/{chat_id}/chat/completions"
+        
+        data = {
+            "model": model,
+            "messages": messages,
+            "stream": stream
+        }
+        if extra_body is not None:
+            data["extra_body"] = extra_body
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", response)
+        else:
+            raise Exception(f"聊天完成失败: {response.get('message', '未知错误')}")
+    
+    def agent_completion(self, agent_id: str, messages: List[Dict[str, Any]], 
+                        stream: bool = False, model: str = "model",
+                        session_id: str = None) -> Dict[str, Any]:
+        endpoint = f"/api/v1/agents_openai/{agent_id}/chat/completions"
+        
+        data = {
+            "model": model,
+            "messages": messages,
+            "stream": stream
+        }
+        if session_id is not None:
+            data["session_id"] = session_id
+        
+        response = self.http_client.post(endpoint, json=data)
+        
+        if response.get("code") == 0:
+            return response.get("data", response)
+        else:
+            raise Exception(f"代理完成失败: {response.get('message', '未知错误')}")

+ 298 - 0
services/ragflow/ragflow_service.py

@@ -0,0 +1,298 @@
+import sys
+import os
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from services.utils.http_client import HTTPClient
+from conf.config import ModelConfig
+from services.ragflow.dataset_service import DatasetService
+from services.ragflow.document_service import DocumentService
+from services.ragflow.chunk_service import ChunkService
+from services.ragflow.chat_service import ChatService
+from services.ragflow.agent_service import AgentService
+from services.ragflow.file_service import FileService
+from services.ragflow.openai_service import OpenAICompatibleService
+
+@dataclass
+class DocumentInfo:
+    id: str
+    name: str
+    type: str
+    size: int
+    location: str
+    dataset_id: str
+    chunk_method: str
+    chunk_count: Optional[int] = None
+    token_count: Optional[int] = None
+    run: str = "UNSTART"
+    status: str = "1"
+
+@dataclass
+class ChunkInfo:
+    id: str
+    document_id: str
+    content: str
+    document_name: str
+    dataset_id: str
+    similarity: float = 0.0
+    vector_similarity: float = 0.0
+    term_similarity: float = 0.0
+
+@dataclass
+class DatasetInfo:
+    id: str
+    name: str
+    description: Optional[str] = None
+    embedding_model: Optional[str] = None
+    permission: Optional[str] = None
+    chunk_method: Optional[str] = None
+    chunk_count: int = 0
+    document_count: int = 0
+    token_count: int = 0
+    status: str = "1"
+
+@dataclass
+class ChatInfo:
+    id: str
+    name: str
+    dataset_ids: List[str]
+    llm: Dict[str, Any]
+    prompt: str
+
+@dataclass
+class AgentInfo:
+    id: str
+    name: str
+    llm: Dict[str, Any]
+    description: Optional[str] = None
+
+@dataclass
+class FileInfo:
+    id: str
+    parent_id: str
+    name: str
+    type: str
+    size: int
+
+class RAGFlowService:
+    def __init__(self, base_url: str = None, api_key: str = None):
+        base_url = base_url or ModelConfig.get_ragflow_api_url()
+        api_key = api_key or ModelConfig.get_ragflow_api_key()
+        self.http_client = HTTPClient(base_url=base_url, api_key=api_key)
+        
+        self.dataset_service = DatasetService(self.http_client)
+        self.document_service = DocumentService(self.http_client)
+        self.chunk_service = ChunkService(self.http_client)
+        self.chat_service = ChatService(self.http_client)
+        self.agent_service = AgentService(self.http_client)
+        self.file_service = FileService(self.http_client)
+        self.openai_service = OpenAICompatibleService(self.http_client)
+    
+    def create_dataset(self, name: str, description: str = None, 
+                      embedding_model: str = None, permission: str = None,
+                      chunk_method: str = None) -> DatasetInfo:
+        return self.dataset_service.create_dataset(name, description, embedding_model, permission, chunk_method)
+    
+    def delete_datasets(self, dataset_ids: List[str]) -> bool:
+        return self.dataset_service.delete_datasets(dataset_ids)
+    
+    def update_dataset(self, dataset_id: str, name: str = None, 
+                      description: str = None, embedding_model: str = None,
+                      permission: str = None, chunk_method: str = None) -> DatasetInfo:
+        return self.dataset_service.update_dataset(dataset_id, name, description, embedding_model, permission, chunk_method)
+    
+    def list_datasets(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                     desc: bool = True, name: str = None, dataset_id: str = None) -> List[DatasetInfo]:
+        return self.dataset_service.list_datasets(page, size, orderby, desc, name, dataset_id)
+    
+    def get_dataset(self, dataset_id: str) -> DatasetInfo:
+        return self.dataset_service.get_dataset(dataset_id)
+    
+    def get_knowledge_graph(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.get_knowledge_graph(dataset_id)
+    
+    def delete_knowledge_graph(self, dataset_id: str) -> bool:
+        return self.dataset_service.delete_knowledge_graph(dataset_id)
+    
+    def trace_graphrag(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.trace_graphrag(dataset_id)
+    
+    def trace_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.trace_raptor(dataset_id)
+    
+    def get_metadata_summary(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.get_metadata_summary(dataset_id)
+    
+    def update_metadata(self, dataset_id: str, metadata: Dict = None,
+                       document_ids: List[str] = None, metadata_condition: Dict = None) -> bool:
+        return self.dataset_service.update_metadata(dataset_id, metadata, document_ids, metadata_condition)
+    
+    def run_graphrag(self, dataset_id: str, mode: str = "light") -> Dict[str, Any]:
+        return self.dataset_service.run_graphrag(dataset_id, mode)
+    
+    def run_raptor(self, dataset_id: str) -> Dict[str, Any]:
+        return self.dataset_service.run_raptor(dataset_id)
+    
+    def upload_document(self, dataset_id: str, file_path: str) -> List[DocumentInfo]:
+        return self.document_service.upload_document(dataset_id, file_path)
+    
+    def update_document(self, dataset_id: str, document_id: str, 
+                       name: str = None, meta_fields: Dict = None, 
+                       chunk_method: str = None, parser_config: Dict = None,
+                       enabled: int = None) -> DocumentInfo:
+        return self.document_service.update_document(dataset_id, document_id, name, meta_fields, chunk_method, parser_config, enabled)
+    
+    def delete_document(self, dataset_id: str, document_id: str) -> bool:
+        return self.document_service.delete_document(dataset_id, document_id)
+    
+    def delete_documents(self, dataset_id: str, document_ids: List[str]) -> bool:
+        return self.document_service.delete_documents(dataset_id, document_ids)
+    
+    def get_document(self, dataset_id: str, document_id: str) -> DocumentInfo:
+        return self.document_service.get_document(dataset_id, document_id)
+    
+    def list_documents(self, dataset_id: str, page: int = 1, size: int = 20,
+                      keywords: str = None, document_id: str = None, document_name: str = None,
+                      suffix: str = None, run: str = None) -> List[DocumentInfo]:
+        return self.document_service.list_documents(dataset_id, page, size, keywords, document_id, document_name, suffix, run)
+    
+    def get_document_chunks(self, dataset_id: str, document_id: str,
+                           keywords: str = None, page: int = 1, size: int = 20,
+                           chunk_id: str = None) -> List[ChunkInfo]:
+        return self.document_service.get_document_chunks(dataset_id, document_id, keywords, page, size, chunk_id)
+    
+    def parse_document(self, dataset_id: str, document_ids: List[str]) -> bool:
+        return self.document_service.parse_document(dataset_id, document_ids)
+    
+    def create_chunk(self, dataset_id: str, document_id: str, content: str, 
+                    meta_fields: Dict = None) -> ChunkInfo:
+        return self.chunk_service.create_chunk(dataset_id, document_id, content, meta_fields)
+    
+    def update_chunk(self, dataset_id: str, chunk_id: str, content: str = None,
+                    meta_fields: Dict = None) -> ChunkInfo:
+        return self.chunk_service.update_chunk(dataset_id, chunk_id, content, meta_fields)
+    
+    def delete_chunk(self, dataset_id: str, chunk_id: str) -> bool:
+        return self.chunk_service.delete_chunk(dataset_id, chunk_id)
+    
+    def delete_chunks(self, dataset_id: str, document_id: str, chunk_ids: List[str]) -> bool:
+        return self.chunk_service.delete_chunks(dataset_id, document_id, chunk_ids)
+    
+    def retrieval(self, dataset_ids: List[str], query: str, top_k: int = 5,
+                 similarity_threshold: float = 0.1, vector_similarity_weight: float = 0.3,
+                 refine: bool = False) -> List[ChunkInfo]:
+        return self.chunk_service.retrieval(dataset_ids, query, top_k, similarity_threshold, vector_similarity_weight, refine)
+    
+    def create_chat(self, name: str, dataset_ids: List[str], llm: Dict[str, Any],
+                   prompt: str = None) -> ChatInfo:
+        return self.chat_service.create_chat(name, dataset_ids, llm, prompt)
+    
+    def update_chat(self, chat_id: str, name: str = None, dataset_ids: List[str] = None,
+                   llm: Dict[str, Any] = None, prompt: str = None) -> ChatInfo:
+        return self.chat_service.update_chat(chat_id, name, dataset_ids, llm, prompt)
+    
+    def delete_chats(self, chat_ids: List[str]) -> bool:
+        return self.chat_service.delete_chats(chat_ids)
+    
+    def list_chats(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True, name: str = None, chat_id: str = None) -> List[ChatInfo]:
+        return self.chat_service.list_chats(page, size, orderby, desc, name, chat_id)
+    
+    def create_chat_session(self, chat_id: str, name: str = None) -> Dict[str, Any]:
+        return self.chat_service.create_chat_session(chat_id, name)
+    
+    def update_chat_session(self, chat_id: str, session_id: str, 
+                           name: str = None, message: List[Dict] = None) -> Dict[str, Any]:
+        return self.chat_service.update_chat_session(chat_id, session_id, name, message)
+    
+    def list_chat_sessions(self, chat_id: str, page: int = 1, size: int = 20,
+                          orderby: str = "create_time", desc: bool = True,
+                          session_id: str = None, session_name: str = None) -> List[Dict[str, Any]]:
+        return self.chat_service.list_chat_sessions(chat_id, page, size, orderby, desc, session_id, session_name)
+    
+    def delete_chat_session(self, chat_id: str, session_id: str) -> bool:
+        return self.chat_service.delete_chat_session(chat_id, session_id)
+    
+    def chat_completion(self, chat_id: str, query: str, stream: bool = False,
+                       session_id: str = None) -> Dict[str, Any]:
+        return self.chat_service.chat_completion(chat_id, query, stream, session_id)
+    
+    def create_agent(self, name: str, llm: Dict[str, Any], description: str = None) -> AgentInfo:
+        return self.agent_service.create_agent(name, llm, description)
+    
+    def update_agent(self, agent_id: str, name: str = None, llm: Dict[str, Any] = None,
+                    description: str = None) -> AgentInfo:
+        return self.agent_service.update_agent(agent_id, name, llm, description)
+    
+    def delete_agent(self, agent_id: str) -> bool:
+        return self.agent_service.delete_agent(agent_id)
+    
+    def list_agents(self, page: int = 1, size: int = 20, orderby: str = "create_time",
+                   desc: bool = True, name: str = None, agent_id: str = None) -> List[AgentInfo]:
+        return self.agent_service.list_agents(page, size, orderby, desc, name, agent_id)
+    
+    def create_agent_session(self, agent_id: str, name: str = None) -> Dict[str, Any]:
+        return self.agent_service.create_agent_session(agent_id, name)
+    
+    def list_agent_sessions(self, agent_id: str, page: int = 1, size: int = 20,
+                           orderby: str = "create_time", desc: bool = True,
+                           session_id: str = None, user_id: str = None,
+                           dsl: str = None) -> List[Dict[str, Any]]:
+        return self.agent_service.list_agent_sessions(agent_id, page, size, orderby, desc, session_id, user_id, dsl)
+    
+    def delete_agent_session(self, agent_id: str, session_id: str) -> bool:
+        return self.agent_service.delete_agent_session(agent_id, session_id)
+    
+    def agent_completion(self, agent_id: str, query: str, stream: bool = False,
+                        session_id: str = None) -> Dict[str, Any]:
+        return self.agent_service.agent_completion(agent_id, query, stream, session_id)
+    
+    def get_related_questions(self, dataset_id: str, question: str, top: int = 10) -> List[str]:
+        return self.agent_service.get_related_questions(dataset_id, question, top)
+    
+    def list_files(self, parent_id: str = None, keywords: str = None,
+                  page: int = 1, size: int = 20, orderby: str = "create_time",
+                  desc: bool = True) -> List[FileInfo]:
+        return self.file_service.list_files(parent_id, keywords, page, size, orderby, desc)
+    
+    def get_root_folder(self) -> Dict[str, Any]:
+        return self.file_service.get_root_folder()
+    
+    def get_parent_folder(self, file_id: str) -> Dict[str, Any]:
+        return self.file_service.get_parent_folder(file_id)
+    
+    def get_all_parent_folders(self, file_id: str) -> List[Dict[str, Any]]:
+        return self.file_service.get_all_parent_folders(file_id)
+    
+    def get_file(self, file_id: str) -> Dict[str, Any]:
+        return self.file_service.get_file(file_id)
+    
+    def upload_file(self, file_path: str) -> Dict[str, Any]:
+        return self.file_service.upload_file(file_path)
+    
+    def create_file(self, file_id: str, tenant_id: str = None) -> Dict[str, Any]:
+        return self.file_service.create_file(file_id, tenant_id)
+    
+    def delete_file(self, file_id: str) -> bool:
+        return self.file_service.delete_file(file_id)
+    
+    def rename_file(self, file_id: str, new_name: str) -> Dict[str, Any]:
+        return self.file_service.rename_file(file_id, new_name)
+    
+    def move_file(self, file_id: str, parent_id: str) -> Dict[str, Any]:
+        return self.file_service.move_file(file_id, parent_id)
+    
+    def convert_file(self, file_id: str) -> Dict[str, Any]:
+        return self.file_service.convert_file(file_id)
+    
+    def openai_chat_completion(self, chat_id: str, messages: List[Dict[str, Any]], 
+                              stream: bool = False, model: str = "model",
+                              extra_body: Dict = None) -> Dict[str, Any]:
+        return self.openai_service.chat_completion(chat_id, messages, stream, model, extra_body)
+    
+    def openai_agent_completion(self, agent_id: str, messages: List[Dict[str, Any]], 
+                               stream: bool = False, model: str = "model",
+                               session_id: str = None) -> Dict[str, Any]:
+        return self.openai_service.agent_completion(agent_id, messages, stream, model, session_id)

BIN
services/utils/__pycache__/decorators.cpython-312.pyc


BIN
services/utils/__pycache__/es_conn.cpython-312.pyc


BIN
services/utils/__pycache__/http_client.cpython-312.pyc


BIN
services/utils/__pycache__/vector_db.cpython-312.pyc


+ 13 - 0
services/utils/decorators.py

@@ -0,0 +1,13 @@
+# 单例装饰器
+class singleton:
+    """
+    单例装饰器,确保类只有一个实例
+    """
+    def __init__(self, cls):
+        self.cls = cls
+        self._instance = None
+    
+    def __call__(self, *args, **kwargs):
+        if self._instance is None:
+            self._instance = self.cls(*args, **kwargs)
+        return self._instance

+ 17 - 0
services/utils/es/__init__.py

@@ -0,0 +1,17 @@
+"""
+Elasticsearch 工具模块
+"""
+
+from .base import ESConnection
+from .constants import ES_DEFAULT_CONFIG
+from .document import DocumentManager
+from .index import IndexManager
+from .search import SearchManager
+
+__all__ = [
+    "ESConnection",
+    "ES_DEFAULT_CONFIG",
+    "DocumentManager",
+    "IndexManager",
+    "SearchManager"
+]

BIN
services/utils/es/__pycache__/__init__.cpython-312.pyc


BIN
services/utils/es/__pycache__/base.cpython-312.pyc


BIN
services/utils/es/__pycache__/constants.cpython-312.pyc


BIN
services/utils/es/__pycache__/document.cpython-312.pyc


BIN
services/utils/es/__pycache__/index.cpython-312.pyc


BIN
services/utils/es/__pycache__/search.cpython-312.pyc


BIN
services/utils/es/__pycache__/templates.cpython-312.pyc


+ 68 - 0
services/utils/es/base.py

@@ -0,0 +1,68 @@
+"""
+Elasticsearch 连接基础类
+"""
+from typing import List, Dict, Any, Optional
+from elasticsearch import Elasticsearch
+from elastic_transport import ConnectionTimeout
+from services.utils.decorators import singleton
+from services.utils.es.constants import ES_DEFAULT_CONFIG
+from services.utils.es.templates import get_dynamic_templates
+
+
+@singleton
+class ESConnection:
+    """
+    Elasticsearch 连接管理器
+    支持:
+    - 单例模式
+    - 连接池管理
+    - 基础配置管理
+    """
+    
+    def __init__(self, hosts: List[str] = None, **kwargs):
+        """
+        初始化 Elasticsearch 连接
+        
+        Args:
+            hosts: Elasticsearch 主机列表,格式如 ["http://localhost:9200"]
+            **kwargs: 其他 Elasticsearch 客户端配置参数
+        """
+        # 合并配置
+        self.config = {**ES_DEFAULT_CONFIG, **kwargs}
+        self.hosts = hosts or ES_DEFAULT_CONFIG.get("hosts", ["http://localhost:9200"])
+        
+        # 初始化 Elasticsearch 客户端
+        self.es = Elasticsearch(
+            hosts=self.hosts,
+            **self.config
+        )
+        
+        # 动态模板映射
+        self.dynamic_templates = get_dynamic_templates()
+    
+    def ping(self) -> bool:
+        """
+        检查 ES 连接是否正常
+        
+        Returns:
+            bool: 连接是否正常
+        """
+        try:
+            return self.es.ping()
+        except Exception:
+            return False
+    
+    def get_client(self) -> Elasticsearch:
+        """
+        获取 ES 客户端实例
+        
+        Returns:
+            Elasticsearch: ES 客户端实例
+        """
+        return self.es
+    
+    def close(self):
+        """
+        关闭 Elasticsearch 连接
+        """
+        self.es.close()

+ 25 - 0
services/utils/es/constants.py

@@ -0,0 +1,25 @@
+"""
+Elasticsearch 常量配置
+"""
+
+# 默认配置
+ES_DEFAULT_CONFIG = {
+    "http_compress": True,
+    "max_retries": 3,
+    "retry_on_timeout": True,
+    "timeout": 60,
+    "sniff_on_start": False,
+    "sniff_on_connection_fail": False,
+    "sniffer_timeout": 0,
+    "connections_per_node": 5,  # 每个节点的连接数
+    "randomize_nodes_in_pool": True
+}
+
+# 连接池大小
+ES_CONNECTIONS_PER_NODE = 5
+
+# 默认超时时间
+ES_DEFAULT_TIMEOUT = 60
+
+# 默认主机
+ES_DEFAULT_HOSTS = ["http://localhost:9200"]

+ 192 - 0
services/utils/es/document.py

@@ -0,0 +1,192 @@
+"""
+Elasticsearch 文档管理
+"""
+from typing import List, Dict, Any, Optional
+from elasticsearch.helpers import bulk, BulkIndexError
+from elasticsearch.exceptions import NotFoundError
+from services.utils.es.base import ESConnection
+
+
+class DocumentManager:
+    """
+    Elasticsearch 文档管理器
+    负责:
+    - 文档插入(单条和批量)
+    - 文档更新
+    - 文档删除(单条和批量)
+    - 文档获取
+    """
+    
+    def __init__(self, es_connection: Optional[ESConnection] = None):
+        """
+        初始化文档管理器
+        
+        Args:
+            es_connection: ES 连接实例,可选
+        """
+        self.es_conn = es_connection or ESConnection()
+        self.es = self.es_conn.get_client()
+    
+    def insert(self, index_name: str, document: Dict[str, Any], id: str = None, refresh: bool = False) -> bool:
+        """
+        插入单个文档
+        
+        Args:
+            index_name: 索引名称
+            document: 文档内容
+            id: 文档ID,可选
+            refresh: 是否立即刷新
+        
+        Returns:
+            bool: 插入是否成功
+        """
+        try:
+            self.es.index(index=index_name, body=document, id=id, refresh=refresh)
+            return True
+        except Exception as e:
+            print(f"插入文档失败: {e}")
+            return False
+    
+    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]], refresh: bool = False) -> Dict[str, Any]:
+        """
+        批量插入文档
+        
+        Args:
+            index_name: 索引名称
+            documents: 文档列表,每个文档可以包含"_id"字段指定ID
+            refresh: 是否立即刷新
+        
+        Returns:
+            Dict: 包含成功和失败信息的字典
+        """
+        try:
+            # 准备批量操作
+            actions = []
+            for doc in documents:
+                action = {
+                    "_index": index_name,
+                    "_source": doc.copy()
+                }
+                # 如果文档包含"_id"字段,将其作为文档ID
+                if "_id" in doc:
+                    action["_id"] = doc["_id"]
+                    del action["_source"]["_id"]
+                actions.append(action)
+            
+            # 执行批量操作
+            success, failed = bulk(self.es, actions, refresh=refresh, stats_only=False)
+            
+            return {
+                "success": success,
+                "failed": len(failed) if failed else 0,
+                "errors": failed if failed else []
+            }
+        except BulkIndexError as e:
+            print(f"批量插入失败: {e}")
+            return {
+                "success": 0,
+                "failed": len(e.errors),
+                "errors": e.errors
+            }
+        except Exception as e:
+            print(f"批量插入失败: {e}")
+            return {
+                "success": 0,
+                "failed": len(documents),
+                "errors": [str(e)] * len(documents)
+            }
+    
+    def update(self, index_name: str, id: str, update_body: Dict[str, Any], refresh: bool = False) -> bool:
+        """
+        更新文档
+        
+        Args:
+            index_name: 索引名称
+            id: 文档ID
+            update_body: 更新内容,格式如 {"doc": {"field": "value"}}
+            refresh: 是否立即刷新
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        try:
+            self.es.update(index=index_name, id=id, body=update_body, refresh=refresh)
+            return True
+        except NotFoundError:
+            print(f"文档不存在: {id}")
+            return False
+        except Exception as e:
+            print(f"更新文档失败: {e}")
+            return False
+    
+    def delete(self, index_name: str, id: str, refresh: bool = False) -> bool:
+        """
+        删除单个文档
+        
+        Args:
+            index_name: 索引名称
+            id: 文档ID
+            refresh: 是否立即刷新
+        
+        Returns:
+            bool: 删除是否成功
+        """
+        try:
+            self.es.delete(index=index_name, id=id, refresh=refresh)
+            return True
+        except NotFoundError:
+            print(f"文档不存在: {id}")
+            return False
+        except Exception as e:
+            print(f"删除文档失败: {e}")
+            return False
+    
+    def delete_by_query(self, index_name: str, query: Dict[str, Any], refresh: bool = False) -> Dict[str, Any]:
+        """
+        按查询条件删除文档
+        
+        Args:
+            index_name: 索引名称
+            query: 查询条件
+            refresh: 是否立即刷新
+        
+        Returns:
+            Dict: 删除结果
+        """
+        try:
+            result = self.es.delete_by_query(index=index_name, body={"query": query}, refresh=refresh)
+            return {
+                "deleted": result["deleted"],
+                "failed": 0
+            }
+        except Exception as e:
+            print(f"按条件删除失败: {e}")
+            return {
+                "deleted": 0,
+                "failed": 1,
+                "error": str(e)
+            }
+    
+    def get(self, index_name: str, id: str, fields: List[str] = None) -> Optional[Dict[str, Any]]:
+        """
+        获取单个文档
+        
+        Args:
+            index_name: 索引名称
+            id: 文档ID
+            fields: 要返回的字段列表,可选
+        
+        Returns:
+            Dict: 文档内容,不存在则返回None
+        """
+        try:
+            params = {}
+            if fields:
+                params["_source"] = fields
+            result = self.es.get(index=index_name, id=id, **params)
+            return result["_source"]
+        except NotFoundError:
+            return None
+        except Exception as e:
+            print(f"获取文档失败: {e}")
+            return None

+ 131 - 0
services/utils/es/index.py

@@ -0,0 +1,131 @@
+"""
+Elasticsearch 索引管理
+"""
+from typing import Dict, Any, Optional
+from services.utils.es.base import ESConnection
+
+
+class IndexManager:
+    """
+    Elasticsearch 索引管理器
+    负责:
+    - 索引创建
+    - 索引删除
+    - 索引检查
+    """
+    
+    def __init__(self, es_connection: Optional[ESConnection] = None):
+        """
+        初始化索引管理器
+        
+        Args:
+            es_connection: ES 连接实例,可选
+        """
+        self.es_conn = es_connection or ESConnection()
+        self.es = self.es_conn.get_client()
+    
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None, settings: Dict[str, Any] = None) -> bool:
+        """
+        创建索引
+        
+        Args:
+            index_name: 索引名称
+            mappings: 自定义映射,会与动态模板合并
+            settings: 索引设置
+        
+        Returns:
+            bool: 创建是否成功
+        """
+        try:
+            # 如果索引已存在,返回True
+            if self.es.indices.exists(index=index_name):
+                return True
+            
+            # 合并动态模板和自定义映射
+            final_mappings = self.es_conn.dynamic_templates.copy()
+            if mappings:
+                if "dynamic_templates" in mappings:
+                    final_mappings["dynamic_templates"] += mappings["dynamic_templates"]
+                if "properties" in mappings:
+                    final_mappings["properties"] = mappings["properties"]
+            
+            body = {}
+            if settings:
+                body["settings"] = settings
+            body["mappings"] = final_mappings
+            
+            self.es.indices.create(index=index_name, body=body)
+            return True
+        except Exception as e:
+            print(f"创建索引失败: {e}")
+            return False
+    
+    def delete_index(self, index_name: str) -> bool:
+        """
+        删除索引
+        
+        Args:
+            index_name: 索引名称
+        
+        Returns:
+            bool: 删除是否成功
+        """
+        try:
+            if self.es.indices.exists(index=index_name):
+                self.es.indices.delete(index=index_name)
+            return True
+        except Exception as e:
+            print(f"删除索引失败: {e}")
+            return False
+    
+    def exists(self, index_name: str) -> bool:
+        """
+        检查索引是否存在
+        
+        Args:
+            index_name: 索引名称
+        
+        Returns:
+            bool: 索引是否存在
+        """
+        try:
+            return self.es.indices.exists(index=index_name)
+        except Exception as e:
+            print(f"检查索引存在失败: {e}")
+            return False
+    
+    def get_mappings(self, index_name: str) -> Optional[Dict[str, Any]]:
+        """
+        获取索引映射
+        
+        Args:
+            index_name: 索引名称
+        
+        Returns:
+            Dict[str, Any]: 索引映射,不存在则返回None
+        """
+        try:
+            if self.exists(index_name):
+                return self.es.indices.get_mapping(index=index_name)
+            return None
+        except Exception as e:
+            print(f"获取索引映射失败: {e}")
+            return None
+    
+    def get_settings(self, index_name: str) -> Optional[Dict[str, Any]]:
+        """
+        获取索引设置
+        
+        Args:
+            index_name: 索引名称
+        
+        Returns:
+            Dict[str, Any]: 索引设置,不存在则返回None
+        """
+        try:
+            if self.exists(index_name):
+                return self.es.indices.get_settings(index=index_name)
+            return None
+        except Exception as e:
+            print(f"获取索引设置失败: {e}")
+            return None

+ 202 - 0
services/utils/es/search.py

@@ -0,0 +1,202 @@
+"""
+Elasticsearch 搜索管理
+"""
+from typing import List, Dict, Any, Optional
+from services.utils.es.base import ESConnection
+
+
+class SearchManager:
+    """
+    Elasticsearch 搜索管理器
+    负责:
+    - 全文检索
+    - 向量相似度检索(k-NN)
+    - 混合检索(文本+向量)
+    - 高亮显示
+    """
+    
+    def __init__(self, es_connection: Optional[ESConnection] = None):
+        """
+        初始化搜索管理器
+        
+        Args:
+            es_connection: ES 连接实例,可选
+        """
+        self.es_conn = es_connection or ESConnection()
+        self.es = self.es_conn.get_client()
+    
+    def search(self, index_name: str, query: Dict[str, Any], size: int = 10, from_: int = 0, 
+               fields: List[str] = None, highlight: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        搜索文档
+        
+        Args:
+            index_name: 索引名称
+            query: 查询条件
+            size: 返回结果数量
+            from_: 起始位置
+            fields: 要返回的字段列表,可选
+            highlight: 高亮配置,可选
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        try:
+            body = {
+                "query": query,
+                "size": size,
+                "from": from_
+            }
+            
+            if fields:
+                body["_source"] = fields
+            
+            if highlight:
+                body["highlight"] = highlight
+            
+            result = self.es.search(index=index_name, body=body)
+            return result
+        except Exception as e:
+            print(f"搜索失败: {e}")
+            return {"hits": {"total": 0, "hits": []}}
+    
+    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
+                     size: int = 10, from_: int = 0, fields: List[str] = None, 
+                     text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
+        """
+        混合检索:向量相似度检索 + 全文检索
+        
+        Args:
+            index_name: 索引名称
+            text_query: 文本查询语句
+            vector_field: 向量字段名
+            vector: 检索向量
+            size: 返回结果数量
+            from_: 起始位置
+            fields: 要返回的字段列表,可选
+            text_weight: 文本检索权重
+            vector_weight: 向量检索权重
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        try:
+            # 构建混合检索查询
+            query = {
+                "bool": {
+                    "should": [
+                        {
+                            "query_string": {
+                                "query": text_query,
+                                "default_operator": "OR",
+                                "boost": text_weight
+                            }
+                        },
+                        {
+                            "script_score": {
+                                "query": {
+                                    "match_all": {}
+                                },
+                                "script": {
+                                    "source": "cosineSimilarity(params.query_vector, doc[params.vector_field]) + 1.0",
+                                    "params": {
+                                        "query_vector": vector,
+                                        "vector_field": vector_field
+                                    }
+                                },
+                                "boost": vector_weight
+                            }
+                        }
+                    ]
+                }
+            }
+            
+            body = {
+                "query": query,
+                "size": size,
+                "from": from_
+            }
+            
+            if fields:
+                body["_source"] = fields
+            
+            result = self.es.search(index=index_name, body=body)
+            return result
+        except Exception as e:
+            print(f"混合检索失败: {e}")
+            return {"hits": {"total": 0, "hits": []}}
+    
+    def knn_search(self, index_name: str, vector_field: str, vector: List[float], 
+                  k: int = 10, filter_query: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        向量相似度检索(k-NN)
+        
+        Args:
+            index_name: 索引名称
+            vector_field: 向量字段名
+            vector: 检索向量
+            k: 返回结果数量
+            filter_query: 过滤条件,可选
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        try:
+            knn = {
+                "field": vector_field,
+                "query_vector": vector,
+                "k": k,
+                "num_candidates": k * 10
+            }
+            
+            if filter_query:
+                knn["filter"] = filter_query
+            
+            body = {
+                "knn": knn
+            }
+            
+            result = self.es.search(index=index_name, body=body)
+            return result
+        except Exception as e:
+            print(f"向量检索失败: {e}")
+            return {"hits": {"total": 0, "hits": []}}
+    
+    def match_search(self, index_name: str, field: str, value: str, size: int = 10, 
+                     fields: List[str] = None) -> Dict[str, Any]:
+        """
+        简单匹配搜索
+        
+        Args:
+            index_name: 索引名称
+            field: 字段名
+            value: 匹配值
+            size: 返回结果数量
+            fields: 要返回的字段列表,可选
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        query = {
+            "match": {
+                field: value
+            }
+        }
+        return self.search(index_name, query, size=size, fields=fields)
+    
+    def match_all(self, index_name: str, size: int = 10, fields: List[str] = None) -> Dict[str, Any]:
+        """
+        匹配所有文档
+        
+        Args:
+            index_name: 索引名称
+            size: 返回结果数量
+            fields: 要返回的字段列表,可选
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        query = {
+            "match_all": {}
+        }
+        return self.search(index_name, query, size=size, fields=fields)

+ 203 - 0
services/utils/es/templates.py

@@ -0,0 +1,203 @@
+"""
+Elasticsearch 动态模板映射
+"""
+from typing import Dict, Any
+
+
+def get_dynamic_templates() -> Dict[str, Any]:
+    """
+    获取动态模板映射配置
+    参考:d:/project/work/ragflow_plugs/book/es_dynamic.md
+    
+    Returns:
+        Dict[str, Any]: 动态模板映射配置
+    """
+    return {
+        "dynamic_templates": [
+            {
+                "int": {
+                    "match": "*_int",
+                    "mapping": {
+                        "store": True,
+                        "type": "integer"
+                    }
+                }
+            },
+            {
+                "ulong": {
+                    "match": "*_ulong",
+                    "mapping": {
+                        "store": True,
+                        "type": "unsigned_long"
+                    }
+                }
+            },
+            {
+                "long": {
+                    "match": "*_long",
+                    "mapping": {
+                        "store": True,
+                        "type": "long"
+                    }
+                }
+            },
+            {
+                "short": {
+                    "match": "*_short",
+                    "mapping": {
+                        "store": True,
+                        "type": "short"
+                    }
+                }
+            },
+            {
+                "numeric": {
+                    "match": "*_flt",
+                    "mapping": {
+                        "store": True,
+                        "type": "float"
+                    }
+                }
+            },
+            {
+                "tks": {
+                    "match": "*_tks",
+                    "mapping": {
+                        "analyzer": "whitespace",
+                        "similarity": "scripted_sim",
+                        "store": True,
+                        "type": "text"
+                    }
+                }
+            },
+            {
+                "ltks": {
+                    "match": "*_ltks",
+                    "mapping": {
+                        "analyzer": "whitespace",
+                        "store": True,
+                        "type": "text"
+                    }
+                }
+            },
+            {
+                "kwd": {
+                    "match": "^(.*_(kwd|id|ids|uid|uids)|uid)$",
+                    "match_pattern": "regex",
+                    "mapping": {
+                        "similarity": "boolean",
+                        "store": True,
+                        "type": "keyword"
+                    }
+                }
+            },
+            {
+                "dt": {
+                    "match": "^.*(_dt|_time|_at)$",
+                    "match_pattern": "regex",
+                    "mapping": {
+                        "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyy-MM-dd_HH:mm:ss",
+                        "store": True,
+                        "type": "date"
+                    }
+                }
+            },
+            {
+                "nested": {
+                    "match": "*_nst",
+                    "mapping": {
+                        "type": "nested"
+                    }
+                }
+            },
+            {
+                "object": {
+                    "match": "*_obj",
+                    "mapping": {
+                        "dynamic": True,
+                        "type": "object"
+                    }
+                }
+            },
+            {
+                "string": {
+                    "match": "^.*_(with_weight|list)$",
+                    "match_pattern": "regex",
+                    "mapping": {
+                        "index": False,
+                        "store": True,
+                        "type": "text"
+                    }
+                }
+            },
+            {
+                "rank_feature": {
+                    "match": "*_fea",
+                    "mapping": {
+                        "type": "rank_feature"
+                    }
+                }
+            },
+            {
+                "rank_features": {
+                    "match": "*_feas",
+                    "mapping": {
+                        "type": "rank_features"
+                    }
+                }
+            },
+            {
+                "dense_vector_512": {
+                    "match": "*_512_vec",
+                    "mapping": {
+                        "dims": 512,
+                        "index": True,
+                        "similarity": "cosine",
+                        "type": "dense_vector"
+                    }
+                }
+            },
+            {
+                "dense_vector_768": {
+                    "match": "*_768_vec",
+                    "mapping": {
+                        "dims": 768,
+                        "index": True,
+                        "similarity": "cosine",
+                        "type": "dense_vector"
+                    }
+                }
+            },
+            {
+                "dense_vector_1024": {
+                    "match": "*_1024_vec",
+                    "mapping": {
+                        "dims": 1024,
+                        "index": True,
+                        "similarity": "cosine",
+                        "type": "dense_vector"
+                    }
+                }
+            },
+            {
+                "dense_vector_1536": {
+                    "match": "*_1536_vec",
+                    "mapping": {
+                        "dims": 1536,
+                        "index": True,
+                        "similarity": "cosine",
+                        "type": "dense_vector"
+                    }
+                }
+            },
+            {
+                "binary": {
+                    "match": "*_bin",
+                    "mapping": {
+                        "type": "binary"
+                    }
+                }
+            }
+        ],
+        "date_detection": True
+    }

+ 138 - 0
services/utils/es_conn.py

@@ -0,0 +1,138 @@
+"""
+Elasticsearch 连接管理器(向后兼容接口)
+
+该文件提供与旧版 es_conn.py 兼容的接口,同时内部使用新的工程化模块。
+"""
+import re
+import json
+import time
+from typing import Any, List, Dict, Optional, Union
+from elasticsearch import Elasticsearch, helpers
+from elasticsearch.helpers import bulk, BulkIndexError
+from elastic_transport import ConnectionTimeout
+from elasticsearch.exceptions import NotFoundError
+
+from services.utils.es.base import ESConnection as _ESConnection
+from services.utils.es.index import IndexManager
+from services.utils.es.document import DocumentManager
+from services.utils.es.search import SearchManager
+
+# 单例装饰器
+class singleton:
+    def __init__(self, cls):
+        self.cls = cls
+        self._instance = None
+    
+    def __call__(self, *args, **kwargs):
+        if self._instance is None:
+            self._instance = self.cls(*args, **kwargs)
+        return self._instance
+
+@singleton
+class ESConnection:
+    """
+    Elasticsearch 连接管理器(向后兼容)
+    支持:
+    - 单例模式
+    - 连接池管理
+    - CRUD操作
+    - 向量相似度检索 + 全文检索的混合检索
+    - 动态模板映射
+    """
+    
+    def __init__(self, hosts: List[str] = None, **kwargs):
+        """
+        初始化 Elasticsearch 连接
+        
+        Args:
+            hosts: Elasticsearch 主机列表,格式如 ["http://localhost:9200"]
+            **kwargs: 其他 Elasticsearch 客户端配置参数
+        """
+        # 使用新的 ESConnection 作为底层连接
+        self._es_conn = _ESConnection(hosts=hosts, **kwargs)
+        
+        # 初始化管理器
+        self.index_manager = IndexManager(self._es_conn)
+        self.document_manager = DocumentManager(self._es_conn)
+        self.search_manager = SearchManager(self._es_conn)
+        
+        # 向后兼容属性
+        self.es = self._es_conn.get_client()
+        self.dynamic_templates = self._es_conn.dynamic_templates
+    
+    def _get_dynamic_templates(self) -> Dict[str, Any]:
+        """
+        获取动态模板映射配置(向后兼容方法)
+        """
+        return self.dynamic_templates
+    
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None, settings: Dict[str, Any] = None) -> bool:
+        """
+        创建索引
+        """
+        return self.index_manager.create_index(index_name, mappings, settings)
+    
+    def insert(self, index_name: str, document: Dict[str, Any], id: str = None, refresh: bool = False) -> bool:
+        """
+        插入单个文档
+        """
+        return self.document_manager.insert(index_name, document, id, refresh)
+    
+    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]], refresh: bool = False) -> Dict[str, Any]:
+        """
+        批量插入文档
+        """
+        return self.document_manager.bulk_insert(index_name, documents, refresh)
+    
+    def update(self, index_name: str, id: str, update_body: Dict[str, Any], refresh: bool = False) -> bool:
+        """
+        更新文档
+        """
+        return self.document_manager.update(index_name, id, update_body, refresh)
+    
+    def delete(self, index_name: str, id: str, refresh: bool = False) -> bool:
+        """
+        删除文档
+        """
+        return self.document_manager.delete(index_name, id, refresh)
+    
+    def delete_by_query(self, index_name: str, query: Dict[str, Any], refresh: bool = False) -> Dict[str, Any]:
+        """
+        按查询条件删除文档
+        """
+        return self.document_manager.delete_by_query(index_name, query, refresh)
+    
+    def get(self, index_name: str, id: str, fields: List[str] = None) -> Optional[Dict[str, Any]]:
+        """
+        获取单个文档
+        """
+        return self.document_manager.get(index_name, id, fields)
+    
+    def search(self, index_name: str, query: Dict[str, Any], size: int = 10, from_: int = 0, 
+               fields: List[str] = None, highlight: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        搜索文档
+        """
+        return self.search_manager.search(index_name, query, size, from_, fields, highlight)
+    
+    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
+                     size: int = 10, from_: int = 0, fields: List[str] = None, 
+                     text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
+        """
+        混合检索:向量相似度检索 + 全文检索
+        """
+        return self.search_manager.hybrid_search(index_name, text_query, vector_field, vector, 
+                                               size, from_, fields, text_weight, vector_weight)
+    
+    def knn_search(self, index_name: str, vector_field: str, vector: List[float], 
+                  k: int = 10, filter_query: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        向量相似度检索(k-NN)
+        """
+        return self.search_manager.knn_search(index_name, vector_field, vector, k, filter_query)
+    
+    def close(self):
+        """
+        关闭 Elasticsearch 连接
+        """
+        self._es_conn.close()

+ 105 - 0
services/utils/http_client.py

@@ -0,0 +1,105 @@
+import requests
+from typing import Dict, Any, Optional
+
+class HTTPClient:
+    """HTTP请求工具类,用于发送各种HTTP请求"""
+    
+    def __init__(self, base_url: str, api_key: str = None):
+        """
+        初始化HTTP客户端
+        
+        Args:
+            base_url: API基础URL
+            api_key: API密钥(Bearer token)
+        """
+        self.base_url = base_url.rstrip('/')
+        self.api_key = api_key
+        self.session = requests.Session()
+        
+        # 设置默认请求头
+        if self.api_key:
+            self.session.headers.update({
+                'Authorization': f'Bearer {self.api_key}'
+            })
+    
+    def post(self, endpoint: str, data: Optional[Dict] = None, 
+             json: Optional[Dict] = None, files: Optional[Dict] = None,
+             headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        发送POST请求
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            data: 表单数据
+            json: JSON数据
+            files: 文件数据
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        url = f"{self.base_url}{endpoint}"
+        response = self.session.post(
+            url=url,
+            data=data,
+            json=json,
+            files=files,
+            headers=headers
+        )
+        response.raise_for_status()  # 抛出HTTP错误
+        return response.json()
+    
+    def get(self, endpoint: str, params: Optional[Dict] = None,
+            headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        发送GET请求
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            params: 查询参数
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        url = f"{self.base_url}{endpoint}"
+        response = self.session.get(
+            url=url,
+            params=params,
+            headers=headers
+        )
+        response.raise_for_status()  # 抛出HTTP错误
+        return response.json()
+    
+    def upload_file(self, endpoint: str, file_path: str, file_field_name: str = 'file',
+                   data: Optional[Dict] = None, headers: Optional[Dict] = None) -> Dict[str, Any]:
+        """
+        上传文件
+        
+        Args:
+            endpoint: API端点路径(以/开头)
+            file_path: 本地文件路径
+            file_field_name: 表单字段名称
+            data: 额外的表单数据
+            headers: 自定义请求头
+            
+        Returns:
+            Dict: 响应JSON数据
+        
+        Raises:
+            requests.exceptions.RequestException: 请求失败时抛出
+        """
+        # 打开文件并构建files字典
+        with open(file_path, 'rb') as f:
+            files = {
+                file_field_name: (file_path.split('/')[-1], f)
+            }
+            
+            # 发送POST请求
+            return self.post(endpoint, data=data, files=files, headers=headers)

+ 94 - 0
services/utils/infinity/__init__.py

@@ -0,0 +1,94 @@
+"""
+Infinity向量数据库主类
+"""
+from typing import List, Dict, Any, Optional
+from services.utils.infinity.base import InfinityConnection
+from services.utils.infinity.index import InfinityIndexManager
+from services.utils.infinity.document import InfinityDocumentManager
+from services.utils.infinity.search import InfinitySearchManager
+
+
+class InfinityVectorDB:
+    """
+    Infinity向量数据库主类
+    提供统一的接口,整合索引、文档和搜索功能
+    """
+    
+    def __init__(self, host: str = None, port: int = None, user: str = None, password: str = None):
+        """
+        初始化Infinity向量数据库
+        
+        Args:
+            host: Infinity主机地址
+            port: Infinity端口
+            user: 用户名
+            password: 密码
+        """
+        # 初始化连接
+        self.connection = InfinityConnection(host, port, user, password)
+        
+        # 初始化管理器
+        self.index_manager = InfinityIndexManager(self.connection)
+        self.document_manager = InfinityDocumentManager(self.connection)
+        self.search_manager = InfinitySearchManager(self.connection)
+    
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
+        """创建索引"""
+        return self.index_manager.create_index(index_name, mappings)
+    
+    def delete_index(self, index_name: str) -> bool:
+        """删除索引"""
+        return self.index_manager.delete_index(index_name)
+    
+    def index_exists(self, index_name: str) -> bool:
+        """检查索引是否存在"""
+        return self.index_manager.index_exists(index_name)
+    
+    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
+        """插入单个文档"""
+        return self.document_manager.insert_document(index_name, document, id)
+    
+    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """批量插入文档"""
+        return self.document_manager.bulk_insert(index_name, documents)
+    
+    def update_document(self, index_name: str, document_id: str, document: Dict[str, Any]) -> bool:
+        """更新单个文档"""
+        return self.document_manager.update_document(index_name, document_id, document)
+    
+    def delete_document(self, index_name: str, document_id: str) -> bool:
+        """删除单个文档"""
+        return self.document_manager.delete_document(index_name, document_id)
+    
+    def get_document(self, index_name: str, document_id: str) -> Optional[Dict[str, Any]]:
+        """获取单个文档"""
+        return self.document_manager.get_document(index_name, document_id)
+    
+    def delete_by_query(self, index_name: str, query: Dict[str, Any]) -> Dict[str, Any]:
+        """按查询条件删除文档"""
+        return self.document_manager.delete_by_query(index_name, query)
+    
+    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
+        """搜索文档"""
+        return self.search_manager.search(index_name, query, size)
+    
+    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
+        """向量检索"""
+        return self.search_manager.vector_search(index_name, vector_field, vector, size, filter)
+    
+    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
+                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
+        """混合检索"""
+        return self.search_manager.hybrid_search(index_name, text_query, vector_field, vector, size, text_weight, vector_weight)
+    
+    def match_search(self, index_name: str, field: str, value: str, size: int = 10) -> Dict[str, Any]:
+        """简单匹配搜索"""
+        return self.search_manager.match_search(index_name, field, value, size)
+    
+    def match_all(self, index_name: str, size: int = 10) -> Dict[str, Any]:
+        """匹配所有文档"""
+        return self.search_manager.match_all(index_name, size)
+    
+    def close(self):
+        """关闭连接"""
+        self.connection.close()

BIN
services/utils/infinity/__pycache__/__init__.cpython-312.pyc


BIN
services/utils/infinity/__pycache__/base.cpython-312.pyc


BIN
services/utils/infinity/__pycache__/document.cpython-312.pyc


BIN
services/utils/infinity/__pycache__/index.cpython-312.pyc


BIN
services/utils/infinity/__pycache__/search.cpython-312.pyc


+ 63 - 0
services/utils/infinity/base.py

@@ -0,0 +1,63 @@
+"""
+Infinity向量数据库连接基础类
+"""
+import http.client
+import json
+from typing import Dict, Any
+from conf.config import VectorDBConfig
+
+
+class InfinityConnection:
+    """
+    Infinity向量数据库连接管理器
+    负责管理与Infinity向量数据库的连接
+    """
+    
+    def __init__(self, host: str = None, port: int = None, user: str = None, password: str = None):
+        """
+        初始化Infinity连接
+        
+        Args:
+            host: Infinity主机地址
+            port: Infinity端口
+            user: 用户名
+            password: 密码
+        """
+        # 获取配置
+        self.host = host or VectorDBConfig.get_infinity_host()
+        self.port = port or VectorDBConfig.get_infinity_port()
+        self.user = user or VectorDBConfig.get_infinity_user()
+        self.password = password or VectorDBConfig.get_infinity_password()
+        self.base_url = f"http://{self.host}:{self.port}"
+        
+        # 初始化HTTP客户端
+        self.http_client = http.client.HTTPConnection(self.host, self.port)
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Authorization': f'Basic {self._get_auth_token()}'
+        }
+    
+    def _get_auth_token(self) -> str:
+        """生成Basic Auth令牌"""
+        import base64
+        auth_str = f"{self.user}:{self.password}"
+        return base64.b64encode(auth_str.encode()).decode()
+    
+    def _make_request(self, method: str, path: str, data: dict = None) -> dict:
+        """发送HTTP请求"""
+        try:
+            body = json.dumps(data) if data else None
+            self.http_client.request(method, path, body, self.headers)
+            response = self.http_client.getresponse()
+            response_data = json.loads(response.read().decode())
+            return response_data
+        except Exception as e:
+            print(f"HTTP请求失败: {e}")
+            return {"error": str(e)}
+    
+    def close(self):
+        """关闭连接"""
+        try:
+            self.http_client.close()
+        except Exception as e:
+            print(f"关闭Infinity连接失败: {e}")

+ 168 - 0
services/utils/infinity/document.py

@@ -0,0 +1,168 @@
+"""
+Infinity向量数据库文档管理器
+"""
+from typing import List, Dict, Any, Optional
+from services.utils.infinity.base import InfinityConnection
+
+
+class InfinityDocumentManager:
+    """
+    Infinity向量数据库文档管理器
+    负责文档的增删改查操作
+    """
+    
+    def __init__(self, infinity_connection: Optional[InfinityConnection] = None):
+        """
+        初始化文档管理器
+        
+        Args:
+            infinity_connection: Infinity连接实例,可选
+        """
+        self.infinity_conn = infinity_connection or InfinityConnection()
+    
+    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
+        """
+        插入单个文档
+        
+        Args:
+            index_name: 索引名称
+            document: 文档内容
+            id: 文档ID,可选
+        
+        Returns:
+            bool: 插入是否成功
+        """
+        try:
+            path = f"/api/collections/{index_name}/documents"
+            response = self.infinity_conn._make_request("POST", path, {"documents": [document]})
+            return "error" not in response
+        except Exception as e:
+            print(f"插入Infinity文档失败: {e}")
+            return False
+    
+    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        批量插入文档
+        
+        Args:
+            index_name: 索引名称
+            documents: 文档列表
+        
+        Returns:
+            Dict: 包含成功和失败信息的字典
+        """
+        try:
+            path = f"/api/collections/{index_name}/documents"
+            response = self.infinity_conn._make_request("POST", path, {"documents": documents})
+            
+            if "error" not in response:
+                return {
+                    "success": len(documents),
+                    "failed": 0,
+                    "errors": []
+                }
+            else:
+                return {
+                    "success": 0,
+                    "failed": len(documents),
+                    "errors": [response["error"]] * len(documents)
+                }
+        except Exception as e:
+            print(f"批量插入Infinity文档失败: {e}")
+            return {
+                "success": 0,
+                "failed": len(documents),
+                "errors": [str(e)] * len(documents)
+            }
+    
+    def update_document(self, index_name: str, document_id: str, document: Dict[str, Any]) -> bool:
+        """
+        更新单个文档
+        
+        Args:
+            index_name: 索引名称
+            document_id: 文档ID
+            document: 要更新的文档内容
+        
+        Returns:
+            bool: 更新是否成功
+        """
+        try:
+            path = f"/api/collections/{index_name}/documents/{document_id}"
+            response = self.infinity_conn._make_request("PUT", path, document)
+            return "error" not in response
+        except Exception as e:
+            print(f"更新Infinity文档失败: {e}")
+            return False
+    
+    def delete_document(self, index_name: str, document_id: str) -> bool:
+        """
+        删除单个文档
+        
+        Args:
+            index_name: 索引名称
+            document_id: 文档ID
+        
+        Returns:
+            bool: 删除是否成功
+        """
+        try:
+            path = f"/api/collections/{index_name}/documents/{document_id}"
+            response = self.infinity_conn._make_request("DELETE", path)
+            return "error" not in response
+        except Exception as e:
+            print(f"删除Infinity文档失败: {e}")
+            return False
+    
+    def get_document(self, index_name: str, document_id: str) -> Optional[Dict[str, Any]]:
+        """
+        获取单个文档
+        
+        Args:
+            index_name: 索引名称
+            document_id: 文档ID
+        
+        Returns:
+            Optional[Dict[str, Any]]: 文档内容,不存在则返回None
+        """
+        try:
+            path = f"/api/collections/{index_name}/documents/{document_id}"
+            response = self.infinity_conn._make_request("GET", path)
+            return response if "error" not in response else None
+        except Exception as e:
+            print(f"获取Infinity文档失败: {e}")
+            return None
+    
+    def delete_by_query(self, index_name: str, query: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        按查询条件删除文档
+        
+        Args:
+            index_name: 索引名称
+            query: 查询条件
+        
+        Returns:
+            Dict: 删除结果
+        """
+        try:
+            path = f"/api/collections/{index_name}/delete_by_query"
+            response = self.infinity_conn._make_request("POST", path, {"query": query})
+            
+            if "error" not in response:
+                return {
+                    "deleted": response.get("total", 0),
+                    "failed": 0
+                }
+            else:
+                return {
+                    "deleted": 0,
+                    "failed": 1,
+                    "error": response["error"]
+                }
+        except Exception as e:
+            print(f"按条件删除Infinity文档失败: {e}")
+            return {
+                "deleted": 0,
+                "failed": 1,
+                "error": str(e)
+            }

+ 140 - 0
services/utils/infinity/index.py

@@ -0,0 +1,140 @@
+"""
+Infinity向量数据库索引管理器
+"""
+from typing import Dict, Any, Optional
+from services.utils.infinity.base import InfinityConnection
+
+
+class InfinityIndexManager:
+    """
+    Infinity向量数据库索引管理器
+    负责索引(collection)的创建、删除和检查
+    """
+    
+    def __init__(self, infinity_connection: Optional[InfinityConnection] = None):
+        """
+        初始化索引管理器
+        
+        Args:
+            infinity_connection: Infinity连接实例,可选
+        """
+        self.infinity_conn = infinity_connection or InfinityConnection()
+    
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
+        """
+        创建索引(Infinity中的collection)
+        
+        Args:
+            index_name: 索引名称
+            mappings: 自定义映射,会与默认映射合并
+        
+        Returns:
+            bool: 创建是否成功
+        """
+        try:
+            # 创建collection的API路径
+            path = f"/api/collections/{index_name}"
+            
+            # 默认映射配置
+            default_mappings = {
+                "fields": [
+                    {"name": "file_name", "type": "string"},
+                    {"name": "file_page_count", "type": "integer"},
+                    {"name": "page_number", "type": "integer"},
+                    {"name": "text", "type": "string"},
+                    {"name": "image_path", "type": "string"},
+                    {"name": "sparse_vector", "type": "array<array<{string, float}>>"},
+                    {"name": "dense_vector_1024", "type": "array<float>"},
+                    {"name": "dataset_id", "type": "string"},
+                    {"name": "document_id", "type": "string"}
+                ],
+                "indexes": [
+                    {
+                        "name": f"{index_name}_vector_index",
+                        "field": "dense_vector_1024",
+                        "type": "vector",
+                        "params": {
+                            "dimension": 1024,
+                            "metric": "cosine"
+                        }
+                    }
+                ]
+            }
+            
+            # 合并用户映射和默认映射
+            final_mappings = {**default_mappings, **(mappings or {})}
+            
+            response = self.infinity_conn._make_request("PUT", path, final_mappings)
+            return "error" not in response
+        except Exception as e:
+            print(f"创建Infinity索引失败: {e}")
+            return False
+    
+    def delete_index(self, index_name: str) -> bool:
+        """
+        删除索引(collection)
+        
+        Args:
+            index_name: 索引名称
+        
+        Returns:
+            bool: 删除是否成功
+        """
+        try:
+            path = f"/api/collections/{index_name}"
+            response = self.infinity_conn._make_request("DELETE", path)
+            return "error" not in response
+        except Exception as e:
+            print(f"删除Infinity索引失败: {e}")
+            return False
+    
+    def index_exists(self, index_name: str) -> bool:
+        """
+        检查索引是否存在
+        
+        Args:
+            index_name: 索引名称
+        
+        Returns:
+            bool: 索引是否存在
+        """
+        try:
+            path = f"/api/collections/{index_name}"
+            response = self.infinity_conn._make_request("GET", path)
+            return "error" not in response
+        except Exception as e:
+            print(f"检查Infinity索引存在失败: {e}")
+            return False
+    
+    def get_index_info(self, index_name: str) -> Dict[str, Any]:
+        """
+        获取索引信息
+        
+        Args:
+            index_name: 索引名称
+        
+        Returns:
+            Dict[str, Any]: 索引信息
+        """
+        try:
+            path = f"/api/collections/{index_name}"
+            response = self.infinity_conn._make_request("GET", path)
+            return response
+        except Exception as e:
+            print(f"获取Infinity索引信息失败: {e}")
+            return {}
+    
+    def list_indexes(self) -> list:
+        """
+        获取所有索引列表
+        
+        Returns:
+            list: 索引列表
+        """
+        try:
+            path = "/api/collections"
+            response = self.infinity_conn._make_request("GET", path)
+            return response.get("collections", [])
+        except Exception as e:
+            print(f"获取Infinity索引列表失败: {e}")
+            return []

+ 187 - 0
services/utils/infinity/search.py

@@ -0,0 +1,187 @@
+"""
+Infinity向量数据库搜索管理器
+"""
+from typing import List, Dict, Any, Optional
+from services.utils.infinity.base import InfinityConnection
+
+
+class InfinitySearchManager:
+    """
+    Infinity向量数据库搜索管理器
+    负责处理各种搜索操作
+    """
+    
+    def __init__(self, infinity_connection: Optional[InfinityConnection] = None):
+        """
+        初始化搜索管理器
+        
+        Args:
+            infinity_connection: Infinity连接实例,可选
+        """
+        self.infinity_conn = infinity_connection or InfinityConnection()
+    
+    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
+        """
+        全文检索
+        
+        Args:
+            index_name: 索引名称
+            query: 查询条件
+            size: 返回结果数量
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        try:
+            path = f"/api/collections/{index_name}/search"
+            response = self.infinity_conn._make_request("POST", path, {
+                "query": query,
+                "limit": size
+            })
+            
+            if "error" not in response:
+                return {
+                    "hits": {
+                        "total": response.get("total", 0),
+                        "hits": [{
+                            "_source": doc
+                        } for doc in response.get("documents", [])]
+                    }
+                }
+            return {"hits": {"total": 0, "hits": []}}
+        except Exception as e:
+            print(f"Infinity搜索失败: {e}")
+            return {"hits": {"total": 0, "hits": []}}
+    
+    def vector_search(self, index_name: str, vector_field: str, vector: List[float], 
+                     size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
+        """
+        向量检索
+        
+        Args:
+            index_name: 索引名称
+            vector_field: 向量字段名
+            vector: 检索向量
+            size: 返回结果数量
+            filter: 过滤条件,可选
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        try:
+            path = f"/api/collections/{index_name}/search"
+            
+            search_query = {
+                "vector": {
+                    "field": vector_field,
+                    "query": vector,
+                    "limit": size
+                }
+            }
+            
+            if filter:
+                search_query["filter"] = filter
+            
+            response = self.infinity_conn._make_request("POST", path, search_query)
+            
+            if "error" not in response:
+                return {
+                    "hits": {
+                        "total": response.get("total", 0),
+                        "hits": [{
+                            "_source": doc
+                        } for doc in response.get("documents", [])]
+                    }
+                }
+            return {"hits": {"total": 0, "hits": []}}
+        except Exception as e:
+            print(f"Infinity向量检索失败: {e}")
+            return {"hits": {"total": 0, "hits": []}}
+    
+    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
+                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
+        """
+        混合检索:文本检索 + 向量检索
+        
+        Args:
+            index_name: 索引名称
+            text_query: 文本查询
+            vector_field: 向量字段名
+            vector: 检索向量
+            size: 返回结果数量
+            text_weight: 文本检索权重
+            vector_weight: 向量检索权重
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        try:
+            path = f"/api/collections/{index_name}/search"
+            
+            search_query = {
+                "hybrid": {
+                    "text": {
+                        "query": text_query,
+                        "fields": ["text"],
+                        "weight": text_weight
+                    },
+                    "vector": {
+                        "field": vector_field,
+                        "query": vector,
+                        "weight": vector_weight
+                    },
+                    "limit": size
+                }
+            }
+            
+            response = self.infinity_conn._make_request("POST", path, search_query)
+            
+            if "error" not in response:
+                return {
+                    "hits": {
+                        "total": response.get("total", 0),
+                        "hits": [{
+                            "_source": doc
+                        } for doc in response.get("documents", [])]
+                    }
+                }
+            return {"hits": {"total": 0, "hits": []}}
+        except Exception as e:
+            print(f"Infinity混合检索失败: {e}")
+            return {"hits": {"total": 0, "hits": []}}
+    
+    def match_search(self, index_name: str, field: str, value: str, size: int = 10) -> Dict[str, Any]:
+        """
+        简单匹配搜索
+        
+        Args:
+            index_name: 索引名称
+            field: 字段名
+            value: 匹配值
+            size: 返回结果数量
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        query = {
+            "match": {
+                field: value
+            }
+        }
+        return self.search(index_name, query, size=size)
+    
+    def match_all(self, index_name: str, size: int = 10) -> Dict[str, Any]:
+        """
+        匹配所有文档
+        
+        Args:
+            index_name: 索引名称
+            size: 返回结果数量
+        
+        Returns:
+            Dict: 搜索结果
+        """
+        query = {
+            "match_all": {}
+        }
+        return self.search(index_name, query, size=size)

+ 168 - 0
services/utils/vector_db.py

@@ -0,0 +1,168 @@
+"""
+向量数据库工厂类
+支持动态切换Elasticsearch和Infinity向量数据库
+"""
+from typing import Any, List, Dict, Optional
+from conf.config import VectorDBConfig
+from services.utils.es import ESConnection as ElasticsearchConnection
+
+
+class VectorDBFactory:
+    """
+    向量数据库工厂类
+    根据配置创建不同类型的向量数据库连接
+    """
+    
+    @staticmethod
+    def get_vector_db():
+        """
+        获取向量数据库实例
+        
+        Returns:
+            VectorDBBase: 向量数据库实例
+        """
+        vector_db_type = VectorDBConfig.get_vector_db_type().lower()
+        
+        if vector_db_type == "es":
+            return ElasticsearchVectorDB()
+        elif vector_db_type == "infinity":
+            return InfinityVectorDB()
+        else:
+            raise ValueError(f"不支持的向量数据库类型: {vector_db_type}")
+
+
+class VectorDBBase:
+    """
+    向量数据库基类
+    定义了向量数据库应该实现的接口
+    """
+    
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
+        """创建索引"""
+        raise NotImplementedError()
+    
+    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
+        """插入单个文档"""
+        raise NotImplementedError()
+    
+    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """批量插入文档"""
+        raise NotImplementedError()
+    
+    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
+        """搜索文档"""
+        raise NotImplementedError()
+    
+    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
+        """向量检索"""
+        raise NotImplementedError()
+    
+    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
+                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
+        """混合检索"""
+        raise NotImplementedError()
+    
+    def close(self):
+        """关闭连接"""
+        raise NotImplementedError()
+
+
+class ElasticsearchVectorDB(VectorDBBase):
+    """
+    Elasticsearch向量数据库实现
+    """
+    
+    def __init__(self):
+        """初始化Elasticsearch向量数据库"""
+        self.es_conn = ElasticsearchConnection()
+        
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
+        """创建索引"""
+        from services.utils.es.index import IndexManager
+        index_manager = IndexManager(self.es_conn)
+        return index_manager.create_index(index_name, mappings)
+    
+    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
+        """插入单个文档"""
+        from services.utils.es.document import DocumentManager
+        doc_manager = DocumentManager(self.es_conn)
+        return doc_manager.insert(index_name, document, id)
+    
+    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """批量插入文档"""
+        from services.utils.es.document import DocumentManager
+        doc_manager = DocumentManager(self.es_conn)
+        return doc_manager.bulk_insert(index_name, documents)
+    
+    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
+        """搜索文档"""
+        from services.utils.es.search import SearchManager
+        search_manager = SearchManager(self.es_conn)
+        return search_manager.search(index_name, query, size=size)
+    
+    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
+        """向量检索"""
+        from services.utils.es.search import SearchManager
+        search_manager = SearchManager(self.es_conn)
+        return search_manager.knn_search(index_name, vector_field, vector, size, filter)
+    
+    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
+                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
+        """混合检索"""
+        from services.utils.es.search import SearchManager
+        search_manager = SearchManager(self.es_conn)
+        return search_manager.hybrid_search(index_name, text_query, vector_field, vector, size, text_weight=text_weight, vector_weight=vector_weight)
+    
+    def close(self):
+        """关闭连接"""
+        self.es_conn.close()
+
+
+class InfinityVectorDB(VectorDBBase):
+    """
+    Infinity向量数据库实现
+    支持infinity向量数据库的具体实现,包含PDF元数据入库
+    """
+    
+    def __init__(self):
+        """初始化Infinity向量数据库"""
+        from services.utils.infinity import InfinityVectorDB as _InfinityVectorDB
+        from conf.config import VectorDBConfig
+        
+        # 获取Infinity配置
+        host = VectorDBConfig.get_infinity_host()
+        port = VectorDBConfig.get_infinity_port()
+        user = VectorDBConfig.get_infinity_user()
+        password = VectorDBConfig.get_infinity_password()
+        
+        # 初始化新的InfinityVectorDB实例
+        self._infinity_db = _InfinityVectorDB(host=host, port=port, user=user, password=password)
+    
+    def create_index(self, index_name: str, mappings: Dict[str, Any] = None) -> bool:
+        """创建索引"""
+        return self._infinity_db.create_index(index_name, mappings)
+    
+    def insert_document(self, index_name: str, document: Dict[str, Any], id: str = None) -> bool:
+        """插入单个文档"""
+        return self._infinity_db.insert_document(index_name, document, id)
+    
+    def bulk_insert(self, index_name: str, documents: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """批量插入文档"""
+        return self._infinity_db.bulk_insert(index_name, documents)
+    
+    def search(self, index_name: str, query: Dict[str, Any], size: int = 10) -> Dict[str, Any]:
+        """搜索文档"""
+        return self._infinity_db.search(index_name, query, size)
+    
+    def vector_search(self, index_name: str, vector_field: str, vector: List[float], size: int = 10, filter: Dict[str, Any] = None) -> Dict[str, Any]:
+        """向量检索"""
+        return self._infinity_db.vector_search(index_name, vector_field, vector, size, filter)
+    
+    def hybrid_search(self, index_name: str, text_query: str, vector_field: str, vector: List[float], 
+                     size: int = 10, text_weight: float = 0.5, vector_weight: float = 0.5) -> Dict[str, Any]:
+        """混合检索"""
+        return self._infinity_db.hybrid_search(index_name, text_query, vector_field, vector, size, text_weight, vector_weight)
+    
+    def close(self):
+        """关闭连接"""
+        self._infinity_db.close()

+ 121 - 0
test_es_conn.py

@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+from services.utils.es_conn import ESConnection
+
+def test_es_connection():
+    """
+    测试 Elasticsearch 连接和基本功能
+    """
+    try:
+        # 初始化连接
+        print("正在初始化 Elasticsearch 连接...")
+        es = ESConnection(hosts=["http://localhost:9200"])
+        print("连接成功!")
+        
+        # 测试索引创建
+        index_name = "test_ragflow_index"
+        print(f"\n正在创建索引: {index_name}")
+        success = es.create_index(index_name)
+        if success:
+            print(f"索引 {index_name} 创建成功!")
+        else:
+            print(f"索引 {index_name} 创建失败!")
+            return False
+        
+        # 测试文档插入
+        test_doc = {
+            "title": "测试文档",
+            "content": "这是一个用于测试 Elasticsearch 连接的文档",
+            "content_tks": "这 是 一个 用于 测试 Elasticsearch 连接 的 文档",
+            "vector_768_vec": [0.1] * 768,
+            "created_at": "2024-01-01 00:00:00",
+            "count_int": 10,
+            "importance_flt": 0.8,
+            "tags_kwd": ["测试", "elasticsearch"],
+            "kb_id": "test_kb_123"
+        }
+        
+        print("\n正在插入测试文档...")
+        insert_success = es.insert(index_name, test_doc)
+        if insert_success:
+            print("文档插入成功!")
+        else:
+            print("文档插入失败!")
+            return False
+        
+        # 测试批量插入
+        test_docs = []
+        for i in range(3):
+            doc = {
+                "title": f"批量测试文档 {i}",
+                "content": f"这是第 {i} 个批量测试文档",
+                "content_tks": f"这是 第 {i} 个 批量 测试 文档",
+                "vector_768_vec": [0.1] * 768,
+                "created_at": "2024-01-01 00:00:00",
+                "count_int": i,
+                "importance_flt": 0.5 + i * 0.1,
+                "tags_kwd": ["批量", "测试"],
+                "kb_id": "test_kb_123"
+            }
+            test_docs.append(doc)
+        
+        print("\n正在批量插入测试文档...")
+        bulk_result = es.bulk_insert(index_name, test_docs)
+        print(f"批量插入结果: {bulk_result}")
+        
+        # 测试全文检索
+        print("\n正在测试全文检索...")
+        text_query = {
+            "match": {
+                "content": "测试"
+            }
+        }
+        text_result = es.search(index_name, text_query, size=5)
+        print(f"全文检索结果: {text_result['hits']['total']} 个命中")
+        
+        # 测试向量检索
+        print("\n正在测试向量检索...")
+        vector = [0.1] * 768
+        vector_result = es.knn_search(
+            index_name=index_name,
+            vector_field="vector_768_vec",
+            vector=vector,
+            k=3
+        )
+        print(f"向量检索结果: {vector_result['hits']['total']} 个命中")
+        
+        # 测试混合检索
+        print("\n正在测试混合检索...")
+        hybrid_result = es.hybrid_search(
+            index_name=index_name,
+            text_query="测试",
+            vector_field="vector_768_vec",
+            vector=vector,
+            size=5
+        )
+        print(f"混合检索结果: {hybrid_result['hits']['total']} 个命中")
+        
+        # 打印命中的文档
+        print("\n混合检索命中的文档:")
+        for hit in hybrid_result['hits']['hits']:
+            doc = hit['_source']
+            print(f"  - 标题: {doc['title']}, 相似度分数: {hit['_score']:.4f}")
+        
+        # 测试文档删除
+        print(f"\n正在删除索引: {index_name}")
+        es.es.indices.delete(index=index_name, ignore=[400, 404])
+        print(f"索引 {index_name} 删除成功!")
+        
+        # 关闭连接
+        es.close()
+        print("\n所有测试完成!")
+        return True
+        
+    except Exception as e:
+        print(f"测试失败: {e}")
+        return False
+
+if __name__ == "__main__":
+    test_es_connection()

+ 180 - 0
test_infinity_encapsulation.py

@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+测试Infinity向量数据库封装
+"""
+
+from services.utils.infinity import InfinityVectorDB
+from conf.config import VectorDBConfig
+
+def test_infinity_connection():
+    """
+    测试Infinity连接
+    """
+    print("=== 测试Infinity连接 ===")
+    
+    try:
+        # 初始化InfinityVectorDB
+        infinity_db = InfinityVectorDB()
+        print("✅ InfinityVectorDB初始化成功")
+        
+        # 测试索引创建
+        index_name = "test_collection"
+        print(f"\n测试创建索引: {index_name}")
+        result = infinity_db.create_index(index_name)
+        if result:
+            print(f"✅ 索引 {index_name} 创建成功")
+        else:
+            print(f"❌ 索引 {index_name} 创建失败")
+            return False
+        
+        # 测试索引存在检查
+        print(f"\n测试检查索引存在: {index_name}")
+        exists = infinity_db.index_exists(index_name)
+        if exists:
+            print(f"✅ 索引 {index_name} 存在")
+        else:
+            print(f"❌ 索引 {index_name} 不存在")
+            return False
+        
+        # 测试插入文档
+        print(f"\n测试插入文档")
+        document = {
+            "file_name": "test.pdf",
+            "file_page_count": 10,
+            "page_number": 1,
+            "text": "这是一个测试文档",
+            "image_path": "test.png",
+            "sparse_vector": [],
+            "dense_vector_1024": [0.1] * 1024,
+            "dataset_id": "test_dataset",
+            "document_id": "test_doc_id"
+        }
+        
+        insert_result = infinity_db.insert_document(index_name, document)
+        if insert_result:
+            print(f"✅ 文档插入成功")
+        else:
+            print(f"❌ 文档插入失败")
+            return False
+        
+        # 测试批量插入
+        print(f"\n测试批量插入文档")
+        documents = []
+        for i in range(2, 5):
+            doc = {
+                "file_name": "test.pdf",
+                "file_page_count": 10,
+                "page_number": i,
+                "text": f"这是第 {i} 页",
+                "image_path": f"test_{i}.png",
+                "sparse_vector": [],
+                "dense_vector_1024": [0.1] * 1024,
+                "dataset_id": "test_dataset",
+                "document_id": "test_doc_id"
+            }
+            documents.append(doc)
+        
+        bulk_result = infinity_db.bulk_insert(index_name, documents)
+        if bulk_result["success"] == len(documents):
+            print(f"✅ 批量插入成功,共插入 {bulk_result['success']} 个文档")
+        else:
+            print(f"❌ 批量插入失败,成功 {bulk_result['success']} 个,失败 {bulk_result['failed']} 个")
+            return False
+        
+        # 测试向量检索
+        print(f"\n测试向量检索")
+        vector = [0.1] * 1024
+        search_result = infinity_db.vector_search(index_name, "dense_vector_1024", vector, size=5)
+        if search_result["hits"]["total"] > 0:
+            print(f"✅ 向量检索成功,找到 {search_result['hits']['total']} 个结果")
+        else:
+            print(f"❌ 向量检索失败,未找到结果")
+        
+        # 测试混合检索
+        print(f"\n测试混合检索")
+        hybrid_result = infinity_db.hybrid_search(
+            index_name,
+            text_query="测试",
+            vector_field="dense_vector_1024",
+            vector=vector,
+            size=5
+        )
+        if hybrid_result["hits"]["total"] > 0:
+            print(f"✅ 混合检索成功,找到 {hybrid_result['hits']['total']} 个结果")
+        else:
+            print(f"❌ 混合检索失败,未找到结果")
+        
+        # 测试删除索引
+        print(f"\n测试删除索引: {index_name}")
+        delete_result = infinity_db.delete_index(index_name)
+        if delete_result:
+            print(f"✅ 索引 {index_name} 删除成功")
+        else:
+            print(f"❌ 索引 {index_name} 删除失败")
+            return False
+        
+        # 关闭连接
+        infinity_db.close()
+        print(f"\n✅ 成功关闭连接")
+        
+        return True
+        
+    except Exception as e:
+        print(f"\n❌ 测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def test_vector_db_factory():
+    """
+    测试VectorDBFactory
+    """
+    print("\n=== 测试VectorDBFactory ===")
+    
+    try:
+        from services.utils.vector_db import VectorDBFactory
+        
+        # 获取向量数据库实例
+        vector_db = VectorDBFactory.get_vector_db()
+        print(f"✅ 成功获取向量数据库实例: {type(vector_db).__name__}")
+        
+        # 测试创建索引
+        index_name = "test_factory_collection"
+        result = vector_db.create_index(index_name)
+        if result:
+            print(f"✅ 通过工厂创建索引 {index_name} 成功")
+        else:
+            print(f"❌ 通过工厂创建索引 {index_name} 失败")
+        
+        vector_db.close()
+        print(f"✅ 成功关闭通过工厂获取的连接")
+        
+        return True
+        
+    except Exception as e:
+        print(f"\n❌ 工厂测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    print("开始测试Infinity向量数据库封装...")
+    
+    # 测试Infinity连接
+    connection_result = test_infinity_connection()
+    
+    # 测试VectorDBFactory
+    factory_result = test_vector_db_factory()
+    
+    # 总结
+    print("\n=== 测试总结 ===")
+    if connection_result and factory_result:
+        print("✅ 所有测试通过!")
+        exit(0)
+    else:
+        print("❌ 部分测试失败!")
+        exit(1)

+ 104 - 0
test_vector_db.py

@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+向量数据库测试脚本
+测试向量数据库工厂类的基本功能和配置切换
+"""
+
+from services.utils.vector_db import VectorDBFactory
+from conf.config import VectorDBConfig
+
+def test_vector_db_factory():
+    """
+    测试向量数据库工厂类
+    """
+    print("=== 测试向量数据库工厂类 ===")
+    
+    # 获取配置的向量数据库类型
+    vector_db_type = VectorDBConfig.get_vector_db_type()
+    print(f"当前配置的向量数据库类型: {vector_db_type}")
+    
+    try:
+        # 获取向量数据库实例
+        vector_db = VectorDBFactory.get_vector_db()
+        print(f"成功获取向量数据库实例: {type(vector_db).__name__}")
+        
+        # 测试创建索引
+        index_name = "test_index"
+        print(f"\n测试创建索引: {index_name}")
+        result = vector_db.create_index(index_name)
+        print(f"创建索引结果: {result}")
+        
+        # 测试向量检索接口
+        print(f"\n测试向量检索接口")
+        vector = [0.1] * 768
+        result = vector_db.vector_search(index_name, "vector_768_vec", vector, size=5)
+        print(f"向量检索结果: {result}")
+        
+        # 测试混合检索接口
+        print(f"\n测试混合检索接口")
+        result = vector_db.hybrid_search(
+            index_name, 
+            text_query="测试", 
+            vector_field="vector_768_vec", 
+            vector=vector, 
+            size=5
+        )
+        print(f"混合检索结果: {result}")
+        
+        # 关闭连接
+        vector_db.close()
+        print(f"\n成功关闭向量数据库连接")
+        
+        return True
+        
+    except Exception as e:
+        print(f"测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+def test_vector_db_switch():
+    """
+    测试向量数据库切换功能
+    """
+    print("\n=== 测试向量数据库切换功能 ===")
+    
+    # 测试不同类型的向量数据库
+    test_types = ["es", "infinity"]
+    
+    for db_type in test_types:
+        print(f"\n测试向量数据库类型: {db_type}")
+        try:
+            # 临时修改配置(实际使用时通过环境变量配置)
+            from conf.config import VectorDBConfig
+            
+            # 注意:这里我们不能直接修改配置类的静态方法返回值
+            # 所以我们通过工厂类的实现来测试
+            
+            # 这里只测试工厂类是否能正确创建不同类型的向量数据库
+            if db_type == "es":
+                from services.utils.vector_db import ElasticsearchVectorDB
+                vector_db = ElasticsearchVectorDB()
+            else:
+                from services.utils.vector_db import InfinityVectorDB
+                vector_db = InfinityVectorDB()
+            
+            print(f"成功创建{db_type}向量数据库实例: {type(vector_db).__name__}")
+            vector_db.close()
+            print(f"成功关闭{db_type}向量数据库连接")
+            
+        except Exception as e:
+            print(f"测试{db_type}失败: {e}")
+            import traceback
+            traceback.print_exc()
+
+if __name__ == "__main__":
+    # 测试向量数据库工厂类
+    test_vector_db_factory()
+    
+    # 测试向量数据库切换功能
+    test_vector_db_switch()
+    
+    print("\n=== 测试完成 ===")

+ 71 - 0
test_workflow.py

@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+PDF解析工作流测试脚本
+测试包含向量化入库的完整工作流
+"""
+
+import os
+import sys
+# 添加项目根目录到Python路径
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from services.pdf_parser.workflow import PDFParsingWorkflow
+
+
+def test_pdf_parsing_workflow():
+    """
+    测试PDF解析工作流,包括向量化入库
+    """
+    print("=== 测试PDF解析工作流 ===")
+    
+    # 测试参数
+    pdf_path = "test/sample.pdf"  # 替换为实际的测试PDF路径
+    dataset_id = "test_dataset"
+    ragflow_api_url = "http://localhost:8000/"  # 替换为实际的RAGFLOW API URL
+    rag_flow_api_key = "test_api_key"  # 替换为实际的API密钥
+    
+    try:
+        # 检查测试PDF文件是否存在
+        if not os.path.exists(pdf_path):
+            print(f"测试PDF文件不存在: {pdf_path}")
+            print("请将测试PDF文件放置在指定位置")
+            return False
+        
+        # 初始化工作流
+        workflow = PDFParsingWorkflow()
+        print(f"工作流初始化成功")
+        
+        # 运行工作流
+        print(f"开始运行工作流,解析PDF: {pdf_path}")
+        result = workflow.run(
+            pdf_path=pdf_path,
+            dataset_id=dataset_id,
+            ragflow_api_url=ragflow_api_url,
+            rag_flow_api_key=rag_flow_api_key
+        )
+        
+        # 打印结果
+        print(f"\n工作流运行完成")
+        print(f"解析页面数量: {len(result.get('parsed_results', []))}")
+        print(f"向量化页面数量: {result.get('vectorized_pages', 0)}")
+        print(f"向量化结果数量: {len(result.get('vectorized_results', []))}")
+        
+        # 检查结果
+        if result.get('is_complete', False):
+            print("\n✅ 工作流运行成功!")
+            return True
+        else:
+            print("\n❌ 工作流运行失败!")
+            return False
+            
+    except Exception as e:
+        print(f"\n❌ 测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+if __name__ == "__main__":
+    test_pdf_parsing_workflow()