Quellcode durchsuchen

dify外部知识库接口提交

yingge vor 3 Monaten
Ursprung
Commit
b8fb77df30

+ 2 - 2
docker/Dockerfile

@@ -12,11 +12,11 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
 # 安装系统依赖
 # build-essential: 编译依赖
 # curl: 网络工具
-# libgl1-mesa-glx: OpenCV 等库可能需要
+# libgl1: OpenCV 等库可能需要 (替换已弃用的 libgl1-mesa-glx)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     curl \
-    libgl1-mesa-glx \
+    libgl1 \
     libglib2.0-0 \
     && rm -rf /var/lib/apt/lists/*
 

+ 1 - 1
docker/docker-compose.yml

@@ -10,7 +10,7 @@ services:
     ports:
       - "18001:18001"
     volumes:
-      - ./.env:/app/.env
+      - ../.env:/app/.env
       # 开发环境下可取消注释以下行以挂载源码
       # - ../src:/app/src
     environment:

+ 37 - 21
src/api/dataset/services/dify_knowledge_service.py

@@ -1,4 +1,4 @@
-import json
+import re
 from src.conf.settings import vector_db_settings
 from src.utils.infinity import InfinityClient
 from src.utils.file.image_util import image_util
@@ -8,18 +8,19 @@ from src.api.dataset.models.dify_models import RetrievalRequest
 from src.conf.settings import vector_db_settings
 
 class DifyKnowledgeService:
-    def __init__(self, infinity_client: InfinityClient, vector_field: str = None, match_field: str = None, match_type: str = None, table_name: str = None):
+    def __init__(self, infinity_client: InfinityClient, vector_field: str = None, match_field: str = None,
+                 match_type: str = None, table_name: str = None):
         self.infinity_client = infinity_client
         # 输出字段
         self.output_fields = [
-                    "file_name",
-                    "page_number",
-                    "content",
-                    "image_path",
-                    "dataset_id",
-                    "document_id",
-                    "_similarity"
-                ]
+            "file_name",
+            "page_number",
+            "content",
+            "image_path",
+            "dataset_id",
+            "document_id",
+            "_similarity"
+        ]
         self.vector_field = vector_field or "dense_vector_1024"
         self.match_field = match_field or "content"
         self.match_type = match_type or "cosine"
@@ -28,10 +29,10 @@ class DifyKnowledgeService:
     def dify_database_search(self, request: RetrievalRequest):
         """
         执行Dify数据库搜索
-    
+
         Args:
             retrievalRequest: 搜索查询参数
-        
+
         Returns:
             搜索结果,转换为基本类型以便序列化
         """
@@ -42,14 +43,29 @@ class DifyKnowledgeService:
             else:
                 # 抛出异常
                 raise Exception("knowledge_id不能为空")
-            # 获取检索参数,并解析为json
-            try:
-                query = json.loads(request.query)
-                # 检查query是否包含match_image或match_text
-                if "match_image" in query or "matching_text" in query:
-                    input_image = query.get("match_image")
-                    input_text = query.get("matching_text")
-            except json.JSONDecodeError:
+            # 解析格式如: matching_text:点点,match_image:http://xxx 或 matching_text:点点,match_image:http://xxx
+            # 支持中文和英文的逗号、冒号
+            input_image = None
+            input_text = None
+            query_str = request.query
+            # 将中文逗号替换为英文逗号,用于分割
+            query_str_normalized = re.sub(r'[,]', ',', query_str)
+            # 按逗号分割为多个键值对
+            pairs = query_str_normalized.split(',')
+            for pair in pairs:
+                # 将中文冒号替换为英文冒号,用于分割键值
+                pair_normalized = re.sub(r'[:]', ':', pair, count=1)
+                if ':' in pair_normalized:
+                    # 只分割第一个冒号,避免URL中的冒号被分割
+                    key, value = pair_normalized.split(':', 1)
+                    key = key.strip()
+                    value = value.strip()
+                    if key == 'match_image':
+                        input_image = value
+                    elif key == 'matching_text':
+                        input_text = value
+            # 如果没有解析出任何参数,将整个query作为input_text
+            if input_image is None and input_text is None:
                 input_text = request.query
 
             retrieval_setting = request.retrieval_setting
@@ -69,7 +85,7 @@ class DifyKnowledgeService:
                 }
             }
             # 执行搜索
-            result = self.infinity_client.vector_search(table_name, self.output_fields, search_query)  
+            result = self.infinity_client.vector_search(table_name, self.output_fields, search_query)
             # 将结果转换为基本类型,处理可能的复杂类型
             result_dict = result.to_result()
             # 递归转换所有复杂类型为基本类型

+ 2 - 9
src/utils/infinity/result_util.py

@@ -5,13 +5,10 @@ from langchain_core.documents import Document
 def convert_to_basic_types(obj: Any) -> Any:
     """
     递归将对象转换为基本类型,以便Pydantic能够序列化
-    
     特殊处理:当字典中的子项包含相同长度的数组时,将其转换为数组对象结构
     例如:{"a": [1,2], "b": [3,4]} -> [{"a":1, "b":3}, {"a":2, "b":4}]
-    
     Args:
         obj: 要转换的对象
-    
     Returns:
         转换后的基本类型对象
     """
@@ -22,7 +19,6 @@ def convert_to_basic_types(obj: Any) -> Any:
     elif isinstance(obj, dict):
         # 先递归转换所有值
         converted = {k: convert_to_basic_types(v) for k, v in obj.items()}
-        
         # 检查是否需要转换为数组对象结构
         # 条件:所有值都是列表,且长度一致,且长度大于0
         values = list(converted.values())
@@ -42,7 +38,6 @@ def convert_to_basic_types(obj: Any) -> Any:
                             item[key] = None
                     result.append(item)
                 return result
-        
         return converted
     elif isinstance(obj, (list, tuple)):
         return [convert_to_basic_types(item) for item in obj]
@@ -56,18 +51,16 @@ def convert_to_basic_types(obj: Any) -> Any:
 def convert_to_langchain_docs(obj: Any) -> List[Document]:
     """
     将Infinity搜索结果转换为LangChain的Document格式
-    
     Args:
         obj: 要转换的对象
-    
     Returns:
         转换后的Document列表
     """
     res = convert_to_basic_types(obj=obj)
     # 将数据转换为 LangChain 的 Document 格式
     candidate_docs = [
-        Document(page_content=item["content"], 
-            metadata={k: v for k, v in item.items() if k != "content"}) 
+        Document(page_content=item["content"],
+                 metadata={k: v for k, v in item.items() if k != "content"})
         for item in res[0]
     ]
     return candidate_docs