import re from src.conf.settings import vector_db_settings from src.utils.infinity import InfinityClient from src.utils.file.image_util import image_util from src.model.multimodal_embedding import get_embedding_model from src.utils.infinity.result_util import convert_to_langchain_docs from src.api.dataset.models.dify_models import RetrievalRequest from src.conf.settings import vector_db_settings class DifyKnowledgeService: def __init__(self, infinity_client: InfinityClient, vector_field: str = None, match_field: str = None, match_type: str = None, table_name: str = None): self.infinity_client = infinity_client # 输出字段 self.output_fields = [ "file_name", "page_number", "content", "image_path", "dataset_id", "document_id", "_similarity" ] self.vector_field = vector_field or "dense_vector_1024" self.match_field = match_field or "content" self.match_type = match_type or "cosine" self.table_name = table_name or vector_db_settings.infinity_table_name def dify_database_search(self, request: RetrievalRequest): """ 执行Dify数据库搜索 Args: retrievalRequest: 搜索查询参数 Returns: 搜索结果,转换为基本类型以便序列化 """ try: if request.knowledge_id: # table_name = f"{vector_db_settings.infinity_dataset_prefix}{request.knowledge_id}" table_name = self.table_name else: # 抛出异常 raise Exception("knowledge_id不能为空") # 解析格式如: matching_text:点点,match_image:http://xxx 或 matching_text:点点,match_image:http://xxx # 支持中文和英文的逗号、冒号 input_image = None input_text = None query_str = request.query # 将中文逗号替换为英文逗号,用于分割 query_str_normalized = re.sub(r'[,]', ',', query_str) # 按逗号分割为多个键值对 pairs = query_str_normalized.split(',') for pair in pairs: # 将中文冒号替换为英文冒号,用于分割键值 pair_normalized = re.sub(r'[:]', ':', pair, count=1) if ':' in pair_normalized: # 只分割第一个冒号,避免URL中的冒号被分割 key, value = pair_normalized.split(':', 1) key = key.strip() value = value.strip() if key == 'match_image': input_image = value elif key == 'matching_text': input_text = value # 如果没有解析出任何参数,将整个query作为input_text if input_image is None and input_text is None: input_text = request.query retrieval_setting = request.retrieval_setting # 1.处理image_url为image: Image.Image image = image_util._url_to_image(input_image) # 多模态向量 query_vector = get_embedding_model().get_multimodal_embedding(text=input_text, image=image) # 构建搜索查询 search_query = { "vector_field": self.vector_field, "query_vector": query_vector, "topn": retrieval_setting.top_k, "knn_params": { "ef": str(retrieval_setting.top_k * 10), "threshold": str(retrieval_setting.score_threshold) } } # 执行搜索 result = self.infinity_client.vector_search(table_name, self.output_fields, search_query) # 将结果转换为基本类型,处理可能的复杂类型 result_dict = result.to_result() # 递归转换所有复杂类型为基本类型 return convert_to_langchain_docs(result_dict) except Exception as e: raise Exception(f"搜索失败: {str(e)}")