il y a 3 mois · f0b5960c74
--- a/src/api/db/models/search_request_models.py
+++ b/src/api/db/models/search_request_models.py
@@ -46,7 +46,7 @@ class SearchRequest(BaseModel):
 
															         le=1.0,
														
 
															         description="相似度分数阈值"
														
 
															     )
														
 
															-    kb_id: Optional[str] = Field(
														
 
															+    kb_id: List[str] = Field(
														
 
															         default=None,
														
 
															         description="知识库id",
														
 
															         examples=["kb_id1", "kb_id2"]
														
@@ -72,11 +72,3 @@ class SearchRequest(BaseModel):
 
															         """模型初始化后验证：确保至少提供一个搜索条件"""
														
 
															         if self.image_url is None and self.matching_text is None:
														
 
															             raise ValueError("必须提供 image_url 或 matching_text 其中至少一个")
														
 
															-
														
 
															-    @field_validator('kb_id')
														
 
															-    @classmethod
														
 
															-    def validate_kb_id(cls, v: Optional[str]) -> Optional[str]:
														
 
															-        """验证知识库id"""
														
 
															-        if v is not None and v.strip() == "":
														
 
															-            return None
														
 
															-        return v
														
--- a/src/api/db/services/vector_search_service.py
+++ b/src/api/db/services/vector_search_service.py
@@ -132,7 +132,6 @@ class VectorSearchService:
 
															                     query_vector = get_embedding_model().get_multimodal_embedding(request.matching_text, image)
														
 
															                 else:
														
 
															                     query_vector = get_embedding_model().get_text_embedding(request.matching_text)
														
 
															-            
														
 
															             search_query = {
														
 
															                 "match_field": self.match_field,
														
 
															                 "matching_text": request.matching_text,
														
--- a/src/api/sdk/search_infinity.py
+++ b/src/api/sdk/search_infinity.py
@@ -8,6 +8,41 @@ from src.utils.async_utils import run_in_threadpool
 
															 from src.utils.ragflow.ragflow_user_service import get_ragflow_user_service
														
 
															 from src.api.db.models import SearchRequest
														
 
															 from src.conf.settings import ragflow_settings
														
 
															+from src.common.logging_config import get_logger
														
 
															+
														
 
															+logger = get_logger(__name__)
														
 
															+
														
 
															+# 全局搜索服务实例
														
 
															+_search_service = None
														
 
															+
														
 
															+
														
 
															+def get_search_service():
														
 
															+    """获取搜索服务实例（单例模式）"""
														
 
															+    global _search_service
														
 
															+    if _search_service is None:
														
 
															+        _search_service = VectorSearchService(
														
 
															+            client=get_vector_db_client(database="ragflow_db"),
														
 
															+            vector_field="q_1024_vec",
														
 
															+            match_field="content_sm_ltks",
														
 
															+            output_fields=["content_with_weight"]
														
 
															+        )
														
 
															+        logger.info("Search service initialized")
														
 
															+    return _search_service
														
 
															+
														
 
															+
														
 
															+def _get_question_table_name():
														
 
															+    """获取问答对表名"""
														
 
															+    try:
														
 
															+        ragflow_user = get_ragflow_user_service().get_ragflow_id_and_api_key(2)
														
 
															+        ragflow_id = ragflow_user.get("ragflow_id")
														
 
															+        if not ragflow_id:
														
 
															+            raise Exception("未找到ragflow_id")
														
 
															+        table_name = f"{ragflow_settings.ragflow_dataset_prefix}_{ragflow_id}"
														
 
															+        logger.info(f"Question table name: {table_name}")
														
 
															+        return table_name
														
 
															+    except Exception as e:
														
 
															+        logger.error(f"Error getting question table name: {str(e)}")
														
 
															+        raise
														
 
															 # 创建FastAPI应用
														
@@ -82,48 +117,47 @@ async def question_search(request: SearchRequest):
 
															     - **database_name**: 数据库名称（可选，默认使用客户端配置的数据库）
														
 
															     """
														
 
															     try:
														
 
															-        output_fields = ["content_with_weight"]
														
 
															-        search_service = VectorSearchService(
														
 
															-            client=get_vector_db_client(database="ragflow_db"), 
														
 
															-            vector_field="q_1024_vec",
														
 
															-            output_fields=output_fields
														
 
															-        )
														
 
															-        ragflow_user = get_ragflow_user_service().get_ragflow_id_and_api_key(2)
														
 
															-        ragflow_id = ragflow_user.get("ragflow_id")
														
 
															-        if not ragflow_id:
														
 
															-            return Result.error(code=500, message="未找到ragflow_id")
														
 
															-        question_table_name = f"{ragflow_settings.ragflow_dataset_prefix}_{ragflow_id}"
														
 
															-        request.kb_id = "2b0ac35df80e11f096160242ac180002"
														
 
															-        request.database_ids = [question_table_name]
														
 
															+        logger.info(f"Received question search request: {request.matching_text[:100]}...")
														
 
															+        
														
 
															+        # 获取搜索服务实例
														
 
															+        search_service = get_search_service()
														
 
															+        
														
 
															+        # 执行搜索
														
 
															         result = await run_in_threadpool(search_service.hybrid_search, request)
														
 
															+        
														
 
															+        logger.info(f"Question search completed successfully, found {len(result) if result else 0} results")
														
 
															         return Result.success(data=result, message="问答对检索成功")
														
 
															     except Exception as e:
														
 
															+        logger.error(f"问答对检索失败: {str(e)}")
														
 
															         return Result.error(code=500, message=f"问答对检索失败: {str(e)}")
														
 
															 @app.post("/resource")
														
 
															-async def question_search(request: SearchRequest):
														
 
															+async def resource_search(request: SearchRequest):
														
 
															     """
														
 
															-    问答对检索接口
														
 
															+    资源检索接口
														
 
															     - **output_fields**: 要返回的字段列表
														
 
															     - **query**: 查询条件，包含vector_field、query_vector、field、query、topn和fusion_weight字段
														
 
															     - **database_name**: 数据库名称（可选，默认使用客户端配置的数据库）
														
 
															     """
														
 
															     try:
														
 
															-        output_fields = ["content_with_weight"]
														
 
															-        search_service = VectorSearchService(
														
 
															-            client=get_vector_db_client(database="ragflow_db"), 
														
 
															-            vector_field="q_1024_vec",
														
 
															-            output_fields=output_fields
														
 
															-        )
														
 
															-        ragflow_user = get_ragflow_user_service().get_ragflow_id_and_api_key(2)
														
 
															-        ragflow_id = ragflow_user.get("ragflow_id")
														
 
															-        if not ragflow_id:
														
 
															-            return Result.error(code=500, message="未找到ragflow_id")
														
 
															-        question_table_name = f"{ragflow_settings.ragflow_dataset_prefix}_{ragflow_id}"
														
 
															+        logger.info(f"Received resource search request: {request.matching_text[:100]}...")
														
 
															+        
														
 
															+        # 获取搜索服务实例
														
 
															+        search_service = get_search_service()
														
 
															+        
														
 
															+        # 获取资源表名
														
 
															+        resource_table_name = await run_in_threadpool(_get_question_table_name)
														
 
															+        
														
 
															+        # 设置请求参数
														
 
															         request.kb_id = "f59936fbfb1f11f095280242ac180002"
														
 
															-        request.database_ids = [question_table_name]
														
 
															+        request.database_ids = [resource_table_name]
														
 
															+        
														
 
															+        # 执行搜索
														
 
															         result = await run_in_threadpool(search_service.hybrid_search, request)
														
 
															-        return Result.success(data=result, message="问答对检索成功")
														
 
															+        
														
 
															+        logger.info(f"Resource search completed successfully, found {len(result) if result else 0} results")
														
 
															+        return Result.success(data=result, message="资源检索成功")
														
 
															     except Exception as e:
														
 
															-        return Result.error(code=500, message=f"问答对检索失败: {str(e)}")
														
 
															+        logger.error(f"资源检索失败: {str(e)}")
														
 
															+        return Result.error(code=500, message=f"资源检索失败: {str(e)}")
														
--- a/src/common/logging_config.py
+++ b/src/common/logging_config.py
@@ -19,8 +19,7 @@ DEFAULT_LOG_LEVEL = logging.INFO
 
															 # 日志文件配置（可选）
														
 
															 LOG_DIR = Path("logs")
														
 
															-LOG_FILE = LOG_DIR / "app.log"
														
 
															-
														
 
															+LOG_FILE = r"D:\project\work\graph_rag_server\logs\app.log"
														
 
															 def setup_logging(
														
 
															     level: int = DEFAULT_LOG_LEVEL,
														
@@ -54,6 +53,8 @@ def setup_logging(
 
															     # 添加文件处理器（如果需要）
														
 
															     if log_to_file:
														
 
															         file_path = log_file or LOG_FILE
														
 
															+        # 转换为Path对象
														
 
															+        file_path = Path(file_path)
														
 
															         # 确保日志目录存在
														
 
															         file_path.parent.mkdir(parents=True, exist_ok=True)
														
@@ -81,4 +82,4 @@ def get_logger(name: str) -> logging.Logger:
 
															 # 在模块导入时自动配置日志
														
 
															-setup_logging()
														
 
															+setup_logging(log_to_file=True)
														
--- a/src/model/multimodal_embedding.py
+++ b/src/model/multimodal_embedding.py
@@ -26,6 +26,14 @@ class Embedding:
 
															         self.base_url = model_settings.base_url
														
 
															         self.api_key = api_key or model_settings.api_key
														
 
															         self.dashscope_api_key =  model_settings.dashscope_api_key
														
 
															+        # 初始化OpenAI Embeddings客户端（只初始化一次）
														
 
															+        self._embeddings_client = OpenAIEmbeddings(
														
 
															+            model=self.embedding_model_name,
														
 
															+            base_url=self.base_url,
														
 
															+            api_key=self.api_key
														
 
															+        )
														
 
															+        # 初始化MultiModal Embedding客户端（只初始化一次）
														
 
															+        self._multimodal_client = MultiModalEmbedding
														
 
															     @observe(name="text_embedding", as_type="embedding")
														
 
															     def get_text_embedding(self, text: str) -> List[float]:
														
@@ -39,13 +47,8 @@ class Embedding:
 
															             List[float]: 文本的embedding向量
														
 
															         """
														
 
															         try:
														
 
															-            # 使用langchain_openai初始化OpenAI Embeddings模型
														
 
															-            embeddings = OpenAIEmbeddings(
														
 
															-                model=self.embedding_model_name,
														
 
															-                base_url=self.base_url,
														
 
															-                api_key=self.api_key
														
 
															-            )
														
 
															-            embedding = embeddings.embed_query(text)
														
 
															+            # 使用预先初始化的客户端
														
 
															+            embedding = self._embeddings_client.embed_query(text)
														
 
															             return embedding
														
 
															         except Exception as e:
														
 
															             raise Exception(f"文本embedding生成失败: {str(e)}")
														
@@ -62,13 +65,8 @@ class Embedding:
 
															             List[List[float]]: 文本列表的embedding向量列表
														
 
															         """
														
 
															         try:
														
 
															-            # 使用langchain_openai初始化OpenAI Embeddings模型
														
 
															-            embeddings = OpenAIEmbeddings(
														
 
															-                model=self.embedding_model_name,
														
 
															-                base_url=self.base_url,
														
 
															-                api_key=self.api_key
														
 
															-            )
														
 
															-            embeddings = embeddings.embed_documents(texts)
														
 
															+            # 使用预先初始化的客户端
														
 
															+            embeddings = self._embeddings_client.embed_documents(texts)
														
 
															             return embeddings
														
 
															         except Exception as e:
														
 
															             raise Exception(f"多个文本embedding生成失败: {str(e)}")
														
@@ -100,7 +98,7 @@ class Embedding:
 
															                     "image": f"data:image/png;base64,{image_base64}"
														
 
															                 }
														
 
															             ]
														
 
															-            response = MultiModalEmbedding.call(
														
 
															+            response = self._multimodal_client.call(
														
 
															                 model=self.multi_embedding_model_name,
														
 
															                 api_key=self.dashscope_api_key,
														
 
															                 input=item
														
@@ -138,7 +136,7 @@ class Embedding:
 
															             if text is not None and text.strip() != "":
														
 
															                 item.append({'text': text})
														
 
															-            response = MultiModalEmbedding.call(
														
 
															+            response = self._multimodal_client.call(
														
 
															                 model=self.multi_embedding_model_name,
														
 
															                 api_key=self.dashscope_api_key,
														
 
															                 input=item
														
@@ -151,7 +149,10 @@ class Embedding:
 
															         except Exception as e:
														
 
															             raise Exception(f"多模态embedding生成失败: {str(e)}")
														
 
															-# 全局单例
														
 
															+# 全局单例实例
														
 
															+_embedding_model_instance = None
														
 
															+
														
 
															+
														
 
															 def get_embedding_model() -> Embedding:
														
 
															     """
														
 
															     获取全局单例的Embedding模型
														
@@ -159,7 +160,10 @@ def get_embedding_model() -> Embedding:
 
															     Returns:
														
 
															         Embedding: 全局单例的Embedding模型
														
 
															     """
														
 
															-    return Embedding(
														
 
															-        model_name=model_settings.embedding_model_name,
														
 
															-        api_key=model_settings.api_key
														
 
															-    )
														
 
															+    global _embedding_model_instance
														
 
															+    if _embedding_model_instance is None:
														
 
															+        _embedding_model_instance = Embedding(
														
 
															+            model_name=model_settings.embedding_model_name,
														
 
															+            api_key=model_settings.api_key
														
 
															+        )
														
 
															+    return _embedding_model_instance
														
--- a/src/utils/chinese_text_processor.py
+++ b/src/utils/chinese_text_processor.py
@@ -0,0 +1,213 @@
 
															+"""
														
 
															+中文文本处理工具
														
 
															+
														
 
															+提供中文文本标准化、分词、权重计算和同义词扩展功能。
														
 
															+"""
														
 
															+
														
 
															+import re
														
 
															+import string
														
 
															+from typing import List, Dict, Tuple, Any
														
 
															+import jieba
														
 
															+from src.common.logging_config import get_logger
														
 
															+
														
 
															+logger = get_logger(__name__)
														
 
															+
														
 
															+
														
 
															+class ChineseTextProcessor:
														
 
															+    """
														
 
															+    中文文本处理器
														
 
															+    
														
 
															+    提供以下功能：
														
 
															+    1. 文本标准化
														
 
															+    2. 中文分词
														
 
															+    3. 停用词过滤
														
 
															+    4. 权重计算
														
 
															+    5. 同义词扩展
														
 
															+    """
														
 
															+    
														
 
															+    def __init__(self):
														
 
															+        """
														
 
															+        初始化中文文本处理器
														
 
															+        """
														
 
															+        # 加载停用词
														
 
															+        self.stopwords = self._load_stopwords()
														
 
															+    
														
 
															+    def _load_stopwords(self) -> set:
														
 
															+        """
														
 
															+        加载停用词表
														
 
															+        
														
 
															+        Returns:
														
 
															+            set: 停用词集合
														
 
															+        """
														
 
															+        # 基本停用词
														
 
															+        stopwords = {
														
 
															+            '的', '了', '是', '在', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也',
														
 
															+            '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '个',
														
 
															+            '得', '地', '天', '子', '日', '月', '年', '时', '分', '秒', '中', '国', '人', '民',
														
 
															+            '大', '小', '多', '少', '上', '下', '左', '右', '前', '后', '里', '外', '内', '外',
														
 
															+            '高', '低', '长', '短', '宽', '窄', '厚', '薄', '远', '近', '早', '晚', '今', '明',
														
 
															+            '昨', '天', '周', '月', '年', '春', '夏', '秋', '冬', '东', '南', '西', '北',
														
 
															+            # 英文停用词
														
 
															+            'a', 'an', 'the', 'and', 'or', 'but', 'if', 'because', 'as', 'what', 'which', 'this',
														
 
															+            'that', 'these', 'those', 'then', 'just', 'so', 'than', 'such', 'both', 'through',
														
 
															+            'about', 'for', 'is', 'of', 'while', 'during', 'to', 'from', 'in', 'on'
														
 
															+        }
														
 
															+        return stopwords
														
 
															+    
														
 
															+
														
 
															+    
														
 
															+    def normalize_text(self, text: str) -> str:
														
 
															+        """
														
 
															+        文本标准化
														
 
															+        
														
 
															+        Args:
														
 
															+            text: 原始文本
														
 
															+            
														
 
															+        Returns:
														
 
															+            str: 标准化后的文本
														
 
															+        """
														
 
															+        if not text:
														
 
															+            return ""
														
 
															+        
														
 
															+        # 1. 移除 WWW 相关内容
														
 
															+        text = re.sub(r'https?://\S+', '', text)
														
 
															+        text = re.sub(r'www\.\S+', '', text)
														
 
															+        
														
 
															+        # 2. 中英文之间添加空格
														
 
															+        text = re.sub(r'([a-zA-Z])([\u4e00-\u9fa5])', r'\1 \2', text)
														
 
															+        text = re.sub(r'([\u4e00-\u9fa5])([a-zA-Z])', r'\1 \2', text)
														
 
															+        
														
 
															+        # 3. 替换特殊字符为空格
														
 
															+        text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', ' ', text)
														
 
															+        
														
 
															+        # 4. 全角转半角
														
 
															+        def full_to_half(s):
														
 
															+            result = []
														
 
															+            for char in s:
														
 
															+                code = ord(char)
														
 
															+                if code == 12288:  # 全角空格
														
 
															+                    result.append(' ')
														
 
															+                elif 65281 <= code <= 65374:  # 全角字符
														
 
															+                    result.append(chr(code - 65248))
														
 
															+                else:
														
 
															+                    result.append(char)
														
 
															+            return ''.join(result)
														
 
															+        
														
 
															+        text = full_to_half(text)
														
 
															+        
														
 
															+        # 5. 转换为小写
														
 
															+        text = text.lower()
														
 
															+        
														
 
															+        # 6. 移除多余空格
														
 
															+        text = re.sub(r'\s+', ' ', text).strip()
														
 
															+        
														
 
															+        return text
														
 
															+    
														
 
															+    def split(self, text: str) -> List[str]:
														
 
															+        """
														
 
															+        中文分词
														
 
															+        
														
 
															+        Args:
														
 
															+            text: 标准化后的文本
														
 
															+            
														
 
															+        Returns:
														
 
															+            List[str]: 分词结果
														
 
															+        """
														
 
															+        if not text:
														
 
															+            return []
														
 
															+        
														
 
															+        # 使用 jieba 分词
														
 
															+        words = jieba.cut(text)
														
 
															+        
														
 
															+        # 过滤停用词和无意义字符
														
 
															+        filtered_words = []
														
 
															+        for word in words:
														
 
															+            # 过滤停用词
														
 
															+            if word in self.stopwords:
														
 
															+                continue
														
 
															+            # 过滤无意义字符
														
 
															+            if len(word) < 1:
														
 
															+                continue
														
 
															+            # 过滤纯数字
														
 
															+            if word.isdigit():
														
 
															+                continue
														
 
															+            # 过滤纯空格
														
 
															+            if word.isspace():
														
 
															+                continue
														
 
															+            filtered_words.append(word)
														
 
															+        
														
 
															+        return filtered_words
														
 
															+    
														
 
															+    def weights(self, words: List[str]) -> Dict[str, float]:
														
 
															+        """
														
 
															+        计算词权重
														
 
															+        
														
 
															+        Args:
														
 
															+            words: 分词结果
														
 
															+            
														
 
															+        Returns:
														
 
															+            Dict[str, float]: 词权重映射
														
 
															+        """
														
 
															+        if not words:
														
 
															+            return {}
														
 
															+        
														
 
															+        # 简单的词频权重计算
														
 
															+        word_freq = {}
														
 
															+        for word in words:
														
 
															+            word_freq[word] = word_freq.get(word, 0) + 1
														
 
															+        
														
 
															+        # 归一化权重
														
 
															+        total_freq = sum(word_freq.values())
														
 
															+        weights = {}
														
 
															+        for word, freq in word_freq.items():
														
 
															+            # 词长因子：长词权重更高
														
 
															+            length_factor = min(len(word) / 4, 1.0)
														
 
															+            # 频率因子
														
 
															+            freq_factor = freq / total_freq
														
 
															+            # 综合权重
														
 
															+            weight = (freq_factor * 0.7) + (length_factor * 0.3)
														
 
															+            weights[word] = round(weight, 4)
														
 
															+        
														
 
															+        return weights
														
 
															+    
														
 
															+    def lookup(self, word: str) -> List[str]:
														
 
															+        """
														
 
															+        查找同义词（已禁用）
														
 
															+        
														
 
															+        Args:
														
 
															+            word: 原始词
														
 
															+            
														
 
															+        Returns:
														
 
															+            List[str]: 空列表（同义词功能已禁用）
														
 
															+        """
														
 
															+        return []
														
 
															+    
														
 
															+    def process_text(self, text: str) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        完整文本处理流程
														
 
															+        
														
 
															+        Args:
														
 
															+            text: 原始文本
														
 
															+            
														
 
															+        Returns:
														
 
															+            Dict[str, Any]: 处理结果
														
 
															+        """
														
 
															+        # 1. 文本标准化
														
 
															+        normalized_text = self.normalize_text(text)
														
 
															+        
														
 
															+        # 2. 分词
														
 
															+        words = self.split(normalized_text)
														
 
															+        
														
 
															+        # 3. 权重计算
														
 
															+        word_weights = self.weights(words)
														
 
															+        
														
 
															+        return {
														
 
															+            'original_text': text,
														
 
															+            'normalized_text': normalized_text,
														
 
															+            'words': words,
														
 
															+            'word_weights': word_weights
														
 
															+        }
														
 
															+
														
 
															+
														
 
															+# 全局实例
														
 
															+chinese_processor = ChineseTextProcessor()
														
--- a/src/utils/vector_db/elasticsearch_adapter.py
+++ b/src/utils/vector_db/elasticsearch_adapter.py
@@ -11,6 +11,7 @@ from .base import VectorDBClient
 
															 from .result_util import UnifiedSearchResult, build_unified_result
														
 
															 from src.conf.settings import es_settings, vector_db_settings
														
 
															 from src.common.logging_config import get_logger
														
 
															+from src.utils.chinese_text_processor import chinese_processor
														
 
															 logger = get_logger(__name__)
														
@@ -214,14 +215,15 @@ class ElasticsearchAdapter(VectorDBClient):
 
															                 "query_vector": query_vector,
														
 
															                 "k": topn,
														
 
															                 "num_candidates": topn * 10,
														
 
															+                "similarity": threshold,
														
 
															                 "filter": kb_id_filter  # kb_id 精准匹配（支持单个或数组）
														
 
															             },
														
 
															             "_source": output_fields
														
 
															         }
														
 
															-        # 如果有阈值，添加 min_score
														
 
															-        if threshold > 0:
														
 
															-            search_params["min_score"] = threshold
														
 
															+        # # 如果有阈值，添加 min_score
														
 
															+        # if threshold > 0:
														
 
															+        #     search_params["min_score"] = threshold
														
 
															         result = self._client.search(**search_params)
														
 
															         return self._convert_result(result, output_fields)
														
@@ -252,17 +254,23 @@ class ElasticsearchAdapter(VectorDBClient):
 
															         # 构建 kb_id 过滤条件（支持单个值或数组）
														
 
															         kb_id_filter = self._build_kb_id_filter(kb_id)
														
 
															-        # index_name =  "ragbook_1_f3c87b89f82711f0b7450242ac180002,ragbook_2_16db7236f82911f093e20242ac180002"
														
 
															         index_name = table_name
														
 
															         vector_field = query.get("vector_field", "dense_vector")
														
 
															         query_vector = query.get("query_vector", [])
														
 
															         # 确保 query_vector 是 Python 原生列表，兼容 numpy 数组等类型
														
 
															-        if query_vector is not None and len(query_vector) > 0:
														
 
															-            # 强制转换为 Python 原生 float 列表
														
 
															-            query_vector = [float(x) for x in query_vector]
														
 
															-            logger.info(f"混合搜索: 已转换 query_vector 类型, 长度={len(query_vector)}, 前3个元素={query_vector[:3]}")
														
 
															-        else:
														
 
															-            query_vector = []
														
 
															+        # if query_vector is not None and len(query_vector) > 0:
														
 
															+        #     # 强制转换为 Python 原生 float 列表
														
 
															+        #     query_vector = [float(x) for x in query_vector]
														
 
															+        #     logger.info(f"混合搜索: 已转换 query_vector 类型, 长度={len(query_vector)}, 前3个元素={query_vector[:3]}")
														
 
															+        # else:
														
 
															+        #     query_vector = []
														
 
															+
														
 
															+        # 将query_vector的值写入D:\project\work\graph_rag_server\logs\app.log
														
 
															+        logger.info(f"混合搜索: query_vector={query_vector}")
														
 
															+        # 获取阈值
														
 
															+        knn_params = query.get("knn_params", {})
														
 
															+        threshold = float(knn_params.get("threshold", 0.0))
														
 
															+
														
 
															         match_field = query.get("match_field", "content")
														
 
															         matching_text = query.get("matching_text")
														
 
															         if matching_text is not None and not isinstance(matching_text, str):
														
@@ -275,24 +283,21 @@ class ElasticsearchAdapter(VectorDBClient):
 
															         # match 的 query 不能为 None，否则 ES 报 VALUE_NULL。无文案时仅用向量（match_all）
														
 
															         if matching_text:
														
 
															-            text_clause = {
														
 
															-                "match": {
														
 
															-                    match_field: {"query": matching_text, "boost": 1.0}
														
 
															-                }
														
 
															-            }
														
 
															+            # 处理中文文本
														
 
															+            text_clause = self._build_chinese_query(match_field, matching_text)
														
 
															         else:
														
 
															-            text_clause = {"match_all": {"boost": 1.0}}
														
 
															-        
														
 
															+            text_clause = {"match_all": {"boost": 2.0}}
														
 
															         # 构建混合查询（ES 8.x 使用关键字参数而非 body）
														
 
															         # 在 query 的 bool.must 中添加 kb_id 精准匹配，在 knn 的 filter 中也添加
														
 
															+        # 打印所有查询条件
														
 
															         result = self._client.search(
														
 
															             index=index_name,
														
 
															             query={
														
 
															                 "bool": {
														
 
															-                    "must": [
														
 
															+                    "filter": [
														
 
															                         kb_id_filter,  # kb_id 精准匹配（支持单个或数组）
														
 
															                     ],
														
 
															-                    "should": [text_clause]
														
 
															+                    "must": [text_clause]
														
 
															                 }
														
 
															             },
														
 
															             knn={
														
@@ -300,7 +305,8 @@ class ElasticsearchAdapter(VectorDBClient):
 
															                 "query_vector": query_vector,
														
 
															                 "k": topn,
														
 
															                 "num_candidates": topn * 10,
														
 
															-                "boost": 1.0,
														
 
															+                "similarity": threshold,
														
 
															+                "boost": 8.0,
														
 
															                 "filter": kb_id_filter  # kb_id 精准匹配（支持单个或数组）
														
 
															             },
														
 
															             size=topn,
														
@@ -328,6 +334,82 @@ class ElasticsearchAdapter(VectorDBClient):
 
															             return result
														
 
															         return None
														
 
															+    def _build_chinese_query(self, field: str, text: str) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        构建中文查询
														
 
															+        
														
 
															+        Args:
														
 
															+            field: 查询字段
														
 
															+            text: 原始文本
														
 
															+            
														
 
															+        Returns:
														
 
															+            Dict[str, Any]: ES查询DSL
														
 
															+        """
														
 
															+        # 处理中文文本
														
 
															+        processed_result = chinese_processor.process_text(text)
														
 
															+        
														
 
															+        logger.info(f"Chinese text processed: {processed_result}")
														
 
															+        
														
 
															+        # 获取处理结果
														
 
															+        words = processed_result.get('words', [])
														
 
															+        word_weights = processed_result.get('word_weights', {})
														
 
															+        
														
 
															+        # 构建布尔查询
														
 
															+        bool_query = {
														
 
															+            "bool": {
														
 
															+                "should": []
														
 
															+            }
														
 
															+        }
														
 
															+        
														
 
															+        # 添加原始词查询
														
 
															+        for word in words:
														
 
															+            weight = word_weights.get(word, 1.0)
														
 
															+            # 添加 term 查询（精确匹配）
														
 
															+            bool_query["bool"]["should"].append({
														
 
															+                "term": {
														
 
															+                    field: {
														
 
															+                        "value": word,
														
 
															+                        "boost": 2.0 * weight
														
 
															+                    }
														
 
															+                }
														
 
															+            })
														
 
															+            # 添加 match 查询（模糊匹配）
														
 
															+            bool_query["bool"]["should"].append({
														
 
															+                "match": {
														
 
															+                    field: {
														
 
															+                        "query": word,
														
 
															+                        "boost": 1.0 * weight
														
 
															+                    }
														
 
															+                }
														
 
															+            })
														
 
															+        
														
 
															+        # 添加短语查询（保留词序）
														
 
															+        if len(words) > 1:
														
 
															+            phrase_query = " ".join(words)
														
 
															+            bool_query["bool"]["should"].append({
														
 
															+                "match_phrase": {
														
 
															+                    field: {
														
 
															+                        "query": phrase_query,
														
 
															+                        "boost": 3.0
														
 
															+                    }
														
 
															+                }
														
 
															+            })
														
 
															+        
														
 
															+        # 添加原始文本匹配
														
 
															+        bool_query["bool"]["should"].append({
														
 
															+            "match": {
														
 
															+                field: {
														
 
															+                    "query": text,
														
 
															+                    "boost": 1.5
														
 
															+                }
														
 
															+            }
														
 
															+        })
														
 
															+        
														
 
															+        # 设置 minimum_should_match，确保至少匹配一个条件
														
 
															+        bool_query["bool"]["minimum_should_match"] = 1
														
 
															+        
														
 
															+        return bool_query
														
 
															+    
														
 
															     @staticmethod
														
 
															     def _parse_cond_for_term(cond: str) -> Optional[Tuple[str, str]]:
														
 
															         """