| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- from typing import Dict, Any, List
- from conf.config import VectorDBConfig
- from utils.infinity import InfinityClient
- from utils.file.image_util import image_util
- from model.multimodal_embedding import get_embedding_model
- def convert_to_basic_types(obj: Any) -> Any:
- """
- 递归将对象转换为基本类型,以便Pydantic能够序列化
-
- 特殊处理:当字典中的子项包含相同长度的数组时,将其转换为数组对象结构
- 例如:{"a": [1,2], "b": [3,4]} -> [{"a":1, "b":3}, {"a":2, "b":4}]
-
- Args:
- obj: 要转换的对象
-
- Returns:
- 转换后的基本类型对象
- """
- if obj is None:
- return None
- elif isinstance(obj, (str, int, float, bool)):
- return obj
- elif isinstance(obj, dict):
- # 先递归转换所有值
- converted = {k: convert_to_basic_types(v) for k, v in obj.items()}
-
- # 检查是否需要转换为数组对象结构
- # 条件:所有值都是列表,且长度一致,且长度大于0
- values = list(converted.values())
- if all(isinstance(v, list) for v in values):
- lengths = [len(v) for v in values]
- if len(set(lengths)) == 1 and lengths[0] > 0:
- # 转换为数组对象结构
- result = []
- keys = list(converted.keys())
- for i in range(lengths[0]):
- item = {}
- for key in keys:
- # 处理数组中可能存在的None值
- if i < len(converted[key]):
- item[key] = converted[key][i]
- else:
- item[key] = None
- result.append(item)
- return result
-
- return converted
- elif isinstance(obj, (list, tuple)):
- return [convert_to_basic_types(item) for item in obj]
- else:
- # 对于其他类型,尝试将其转换为字符串或字典
- try:
- return dict(obj)
- except:
- return str(obj)
- class InfinitySearchService:
- def __init__(self, infinity_client: InfinityClient, vector_field: str = None, match_field: str = None, match_type: str = None, table_name: str = None):
- self.infinity_client = infinity_client
- # 输出字段
- self.output_fields = [
- "file_name",
- "page_number",
- "content",
- "image_path",
- "dataset_id",
- "document_id"
- ]
- self.vector_field = vector_field or "dense_vector_1024"
- self.match_field = match_field or "content"
- self.match_type = match_type or "cosine"
- self.table_name = table_name or VectorDBConfig.get_infinity_table_name()
- def search(self, search_query: Dict[str, Any]) -> Dict[str, Any]:
- """
- 执行Infinity数据库搜索
-
- Args:
- search_query: 搜索查询参数
-
- Returns:
- 搜索结果,转换为基本类型以便序列化
- """
- try:
- # 执行搜索
- result = self.infinity_client.search(self.table_name, self.output_fields, search_query)
- # 将结果转换为基本类型,处理可能的复杂类型
- result_dict = result.to_result()
- # 递归转换所有复杂类型为基本类型
- return convert_to_basic_types(result_dict)
- except Exception as e:
- raise Exception(f"搜索失败: {str(e)}")
- def vector_search(self, search_query: Dict[str, Any]):
- """
- 执行Infinity数据库向量检索
-
- Args:
- search_query: 向量检索查询参数
-
- Returns:
- 向量检索结果,转换为基本类型以便序列化
- """
- try:
- # 1.处理image_url为image: Image.Image
- image = image_util._url_to_image(search_query["image_url"])
- # 2.将图片进行向量化
- query_vector = get_embedding_model().get_multimodal_embedding(search_query["matching_text"], image)
- search_query["vector_field"] = self.vector_field
- search_query["query_vector"] = query_vector
- # 执行向量检索
- result = self.infinity_client.vector_search(self.table_name, self.output_fields, search_query)
- # 将结果转换为基本类型,处理可能的复杂类型
- result_dict = result.to_result()
- # 递归转换所有复杂类型为基本类型
- return convert_to_basic_types(result_dict)
- except Exception as e:
- raise Exception(f"向量检索失败: {str(e)}")
- def hybrid_search(self, search_query: Dict[str, Any]):
- """
- 执行Infinity数据库混合检索
-
- Args:
- search_query: 混合检索查询参数
-
- Returns:
- 混合检索结果,转换为基本类型以便序列化
- """
- try:
- # 1.处理image_url为image: Image.Image
- image = image_util._url_to_image(search_query["image_url"])
- # 2.将图片进行向量化
- query_vector = get_embedding_model().get_multimodal_embedding(search_query["matching_text"], image)
- search_query["vector_field"] = self.vector_field
- search_query["query_vector"] = query_vector
- search_query["match_field"] = self.match_field
- # 执行混合检索
- result = self.infinity_client.hybrid_search(self.table_name, self.output_fields, search_query)
- # 将结果转换为基本类型,处理可能的复杂类型
- result_dict = result.to_result()
- # 递归转换所有复杂类型为基本类型
- return convert_to_basic_types(result_dict)
- except Exception as e:
- raise Exception(f"混合检索失败: {str(e)}")
|