|
@@ -8,7 +8,7 @@
|
|
|
from typing import List, Dict, Any, Optional
|
|
from typing import List, Dict, Any, Optional
|
|
|
from src.utils.mysql import get_global_mysql_client
|
|
from src.utils.mysql import get_global_mysql_client
|
|
|
from src.utils.vector_db import get_vector_db_client
|
|
from src.utils.vector_db import get_vector_db_client
|
|
|
-from src.conf.settings import vector_db_settings
|
|
|
|
|
|
|
+from src.conf.settings import vector_db_settings, ragflow_settings
|
|
|
from src.common.logging_config import get_logger
|
|
from src.common.logging_config import get_logger
|
|
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
logger = get_logger(__name__)
|
|
@@ -20,12 +20,12 @@ INFINITY_TABLE_COLUMNS = [
|
|
|
{"name": "file_name", "type": "varchar", "default": ""},
|
|
{"name": "file_name", "type": "varchar", "default": ""},
|
|
|
{"name": "page_number", "type": "int", "default": 0},
|
|
{"name": "page_number", "type": "int", "default": 0},
|
|
|
{"name": "content", "type": "varchar", "default": ""},
|
|
{"name": "content", "type": "varchar", "default": ""},
|
|
|
- {"name": "doc_content", "type": "varchar", "default": ""},
|
|
|
|
|
{"name": "image_path", "type": "varchar", "default": ""},
|
|
{"name": "image_path", "type": "varchar", "default": ""},
|
|
|
- {"name": "dataset_id", "type": "varchar", "default": ""},
|
|
|
|
|
{"name": "document_id", "type": "varchar", "default": ""},
|
|
{"name": "document_id", "type": "varchar", "default": ""},
|
|
|
|
|
+ {"name": "chunk_id", "type": "varchar", "default": ""},
|
|
|
{"name": "ability_tags", "type": "varchar", "default": ""},
|
|
{"name": "ability_tags", "type": "varchar", "default": ""},
|
|
|
{"name": "content_tag", "type": "varchar", "default": ""},
|
|
{"name": "content_tag", "type": "varchar", "default": ""},
|
|
|
|
|
+ {"name": "metadata", "type": "varchar", "default": ""},
|
|
|
{"name": "dense_vector_1024", "type": "vector,1024,float"},
|
|
{"name": "dense_vector_1024", "type": "vector,1024,float"},
|
|
|
]
|
|
]
|
|
|
|
|
|
|
@@ -35,13 +35,13 @@ ES_INDEX_MAPPINGS = {
|
|
|
"id": {"type": "keyword"},
|
|
"id": {"type": "keyword"},
|
|
|
"file_name": {"type": "keyword"},
|
|
"file_name": {"type": "keyword"},
|
|
|
"page_number": {"type": "integer"},
|
|
"page_number": {"type": "integer"},
|
|
|
- "content": {"type": "text", "analyzer": "standard"},
|
|
|
|
|
- "doc_content": {"type": "text", "analyzer": "standard"},
|
|
|
|
|
|
|
+ "content": {"type": "text", "analyzer": "ik_smart"},
|
|
|
"image_path": {"type": "keyword"},
|
|
"image_path": {"type": "keyword"},
|
|
|
- "dataset_id": {"type": "keyword"},
|
|
|
|
|
"document_id": {"type": "keyword"},
|
|
"document_id": {"type": "keyword"},
|
|
|
|
|
+ "chunk_id": {"type": "keyword"},
|
|
|
"ability_tags": {"type": "keyword"},
|
|
"ability_tags": {"type": "keyword"},
|
|
|
"content_tag": {"type": "keyword"},
|
|
"content_tag": {"type": "keyword"},
|
|
|
|
|
+ "metadata": {"type": "object"},
|
|
|
"dense_vector_1024": {
|
|
"dense_vector_1024": {
|
|
|
"type": "dense_vector",
|
|
"type": "dense_vector",
|
|
|
"dims": 1024,
|
|
"dims": 1024,
|
|
@@ -50,6 +50,13 @@ ES_INDEX_MAPPINGS = {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
+# Elasticsearch 索引设置定义
|
|
|
|
|
+ES_INDEX_SETTINGS = {
|
|
|
|
|
+ "index": {
|
|
|
|
|
+ "number_of_shards": 2,
|
|
|
|
|
+ "number_of_replicas": 0
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
|
|
|
|
|
|
|
|
class PromptService:
|
|
class PromptService:
|
|
@@ -66,9 +73,9 @@ class PromptService:
|
|
|
self._vector_client = get_vector_db_client()
|
|
self._vector_client = get_vector_db_client()
|
|
|
return self._vector_client
|
|
return self._vector_client
|
|
|
|
|
|
|
|
- def _get_table_name(self, dimension_id: int) -> str:
|
|
|
|
|
|
|
+ def _get_table_name(self, dimension: Dict) -> str:
|
|
|
"""获取维度对应的表名/索引名"""
|
|
"""获取维度对应的表名/索引名"""
|
|
|
- return f"book_{dimension_id}"
|
|
|
|
|
|
|
+ return f"{ragflow_settings.custom_dataset_prefix}_{dimension['id']}_{dimension['dataset_id']}"
|
|
|
|
|
|
|
|
def init_vector_db_tables(self):
|
|
def init_vector_db_tables(self):
|
|
|
"""
|
|
"""
|
|
@@ -115,7 +122,7 @@ class PromptService:
|
|
|
client = self._get_vector_client()
|
|
client = self._get_vector_client()
|
|
|
|
|
|
|
|
for dim in dimensions:
|
|
for dim in dimensions:
|
|
|
- index_name = self._get_table_name(dim['id'])
|
|
|
|
|
|
|
+ index_name = self._get_table_name(dim)
|
|
|
if not client.index_exists(index_name):
|
|
if not client.index_exists(index_name):
|
|
|
self._create_es_index(index_name)
|
|
self._create_es_index(index_name)
|
|
|
logger.info(f"✅ 创建 ES 索引: {index_name} (维度: {dim['name']})")
|
|
logger.info(f"✅ 创建 ES 索引: {index_name} (维度: {dim['name']})")
|
|
@@ -135,27 +142,29 @@ class PromptService:
|
|
|
def _create_es_index(self, index_name: str):
|
|
def _create_es_index(self, index_name: str):
|
|
|
"""创建 Elasticsearch 索引"""
|
|
"""创建 Elasticsearch 索引"""
|
|
|
client = self._get_vector_client()
|
|
client = self._get_vector_client()
|
|
|
- client.create_index(
|
|
|
|
|
|
|
+ res = client.create_index(
|
|
|
index_name=index_name,
|
|
index_name=index_name,
|
|
|
- mappings=ES_INDEX_MAPPINGS
|
|
|
|
|
|
|
+ mappings=ES_INDEX_MAPPINGS,
|
|
|
|
|
+ settings=ES_INDEX_SETTINGS
|
|
|
)
|
|
)
|
|
|
|
|
+ logger.info(f"✅ 创建 ES 索引: {index_name} (响应: {res})")
|
|
|
|
|
|
|
|
- def _create_vector_db_table(self, dimension_id: int, dimension_name: str):
|
|
|
|
|
|
|
+ def _create_vector_db_table(self, dimension: Dict):
|
|
|
"""
|
|
"""
|
|
|
为维度创建向量数据库表/索引
|
|
为维度创建向量数据库表/索引
|
|
|
|
|
|
|
|
根据配置自动选择 Infinity 或 Elasticsearch。
|
|
根据配置自动选择 Infinity 或 Elasticsearch。
|
|
|
"""
|
|
"""
|
|
|
db_type = vector_db_settings.vector_db_type
|
|
db_type = vector_db_settings.vector_db_type
|
|
|
- table_name = self._get_table_name(dimension_id)
|
|
|
|
|
-
|
|
|
|
|
|
|
+ # table_name = self._get_table_name(dimension_id)
|
|
|
|
|
+ table_name = self._get_table_name(dimension)
|
|
|
try:
|
|
try:
|
|
|
if db_type == "infinity":
|
|
if db_type == "infinity":
|
|
|
self._create_infinity_table(table_name)
|
|
self._create_infinity_table(table_name)
|
|
|
- logger.info(f"✅ 创建 Infinity 表: {table_name} (维度: {dimension_name})")
|
|
|
|
|
|
|
+ logger.info(f"✅ 创建 Infinity 表: {table_name} (维度: {dimension['name']})")
|
|
|
elif db_type == "es":
|
|
elif db_type == "es":
|
|
|
self._create_es_index(table_name)
|
|
self._create_es_index(table_name)
|
|
|
- logger.info(f"✅ 创建 ES 索引: {table_name} (维度: {dimension_name})")
|
|
|
|
|
|
|
+ logger.info(f"✅ 创建 ES 索引: {table_name} (维度: {dimension['name']})")
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
logger.error(f"创建向量数据库表/索引失败: {str(e)}")
|
|
logger.error(f"创建向量数据库表/索引失败: {str(e)}")
|
|
|
|
|
|
|
@@ -165,7 +174,7 @@ class PromptService:
|
|
|
"""
|
|
"""
|
|
|
添加维度
|
|
添加维度
|
|
|
|
|
|
|
|
- 创建维度后会自动创建对应的向量数据库表/索引 (book_{dimension_id})。
|
|
|
|
|
|
|
+ 创建维度后会自动创建对应的向量数据库表/索引 (book_{dataset_id})。
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
|
name: 维度名称
|
|
name: 维度名称
|
|
@@ -174,17 +183,36 @@ class PromptService:
|
|
|
Returns:
|
|
Returns:
|
|
|
新建的维度信息
|
|
新建的维度信息
|
|
|
"""
|
|
"""
|
|
|
|
|
+ # 1. 先创建 RAGFlow 数据集
|
|
|
|
|
+ from src.utils.ragflow.ragflow_service import RAGFlowService
|
|
|
|
|
+ from src.conf.rag_parser_config import RagParserDefaults
|
|
|
|
|
+
|
|
|
|
|
+ ragflow_service = RAGFlowService(api_key="ragflow-XelVBvv8Uc6dZLNb1aBIKdbsupucEjESotOPTZZBrG4")
|
|
|
|
|
+ logger.info(f"开始创建 RAGFlow 数据集: {name}")
|
|
|
|
|
+
|
|
|
|
|
+ dataset = ragflow_service.create_dataset(
|
|
|
|
|
+ name=name,
|
|
|
|
|
+ description=description or f"维度: {name}",
|
|
|
|
|
+ permission=RagParserDefaults.DATASET_PERMISSION,
|
|
|
|
|
+ chunk_method=RagParserDefaults.DATASET_CHUNK_METHOD,
|
|
|
|
|
+ parser_config=RagParserDefaults.DATASET_CONFIG_DICT
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ dataset_id = dataset['id']
|
|
|
|
|
+ logger.info(f"RAGFlow 数据集创建成功,ID: {dataset_id}")
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 将维度信息存入 MySQL,同时保存 dataset_id
|
|
|
sql = """
|
|
sql = """
|
|
|
- INSERT INTO prompt_dimensions (name, description)
|
|
|
|
|
- VALUES (%s, %s)
|
|
|
|
|
|
|
+ INSERT INTO prompt_dimensions (name, description, dataset_id)
|
|
|
|
|
+ VALUES (%s, %s, %s)
|
|
|
"""
|
|
"""
|
|
|
- self._db.execute(sql, [name, description])
|
|
|
|
|
|
|
+ self._db.execute(sql, [name, description, dataset_id])
|
|
|
|
|
|
|
|
- # 获取新建的维度
|
|
|
|
|
|
|
+ # 3. 获取新建的维度
|
|
|
dimension = self.get_dimension_by_name(name)
|
|
dimension = self.get_dimension_by_name(name)
|
|
|
-
|
|
|
|
|
- # 创建对应的向量数据库表/索引
|
|
|
|
|
- self._create_vector_db_table(dimension['id'], name)
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 2. 创建对应的向量数据库表/索引(使用 dataset_id 作为维度ID)
|
|
|
|
|
+ self._create_vector_db_table(dimension)
|
|
|
|
|
|
|
|
return dimension
|
|
return dimension
|
|
|
|
|
|
|
@@ -203,6 +231,12 @@ class PromptService:
|
|
|
sql = "SELECT * FROM prompt_dimensions ORDER BY created_at DESC"
|
|
sql = "SELECT * FROM prompt_dimensions ORDER BY created_at DESC"
|
|
|
return self._db.fetch_all(sql)
|
|
return self._db.fetch_all(sql)
|
|
|
|
|
|
|
|
|
|
+ def get_dataset_id_by_dimension_id(self, dimension_id: int) -> Optional[str]:
|
|
|
|
|
+ """根据维度ID获取对应的RAGFlow数据集ID"""
|
|
|
|
|
+ sql = "SELECT dataset_id FROM prompt_dimensions WHERE id = %s"
|
|
|
|
|
+ result = self._db.fetch_one(sql, [dimension_id])
|
|
|
|
|
+ return result['dataset_id'] if result else None
|
|
|
|
|
+
|
|
|
def update_dimension(self, dimension_id: int, name: str = None, description: str = None) -> int:
|
|
def update_dimension(self, dimension_id: int, name: str = None, description: str = None) -> int:
|
|
|
"""更新维度信息"""
|
|
"""更新维度信息"""
|
|
|
updates = []
|
|
updates = []
|
|
@@ -323,6 +357,25 @@ class PromptService:
|
|
|
result = self._db.fetch_one(sql, [dimension_name])
|
|
result = self._db.fetch_one(sql, [dimension_name])
|
|
|
return result['content'] if result else None
|
|
return result['content'] if result else None
|
|
|
|
|
|
|
|
|
|
+ def get_active_dimension_by_id(self, dimension_id: int) -> Optional[Dict[str, Any]]:
|
|
|
|
|
+ """
|
|
|
|
|
+ 根据维度ID获取当前激活的提示词内容和数据集ID
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ dimension_id: 维度ID
|
|
|
|
|
+
|
|
|
|
|
+ Returns:
|
|
|
|
|
+ 包含提示词内容和数据集ID的字典,若不存在则返回 None
|
|
|
|
|
+ """
|
|
|
|
|
+ sql = """
|
|
|
|
|
+ SELECT pv.content, pd.dataset_id
|
|
|
|
|
+ FROM prompt_versions pv
|
|
|
|
|
+ JOIN prompt_dimensions pd ON pv.dimension_id = pd.id
|
|
|
|
|
+ WHERE pv.dimension_id = %s AND pv.is_active = 1
|
|
|
|
|
+ """
|
|
|
|
|
+ result = self._db.fetch_one(sql, [dimension_id])
|
|
|
|
|
+ return result if result else None
|
|
|
|
|
+
|
|
|
def get_active_prompt_by_id(self, dimension_id: int) -> Optional[str]:
|
|
def get_active_prompt_by_id(self, dimension_id: int) -> Optional[str]:
|
|
|
"""
|
|
"""
|
|
|
根据维度ID获取当前激活的提示词内容
|
|
根据维度ID获取当前激活的提示词内容
|
|
@@ -339,7 +392,7 @@ class PromptService:
|
|
|
"""
|
|
"""
|
|
|
result = self._db.fetch_one(sql, [dimension_id])
|
|
result = self._db.fetch_one(sql, [dimension_id])
|
|
|
return result['content'] if result else None
|
|
return result['content'] if result else None
|
|
|
-
|
|
|
|
|
|
|
+
|
|
|
def set_active_version(self, dimension_id: int, version_number: int) -> int:
|
|
def set_active_version(self, dimension_id: int, version_number: int) -> int:
|
|
|
"""设置激活版本"""
|
|
"""设置激活版本"""
|
|
|
# 先取消当前激活版本
|
|
# 先取消当前激活版本
|
|
@@ -372,4 +425,4 @@ def get_prompt_service() -> PromptService:
|
|
|
global _prompt_service
|
|
global _prompt_service
|
|
|
if _prompt_service is None:
|
|
if _prompt_service is None:
|
|
|
_prompt_service = PromptService()
|
|
_prompt_service = PromptService()
|
|
|
- return _prompt_service
|
|
|
|
|
|
|
+ return _prompt_service
|