|
|
@@ -10,6 +10,8 @@
|
|
|
"""
|
|
|
|
|
|
from typing import Dict, Any, List
|
|
|
+import os
|
|
|
+import psutil
|
|
|
from langfuse.langchain import CallbackHandler
|
|
|
|
|
|
from src.datasets.parser.core.workflow_builder import WorkflowBuilder
|
|
|
@@ -45,36 +47,117 @@ class DynamicDimensionWorkflow:
|
|
|
... )
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, model_name: str = "Qwen/Qwen3-VL-8B-Instruct", max_workers: int = 5):
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ model_name: str = "Qwen/Qwen3-VL-8B-Instruct",
|
|
|
+ max_workers: int = 5,
|
|
|
+ max_concurrent_dimensions: int = 2, # 最大并发维度数
|
|
|
+ memory_threshold_percent: float = 80.0, # 内存使用阈值(百分比)
|
|
|
+ use_async: bool = False, # 是否使用异步解析节点
|
|
|
+ max_concurrent: int = 50 # 异步版本的最大并发数
|
|
|
+ ):
|
|
|
"""
|
|
|
初始化工作流
|
|
|
|
|
|
Args:
|
|
|
model_name: VL模型名称
|
|
|
- max_workers: 每个维度内部图片解析的并行线程数
|
|
|
+ max_workers: 每个维度内部图片解析的并行线程数(同步版本)
|
|
|
+ max_concurrent_dimensions: 最大并发维度数,默认2(避免资源耗尽)
|
|
|
+ memory_threshold_percent: 内存使用阈值百分比,超过则降级
|
|
|
+ use_async: 是否使用异步解析节点,默认False
|
|
|
+ max_concurrent: 异步版本的最大并发数,默认50
|
|
|
"""
|
|
|
self.model_name = model_name
|
|
|
self.max_workers = max_workers
|
|
|
+ self.max_concurrent_dimensions = max_concurrent_dimensions
|
|
|
+ self.memory_threshold_percent = memory_threshold_percent
|
|
|
+ self.use_async = use_async
|
|
|
+ self.max_concurrent = max_concurrent
|
|
|
self.langfuse_handler = CallbackHandler()
|
|
|
# 懒加载维度分解方法(初始化时加载一次)
|
|
|
self.decomposition_methods = self._get_decomposition_method()
|
|
|
+
|
|
|
+ logger.info(f"工作流初始化: 解析模式={'异步' if use_async else '同步'}, "
|
|
|
+ f"{'最大并发=' + str(max_concurrent) if use_async else '线程数=' + str(max_workers)}")
|
|
|
|
|
|
def _get_decomposition_method(self):
|
|
|
prompt_service = get_prompt_service()
|
|
|
decomposition_methods = prompt_service.get_decomposition_method()
|
|
|
return decomposition_methods
|
|
|
|
|
|
- def _build_workflow_for_dimensions(self, dimension_ids: List[int]):
|
|
|
+ def _get_memory_usage(self) -> float:
|
|
|
+ """获取当前内存使用百分比"""
|
|
|
+ try:
|
|
|
+ memory = psutil.virtual_memory()
|
|
|
+ return memory.percent
|
|
|
+ except Exception as e:
|
|
|
+ logger.warning(f"获取内存使用失败: {e}")
|
|
|
+ return 0.0
|
|
|
+
|
|
|
+ def _calculate_optimal_workers(self, dimension_count: int) -> int:
|
|
|
+ """
|
|
|
+ 根据维度数量和系统资源动态计算最优线程数
|
|
|
+
|
|
|
+ 保证工作流完整执行:
|
|
|
+ - 最小线程数为2,确保基本并行能力
|
|
|
+ - 只调整并行度,不影响任务完整性
|
|
|
+ - 所有页面都会被处理,只是速度可能变慢
|
|
|
+
|
|
|
+ Args:
|
|
|
+ dimension_count: 维度数量
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 优化后的线程数(最小为2)
|
|
|
+ """
|
|
|
+ # 获取CPU核心数
|
|
|
+ cpu_count = os.cpu_count() or 4
|
|
|
+
|
|
|
+ # 获取内存使用情况
|
|
|
+ memory_percent = self._get_memory_usage()
|
|
|
+
|
|
|
+ # 基础线程数:CPU核心数 * 2
|
|
|
+ base_workers = cpu_count * 2
|
|
|
+
|
|
|
+ # 根据维度数量调整(维度越多,每个维度的线程数越少)
|
|
|
+ if dimension_count <= 2:
|
|
|
+ adjusted_workers = base_workers
|
|
|
+ elif dimension_count <= 4:
|
|
|
+ adjusted_workers = max(base_workers // 2, 4) # 最小4个线程
|
|
|
+ else:
|
|
|
+ adjusted_workers = max(base_workers // 3, 3) # 最小3个线程
|
|
|
+
|
|
|
+ # 根据内存使用情况降级
|
|
|
+ if memory_percent > self.memory_threshold_percent:
|
|
|
+ logger.warning(f"内存使用率 {memory_percent:.1f}% 超过阈值 {self.memory_threshold_percent}%,降低线程数")
|
|
|
+ adjusted_workers = max(adjusted_workers // 2, 2) # 极端情况最小2个线程
|
|
|
+ elif memory_percent > self.memory_threshold_percent - 10:
|
|
|
+ logger.info(f"内存使用率 {memory_percent:.1f}% 接近阈值,适度降低线程数")
|
|
|
+ adjusted_workers = max(int(adjusted_workers * 0.7), 3) # 最小3个线程
|
|
|
+
|
|
|
+ # 限制最大值
|
|
|
+ max_allowed = min(self.max_workers, 10)
|
|
|
+ final_workers = min(adjusted_workers, max_allowed)
|
|
|
+
|
|
|
+ # 确保最小值为2,保证工作流能够完整执行
|
|
|
+ final_workers = max(final_workers, 2)
|
|
|
+
|
|
|
+ logger.info(f"动态计算线程数: CPU核心={cpu_count}, 维度数={dimension_count}, "
|
|
|
+ f"内存使用={memory_percent:.1f}%, 最终线程数={final_workers}")
|
|
|
+
|
|
|
+ return final_workers
|
|
|
+
|
|
|
+ def _build_workflow_for_dimensions(self, dimension_ids: List[int], optimal_workers: int):
|
|
|
"""
|
|
|
根据维度ID列表动态构建 LangGraph 工作流
|
|
|
|
|
|
Args:
|
|
|
dimension_ids: 维度ID列表,决定节点和执行顺序
|
|
|
+ optimal_workers: 优化后的线程数
|
|
|
|
|
|
Returns:
|
|
|
编译后的 LangGraph 工作流
|
|
|
"""
|
|
|
- logger.info(f"动态构建工作流,维度: {dimension_ids}")
|
|
|
+ logger.info(f"动态构建工作流,维度: {dimension_ids}, 线程数: {optimal_workers}")
|
|
|
|
|
|
# 创建固定节点
|
|
|
split_node = PDFSplitNode()
|
|
|
@@ -113,34 +196,36 @@ class DynamicDimensionWorkflow:
|
|
|
skill_node = DimensionBookSplitNode(
|
|
|
dimension_id=dim_id,
|
|
|
model_name=self.model_name,
|
|
|
- max_workers=self.max_workers
|
|
|
+ max_workers=optimal_workers # 使用优化后的线程数
|
|
|
)
|
|
|
elif decomposition_method == 1:
|
|
|
skill_node = DimensionPageSplitNode(
|
|
|
dimension_id=dim_id,
|
|
|
model_name=self.model_name,
|
|
|
- max_workers=self.max_workers
|
|
|
+ max_workers=optimal_workers # 使用优化后的线程数
|
|
|
)
|
|
|
elif decomposition_method == 2:
|
|
|
# 只有第一个滑动窗口维度需要执行拆分,后续维度复用结果
|
|
|
skill_node = DimensionSlidingWindowNode(
|
|
|
dimension_id=dim_id,
|
|
|
model_name=self.model_name,
|
|
|
- max_workers=self.max_workers,
|
|
|
+ max_workers=optimal_workers, # 使用优化后的线程数(同步版本)
|
|
|
+ max_concurrent=self.max_concurrent, # 异步版本的并发数
|
|
|
window_size=3, # 滑动窗口大小:当前页+前后各1页
|
|
|
- skip_stitching=not first_sliding_window # 非第一个维度跳过拆分
|
|
|
+ skip_stitching=not first_sliding_window, # 非第一个维度跳过拆分
|
|
|
+ use_async=self.use_async # 是否使用异步版本
|
|
|
)
|
|
|
if first_sliding_window:
|
|
|
first_sliding_window = False
|
|
|
- logger.info(f"维度 {dim_id} 将执行滑动窗口拆分")
|
|
|
+ logger.info(f"维度 {dim_id} 将执行滑动窗口拆分 ({'异步' if self.use_async else '同步'}模式)")
|
|
|
else:
|
|
|
- logger.info(f"维度 {dim_id} 将复用已有的滑动窗口拆分结果")
|
|
|
+ logger.info(f"维度 {dim_id} 将复用已有的滑动窗口拆分结果 ({'异步' if self.use_async else '同步'}模式)")
|
|
|
else:
|
|
|
logger.warning(f"未知的分解方法: {decomposition_method},使用默认分页模式")
|
|
|
skill_node = DimensionPageSplitNode(
|
|
|
dimension_id=dim_id,
|
|
|
model_name=self.model_name,
|
|
|
- max_workers=self.max_workers
|
|
|
+ max_workers=optimal_workers # 使用优化后的线程数
|
|
|
)
|
|
|
builder.add_node(skill_node)
|
|
|
builder.add_edge(prev_node, skill_node.name)
|
|
|
@@ -185,6 +270,16 @@ class DynamicDimensionWorkflow:
|
|
|
|
|
|
logger.info(f"开始运行动态多维度解析: {pdf_path}")
|
|
|
logger.info(f"维度执行顺序: {dimension_ids}")
|
|
|
+
|
|
|
+ # 检查内存使用情况(仅警告,不阻止执行)
|
|
|
+ memory_percent = self._get_memory_usage()
|
|
|
+ logger.info(f"当前内存使用: {memory_percent:.1f}%")
|
|
|
+
|
|
|
+ if memory_percent > self.memory_threshold_percent:
|
|
|
+ logger.warning(f"内存使用率 {memory_percent:.1f}% 超过阈值 {self.memory_threshold_percent}%,将使用最小线程数执行")
|
|
|
+ logger.warning("工作流将继续执行,但速度可能较慢,请监控系统资源")
|
|
|
+ elif memory_percent > self.memory_threshold_percent - 10:
|
|
|
+ logger.info(f"内存使用率 {memory_percent:.1f}% 接近阈值,将适度降低线程数")
|
|
|
|
|
|
# 查询维度知识库对应的user_id 和 api-key
|
|
|
ragflow_user = get_ragflow_user_service().get_ragflow_id_and_api_key(3)
|
|
|
@@ -197,10 +292,13 @@ class DynamicDimensionWorkflow:
|
|
|
logger.error("未找到维度知识库对应的user_id和api-key")
|
|
|
return {"success": False, "error": "ragflow_user_not_found"}
|
|
|
|
|
|
- # 1. 根据维度列表动态构建工作流
|
|
|
- workflow = self._build_workflow_for_dimensions(dimension_ids)
|
|
|
+ # 1. 动态计算最优线程数
|
|
|
+ optimal_workers = self._calculate_optimal_workers(len(dimension_ids))
|
|
|
+
|
|
|
+ # 2. 根据维度列表动态构建工作流
|
|
|
+ workflow = self._build_workflow_for_dimensions(dimension_ids, optimal_workers)
|
|
|
|
|
|
- # 2. 创建初始状态
|
|
|
+ # 3. 创建初始状态
|
|
|
initial_state = DynamicDimensionState(
|
|
|
pdf_path=pdf_path,
|
|
|
dimension_ids=dimension_ids,
|
|
|
@@ -214,19 +312,19 @@ class DynamicDimensionWorkflow:
|
|
|
total_vectorized_pages=0
|
|
|
)
|
|
|
|
|
|
- # 3. 执行工作流
|
|
|
+ # 4. 执行工作流
|
|
|
result = workflow.invoke(
|
|
|
initial_state,
|
|
|
config={"callbacks": [self.langfuse_handler]}
|
|
|
)
|
|
|
|
|
|
- # 4. 处理结果
|
|
|
+ # 5. 处理结果
|
|
|
if isinstance(result, dict):
|
|
|
final_result = result
|
|
|
else:
|
|
|
final_result = result.dict() if hasattr(result, 'dict') else dict(result)
|
|
|
|
|
|
- # 5. 添加统计信息
|
|
|
+ # 6. 添加统计信息
|
|
|
dim_results = final_result.get('dimension_results', {})
|
|
|
success_count = sum(1 for r in dim_results.values() if r.get("success"))
|
|
|
|