|
@@ -315,9 +315,32 @@ class ResultAggregationNode(BaseNode):
|
|
|
|
|
|
|
|
# 检查该窗口是否包含该分块内容
|
|
# 检查该窗口是否包含该分块内容
|
|
|
content_found = False
|
|
content_found = False
|
|
|
- # 直接进行文本匹配
|
|
|
|
|
- if chunk_keywords in parsed_content:
|
|
|
|
|
- content_found = True
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 处理parsed_content可能是字典或字符串的情况
|
|
|
|
|
+ if isinstance(parsed_content, str):
|
|
|
|
|
+ # Markdown字符串:直接进行文本匹配
|
|
|
|
|
+ if chunk_keywords in parsed_content:
|
|
|
|
|
+ content_found = True
|
|
|
|
|
+ elif isinstance(parsed_content, dict):
|
|
|
|
|
+ # JSON字典:遍历所有值
|
|
|
|
|
+ for key, value in parsed_content.items():
|
|
|
|
|
+ if isinstance(value, str):
|
|
|
|
|
+ if chunk_keywords in value:
|
|
|
|
|
+ content_found = True
|
|
|
|
|
+ break
|
|
|
|
|
+ elif isinstance(value, list):
|
|
|
|
|
+ for item in value:
|
|
|
|
|
+ if isinstance(item, dict):
|
|
|
|
|
+ item_content = item.get("内容", "")
|
|
|
|
|
+ if chunk_keywords in str(item_content):
|
|
|
|
|
+ content_found = True
|
|
|
|
|
+ break
|
|
|
|
|
+ elif isinstance(item, str):
|
|
|
|
|
+ if chunk_keywords in item:
|
|
|
|
|
+ content_found = True
|
|
|
|
|
+ break
|
|
|
|
|
+ if content_found:
|
|
|
|
|
+ break
|
|
|
|
|
|
|
|
# 如果找到,添加该窗口的页码范围
|
|
# 如果找到,添加该窗口的页码范围
|
|
|
if content_found:
|
|
if content_found:
|
|
@@ -354,64 +377,149 @@ class ResultAggregationNode(BaseNode):
|
|
|
parsed_results = []
|
|
parsed_results = []
|
|
|
|
|
|
|
|
if not aggregated_result:
|
|
if not aggregated_result:
|
|
|
|
|
+ logger.warning("汇总结果为空,返回空列表")
|
|
|
return parsed_results
|
|
return parsed_results
|
|
|
|
|
|
|
|
- # 创建页码到图片的映射
|
|
|
|
|
- page_map = {page.get('page_number'): page.get('image') for page in split_pages}
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 创建页码到图片的映射
|
|
|
|
|
+ page_map = {page.get('page_number'): page.get('image') for page in split_pages}
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f"创建页码映射失败: {str(e)}")
|
|
|
|
|
+ page_map = {}
|
|
|
|
|
|
|
|
chunk_index = 0
|
|
chunk_index = 0
|
|
|
-
|
|
|
|
|
- # Markdown格式:按 --- 分割分块
|
|
|
|
|
- logger.info("检测到Markdown格式的汇总结果,按 --- 分割分块")
|
|
|
|
|
-
|
|
|
|
|
- # 按 --- 分割
|
|
|
|
|
- chunks = aggregated_result.split('---')
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 判断aggregated_result的类型
|
|
|
|
|
+ if isinstance(aggregated_result, str):
|
|
|
|
|
+ # Markdown格式:按 --- 分割分块
|
|
|
|
|
+ logger.info("检测到Markdown格式的汇总结果,按 --- 分割分块")
|
|
|
|
|
|
|
|
- for chunk_text in chunks:
|
|
|
|
|
- chunk_text = chunk_text.strip()
|
|
|
|
|
- if not chunk_text:
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 按 --- 分割
|
|
|
|
|
+ chunks = aggregated_result.split('---')
|
|
|
|
|
|
|
|
- # 从文本中提取页码范围标记 [页码范围]:(2-5)
|
|
|
|
|
- page_numbers = self._extract_page_range_from_text(chunk_text)
|
|
|
|
|
-
|
|
|
|
|
- # 如果没有找到页码范围标记,尝试从窗口结果中匹配
|
|
|
|
|
- if not page_numbers:
|
|
|
|
|
- logger.debug("未找到页码范围标记,尝试从窗口结果中匹配")
|
|
|
|
|
- page_numbers = self._extract_page_numbers_from_windows(
|
|
|
|
|
- windowed_results,
|
|
|
|
|
- chunk_text
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 使用第一个页码作为主页码
|
|
|
|
|
- page_number = page_numbers[0] if page_numbers else 0
|
|
|
|
|
-
|
|
|
|
|
- # 拼接对应页码的图片
|
|
|
|
|
- stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
|
|
|
|
|
-
|
|
|
|
|
- # 上传到MinIO
|
|
|
|
|
- image_url = None
|
|
|
|
|
- if stitched_image:
|
|
|
|
|
- image_url = self._upload_image_to_minio(
|
|
|
|
|
- stitched_image,
|
|
|
|
|
- self.dimension_id or 0,
|
|
|
|
|
- chunk_index
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 统一格式:直接存储chunk文本
|
|
|
|
|
- parsed_results.append({
|
|
|
|
|
- "page_number": page_number,
|
|
|
|
|
- "chunk_id": "",
|
|
|
|
|
- "content": chunk_text, # Markdown格式的文本
|
|
|
|
|
- "model": self.model_name,
|
|
|
|
|
- "image_path": image_url or "",
|
|
|
|
|
- # 额外信息
|
|
|
|
|
- "_page_numbers": page_numbers,
|
|
|
|
|
- "_image": stitched_image,
|
|
|
|
|
- "_image_url": image_url
|
|
|
|
|
- })
|
|
|
|
|
|
|
+ for chunk_text in chunks:
|
|
|
|
|
+ try:
|
|
|
|
|
+ chunk_text = chunk_text.strip()
|
|
|
|
|
+ if not chunk_text:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 从文本中提取页码范围标记 [页码范围]:(2-5)
|
|
|
|
|
+ page_numbers = self._extract_page_range_from_text(chunk_text)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果没有找到页码范围标记,尝试从窗口结果中匹配
|
|
|
|
|
+ if not page_numbers:
|
|
|
|
|
+ logger.debug("未找到页码范围标记,尝试从窗口结果中匹配")
|
|
|
|
|
+ page_numbers = self._extract_page_numbers_from_windows(
|
|
|
|
|
+ windowed_results,
|
|
|
|
|
+ chunk_text
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 使用第一个页码作为主页码
|
|
|
|
|
+ page_number = page_numbers[0] if page_numbers else 0
|
|
|
|
|
+
|
|
|
|
|
+ # 拼接对应页码的图片
|
|
|
|
|
+ stitched_image = None
|
|
|
|
|
+ image_url = None
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
|
|
|
|
|
+
|
|
|
|
|
+ # 上传到MinIO
|
|
|
|
|
+ if stitched_image:
|
|
|
|
|
+ image_url = self._upload_image_to_minio(
|
|
|
|
|
+ stitched_image,
|
|
|
|
|
+ self.dimension_id or 0,
|
|
|
|
|
+ chunk_index
|
|
|
|
|
+ )
|
|
|
|
|
+ except Exception as img_error:
|
|
|
|
|
+ logger.error(f"处理分块 {chunk_index} 的图片失败: {str(img_error)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 统一格式:直接存储chunk文本
|
|
|
|
|
+ parsed_results.append({
|
|
|
|
|
+ "page_number": page_number,
|
|
|
|
|
+ "chunk_id": "",
|
|
|
|
|
+ "content": chunk_text, # Markdown格式的文本
|
|
|
|
|
+ "model": self.model_name,
|
|
|
|
|
+ "image_path": image_url or "",
|
|
|
|
|
+ # 额外信息
|
|
|
|
|
+ "_page_numbers": page_numbers,
|
|
|
|
|
+ "_image": stitched_image,
|
|
|
|
|
+ "_image_url": image_url
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ chunk_index += 1
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as chunk_error:
|
|
|
|
|
+ logger.error(f"处理Markdown分块 {chunk_index} 失败: {str(chunk_error)}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f"Markdown格式处理失败: {str(e)}")
|
|
|
|
|
|
|
|
- chunk_index += 1
|
|
|
|
|
|
|
+ elif isinstance(aggregated_result, dict):
|
|
|
|
|
+ # JSON格式:遍历维度字段(向后兼容)
|
|
|
|
|
+ logger.info("检测到JSON格式的汇总结果,按维度字段拆分")
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ for key, value in aggregated_result.items():
|
|
|
|
|
+ if key in ["页码", "page_number"] or not isinstance(value, list):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 将每个维度的分块对象展开到parsed_results中
|
|
|
|
|
+ for chunk in value:
|
|
|
|
|
+ try:
|
|
|
|
|
+ if isinstance(chunk, dict):
|
|
|
|
|
+ chunk_content = chunk.get("内容", "")
|
|
|
|
|
+
|
|
|
|
|
+ # 提取该分块对应的页码范围
|
|
|
|
|
+ page_numbers = self._extract_page_numbers_from_windows(
|
|
|
|
|
+ windowed_results,
|
|
|
|
|
+ chunk_content
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 使用第一个页码作为主页码
|
|
|
|
|
+ page_number = page_numbers[0] if page_numbers else 0
|
|
|
|
|
+
|
|
|
|
|
+ # 拼接对应页码的图片
|
|
|
|
|
+ stitched_image = None
|
|
|
|
|
+ image_url = None
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
|
|
|
|
|
+
|
|
|
|
|
+ # 上传到MinIO
|
|
|
|
|
+ if stitched_image:
|
|
|
|
|
+ image_url = self._upload_image_to_minio(
|
|
|
|
|
+ stitched_image,
|
|
|
|
|
+ self.dimension_id or 0,
|
|
|
|
|
+ chunk_index
|
|
|
|
|
+ )
|
|
|
|
|
+ except Exception as img_error:
|
|
|
|
|
+ logger.error(f"处理分块 {chunk_index} 的图片失败: {str(img_error)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 统一格式:直接存储chunk的JSON对象
|
|
|
|
|
+ parsed_results.append({
|
|
|
|
|
+ "page_number": page_number,
|
|
|
|
|
+ "chunk_id": "",
|
|
|
|
|
+ "content": chunk, # JSON对象
|
|
|
|
|
+ "model": self.model_name,
|
|
|
|
|
+ "image_path": image_url or "",
|
|
|
|
|
+ # 额外信息
|
|
|
|
|
+ "_dimension": key,
|
|
|
|
|
+ "_page_numbers": page_numbers,
|
|
|
|
|
+ "_image": stitched_image,
|
|
|
|
|
+ "_image_url": image_url
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ chunk_index += 1
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as chunk_error:
|
|
|
|
|
+ logger.error(f"处理JSON分块 {chunk_index} 失败: {str(chunk_error)}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.error(f"JSON格式处理失败: {str(e)}")
|
|
|
else:
|
|
else:
|
|
|
logger.warning(f"未知的汇总结果格式: {type(aggregated_result)}")
|
|
logger.warning(f"未知的汇总结果格式: {type(aggregated_result)}")
|
|
|
|
|
|
|
@@ -457,9 +565,12 @@ class ResultAggregationNode(BaseNode):
|
|
|
|
|
|
|
|
logger.debug(f"Chat模型返回内容: {aggregated_content[:500]}...")
|
|
logger.debug(f"Chat模型返回内容: {aggregated_content[:500]}...")
|
|
|
|
|
|
|
|
|
|
+ # 直接使用Chat模型返回的内容(Markdown字符串)
|
|
|
|
|
+ aggregated_result = aggregated_content
|
|
|
|
|
+
|
|
|
# 按"分块"拆分结果,并为每个分块拼接图片上传到MinIO
|
|
# 按"分块"拆分结果,并为每个分块拼接图片上传到MinIO
|
|
|
parsed_results = self._split_by_chunks(
|
|
parsed_results = self._split_by_chunks(
|
|
|
- aggregated_content,
|
|
|
|
|
|
|
+ aggregated_result,
|
|
|
windowed_results,
|
|
windowed_results,
|
|
|
split_pages
|
|
split_pages
|
|
|
)
|
|
)
|
|
@@ -468,7 +579,7 @@ class ResultAggregationNode(BaseNode):
|
|
|
|
|
|
|
|
return {
|
|
return {
|
|
|
"parsed_results": parsed_results,
|
|
"parsed_results": parsed_results,
|
|
|
- "aggregated_result": aggregated_content # 保留完整的汇总结果供调试
|
|
|
|
|
|
|
+ "aggregated_result": aggregated_result # 保留完整的汇总结果供调试
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
@@ -528,45 +639,93 @@ class ResultAggregationNode(BaseNode):
|
|
|
parsed_content = result.get("parsed_content", {})
|
|
parsed_content = result.get("parsed_content", {})
|
|
|
center_page = result.get("center_page", 0)
|
|
center_page = result.get("center_page", 0)
|
|
|
page_numbers = result.get("page_numbers", [center_page])
|
|
page_numbers = result.get("page_numbers", [center_page])
|
|
|
-
|
|
|
|
|
- # 遍历每个维度字段
|
|
|
|
|
- for key, value in parsed_content.items():
|
|
|
|
|
- if key in ["页码", "page_number"] or not isinstance(value, list):
|
|
|
|
|
- continue
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # 判断parsed_content的类型
|
|
|
|
|
+ if isinstance(parsed_content, str):
|
|
|
|
|
+ # Markdown格式:按 --- 分割分块
|
|
|
|
|
+ chunks = parsed_content.split('---')
|
|
|
|
|
|
|
|
- # 展开分块
|
|
|
|
|
- for chunk in value:
|
|
|
|
|
- if isinstance(chunk, dict):
|
|
|
|
|
- # 使用第一个页码作为主页码
|
|
|
|
|
- page_number = page_numbers[0] if page_numbers else center_page
|
|
|
|
|
-
|
|
|
|
|
- # 拼接图片
|
|
|
|
|
- stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
|
|
|
|
|
-
|
|
|
|
|
- # 上传到MinIO
|
|
|
|
|
- image_url = None
|
|
|
|
|
- if stitched_image:
|
|
|
|
|
- image_url = self._upload_image_to_minio(
|
|
|
|
|
- stitched_image,
|
|
|
|
|
- self.dimension_id or 0,
|
|
|
|
|
- chunk_index
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- # 统一格式:直接存储chunk的JSON对象
|
|
|
|
|
- parsed_results.append({
|
|
|
|
|
- "page_number": page_number,
|
|
|
|
|
- "chunk_id": "",
|
|
|
|
|
- "content": chunk, # 直接存储chunk的JSON对象
|
|
|
|
|
- "model": self.model_name,
|
|
|
|
|
- "image_path": image_url or "", # 拼接后图片的MinIO URL
|
|
|
|
|
- # 额外信息
|
|
|
|
|
- "_dimension": key,
|
|
|
|
|
- "_page_numbers": page_numbers,
|
|
|
|
|
- "_image": stitched_image,
|
|
|
|
|
- "_image_url": image_url
|
|
|
|
|
- })
|
|
|
|
|
-
|
|
|
|
|
- chunk_index += 1
|
|
|
|
|
|
|
+ for chunk_text in chunks:
|
|
|
|
|
+ chunk_text = chunk_text.strip()
|
|
|
|
|
+ if not chunk_text:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 从文本中提取页码范围标记
|
|
|
|
|
+ chunk_page_numbers = self._extract_page_range_from_text(chunk_text)
|
|
|
|
|
+
|
|
|
|
|
+ # 如果没有找到页码范围标记,使用窗口的页码范围
|
|
|
|
|
+ if not chunk_page_numbers:
|
|
|
|
|
+ chunk_page_numbers = page_numbers
|
|
|
|
|
+
|
|
|
|
|
+ # 使用第一个页码作为主页码
|
|
|
|
|
+ page_number = chunk_page_numbers[0] if chunk_page_numbers else center_page
|
|
|
|
|
+
|
|
|
|
|
+ # 拼接图片
|
|
|
|
|
+ stitched_image = self._stitch_images_for_pages(chunk_page_numbers, page_map)
|
|
|
|
|
+
|
|
|
|
|
+ # 上传到MinIO
|
|
|
|
|
+ image_url = None
|
|
|
|
|
+ if stitched_image:
|
|
|
|
|
+ image_url = self._upload_image_to_minio(
|
|
|
|
|
+ stitched_image,
|
|
|
|
|
+ self.dimension_id or 0,
|
|
|
|
|
+ chunk_index
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 统一格式:存储Markdown文本
|
|
|
|
|
+ parsed_results.append({
|
|
|
|
|
+ "page_number": page_number,
|
|
|
|
|
+ "chunk_id": "",
|
|
|
|
|
+ "content": chunk_text, # Markdown文本
|
|
|
|
|
+ "model": self.model_name,
|
|
|
|
|
+ "image_path": image_url or "",
|
|
|
|
|
+ # 额外信息
|
|
|
|
|
+ "_page_numbers": chunk_page_numbers,
|
|
|
|
|
+ "_image": stitched_image,
|
|
|
|
|
+ "_image_url": image_url
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ chunk_index += 1
|
|
|
|
|
+
|
|
|
|
|
+ elif isinstance(parsed_content, dict):
|
|
|
|
|
+ # JSON格式:遍历每个维度字段
|
|
|
|
|
+ for key, value in parsed_content.items():
|
|
|
|
|
+ if key in ["页码", "page_number"] or not isinstance(value, list):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 展开分块
|
|
|
|
|
+ for chunk in value:
|
|
|
|
|
+ if isinstance(chunk, dict):
|
|
|
|
|
+ # 使用第一个页码作为主页码
|
|
|
|
|
+ page_number = page_numbers[0] if page_numbers else center_page
|
|
|
|
|
+
|
|
|
|
|
+ # 拼接图片
|
|
|
|
|
+ stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
|
|
|
|
|
+
|
|
|
|
|
+ # 上传到MinIO
|
|
|
|
|
+ image_url = None
|
|
|
|
|
+ if stitched_image:
|
|
|
|
|
+ image_url = self._upload_image_to_minio(
|
|
|
|
|
+ stitched_image,
|
|
|
|
|
+ self.dimension_id or 0,
|
|
|
|
|
+ chunk_index
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 统一格式:直接存储chunk的JSON对象
|
|
|
|
|
+ parsed_results.append({
|
|
|
|
|
+ "page_number": page_number,
|
|
|
|
|
+ "chunk_id": "",
|
|
|
|
|
+ "content": chunk, # 直接存储chunk的JSON对象
|
|
|
|
|
+ "model": self.model_name,
|
|
|
|
|
+ "image_path": image_url or "", # 拼接后图片的MinIO URL
|
|
|
|
|
+ # 额外信息
|
|
|
|
|
+ "_dimension": key,
|
|
|
|
|
+ "_page_numbers": page_numbers,
|
|
|
|
|
+ "_image": stitched_image,
|
|
|
|
|
+ "_image_url": image_url
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ chunk_index += 1
|
|
|
|
|
|
|
|
logger.info(f"备用合并完成,共 {len(parsed_results)} 个分块")
|
|
logger.info(f"备用合并完成,共 {len(parsed_results)} 个分块")
|
|
|
|
|
|