Explorar el Código

19维度图书解析流程调整-新增滑动窗口模式

yingge hace 3 meses
padre
commit
fa3c10e971
Se han modificado 1 ficheros con 253 adiciones y 94 borrados
  1. 253 94
      src/datasets/parser/nodes/result_aggregation_node.py

+ 253 - 94
src/datasets/parser/nodes/result_aggregation_node.py

@@ -315,9 +315,32 @@ class ResultAggregationNode(BaseNode):
             
             # 检查该窗口是否包含该分块内容
             content_found = False
-            # 直接进行文本匹配
-            if chunk_keywords in parsed_content:
-                content_found = True
+            
+            # 处理parsed_content可能是字典或字符串的情况
+            if isinstance(parsed_content, str):
+                # Markdown字符串:直接进行文本匹配
+                if chunk_keywords in parsed_content:
+                    content_found = True
+            elif isinstance(parsed_content, dict):
+                # JSON字典:遍历所有值
+                for key, value in parsed_content.items():
+                    if isinstance(value, str):
+                        if chunk_keywords in value:
+                            content_found = True
+                            break
+                    elif isinstance(value, list):
+                        for item in value:
+                            if isinstance(item, dict):
+                                item_content = item.get("内容", "")
+                                if chunk_keywords in str(item_content):
+                                    content_found = True
+                                    break
+                            elif isinstance(item, str):
+                                if chunk_keywords in item:
+                                    content_found = True
+                                    break
+                        if content_found:
+                            break
             
             # 如果找到,添加该窗口的页码范围
             if content_found:
@@ -354,64 +377,149 @@ class ResultAggregationNode(BaseNode):
         parsed_results = []
         
         if not aggregated_result:
+            logger.warning("汇总结果为空,返回空列表")
             return parsed_results
         
-        # 创建页码到图片的映射
-        page_map = {page.get('page_number'): page.get('image') for page in split_pages}
+        try:
+            # 创建页码到图片的映射
+            page_map = {page.get('page_number'): page.get('image') for page in split_pages}
+        except Exception as e:
+            logger.error(f"创建页码映射失败: {str(e)}")
+            page_map = {}
         
         chunk_index = 0
-    
-        # Markdown格式:按 --- 分割分块
-        logger.info("检测到Markdown格式的汇总结果,按 --- 分割分块")
-            
-        # 按 --- 分割
-        chunks = aggregated_result.split('---')
+        
+        # 判断aggregated_result的类型
+        if isinstance(aggregated_result, str):
+            # Markdown格式:按 --- 分割分块
+            logger.info("检测到Markdown格式的汇总结果,按 --- 分割分块")
             
-        for chunk_text in chunks:
-            chunk_text = chunk_text.strip()
-            if not chunk_text:
-                continue
+            try:
+                # 按 --- 分割
+                chunks = aggregated_result.split('---')
                 
-            # 从文本中提取页码范围标记 [页码范围]:(2-5)
-            page_numbers = self._extract_page_range_from_text(chunk_text)
-                
-            # 如果没有找到页码范围标记,尝试从窗口结果中匹配
-            if not page_numbers:
-                logger.debug("未找到页码范围标记,尝试从窗口结果中匹配")
-                page_numbers = self._extract_page_numbers_from_windows(
-                    windowed_results, 
-                    chunk_text
-                )
-                
-            # 使用第一个页码作为主页码
-            page_number = page_numbers[0] if page_numbers else 0
-                
-            # 拼接对应页码的图片
-            stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
-                
-            # 上传到MinIO
-            image_url = None
-            if stitched_image:
-                image_url = self._upload_image_to_minio(
-                    stitched_image,
-                    self.dimension_id or 0,
-                    chunk_index
-                )
-                
-            # 统一格式:直接存储chunk文本
-            parsed_results.append({
-                "page_number": page_number,
-                "chunk_id": "",
-                "content": chunk_text,  # Markdown格式的文本
-                "model": self.model_name,
-                "image_path": image_url or "",
-                # 额外信息
-                "_page_numbers": page_numbers,
-                "_image": stitched_image,
-                "_image_url": image_url
-            })
+                for chunk_text in chunks:
+                    try:
+                        chunk_text = chunk_text.strip()
+                        if not chunk_text:
+                            continue
+                        
+                        # 从文本中提取页码范围标记 [页码范围]:(2-5)
+                        page_numbers = self._extract_page_range_from_text(chunk_text)
+                        
+                        # 如果没有找到页码范围标记,尝试从窗口结果中匹配
+                        if not page_numbers:
+                            logger.debug("未找到页码范围标记,尝试从窗口结果中匹配")
+                            page_numbers = self._extract_page_numbers_from_windows(
+                                windowed_results, 
+                                chunk_text
+                            )
+                        
+                        # 使用第一个页码作为主页码
+                        page_number = page_numbers[0] if page_numbers else 0
+                        
+                        # 拼接对应页码的图片
+                        stitched_image = None
+                        image_url = None
+                        
+                        try:
+                            stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
+                            
+                            # 上传到MinIO
+                            if stitched_image:
+                                image_url = self._upload_image_to_minio(
+                                    stitched_image,
+                                    self.dimension_id or 0,
+                                    chunk_index
+                                )
+                        except Exception as img_error:
+                            logger.error(f"处理分块 {chunk_index} 的图片失败: {str(img_error)}")
+                        
+                        # 统一格式:直接存储chunk文本
+                        parsed_results.append({
+                            "page_number": page_number,
+                            "chunk_id": "",
+                            "content": chunk_text,  # Markdown格式的文本
+                            "model": self.model_name,
+                            "image_path": image_url or "",
+                            # 额外信息
+                            "_page_numbers": page_numbers,
+                            "_image": stitched_image,
+                            "_image_url": image_url
+                        })
+                        
+                        chunk_index += 1
+                        
+                    except Exception as chunk_error:
+                        logger.error(f"处理Markdown分块 {chunk_index} 失败: {str(chunk_error)}")
+                        continue
+                        
+            except Exception as e:
+                logger.error(f"Markdown格式处理失败: {str(e)}")
                 
-            chunk_index += 1
+        elif isinstance(aggregated_result, dict):
+            # JSON格式:遍历维度字段(向后兼容)
+            logger.info("检测到JSON格式的汇总结果,按维度字段拆分")
+            
+            try:
+                for key, value in aggregated_result.items():
+                    if key in ["页码", "page_number"] or not isinstance(value, list):
+                        continue
+                    
+                    # 将每个维度的分块对象展开到parsed_results中
+                    for chunk in value:
+                        try:
+                            if isinstance(chunk, dict):
+                                chunk_content = chunk.get("内容", "")
+                                
+                                # 提取该分块对应的页码范围
+                                page_numbers = self._extract_page_numbers_from_windows(
+                                    windowed_results, 
+                                    chunk_content
+                                )
+                                
+                                # 使用第一个页码作为主页码
+                                page_number = page_numbers[0] if page_numbers else 0
+                                
+                                # 拼接对应页码的图片
+                                stitched_image = None
+                                image_url = None
+                                
+                                try:
+                                    stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
+                                    
+                                    # 上传到MinIO
+                                    if stitched_image:
+                                        image_url = self._upload_image_to_minio(
+                                            stitched_image,
+                                            self.dimension_id or 0,
+                                            chunk_index
+                                        )
+                                except Exception as img_error:
+                                    logger.error(f"处理分块 {chunk_index} 的图片失败: {str(img_error)}")
+                                
+                                # 统一格式:直接存储chunk的JSON对象
+                                parsed_results.append({
+                                    "page_number": page_number,
+                                    "chunk_id": "",
+                                    "content": chunk,  # JSON对象
+                                    "model": self.model_name,
+                                    "image_path": image_url or "",
+                                    # 额外信息
+                                    "_dimension": key,
+                                    "_page_numbers": page_numbers,
+                                    "_image": stitched_image,
+                                    "_image_url": image_url
+                                })
+                                
+                                chunk_index += 1
+                                
+                        except Exception as chunk_error:
+                            logger.error(f"处理JSON分块 {chunk_index} 失败: {str(chunk_error)}")
+                            continue
+                            
+            except Exception as e:
+                logger.error(f"JSON格式处理失败: {str(e)}")
         else:
             logger.warning(f"未知的汇总结果格式: {type(aggregated_result)}")
         
@@ -457,9 +565,12 @@ class ResultAggregationNode(BaseNode):
             
             logger.debug(f"Chat模型返回内容: {aggregated_content[:500]}...")
             
+            # 直接使用Chat模型返回的内容(Markdown字符串)
+            aggregated_result = aggregated_content
+            
             # 按"分块"拆分结果,并为每个分块拼接图片上传到MinIO
             parsed_results = self._split_by_chunks(
-                aggregated_content,
+                aggregated_result, 
                 windowed_results,
                 split_pages
             )
@@ -468,7 +579,7 @@ class ResultAggregationNode(BaseNode):
             
             return {
                 "parsed_results": parsed_results,
-                "aggregated_result": aggregated_content  # 保留完整的汇总结果供调试
+                "aggregated_result": aggregated_result  # 保留完整的汇总结果供调试
             }
             
         except Exception as e:
@@ -528,45 +639,93 @@ class ResultAggregationNode(BaseNode):
             parsed_content = result.get("parsed_content", {})
             center_page = result.get("center_page", 0)
             page_numbers = result.get("page_numbers", [center_page])
-            
-            # 遍历每个维度字段
-            for key, value in parsed_content.items():
-                if key in ["页码", "page_number"] or not isinstance(value, list):
-                    continue
+
+            # 判断parsed_content的类型
+            if isinstance(parsed_content, str):
+                # Markdown格式:按 --- 分割分块
+                chunks = parsed_content.split('---')
                 
-                # 展开分块
-                for chunk in value:
-                    if isinstance(chunk, dict):
-                        # 使用第一个页码作为主页码
-                        page_number = page_numbers[0] if page_numbers else center_page
-                        
-                        # 拼接图片
-                        stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
-                        
-                        # 上传到MinIO
-                        image_url = None
-                        if stitched_image:
-                            image_url = self._upload_image_to_minio(
-                                stitched_image,
-                                self.dimension_id or 0,
-                                chunk_index
-                            )
-                        
-                        # 统一格式:直接存储chunk的JSON对象
-                        parsed_results.append({
-                            "page_number": page_number,
-                            "chunk_id": "",
-                            "content": chunk,  # 直接存储chunk的JSON对象
-                            "model": self.model_name,
-                            "image_path": image_url or "",  # 拼接后图片的MinIO URL
-                            # 额外信息
-                            "_dimension": key,
-                            "_page_numbers": page_numbers,
-                            "_image": stitched_image,
-                            "_image_url": image_url
-                        })
-                        
-                        chunk_index += 1
+                for chunk_text in chunks:
+                    chunk_text = chunk_text.strip()
+                    if not chunk_text:
+                        continue
+                    
+                    # 从文本中提取页码范围标记
+                    chunk_page_numbers = self._extract_page_range_from_text(chunk_text)
+                    
+                    # 如果没有找到页码范围标记,使用窗口的页码范围
+                    if not chunk_page_numbers:
+                        chunk_page_numbers = page_numbers
+                    
+                    # 使用第一个页码作为主页码
+                    page_number = chunk_page_numbers[0] if chunk_page_numbers else center_page
+                    
+                    # 拼接图片
+                    stitched_image = self._stitch_images_for_pages(chunk_page_numbers, page_map)
+                    
+                    # 上传到MinIO
+                    image_url = None
+                    if stitched_image:
+                        image_url = self._upload_image_to_minio(
+                            stitched_image,
+                            self.dimension_id or 0,
+                            chunk_index
+                        )
+                    
+                    # 统一格式:存储Markdown文本
+                    parsed_results.append({
+                        "page_number": page_number,
+                        "chunk_id": "",
+                        "content": chunk_text,  # Markdown文本
+                        "model": self.model_name,
+                        "image_path": image_url or "",
+                        # 额外信息
+                        "_page_numbers": chunk_page_numbers,
+                        "_image": stitched_image,
+                        "_image_url": image_url
+                    })
+                    
+                    chunk_index += 1
+                    
+            elif isinstance(parsed_content, dict):
+                # JSON格式:遍历每个维度字段
+                for key, value in parsed_content.items():
+                    if key in ["页码", "page_number"] or not isinstance(value, list):
+                        continue
+                    
+                    # 展开分块
+                    for chunk in value:
+                        if isinstance(chunk, dict):
+                            # 使用第一个页码作为主页码
+                            page_number = page_numbers[0] if page_numbers else center_page
+                            
+                            # 拼接图片
+                            stitched_image = self._stitch_images_for_pages(page_numbers, page_map)
+                            
+                            # 上传到MinIO
+                            image_url = None
+                            if stitched_image:
+                                image_url = self._upload_image_to_minio(
+                                    stitched_image,
+                                    self.dimension_id or 0,
+                                    chunk_index
+                                )
+                            
+                            # 统一格式:直接存储chunk的JSON对象
+                            parsed_results.append({
+                                "page_number": page_number,
+                                "chunk_id": "",
+                                "content": chunk,  # 直接存储chunk的JSON对象
+                                "model": self.model_name,
+                                "image_path": image_url or "",  # 拼接后图片的MinIO URL
+                                # 额外信息
+                                "_dimension": key,
+                                "_page_numbers": page_numbers,
+                                "_image": stitched_image,
+                                "_image_url": image_url
+                            })
+                            
+                            chunk_index += 1
         
         logger.info(f"备用合并完成,共 {len(parsed_results)} 个分块")