|
|
@@ -277,50 +277,60 @@ class RAGFlowChunkNode(BaseNode):
|
|
|
logger.info(f"开始创建Chunks,共 {len(parsed_results)} 页")
|
|
|
|
|
|
for i, parsed_result in enumerate(parsed_results):
|
|
|
- page_number = parsed_result.get("page_number", i + 1)
|
|
|
- content = parsed_result.get("content", "")
|
|
|
-
|
|
|
- # 处理content:如果是字典,转换为JSON字符串;如果是字符串,直接使用
|
|
|
- if isinstance(content, dict):
|
|
|
- text = json.dumps(content, ensure_ascii=False, indent=2)
|
|
|
- else:
|
|
|
- text = str(content)
|
|
|
-
|
|
|
- # 优先从parsed_result中获取image_path(滑动窗口模式)
|
|
|
- # 如果没有,则从split_pages中获取(分页模式)
|
|
|
- image_path = parsed_result.get("image_path", "")
|
|
|
- if not image_path and i < len(split_pages):
|
|
|
- image_path = split_pages[i].get("image_path", "")
|
|
|
-
|
|
|
- # 生成img_id
|
|
|
- img_id = ""
|
|
|
- if image_path:
|
|
|
- # 从URL或路径中提取文件名
|
|
|
- filename = os.path.basename(image_path)
|
|
|
- # 如果是MinIO URL,可能包含查询参数,需要清理
|
|
|
- if '?' in filename:
|
|
|
- filename = filename.split('?')[0]
|
|
|
+ try:
|
|
|
+ # 安全获取page_number
|
|
|
+ if not isinstance(parsed_result, dict):
|
|
|
+ logger.warning(f"解析结果 {i} 不是字典类型: {type(parsed_result)},跳过")
|
|
|
+ continue
|
|
|
+
|
|
|
+ page_number = parsed_result.get("page_number", i + 1)
|
|
|
+ content = parsed_result.get("content", "")
|
|
|
+
|
|
|
+ # 处理content:如果是字典,转换为JSON字符串;如果是字符串,直接使用
|
|
|
+ if isinstance(content, dict):
|
|
|
+ text = json.dumps(content, ensure_ascii=False, indent=2)
|
|
|
+ else:
|
|
|
+ text = str(content)
|
|
|
+
|
|
|
+ # 优先从parsed_result中获取image_path(滑动窗口模式)
|
|
|
+ # 如果没有,则从split_pages中获取(分页模式)
|
|
|
+ image_path = parsed_result.get("image_path", "")
|
|
|
+ if not image_path and i < len(split_pages):
|
|
|
+ if isinstance(split_pages[i], dict):
|
|
|
+ image_path = split_pages[i].get("image_path", "")
|
|
|
+
|
|
|
# 生成img_id
|
|
|
- img_id = f"bookpage-{filename.split('.')[0]}.png"
|
|
|
-
|
|
|
- chunk = self.ragflow_service.create_chunk(
|
|
|
- dataset_id=page_dataset_id,
|
|
|
- document_id=page_document_id,
|
|
|
- content=text
|
|
|
- )
|
|
|
- chunk_id = chunk["chunk"]["id"]
|
|
|
- parsed_result["chunk_id"] = chunk_id
|
|
|
- logger.debug(f"创建第 {page_number} 页Chunk,ID: {chunk_id}, img_id: {img_id}")
|
|
|
-
|
|
|
- # 记录到定时任务表
|
|
|
- if img_id:
|
|
|
- get_chunk_record_service().record_chunk_add(
|
|
|
- database_name=vector_db_settings.infinity_ragflow_database,
|
|
|
- table_name=ragflow_settings.ragflow_dataset_prefix + "_" + ragflow_id,
|
|
|
- chunk_id=chunk_id,
|
|
|
- cond=f"_id = '{chunk_id}'",
|
|
|
- data={"img_id": img_id}
|
|
|
+ img_id = ""
|
|
|
+ if image_path:
|
|
|
+ # 从URL或路径中提取文件名
|
|
|
+ filename = os.path.basename(image_path)
|
|
|
+ # 如果是MinIO URL,可能包含查询参数,需要清理
|
|
|
+ if '?' in filename:
|
|
|
+ filename = filename.split('?')[0]
|
|
|
+ # 生成img_id
|
|
|
+ img_id = f"bookpage-{filename.split('.')[0]}.png"
|
|
|
+
|
|
|
+ chunk = self.ragflow_service.create_chunk(
|
|
|
+ dataset_id=page_dataset_id,
|
|
|
+ document_id=page_document_id,
|
|
|
+ content=text
|
|
|
)
|
|
|
+ chunk_id = chunk["chunk"]["id"]
|
|
|
+ parsed_result["chunk_id"] = chunk_id
|
|
|
+ logger.debug(f"创建第 {page_number} 页Chunk,ID: {chunk_id}, img_id: {img_id}")
|
|
|
+
|
|
|
+ # 记录到定时任务表
|
|
|
+ if img_id:
|
|
|
+ get_chunk_record_service().record_chunk_add(
|
|
|
+ database_name=vector_db_settings.infinity_ragflow_database,
|
|
|
+ table_name=ragflow_settings.ragflow_dataset_prefix + "_" + ragflow_id,
|
|
|
+ chunk_id=chunk_id,
|
|
|
+ cond=f"_id = '{chunk_id}'",
|
|
|
+ data={"img_id": img_id}
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"创建第 {i + 1} 个Chunk时出错: {str(e)}")
|
|
|
+ continue
|
|
|
|
|
|
logger.info(f"Chunks创建完成")
|
|
|
return {
|