|
|
@@ -5,14 +5,15 @@ from concurrent.futures import ThreadPoolExecutor
|
|
|
from langgraph.graph import StateGraph, START, END
|
|
|
from typing import List, Dict, Any
|
|
|
from pydantic import BaseModel, Field, ConfigDict
|
|
|
-from parser.pdf_parser.pdf_splitter import PDFSplitter
|
|
|
-from model.qwen_vl import QWenVLParser
|
|
|
-from utils.ragflow.ragflow_service import RAGFlowService
|
|
|
-from model.multimodal_embedding import Embedding
|
|
|
-from conf.settings import model_settings, vector_db_settings
|
|
|
-from utils.infinity import get_client
|
|
|
+from src.parser.pdf_parser.pdf_splitter import PDFSplitter
|
|
|
+from src.model.qwen_vl import QWenVLParser
|
|
|
+from src.utils.ragflow.ragflow_service import RAGFlowService
|
|
|
+from src.utils.ragflow.chunk_record import get_chunk_record_service
|
|
|
+from src.model.multimodal_embedding import Embedding
|
|
|
+from src.conf.settings import model_settings, vector_db_settings, minio_settings
|
|
|
+from src.utils.infinity import get_client
|
|
|
from langfuse.langchain import CallbackHandler
|
|
|
-from conf.rag_parser_config import RagParserDefaults
|
|
|
+from src.conf.rag_parser_config import RagParserDefaults
|
|
|
|
|
|
# 定义工作流状态类
|
|
|
class PDFParsingState(BaseModel):
|
|
|
@@ -258,9 +259,12 @@ class PDFParsingWorkflow:
|
|
|
|
|
|
def _parse_single_page(self, page: Dict[str, Any], model_name: str) -> Dict[str, Any]:
|
|
|
"""解析单个页面(用于并行处理)"""
|
|
|
- prompt = """
|
|
|
+ page_number = page["page_number"]
|
|
|
+ image = page["image"]
|
|
|
+ prompt = f"""
|
|
|
角色定位:你是一位顶尖的儿童绘本分析师与视觉工程专家,擅长将插画视觉信息转化为高精度的结构化元数据。
|
|
|
任务描述:请深度解析提供的绘本页面,不仅提取基本要素,还要进行“像素级”的特征拆解。重点关注角色的微表情、服饰纹理、环境光效、构图视角及整体艺术风格。
|
|
|
+ 当前提取页码为:{page_number}
|
|
|
提取维度:
|
|
|
艺术风格 (Style):包括笔触(如水彩、蜡笔)、线条粗细、整体色调偏好。
|
|
|
角色特征 (Character):五官细节、肢体动作的动态感、衣物材质、标志性配饰。
|
|
|
@@ -277,44 +281,41 @@ class PDFParsingWorkflow:
|
|
|
描述精度:单条描述需包含具体视觉属性(颜色、形状、质感),字数控制在50字以内。
|
|
|
格式要求:严谨按照指定的JSON结构输出。
|
|
|
json格式:
|
|
|
- {
|
|
|
- "page_meta": {
|
|
|
- "page_number": 1,
|
|
|
+ {{
|
|
|
+ "page_meta": {{
|
|
|
+ "page_number": {page_number},
|
|
|
"content_text": "页面原文本内容",
|
|
|
- "overall_style": {
|
|
|
+ "overall_style": {{
|
|
|
"art_medium": "艺术媒介(如:手绘水彩、矢量平涂、3D渲染)",
|
|
|
"color_palette": ["主色调1", "主色调2"],
|
|
|
"lighting": "光影描述(如:柔和侧光、清晨自然光)",
|
|
|
"composition": "构图(如:三分法、对角线构图、大远景)"
|
|
|
- }
|
|
|
- },
|
|
|
+ }}
|
|
|
+ }},
|
|
|
"elements": [
|
|
|
- {
|
|
|
+ {{
|
|
|
"element_name": "元素名称(如:小兔子)",
|
|
|
"character_name": "角色名称(如果有,没有的话,角色名称为空字符串)",
|
|
|
"category": "分类(角色/场景/道具)",
|
|
|
"spatial_layer": "所在层级(前景/中景/背景)",
|
|
|
- "visual_attributes": {
|
|
|
+ "visual_attributes": {{
|
|
|
"appearance": "外貌细节描述(发型、五官、材质感)",
|
|
|
"action_emotion": "行为动作与情感流露",
|
|
|
"color_detail": "像素级颜色描述(如:淡茱萸粉、薄荷绿)",
|
|
|
"ability_tag": "如果为角色,其表现出的正面能力/特质"
|
|
|
- },
|
|
|
- "content_tags": {
|
|
|
+ }},
|
|
|
+ "content_tags": {{
|
|
|
"theme": ["自然", "社交", "生活常识"],
|
|
|
"object": ["动物", "服装", "植物"],
|
|
|
"emotion": ["快乐", "勇敢"]
|
|
|
- },
|
|
|
+ }},
|
|
|
"ability_tags": ["语言表达", "逻辑思维", "自我认知"],
|
|
|
"description": "综合性简洁描述(50字内)"
|
|
|
- }
|
|
|
+ }}
|
|
|
]
|
|
|
- }
|
|
|
+ }}
|
|
|
"""
|
|
|
|
|
|
- page_number = page["page_number"]
|
|
|
- image = page["image"]
|
|
|
-
|
|
|
print(f"开始解析第 {page_number} 页")
|
|
|
|
|
|
# 使用QWEN VL模型解析图像
|
|
|
@@ -380,6 +381,8 @@ class PDFParsingWorkflow:
|
|
|
page_number = parsed_result.get("page_number")
|
|
|
text = parsed_result.get("content", "")
|
|
|
image_path = state.split_pages[i].get("image_path")
|
|
|
+ # 截取url中的图片名
|
|
|
+ img_id = f"{vector_db_settings.infinity_page_dataset_id}-{os.path.basename(image_path).split(".")[0]}"
|
|
|
|
|
|
# 上传单页到RagFlow Chunk
|
|
|
chunk = self.ragflow_service.create_chunk(dataset_id=state.page_dataset_id,
|
|
|
@@ -387,10 +390,15 @@ class PDFParsingWorkflow:
|
|
|
content=text)
|
|
|
chunk_id = chunk["chunk"]["id"]
|
|
|
print(f"上传第 {page_number} 页,Chunk ID: {chunk_id}")
|
|
|
- # # 睡眠50ms,避免上传过快
|
|
|
- # time.sleep(0.05)
|
|
|
- # result = get_client().update(database_name=state.dataset_name, table_name="", cond=f"id = '{chunk_id}'", data={"img_id": img_id})
|
|
|
- # print(f"更新第 {page_number} 页,Chunk ID: {chunk_id},结果: {result}")
|
|
|
+
|
|
|
+ # 记录到定时任务表
|
|
|
+ get_chunk_record_service().record_chunk_add(
|
|
|
+ database_name=vector_db_settings.infinity_ragflow_database,
|
|
|
+ table_name=vector_db_settings.infinity_page_table_name,
|
|
|
+ chunk_id=chunk_id,
|
|
|
+ cond=f"id = '{chunk_id}'",
|
|
|
+ data={"img_id": img_id}
|
|
|
+ )
|
|
|
|
|
|
def _vectorize_store_node(self, state: PDFParsingState) -> Dict[str, Any]:
|
|
|
"""向量化入库节点"""
|