alair
/
book_rag_server


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
							"""
数据集管理服务

该文件提供数据集管理功能，支持：
- PDF文件解析
- 数据集创建和管理
- 调用PDF解析工作流
"""

import os
import tempfile
from typing import Dict, Any, Optional
from parser.pdf_parser.pdf_parser_workflow import PDFParsingWorkflow
from conf.settings import vector_db_settings


class DatasetManageService:
    """数据集管理服务类"""
    
    def __init__(self):
        """初始化数据集管理服务"""
        self.pdf_workflow = PDFParsingWorkflow()
    
    def parse_pdf(self, series_name: str, pdf_file: bytes, pdf_filename: str) -> Dict[str, Any]:
        """
        解析PDF文件
        
        Args:
            series_name: 系列名
            pdf_file: PDF文件字节数据
            pdf_filename: PDF文件名
            
        Returns:
            Dict[str, Any]: 解析结果
        """
        try:
            # 创建临时文件，使用原始文件名称
            temp_dir = tempfile.gettempdir()
            temp_file_path = os.path.join(temp_dir, pdf_filename)
            with open(temp_file_path, 'wb') as temp_file:
                temp_file.write(pdf_file)
            
            try:
                # 运行PDF解析工作流
                result = self.pdf_workflow.run(
                    pdf_path=temp_file_path,
                    page_dataset_id=vector_db_settings.infinity_page_dataset_id,
                    dataset_name=series_name
                )
                
                return result
            finally:
                # 删除临时文件
                os.unlink(temp_file_path)
        except Exception as e:
            raise Exception(f"解析PDF文件失败: {str(e)}")