""" 数据集管理服务 该文件提供数据集管理功能,支持: - PDF文件解析 - 数据集创建和管理 - 调用PDF解析工作流 """ import os import tempfile from typing import Dict, Any, Optional from parser.pdf_parser.pdf_parser_workflow import PDFParsingWorkflow from conf.settings import vector_db_settings class DatasetManageService: """数据集管理服务类""" def __init__(self): """初始化数据集管理服务""" self.pdf_workflow = PDFParsingWorkflow() def parse_pdf(self, series_name: str, pdf_file: bytes, pdf_filename: str) -> Dict[str, Any]: """ 解析PDF文件 Args: series_name: 系列名 pdf_file: PDF文件字节数据 pdf_filename: PDF文件名 Returns: Dict[str, Any]: 解析结果 """ try: # 创建临时文件,使用原始文件名称 temp_dir = tempfile.gettempdir() temp_file_path = os.path.join(temp_dir, pdf_filename) with open(temp_file_path, 'wb') as temp_file: temp_file.write(pdf_file) try: # 运行PDF解析工作流 result = self.pdf_workflow.run( pdf_path=temp_file_path, page_dataset_id=vector_db_settings.infinity_page_dataset_id, dataset_name=series_name ) return result finally: # 删除临时文件 os.unlink(temp_file_path) except Exception as e: raise Exception(f"解析PDF文件失败: {str(e)}")