| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- """
- 数据集管理服务
- 该文件提供数据集管理功能,支持:
- - PDF文件解析
- - 数据集创建和管理
- - 调用PDF解析工作流
- """
- import os
- import tempfile
- from typing import Dict, Any, Optional
- from parser.pdf_parser.pdf_parser_workflow import PDFParsingWorkflow
- from conf.settings import vector_db_settings
- class DatasetManageService:
- """数据集管理服务类"""
-
- def __init__(self):
- """初始化数据集管理服务"""
- self.pdf_workflow = PDFParsingWorkflow()
-
- def parse_pdf(self, series_name: str, pdf_file: bytes, pdf_filename: str) -> Dict[str, Any]:
- """
- 解析PDF文件
-
- Args:
- series_name: 系列名
- pdf_file: PDF文件字节数据
- pdf_filename: PDF文件名
-
- Returns:
- Dict[str, Any]: 解析结果
- """
- try:
- # 创建临时文件,使用原始文件名称
- temp_dir = tempfile.gettempdir()
- temp_file_path = os.path.join(temp_dir, pdf_filename)
- with open(temp_file_path, 'wb') as temp_file:
- temp_file.write(pdf_file)
-
- try:
- # 运行PDF解析工作流
- result = self.pdf_workflow.run(
- pdf_path=temp_file_path,
- page_dataset_id=vector_db_settings.infinity_page_dataset_id,
- dataset_name=series_name
- )
-
- return result
- finally:
- # 删除临时文件
- os.unlink(temp_file_path)
- except Exception as e:
- raise Exception(f"解析PDF文件失败: {str(e)}")
|