dataset_manage_service.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. """
  2. 数据集管理服务
  3. 该文件提供数据集管理功能,支持:
  4. - PDF文件解析
  5. - 数据集创建和管理
  6. - 调用PDF解析工作流
  7. """
  8. import os
  9. import tempfile
  10. from typing import Dict, Any, Optional
  11. from parser.pdf_parser.pdf_parser_workflow import PDFParsingWorkflow
  12. from conf.settings import vector_db_settings
  13. class DatasetManageService:
  14. """数据集管理服务类"""
  15. def __init__(self):
  16. """初始化数据集管理服务"""
  17. self.pdf_workflow = PDFParsingWorkflow()
  18. def parse_pdf(self, series_name: str, pdf_file: bytes, pdf_filename: str) -> Dict[str, Any]:
  19. """
  20. 解析PDF文件
  21. Args:
  22. series_name: 系列名
  23. pdf_file: PDF文件字节数据
  24. pdf_filename: PDF文件名
  25. Returns:
  26. Dict[str, Any]: 解析结果
  27. """
  28. try:
  29. # 创建临时文件,使用原始文件名称
  30. temp_dir = tempfile.gettempdir()
  31. temp_file_path = os.path.join(temp_dir, pdf_filename)
  32. with open(temp_file_path, 'wb') as temp_file:
  33. temp_file.write(pdf_file)
  34. try:
  35. # 运行PDF解析工作流
  36. result = self.pdf_workflow.run(
  37. pdf_path=temp_file_path,
  38. page_dataset_id=vector_db_settings.infinity_page_dataset_id,
  39. dataset_name=series_name
  40. )
  41. return result
  42. finally:
  43. # 删除临时文件
  44. os.unlink(temp_file_path)
  45. except Exception as e:
  46. raise Exception(f"解析PDF文件失败: {str(e)}")