yingge 4 месяцев назад
Родитель
Сommit
a3a41451f0

BIN
services/pdf_parser/__pycache__/workflow.cpython-312.pyc


+ 2 - 2
services/pdf_parser/workflow.py

@@ -111,7 +111,7 @@ class PDFParsingWorkflow:
             
             # 检查响应
             if document_info_list and len(document_info_list) > 0:
-                document_id = document_info_list[0].id
+                document_id = document_info_list[0]["id"]
                 print(f"文档上传成功,文档ID: {document_id}")
                 return {
                     "document_id": document_id
@@ -193,7 +193,7 @@ class PDFParsingWorkflow:
         parsed_results = []
         
         # 使用ThreadPoolExecutor实现并行处理
-        with ThreadPoolExecutor(max_workers=10) as executor:
+        with ThreadPoolExecutor(max_workers=4) as executor:
             # 提交所有页面解析任务
             future_to_page = {
                 executor.submit(self._parse_single_page, page, self.model_name): page

BIN
services/ragflow/__pycache__/document_service.cpython-312.pyc


+ 5 - 4
services/ragflow/document_service.py

@@ -5,13 +5,14 @@ class DocumentService:
         self.http_client = http_client
     
     def upload_document(self, dataset_id: str, file_path: str) -> List[Dict[str, Any]]:
+        import os
         endpoint = f"/api/v1/datasets/{dataset_id}/documents"
         
         with open(file_path, 'rb') as f:
-            files = {'file': (file_path.split('/')[-1], f)}
-            headers = {'Content-Type': 'multipart/form-data'}
-            
-            response = self.http_client.post(endpoint, files=files, headers=headers)
+            # 使用os.path.basename获取文件名,兼容Windows和Linux
+            files = {'file': (os.path.basename(file_path), f)}
+            # 不设置Content-Type头,让requests库自动生成正确的multipart/form-data头
+            response = self.http_client.post(endpoint, files=files)
         
         if response.get("code") == 0 and response.get("data"):
             return response["data"]

+ 39 - 0
test_upload_document.py

@@ -0,0 +1,39 @@
+import sys
+import os
+
+# 添加项目根目录到Python路径
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from services.utils.http_client import HTTPClient
+from services.ragflow.document_service import DocumentService
+
+# 配置信息
+API_URL = "http://localhost:8000"  # 替换为实际的RAGFlow API URL
+API_KEY = "your_api_key"  # 替换为实际的API密钥
+DATASET_ID = "your_dataset_id"  # 替换为实际的数据集ID
+PDF_PATH = r"D:\project\work\ragflow_plugs\book\不一样的卡梅拉1-我想去看海.pdf"  # 使用已有的测试PDF文件
+
+def test_upload_document():
+    """测试上传文档功能"""
+    try:
+        # 创建HTTP客户端实例
+        http_client = HTTPClient(base_url=API_URL, api_key=API_KEY)
+        
+        # 创建文档服务实例
+        document_service = DocumentService(http_client)
+        
+        # 调用上传文档方法
+        print(f"开始上传文档: {PDF_PATH}")
+        result = document_service.upload_document(DATASET_ID, PDF_PATH)
+        
+        # 打印结果
+        print(f"文档上传成功: {result}")
+        return True
+    except Exception as e:
+        print(f"文档上传失败: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+if __name__ == "__main__":
+    test_upload_document()