from typing import Dict, Any from PIL import Image import base64 import io from langchain.chat_models import init_chat_model from conf.config import ModelConfig class QWenVLParser: """QWEN VL模型图像解析工具""" def __init__(self, model_name: str = None): """ 初始化QWEN VL模型解析器 Args: model_name: 模型名称,若为None则使用配置文件中的值 """ # 获取模型配置 self.model_provider = ModelConfig.get_model_provider() self.model_name = model_name or ModelConfig.get_model_name() self.base_url = ModelConfig.get_base_url() self.api_key = ModelConfig.get_api_key() # 使用langchain的init_chat_model初始化模型 self.model = init_chat_model( model_provider=self.model_provider, model=self.model_name, base_url=self.base_url, api_key=self.api_key ) def image_to_base64(self, image: Image.Image) -> str: """ 将PIL图像转换为base64编码字符串 Args: image: PIL图像对象 Returns: str: base64编码的图像字符串 """ buffer = io.BytesIO() image.save(buffer, format="PNG") return base64.b64encode(buffer.getvalue()).decode("utf-8") def parse_image(self, image: Image.Image, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]: """ 使用OpenAI模型解析图像内容 Args: image: PIL图像对象 page_number: 页码 prompt: 提示词 Returns: Dict: 包含解析结果的字典,包含: - page_number: 页码 - content: 解析内容 - model: 使用的模型名称 """ try: # 将图像转换为base64 image_base64 = self.image_to_base64(image) # 构建消息,符合OpenAI API格式 messages = [ { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" } } ] } ] # 使用langchain模型调用OpenAI API response = self.model.invoke(messages) # 提取解析结果 content = response.content return { "page_number": page_number, "content": content, "model": self.model_name } except Exception as e: raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}") def parse_image_bytes(self, image_bytes: io.BytesIO, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]: """ 使用OpenAI模型解析图像字节流 Args: image_bytes: 图像字节流 page_number: 页码 prompt: 提示词 Returns: Dict: 包含解析结果的字典 """ # 将字节流转换为PIL图像 image = Image.open(image_bytes) return self.parse_image(image, page_number, prompt)