| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- from typing import Dict, Any
- from PIL import Image
- import base64
- import io
- from langchain.chat_models import init_chat_model
- from src.conf.settings import model_settings
- from langfuse.langchain import CallbackHandler
- class QWenVLParser:
- """QWEN VL模型图像解析工具"""
-
- def __init__(self, model_name: str = None):
- """
- 初始化QWEN VL模型解析器
-
- Args:
- model_name: 模型名称,若为None则使用配置文件中的值
- """
- # 获取模型配置
- self.model_provider = model_settings.model_provider
- self.model_name = model_name or model_settings.model_name
- self.base_url = model_settings.base_url
- self.api_key = model_settings.api_key
- self.langfuse_handler = CallbackHandler()
- # 使用langchain的init_chat_model初始化模型
- self.model = init_chat_model(
- model_provider=self.model_provider,
- model=self.model_name,
- base_url=self.base_url,
- api_key=self.api_key
- )
-
- def image_to_base64(self, image: Image.Image) -> str:
- """
- 将PIL图像转换为base64编码字符串
-
- Args:
- image: PIL图像对象
-
- Returns:
- str: base64编码的图像字符串
- """
- buffer = io.BytesIO()
- image.save(buffer, format="PNG")
- return base64.b64encode(buffer.getvalue()).decode("utf-8")
-
- def parse_image(self, image: Image.Image, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
- """
- 使用OpenAI模型解析图像内容
-
- Args:
- image: PIL图像对象
- page_number: 页码
- prompt: 提示词
-
- Returns:
- Dict: 包含解析结果的字典,包含:
- - page_number: 页码
- - content: 解析内容
- - model: 使用的模型名称
- """
- try:
- # 将图像转换为base64
- image_base64 = self.image_to_base64(image)
-
- # 构建消息,符合OpenAI API格式
- messages = [
- {
- "role": "user",
- "content": [
- {
- "type": "text",
- "text": prompt
- },
- {
- "type": "image_url",
- "image_url": {
- "url": f"data:image/png;base64,{image_base64}"
- }
- }
- ]
- }
- ]
-
- # 使用langchain模型调用OpenAI API
- response = self.model.invoke(input=messages, config={"callbacks": [self.langfuse_handler]})
-
- # 提取解析结果
- content = response.content
-
- return {
- "page_number": page_number,
- "content": content,
- "model": self.model_name
- }
- except Exception as e:
- raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
-
- def parse_image_path(self, image_path: str, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
- """
- 使用OpenAI模型解析图像内容
-
- Args:
- image_path: 图像路径
- page_number: 页码
- prompt: 提示词
-
- Returns:
- Dict: 包含解析结果的字典,包含:
- - page_number: 页码
- - content: 解析内容
- - model: 使用的模型名称
- """
- try:
- # 构建消息,符合OpenAI API格式
- messages = [
- {
- "role": "user",
- "content": [
- {
- "type": "text",
- "text": prompt
- },
- {
- "type": "image_url",
- "image_url": {
- "url": f"{image_path}"
- }
- }
- ]
- }
- ]
-
- # 使用langchain模型调用OpenAI API
- response = self.model.invoke(messages)
-
- # 提取解析结果
- content = response.content
-
- return {
- "page_number": page_number,
- "content": content,
- "model": self.model_name
- }
- except Exception as e:
- raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
-
- def parse_image_bytes(self, image_bytes: io.BytesIO, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
- """
- 使用OpenAI模型解析图像字节流
-
- Args:
- image_bytes: 图像字节流
- page_number: 页码
- prompt: 提示词
-
- Returns:
- Dict: 包含解析结果的字典
- """
- # 将字节流转换为PIL图像
- image = Image.open(image_bytes)
- return self.parse_image(image, page_number, prompt)
|