qwen_vl.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. from typing import Dict, Any
  2. from PIL import Image
  3. import base64
  4. import io
  5. from langchain.chat_models import init_chat_model
  6. from conf.config import ModelConfig
  7. class QWenVLParser:
  8. """QWEN VL模型图像解析工具"""
  9. def __init__(self, model_name: str = None):
  10. """
  11. 初始化QWEN VL模型解析器
  12. Args:
  13. model_name: 模型名称,若为None则使用配置文件中的值
  14. """
  15. # 获取模型配置
  16. self.model_provider = ModelConfig.get_model_provider()
  17. self.model_name = model_name or ModelConfig.get_model_name()
  18. self.base_url = ModelConfig.get_base_url()
  19. self.api_key = ModelConfig.get_api_key()
  20. # 使用langchain的init_chat_model初始化模型
  21. self.model = init_chat_model(
  22. model_provider=self.model_provider,
  23. model=self.model_name,
  24. base_url=self.base_url,
  25. api_key=self.api_key
  26. )
  27. def image_to_base64(self, image: Image.Image) -> str:
  28. """
  29. 将PIL图像转换为base64编码字符串
  30. Args:
  31. image: PIL图像对象
  32. Returns:
  33. str: base64编码的图像字符串
  34. """
  35. buffer = io.BytesIO()
  36. image.save(buffer, format="PNG")
  37. return base64.b64encode(buffer.getvalue()).decode("utf-8")
  38. def parse_image(self, image: Image.Image, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  39. """
  40. 使用OpenAI模型解析图像内容
  41. Args:
  42. image: PIL图像对象
  43. page_number: 页码
  44. prompt: 提示词
  45. Returns:
  46. Dict: 包含解析结果的字典,包含:
  47. - page_number: 页码
  48. - content: 解析内容
  49. - model: 使用的模型名称
  50. """
  51. try:
  52. # 将图像转换为base64
  53. image_base64 = self.image_to_base64(image)
  54. # 构建消息,符合OpenAI API格式
  55. messages = [
  56. {
  57. "role": "user",
  58. "content": [
  59. {
  60. "type": "text",
  61. "text": prompt
  62. },
  63. {
  64. "type": "image_url",
  65. "image_url": {
  66. "url": f"data:image/png;base64,{image_base64}"
  67. }
  68. }
  69. ]
  70. }
  71. ]
  72. # 使用langchain模型调用OpenAI API
  73. response = self.model.invoke(messages)
  74. # 提取解析结果
  75. content = response.content
  76. return {
  77. "page_number": page_number,
  78. "content": content,
  79. "model": self.model_name
  80. }
  81. except Exception as e:
  82. raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
  83. def parse_image_bytes(self, image_bytes: io.BytesIO, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  84. """
  85. 使用OpenAI模型解析图像字节流
  86. Args:
  87. image_bytes: 图像字节流
  88. page_number: 页码
  89. prompt: 提示词
  90. Returns:
  91. Dict: 包含解析结果的字典
  92. """
  93. # 将字节流转换为PIL图像
  94. image = Image.open(image_bytes)
  95. return self.parse_image(image, page_number, prompt)