qwen_vl.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. from typing import Dict, Any
  2. from PIL import Image
  3. import base64
  4. import io
  5. from langchain.chat_models import init_chat_model
  6. from conf.config import ModelConfig
  7. class QWenVLParser:
  8. """QWEN VL模型图像解析工具"""
  9. def __init__(self, model_name: str = None):
  10. """
  11. 初始化QWEN VL模型解析器
  12. Args:
  13. model_name: 模型名称,若为None则使用配置文件中的值
  14. """
  15. # 获取模型配置
  16. self.model_provider = ModelConfig.get_model_provider()
  17. self.model_name = model_name or ModelConfig.get_model_name()
  18. self.base_url = ModelConfig.get_base_url()
  19. self.api_key = ModelConfig.get_api_key()
  20. # 使用langchain的init_chat_model初始化模型
  21. self.model = init_chat_model(
  22. model_provider=self.model_provider,
  23. model=self.model_name,
  24. base_url=self.base_url,
  25. api_key=self.api_key
  26. )
  27. def image_to_base64(self, image: Image.Image) -> str:
  28. """
  29. 将PIL图像转换为base64编码字符串
  30. Args:
  31. image: PIL图像对象
  32. Returns:
  33. str: base64编码的图像字符串
  34. """
  35. buffer = io.BytesIO()
  36. image.save(buffer, format="PNG")
  37. return base64.b64encode(buffer.getvalue()).decode("utf-8")
  38. def parse_image(self, image: Image.Image, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  39. """
  40. 使用OpenAI模型解析图像内容
  41. Args:
  42. image: PIL图像对象
  43. page_number: 页码
  44. prompt: 提示词
  45. Returns:
  46. Dict: 包含解析结果的字典,包含:
  47. - page_number: 页码
  48. - content: 解析内容
  49. - model: 使用的模型名称
  50. """
  51. try:
  52. # 将图像转换为base64
  53. image_base64 = self.image_to_base64(image)
  54. # 构建消息,符合OpenAI API格式
  55. messages = [
  56. {
  57. "role": "user",
  58. "content": [
  59. {
  60. "type": "text",
  61. "text": prompt
  62. },
  63. {
  64. "type": "image_url",
  65. "image_url": {
  66. "url": f"data:image/png;base64,{image_base64}"
  67. }
  68. }
  69. ]
  70. }
  71. ]
  72. # 使用langchain模型调用OpenAI API
  73. response = self.model.invoke(messages)
  74. # 提取解析结果
  75. content = response.content
  76. return {
  77. "page_number": page_number,
  78. "content": content,
  79. "model": self.model_name
  80. }
  81. except Exception as e:
  82. raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
  83. def parse_image_path(self, image_path: str, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  84. """
  85. 使用OpenAI模型解析图像内容
  86. Args:
  87. image_path: 图像路径
  88. page_number: 页码
  89. prompt: 提示词
  90. Returns:
  91. Dict: 包含解析结果的字典,包含:
  92. - page_number: 页码
  93. - content: 解析内容
  94. - model: 使用的模型名称
  95. """
  96. try:
  97. # 构建消息,符合OpenAI API格式
  98. messages = [
  99. {
  100. "role": "user",
  101. "content": [
  102. {
  103. "type": "text",
  104. "text": prompt
  105. },
  106. {
  107. "type": "image_url",
  108. "image_url": {
  109. "url": f"{image_path}"
  110. }
  111. }
  112. ]
  113. }
  114. ]
  115. # 使用langchain模型调用OpenAI API
  116. response = self.model.invoke(messages)
  117. # 提取解析结果
  118. content = response.content
  119. return {
  120. "page_number": page_number,
  121. "content": content,
  122. "model": self.model_name
  123. }
  124. except Exception as e:
  125. raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
  126. def parse_image_bytes(self, image_bytes: io.BytesIO, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  127. """
  128. 使用OpenAI模型解析图像字节流
  129. Args:
  130. image_bytes: 图像字节流
  131. page_number: 页码
  132. prompt: 提示词
  133. Returns:
  134. Dict: 包含解析结果的字典
  135. """
  136. # 将字节流转换为PIL图像
  137. image = Image.open(image_bytes)
  138. return self.parse_image(image, page_number, prompt)