qwen_vl.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. from typing import Dict, Any
  2. from PIL import Image
  3. import base64
  4. import io
  5. from langchain.chat_models import init_chat_model
  6. from src.conf.settings import model_settings
  7. from langfuse.langchain import CallbackHandler
  8. class QWenVLParser:
  9. """QWEN VL模型图像解析工具"""
  10. def __init__(self, model_name: str = None):
  11. """
  12. 初始化QWEN VL模型解析器
  13. Args:
  14. model_name: 模型名称,若为None则使用配置文件中的值
  15. """
  16. # 获取模型配置
  17. self.model_provider = model_settings.model_provider
  18. self.model_name = model_name or model_settings.model_name
  19. self.base_url = model_settings.base_url
  20. self.api_key = model_settings.api_key
  21. self.langfuse_handler = CallbackHandler()
  22. # 使用langchain的init_chat_model初始化模型
  23. self.model = init_chat_model(
  24. model_provider=self.model_provider,
  25. model=self.model_name,
  26. base_url=self.base_url,
  27. api_key=self.api_key
  28. )
  29. def image_to_base64(self, image: Image.Image) -> str:
  30. """
  31. 将PIL图像转换为base64编码字符串
  32. Args:
  33. image: PIL图像对象
  34. Returns:
  35. str: base64编码的图像字符串
  36. """
  37. buffer = io.BytesIO()
  38. image.save(buffer, format="PNG")
  39. return base64.b64encode(buffer.getvalue()).decode("utf-8")
  40. def parse_image(self, image: Image.Image, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  41. """
  42. 使用OpenAI模型解析图像内容
  43. Args:
  44. image: PIL图像对象
  45. page_number: 页码
  46. prompt: 提示词
  47. Returns:
  48. Dict: 包含解析结果的字典,包含:
  49. - page_number: 页码
  50. - content: 解析内容
  51. - model: 使用的模型名称
  52. """
  53. try:
  54. # 将图像转换为base64
  55. image_base64 = self.image_to_base64(image)
  56. # 构建消息,符合OpenAI API格式
  57. messages = [
  58. {
  59. "role": "user",
  60. "content": [
  61. {
  62. "type": "text",
  63. "text": prompt
  64. },
  65. {
  66. "type": "image_url",
  67. "image_url": {
  68. "url": f"data:image/png;base64,{image_base64}"
  69. }
  70. }
  71. ]
  72. }
  73. ]
  74. # 使用langchain模型调用OpenAI API
  75. response = self.model.invoke(input=messages, config={"callbacks": [self.langfuse_handler]})
  76. # 提取解析结果
  77. content = response.content
  78. return {
  79. "page_number": page_number,
  80. "content": content,
  81. "model": self.model_name
  82. }
  83. except Exception as e:
  84. raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
  85. def parse_image_path(self, image_path: str, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  86. """
  87. 使用OpenAI模型解析图像内容
  88. Args:
  89. image_path: 图像路径
  90. page_number: 页码
  91. prompt: 提示词
  92. Returns:
  93. Dict: 包含解析结果的字典,包含:
  94. - page_number: 页码
  95. - content: 解析内容
  96. - model: 使用的模型名称
  97. """
  98. try:
  99. # 构建消息,符合OpenAI API格式
  100. messages = [
  101. {
  102. "role": "user",
  103. "content": [
  104. {
  105. "type": "text",
  106. "text": prompt
  107. },
  108. {
  109. "type": "image_url",
  110. "image_url": {
  111. "url": f"{image_path}"
  112. }
  113. }
  114. ]
  115. }
  116. ]
  117. # 使用langchain模型调用OpenAI API
  118. response = self.model.invoke(messages)
  119. # 提取解析结果
  120. content = response.content
  121. return {
  122. "page_number": page_number,
  123. "content": content,
  124. "model": self.model_name
  125. }
  126. except Exception as e:
  127. raise Exception(f"图像解析失败(页码:{page_number}): {str(e)}")
  128. def parse_image_bytes(self, image_bytes: io.BytesIO, page_number: int, prompt: str = "请详细描述图像中的内容") -> Dict[str, Any]:
  129. """
  130. 使用OpenAI模型解析图像字节流
  131. Args:
  132. image_bytes: 图像字节流
  133. page_number: 页码
  134. prompt: 提示词
  135. Returns:
  136. Dict: 包含解析结果的字典
  137. """
  138. # 将字节流转换为PIL图像
  139. image = Image.open(image_bytes)
  140. return self.parse_image(image, page_number, prompt)