Skip to content

Commit c0349d0

Browse files
committed
support optional image prefix for vl fn-call
1 parent 5aa0070 commit c0349d0

File tree

9 files changed

+120
-48
lines changed

9 files changed

+120
-48
lines changed

examples/qwen2vl_assistant_video.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from qwen_agent.agents import Assistant
2+
3+
4+
def test():
5+
bot = Assistant(llm={'model': 'qwen-vl-max-latest'})
6+
7+
messages = [{
8+
'role':
9+
'user',
10+
'content': [{
11+
'video': [
12+
'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/xzsgiz/football1.jpg',
13+
'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/tdescd/football2.jpg',
14+
'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/zefdja/football3.jpg',
15+
'https://help-static-aliyun-doc.aliyuncs.com/file-manage-files/zh-CN/20241108/aedbqh/football4.jpg'
16+
]
17+
}, {
18+
'text': 'Describe the specific process of this video'
19+
}]
20+
}]
21+
22+
# Uploading video files requires applying for permission on DashScope
23+
# messages = [{
24+
# 'role':
25+
# 'user',
26+
# 'content': [{
27+
# 'video': 'https://www.runoob.com/try/demo_source/mov_bbb.mp4'
28+
# }, {
29+
# 'text': 'Describe the specific process of this video'
30+
# }]
31+
# }]
32+
33+
for rsp in bot.run(messages):
34+
print(rsp)
35+
36+
37+
if __name__ == '__main__':
38+
test()

qwen_agent/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '0.0.14'
1+
__version__ = '0.0.15'
22
from .agent import Agent
33
from .multi_agent_hub import MultiAgentHub
44

qwen_agent/llm/base.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,12 @@ def _preprocess_messages(
298298
generate_cfg: dict,
299299
functions: Optional[List[Dict]] = None,
300300
) -> List[Message]:
301-
messages = [format_as_multimodal_message(msg, add_upload_info=True, lang=lang) for msg in messages]
301+
messages = [
302+
format_as_multimodal_message(msg,
303+
add_upload_info=True,
304+
add_multimodel_upload_info=(functions is not None),
305+
lang=lang) for msg in messages
306+
]
302307
return messages
303308

304309
def _postprocess_messages(
@@ -307,7 +312,10 @@ def _postprocess_messages(
307312
fncall_mode: bool,
308313
generate_cfg: dict,
309314
) -> List[Message]:
310-
messages = [format_as_multimodal_message(msg, add_upload_info=False) for msg in messages]
315+
messages = [
316+
format_as_multimodal_message(msg, add_upload_info=False, add_multimodel_upload_info=False)
317+
for msg in messages
318+
]
311319
if not generate_cfg.get('skip_stopword_postproc', False):
312320
stop = generate_cfg.get('stop', [])
313321
messages = _postprocess_stop_words(messages, stop=stop)

qwen_agent/llm/fncall_prompts/base_fncall_prompt.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,10 @@ def format_plaintext_train_samples(
5252
if has_para:
5353
raise ValueError('This sample requires parallel_function_calls=True.')
5454

55-
messages = [format_as_multimodal_message(msg, add_upload_info=True, lang=lang) for msg in messages]
55+
messages = [
56+
format_as_multimodal_message(msg, add_upload_info=True, add_multimodel_upload_info=True, lang=lang)
57+
for msg in messages
58+
]
5659
for m in messages:
5760
for item in m.content:
5861
if item.type != 'text':

qwen_agent/llm/function_calling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def _preprocess_messages(
2929
generate_cfg: dict,
3030
functions: Optional[List[Dict]] = None,
3131
) -> List[Message]:
32-
messages = super()._preprocess_messages(messages, lang=lang, generate_cfg=generate_cfg)
32+
messages = super()._preprocess_messages(messages, lang=lang, generate_cfg=generate_cfg, functions=functions)
3333
if (not functions) or (generate_cfg.get('function_choice', 'auto') == 'none'):
3434
messages = self._remove_fncall_messages(messages, lang=lang)
3535
else:

qwen_agent/llm/qwenvl_dashscope.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -78,40 +78,41 @@ def _format_local_files(messages: List[Message]) -> List[Message]:
7878
if isinstance(msg.content, list):
7979
for item in msg.content:
8080
if item.image:
81-
fname = item.image
82-
if not fname.startswith((
83-
'http://',
84-
'https://',
85-
'file://',
86-
'data:', # base64 such as f"data:image/jpg;base64,{image_base64}"
87-
)):
88-
if fname.startswith('~'):
89-
fname = os.path.expanduser(fname)
90-
fname = os.path.abspath(fname)
91-
if os.path.isfile(fname):
92-
if re.match(r'^[A-Za-z]:\\', fname):
93-
fname = fname.replace('\\', '/')
94-
fname = 'file://' + fname
95-
item.image = fname
81+
item.image = _conv_fname(item.image)
9682
if item.audio:
97-
fname = item.audio
98-
if not fname.startswith((
99-
'http://',
100-
'https://',
101-
'file://',
102-
'data:', # base64 such as f"data:image/jpg;base64,{image_base64}"
103-
)):
104-
if fname.startswith('~'):
105-
fname = os.path.expanduser(fname)
106-
fname = os.path.abspath(fname)
107-
if os.path.isfile(fname):
108-
if re.match(r'^[A-Za-z]:\\', fname):
109-
fname = fname.replace('\\', '/')
110-
fname = 'file://' + fname
111-
item.audio = fname
83+
item.audio = _conv_fname(item.audio)
84+
if item.video:
85+
if isinstance(item.video, str):
86+
item.video = _conv_fname(item.video)
87+
else:
88+
assert isinstance(item.video, list)
89+
new_url = []
90+
for fname in item.video:
91+
new_url.append(_conv_fname(fname))
92+
item.video = new_url
11293
return messages
11394

11495

96+
def _conv_fname(fname: str) -> str:
97+
ori_fname = fname
98+
if not fname.startswith((
99+
'http://',
100+
'https://',
101+
'file://',
102+
'data:', # base64 such as f"data:image/jpg;base64,{image_base64}"
103+
)):
104+
if fname.startswith('~'):
105+
fname = os.path.expanduser(fname)
106+
fname = os.path.abspath(fname)
107+
if os.path.isfile(fname):
108+
if re.match(r'^[A-Za-z]:\\', fname):
109+
fname = fname.replace('\\', '/')
110+
fname = 'file://' + fname
111+
return fname
112+
113+
return ori_fname
114+
115+
115116
def _extract_vl_response(response) -> List[Message]:
116117
output = response.output.choices[0].message
117118
text_content = []

qwen_agent/llm/schema.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
FILE = 'file'
1717
IMAGE = 'image'
1818
AUDIO = 'audio'
19+
VIDEO = 'video'
1920

2021

2122
class BaseModelCompatibleDict(BaseModel):
@@ -66,13 +67,15 @@ class ContentItem(BaseModelCompatibleDict):
6667
image: Optional[str] = None
6768
file: Optional[str] = None
6869
audio: Optional[str] = None
70+
video: Optional[Union[str, list]] = None
6971

7072
def __init__(self,
7173
text: Optional[str] = None,
7274
image: Optional[str] = None,
7375
file: Optional[str] = None,
74-
audio: Optional[str] = None):
75-
super().__init__(text=text, image=image, file=file, audio=audio)
76+
audio: Optional[str] = None,
77+
video: Optional[Union[str, list]] = None):
78+
super().__init__(text=text, image=image, file=file, audio=audio, video=video)
7679

7780
@model_validator(mode='after')
7881
def check_exclusivity(self):
@@ -85,21 +88,23 @@ def check_exclusivity(self):
8588
provided_fields += 1
8689
if self.audio:
8790
provided_fields += 1
91+
if self.video:
92+
provided_fields += 1
8893

8994
if provided_fields != 1:
90-
raise ValueError("Exactly one of 'text', 'image', 'file', or 'audio' must be provided.")
95+
raise ValueError("Exactly one of 'text', 'image', 'file', 'audio', or 'video' must be provided.")
9196
return self
9297

9398
def __repr__(self):
9499
return f'ContentItem({self.model_dump()})'
95100

96-
def get_type_and_value(self) -> Tuple[Literal['text', 'image', 'file', 'audio'], str]:
101+
def get_type_and_value(self) -> Tuple[Literal['text', 'image', 'file', 'audio', 'video'], str]:
97102
(t, v), = self.model_dump().items()
98-
assert t in ('text', 'image', 'file', 'audio')
103+
assert t in ('text', 'image', 'file', 'audio', 'video')
99104
return t, v
100105

101106
@property
102-
def type(self) -> Literal['text', 'image', 'file', 'audio']:
107+
def type(self) -> Literal['text', 'image', 'file', 'audio', 'video']:
103108
t, v = self.get_type_and_value()
104109
return t
105110

qwen_agent/utils/utils.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ def json_dumps_compact(obj: dict, ensure_ascii=False, indent=None, **kwargs) ->
313313
def format_as_multimodal_message(
314314
msg: Message,
315315
add_upload_info: bool,
316+
add_multimodel_upload_info: bool,
316317
lang: Literal['auto', 'en', 'zh'] = 'auto',
317318
) -> Message:
318319
assert msg.role in (USER, ASSISTANT, SYSTEM, FUNCTION)
@@ -324,24 +325,32 @@ def format_as_multimodal_message(
324325
files = []
325326
for item in msg.content:
326327
k, v = item.get_type_and_value()
327-
if k == 'text':
328-
content.append(ContentItem(text=v))
329-
if k in ('image', 'audio'):
328+
if k in ('text', 'image', 'audio', 'video'):
330329
content.append(item)
331330
if k == 'file':
332331
# Move 'file' out of 'content' since it's not natively supported by models
333332
files.append(v)
333+
if add_multimodel_upload_info and k == 'image':
334+
# Indicate the image name
335+
# Not considering audio and video for now
336+
files.append(v)
334337
if add_upload_info and files and (msg.role in (SYSTEM, USER)):
335338
if lang == 'auto':
336339
has_zh = has_chinese_chars(msg)
337340
else:
338341
has_zh = (lang == 'zh')
339342
upload = []
340343
for f in [get_basename_from_url(f) for f in files]:
341-
if has_zh:
342-
upload.append(f'[文件]({f})')
344+
if is_image(f):
345+
if has_zh:
346+
upload.append(f'![图片]({f})')
347+
else:
348+
upload.append(f'![image]({f})')
343349
else:
344-
upload.append(f'[file]({f})')
350+
if has_zh:
351+
upload.append(f'[文件]({f})')
352+
else:
353+
upload.append(f'[file]({f})')
345354
upload = ' '.join(upload)
346355
if has_zh:
347356
upload = f'(上传了 {upload}\n\n'
@@ -372,7 +381,10 @@ def format_as_text_message(
372381
add_upload_info: bool,
373382
lang: Literal['auto', 'en', 'zh'] = 'auto',
374383
) -> Message:
375-
msg = format_as_multimodal_message(msg, add_upload_info=add_upload_info, lang=lang)
384+
msg = format_as_multimodal_message(msg,
385+
add_upload_info=add_upload_info,
386+
add_multimodel_upload_info=add_upload_info,
387+
lang=lang)
376388
text = ''
377389
for item in msg.content:
378390
if item.type == 'text':

tests/examples/test_examples.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from examples.llm_vl_mix_text import test as llm_vl_mix_text # noqa
1818
from examples.multi_agent_router import test as multi_agent_router # noqa
1919
from examples.qwen2vl_assistant_tooluse import test as qwen2vl_assistant_tooluse # noqa
20+
from examples.qwen2vl_assistant_video import test as test_video # noqa
2021
from examples.react_data_analysis import test as react_data_analysis # noqa
2122
from examples.visual_storytelling import test as visual_storytelling # noqa
2223

@@ -86,3 +87,7 @@ def test_group_chat_demo():
8687

8788
def test_qwen2vl_assistant_tooluse():
8889
qwen2vl_assistant_tooluse()
90+
91+
92+
def test_video_understanding():
93+
test_video()

0 commit comments

Comments
 (0)