diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 7e464255c6..94cfb6baf4 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -535,6 +535,7 @@
|minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|✔|✔|✘|✘|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)|
|minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|✔|✔|✘|✘|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)|
|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
+|pixtral-large-instruct|[AI-ModelScope/Pixtral-Large-Instruct-2411](https://modelscope.cn/models/AI-ModelScope/Pixtral-Large-Instruct-2411/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral-large|✘|✘|✘|✘|transformers>=4.45|vision|[mistralai/Pixtral-Large-Instruct-2411](https://huggingface.co/mistralai/Pixtral-Large-Instruct-2411)|
|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
|mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)|
diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
index eb92e811c4..b60d25c55c 100644
--- a/docs/source_en/Instruction/Supported-models-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -535,6 +535,7 @@ The table below introcudes all models supported by SWIFT:
|minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|✔|✔|✘|✘|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)|
|minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|✔|✔|✘|✘|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)|
|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
+|pixtral-large-instruct|[AI-ModelScope/Pixtral-Large-Instruct-2411](https://modelscope.cn/models/AI-ModelScope/Pixtral-Large-Instruct-2411/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral-large|✘|✘|✘|✘|transformers>=4.45|vision|[mistralai/Pixtral-Large-Instruct-2411](https://huggingface.co/mistralai/Pixtral-Large-Instruct-2411)|
|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
|mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
|mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)|
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 142d2d859d..d30b1d6bee 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -541,6 +541,7 @@ class ModelType:
ministral_8b_instruct_2410 = 'ministral-8b-instruct-2410'
pixtral_12b = 'pixtral-12b'
+ pixtral_large_instruct = 'pixtral-large-instruct'
# wizardlm
wizardlm2_7b_awq = 'wizardlm2-7b-awq'
wizardlm2_8x22b = 'wizardlm2-8x22b'
@@ -1124,6 +1125,15 @@ def _output_device_map_hook(module, input, output):
return output.to(input[0].device)
+@register_model(
+ ModelType.pixtral_large_instruct,
+ 'AI-ModelScope/Pixtral-Large-Instruct-2411',
+ LoRATM.llava,
+ TemplateType.pixtral_large,
+ requires=['transformers>=4.45'],
+ placeholder_tokens=['[IMG]'],
+ tags=['multi-modal', 'vision'],
+ hf_model_id='mistralai/Pixtral-Large-Instruct-2411')
@register_model(
ModelType.pixtral_12b,
'AI-ModelScope/pixtral-12b',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index a238950d1c..742e8bc7e9 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -4,7 +4,7 @@
import re
from contextlib import contextmanager
from copy import deepcopy
-from datetime import datetime
+from datetime import datetime, timedelta
from functools import partial, wraps
from types import MethodType
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, TypeVar, Union
@@ -89,6 +89,7 @@ class TemplateType:
idefics3 = 'idefics3'
mistral_nemo = 'mistral-nemo'
pixtral = 'pixtral'
+ pixtral_large = 'pixtral-large'
openbuddy = 'openbuddy'
openbuddy2 = 'openbuddy2'
internlm = 'internlm'
@@ -1712,9 +1713,6 @@ def _gather_list(batch: List[Dict[str, Any]], attr_name: str) -> Optional[List[A
class PixtralTemplate(Template):
- def __init__(self):
- super().__init__(['{{SYSTEM}}'], ['[INST]{{QUERY}}[/INST]'], [''], [''], None)
-
def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
example: Dict[str, Any]) -> List[Context]:
return ['[IMG]']
@@ -1761,7 +1759,34 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
return res
-register_template(TemplateType.pixtral, PixtralTemplate(), lazy_tokenize=True)
+register_template(
+ TemplateType.pixtral,
+ PixtralTemplate(['[INST]'], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['[INST]'], [''], None),
+ lazy_tokenize=True)
+
+
+class PixtralLargeTemplate(PixtralTemplate):
+
+ @staticmethod
+ def _load_system_prompt(model_dir: str) -> str:
+ file_path = os.path.join(model_dir, 'SYSTEM_PROMPT.txt')
+ with open(file_path, 'r') as file:
+ system_prompt = file.read()
+ today = datetime.today().strftime('%Y-%m-%d')
+ yesterday = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d')
+ model_name = model_dir.split('/')[-1]
+ return system_prompt.format(name=model_name, today=today, yesterday=yesterday)
+
+ def _init_template(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs) -> None:
+ self.default_system = self._load_system_prompt(tokenizer.model_dir)
+ return super()._init_template(tokenizer, *args, **kwargs)
+
+
+register_template(
+ TemplateType.pixtral_large,
+ PixtralLargeTemplate(['[SYSTEM_PROMPT]{{SYSTEM}}[/SYSTEM_PROMPT]'], ['[INST]{{QUERY}}[/INST]'], [''],
+ [''], None),
+ lazy_tokenize=True)
class YiCoderTemplate(ChatmlTemplate):
@@ -1948,7 +1973,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
['<>\n{{SYSTEM}}\n<>\n\n']))
register_template(TemplateType.mistral_nemo,
- Template(['[INST] '], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['[INST] '], ['']))
+ Template(['[INST]'], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['[INST]'], ['']))
class Llama3TemplateMixin: