diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 7e464255c6..94cfb6baf4 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -535,6 +535,7 @@ |minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|✔|✔|✘|✘|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)| |minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|✔|✔|✘|✘|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)| |pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)| +|pixtral-large-instruct|[AI-ModelScope/Pixtral-Large-Instruct-2411](https://modelscope.cn/models/AI-ModelScope/Pixtral-Large-Instruct-2411/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral-large|✘|✘|✘|✘|transformers>=4.45|vision|[mistralai/Pixtral-Large-Instruct-2411](https://huggingface.co/mistralai/Pixtral-Large-Instruct-2411)| |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)| |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)| |mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)| diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index eb92e811c4..b60d25c55c 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -535,6 +535,7 @@ The table below introcudes all models supported by SWIFT: |minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|✔|✔|✘|✘|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)| |minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|✔|✔|✘|✘|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)| |pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)| +|pixtral-large-instruct|[AI-ModelScope/Pixtral-Large-Instruct-2411](https://modelscope.cn/models/AI-ModelScope/Pixtral-Large-Instruct-2411/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral-large|✘|✘|✘|✘|transformers>=4.45|vision|[mistralai/Pixtral-Large-Instruct-2411](https://huggingface.co/mistralai/Pixtral-Large-Instruct-2411)| |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)| |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)| |mplug-owl3-1b-chat|[iic/mPLUG-Owl3-1B-241014](https://modelscope.cn/models/iic/mPLUG-Owl3-1B-241014/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-1B-241014](https://huggingface.co/mPLUG/mPLUG-Owl3-1B-241014)| diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 142d2d859d..d30b1d6bee 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -541,6 +541,7 @@ class ModelType: ministral_8b_instruct_2410 = 'ministral-8b-instruct-2410' pixtral_12b = 'pixtral-12b' + pixtral_large_instruct = 'pixtral-large-instruct' # wizardlm wizardlm2_7b_awq = 'wizardlm2-7b-awq' wizardlm2_8x22b = 'wizardlm2-8x22b' @@ -1124,6 +1125,15 @@ def _output_device_map_hook(module, input, output): return output.to(input[0].device) +@register_model( + ModelType.pixtral_large_instruct, + 'AI-ModelScope/Pixtral-Large-Instruct-2411', + LoRATM.llava, + TemplateType.pixtral_large, + requires=['transformers>=4.45'], + placeholder_tokens=['[IMG]'], + tags=['multi-modal', 'vision'], + hf_model_id='mistralai/Pixtral-Large-Instruct-2411') @register_model( ModelType.pixtral_12b, 'AI-ModelScope/pixtral-12b', diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index a238950d1c..742e8bc7e9 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -4,7 +4,7 @@ import re from contextlib import contextmanager from copy import deepcopy -from datetime import datetime +from datetime import datetime, timedelta from functools import partial, wraps from types import MethodType from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, TypeVar, Union @@ -89,6 +89,7 @@ class TemplateType: idefics3 = 'idefics3' mistral_nemo = 'mistral-nemo' pixtral = 'pixtral' + pixtral_large = 'pixtral-large' openbuddy = 'openbuddy' openbuddy2 = 'openbuddy2' internlm = 'internlm' @@ -1712,9 +1713,6 @@ def _gather_list(batch: List[Dict[str, Any]], attr_name: str) -> Optional[List[A class PixtralTemplate(Template): - def __init__(self): - super().__init__(['{{SYSTEM}}'], ['[INST]{{QUERY}}[/INST]'], [''], [''], None) - def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int, example: Dict[str, Any]) -> List[Context]: return ['[IMG]'] @@ -1761,7 +1759,34 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = return res -register_template(TemplateType.pixtral, PixtralTemplate(), lazy_tokenize=True) +register_template( + TemplateType.pixtral, + PixtralTemplate(['[INST]'], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['[INST]'], [''], None), + lazy_tokenize=True) + + +class PixtralLargeTemplate(PixtralTemplate): + + @staticmethod + def _load_system_prompt(model_dir: str) -> str: + file_path = os.path.join(model_dir, 'SYSTEM_PROMPT.txt') + with open(file_path, 'r') as file: + system_prompt = file.read() + today = datetime.today().strftime('%Y-%m-%d') + yesterday = (datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d') + model_name = model_dir.split('/')[-1] + return system_prompt.format(name=model_name, today=today, yesterday=yesterday) + + def _init_template(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs) -> None: + self.default_system = self._load_system_prompt(tokenizer.model_dir) + return super()._init_template(tokenizer, *args, **kwargs) + + +register_template( + TemplateType.pixtral_large, + PixtralLargeTemplate(['[SYSTEM_PROMPT]{{SYSTEM}}[/SYSTEM_PROMPT]'], ['[INST]{{QUERY}}[/INST]'], [''], + [''], None), + lazy_tokenize=True) class YiCoderTemplate(ChatmlTemplate): @@ -1948,7 +1973,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = ['<>\n{{SYSTEM}}\n<>\n\n'])) register_template(TemplateType.mistral_nemo, - Template(['[INST] '], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['[INST] '], [''])) + Template(['[INST]'], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['[INST]'], [''])) class Llama3TemplateMixin: