support vit full llm lora (#3575)

Jintao-Huang · web-flow · commit a3db5c8fd89d · 2025-03-20T14:58:33.000+08:00
diff --git a/docs/source/Customization/插件化.md b/docs/source/Customization/插件化.md
@@ -145,7 +145,7 @@ tuner定制也是swift中有特色的能力之一，开发者可以无视复杂
 class IA3(Tuner):
 
     @staticmethod
-    def prepare_model(args: 'TrainArguments', model: torch.nn.Module):
+    def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
         model_arch: ModelKeys = MODEL_ARCH_MAPPING[model.model_meta.model_arch]
         ia3_config = IA3Config(
             target_modules=find_all_linears(model), feedforward_modules='.*' + model_arch.mlp.split('{}.')[1] + '.*')
@@ -155,14 +155,15 @@ class IA3(Tuner):
     def save_pretrained(
         model: torch.nn.Module,
         save_directory: str,
+        state_dict: Optional[dict] = None,
         safe_serialization: bool = True,
         **kwargs,
-    ):
+    ) -> None:
         model: PeftModel
-        model.save_pretrained(save_directory, safe_serialization=safe_serialization, **kwargs)
+        model.save_pretrained(save_directory, state_dict=state_dict, safe_serialization=safe_serialization, **kwargs)
 
     @staticmethod
-    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs):
+    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs) -> torch.nn.Module:
         return PeftModel.from_pretrained(model, model_id, **kwargs)
 ```
 
diff --git a/docs/source/Instruction/预训练与微调.md b/docs/source/Instruction/预训练与微调.md
@@ -55,6 +55,7 @@ ms-swift使用了分层式的设计思想，用户可以使用命令行界面、
   - 多机多卡训练: 我们书写了使用swift、torchrun、dlc、deepspeed、accelerate启动多节点运行的shell脚本示例。除了dlc和deepspeed，其他启动脚本都需要在所有节点中启动才可运行。具体参考[这里](https://github.com/modelscope/swift/blob/main/examples/train/multi-node)。
 - 量化训练：支持使用GPTQ、AWQ、AQLM、BNB、HQQ、EETQ量化技术的QLoRA训练。微调7B模型只需要9GB显存资源。具体参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora)。
 - 多模态训练：SWIFT支持多模态模型的预训练、微调和RLHF。支持Caption、VQA、OCR、[Grounding](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-vl-grounding/zh.ipynb)任务。支持图像、视频和音频三种模态。具体参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal)。多模态自定义数据集格式参考[自定义数据集文档](../Customization/自定义数据集.md)。
+  - 对ViT/Aligner使用全参数训练，LLM使用LoRA训练，并采用不同学习率的例子参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/custom_tuner)
 - RLHF训练：参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf)。多模态模型参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/rlhf)。GRPO训练参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/grpo_zero2.sh)。强化微调查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft)。
 - Megatron训练：支持使用Megatron的并行技术来加速大模型的训练，包括数据并行、张量并行、流水线并行、序列并行，上下文并行。参考[Megatron-SWIFT训练文档](./Megatron-SWIFT训练.md)。
 - 序列分类模型训练：参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls)。
diff --git a/docs/source_en/Customization/Pluginization.md b/docs/source_en/Customization/Pluginization.md
@@ -163,7 +163,7 @@ Tuner customization is another unique feature of SWIFT. Developers can bypass th
 class IA3(Tuner):
 
     @staticmethod
-    def prepare_model(args: 'TrainArguments', model: torch.nn.Module):
+    def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
         model_arch: ModelKeys = MODEL_ARCH_MAPPING[model.model_meta.model_arch]
         ia3_config = IA3Config(
             target_modules=find_all_linears(model), feedforward_modules='.*' + model_arch.mlp.split('{}.')[1] + '.*')
@@ -173,14 +173,15 @@ class IA3(Tuner):
     def save_pretrained(
         model: torch.nn.Module,
         save_directory: str,
+        state_dict: Optional[dict] = None,
         safe_serialization: bool = True,
         **kwargs,
-    ):
+    ) -> None:
         model: PeftModel
-        model.save_pretrained(save_directory, safe_serialization=safe_serialization, **kwargs)
+        model.save_pretrained(save_directory, state_dict=state_dict, safe_serialization=safe_serialization, **kwargs)
 
     @staticmethod
-    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs):
+    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs) -> torch.nn.Module:
         return PeftModel.from_pretrained(model, model_id, **kwargs)
 ```
 
diff --git a/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md b/docs/source_en/Instruction/Pre-training-and-Fine-tuning.md
@@ -58,6 +58,7 @@ Additionally, we offer a series of scripts to help you understand the training c
   - Multi-node Multi-GPU Training: We have provided example shell scripts for launching multi-node runs using swift, torchrun, dlc, deepspeed, and accelerate. Except for dlc and deepspeed, the other launch scripts need to be started on all nodes to run properly. Please refer to [here](https://github.com/modelscope/swift/blob/main/examples/train/multi-node) for details.
 - Quantization Training: Supports QLoRA training using quantization techniques such as GPTQ, AWQ, AQLM, BNB, HQQ, and EETQ. Fine-tuning a 7B model only requires 9GB of memory. For more details, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora).
 - Multi-modal Training: SWIFT supports pre-training, fine-tuning, and RLHF for multi-modal models. It supports tasks such as Captioning, VQA, OCR, and [Grounding](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-vl-grounding/zh.ipynb). It supports three modalities: images, videos, and audio. For more details, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal). The format for custom multi-modal datasets can be found in the [Custom Dataset Documentation](../Customization/Custom-dataset.md).
+  - For an example of using full-parameter training for ViT/Aligner, LoRA training for LLM, and adopting different learning rates, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/custom_tuner).
 - RLHF Training: Refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf). For multi-modal models, refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal/rlhf). For GRPO training, refer to [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/grpo_zero2.sh). For reinforcement fine-tuning, see [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rft).
 - Megatron Training: Supports the use of Megatron's parallelization techniques to accelerate the training of large models, including data parallelism, tensor parallelism, pipeline parallelism, sequence parallelism, and context parallelism. Refer to the [Megatron-SWIFT Training Documentation](./Megatron-SWIFT-Training.md).
 - Sequence Classification Model Training: Refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
diff --git a/examples/train/multimodal/custom_tuner/custom_plugin.py b/examples/train/multimodal/custom_tuner/custom_plugin.py
@@ -0,0 +1,81 @@
+import os
+from typing import Optional
+
+import safetensors.torch
+import torch
+from transformers import Trainer
+
+from swift.plugin import Tuner, extra_tuners, optimizers_map
+from swift.tuners import LoraConfig, Swift
+
+
+class CustomTuner(Tuner):
+
+    @staticmethod
+    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs) -> torch.nn.Module:
+        model = Swift.from_pretrained(model, model_id, **kwargs)
+        state_dict = safetensors.torch.load_file(os.path.join(model_id, 'vit.safetensors'))
+        model.load_state_dict(state_dict, strict=False)
+        return model
+
+    @staticmethod
+    def save_pretrained(
+        model: torch.nn.Module,
+        save_directory: str,
+        state_dict: Optional[dict] = None,
+        safe_serialization: bool = True,
+        **kwargs,
+    ) -> None:
+        if state_dict is None:
+            state_dict = {}
+            for n, p in model.named_parameters():
+                if p.requires_grad:
+                    state_dict[n] = p.detach().cpu()
+        model.save_pretrained(save_directory, state_dict=state_dict, safe_serialization=safe_serialization, **kwargs)
+        # vit
+        state_dict = {k: v for k, v in state_dict.items() if '.visual.' in k}
+        safetensors.torch.save_file(
+            state_dict, os.path.join(save_directory, 'vit.safetensors'), metadata={'format': 'pt'})
+
+    @staticmethod
+    def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
+        target_regex = r'^model.layers.*'
+        lora_config = LoraConfig(
+            task_type='CAUSAL_LM', r=args.lora_rank, lora_alpha=args.lora_alpha, target_modules=target_regex)
+        model = Swift.prepare_model(model, lora_config)
+        model.visual.requires_grad_(True)  # vit & merger
+        return model
+
+
+def create_custom_optimizer(args, model, dataset):
+    decay_parameters = set(Trainer.get_decay_parameter_names(None, model))
+    vit_parameters = [(n, p) for n, p in model.named_parameters() if '.visual.' in n and p.requires_grad]
+    llm_parameters = [(n, p) for n, p in model.named_parameters() if '.visual.' not in n and p.requires_grad]
+    optimizer_grouped_parameters = [
+        # vit & merger
+        {
+            'params': [p for n, p in vit_parameters if n in decay_parameters],
+            'weight_decay': args.weight_decay,
+            'lr': 0.1 * args.learning_rate,  # 1e-5
+        },
+        {
+            'params': [p for n, p in vit_parameters if n not in decay_parameters],
+            'weight_decay': 0.0,
+            'lr': 0.1 * args.learning_rate,
+        },
+        # llm
+        {
+            'params': [p for n, p in llm_parameters if n in decay_parameters],
+            'weight_decay': args.weight_decay,
+        },
+        {
+            'params': [p for n, p in llm_parameters if n not in decay_parameters],
+            'weight_decay': 0.0,
+        },
+    ]
+    optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(args, model)
+    return optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs), None
+
+
+extra_tuners['custom'] = CustomTuner
+optimizers_map['custom'] = create_custom_optimizer
diff --git a/examples/train/multimodal/custom_tuner/infer.sh b/examples/train/multimodal/custom_tuner/infer.sh
@@ -0,0 +1,8 @@
+# If the weights have been merged, please use `--model`.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --load_data_args true \
+    --temperature 0 \
+    --max_new_tokens 2048
diff --git a/examples/train/multimodal/custom_tuner/lora_llm_full_vit.sh b/examples/train/multimodal/custom_tuner/lora_llm_full_vit.sh
@@ -0,0 +1,30 @@
+# 4 * 22GiB
+# vit/merger lr 1e-5; llm lora lr 1e-4
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+MAX_PIXELS=1003520 \
+swift sft \
+    --model Qwen/Qwen2.5-VL-7B-Instruct \
+    --dataset 'AI-ModelScope/coco#20000' \
+    --train_type custom \
+    --optimizer custom \
+    --external_plugins 'examples/train/multimodal/custom_tuner/custom_plugin.py' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 16 \
+    --lora_alpha 32 \
+    --gradient_accumulation_steps 4 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 8192 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --deepspeed zero2 \
+    --save_only_model true
diff --git a/examples/train/multimodal/custom_tuner/merge_lora.sh b/examples/train/multimodal/custom_tuner/merge_lora.sh
@@ -0,0 +1,3 @@
+swift export \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --merge_lora true
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
@@ -131,9 +131,6 @@ def _init_adapters(self):
         self.adapters = [
             safe_snapshot_download(adapter, use_hf=self.use_hf, hub_token=self.hub_token) for adapter in self.adapters
         ]
-        for adapter in self.adapters:
-            assert self._check_is_adapter(adapter), (
-                f'`{adapter}` is not an adapter, please try using `--model` to pass it.')
 
     def __post_init__(self):
         if self.use_hf or use_hf_hub():
@@ -149,6 +146,10 @@ def __post_init__(self):
         self.rank, self.local_rank, self.global_world_size, self.local_world_size = get_dist_setting()
         logger.info(f'rank: {self.rank}, local_rank: {self.local_rank}, '
                     f'world_size: {self.global_world_size}, local_world_size: {self.local_world_size}')
+        if self.train_type not in extra_tuners:
+            for adapter in self.adapters:
+                assert self._check_is_adapter(adapter), (
+                    f'`{adapter}` is not an adapter, please try using `--model` to pass it.')
         ModelArguments.__post_init__(self)
         QuantizeArguments.__post_init__(self)
         TemplateArguments.__post_init__(self)
diff --git a/swift/plugin/__init__.py b/swift/plugin/__init__.py
@@ -10,7 +10,7 @@
     from .metric import InferStats, MeanMetric, Metric, compute_acc, get_metric, compute_rouge_bleu
     from .optimizer import optimizers_map
     from .tools import get_tools_prompt, get_tools_keyword
-    from .tuner import Tuner, extra_tuners
+    from .tuner import Tuner, extra_tuners, PeftTuner
     from .prm import prms, PRM
     from .orm import orms, ORM
 
@@ -22,7 +22,7 @@
         'metric': ['InferStats', 'MeanMetric', 'Metric', 'compute_acc', 'get_metric', 'compute_rouge_bleu'],
         'optimizer': ['optimizers_map'],
         'tools': ['get_tools_prompt', 'get_tools_keyword'],
-        'tuner': ['Tuner', 'extra_tuners'],
+        'tuner': ['Tuner', 'extra_tuners', 'PeftTuner'],
         'prm': ['prms', 'PRM'],
         'orm': ['orms', 'ORM']
     }
diff --git a/swift/plugin/optimizer.py b/swift/plugin/optimizer.py
@@ -22,7 +22,7 @@ def calculate_max_steps(args: 'TrainArguments', dataset) -> int:
     return max_steps
 
 
-def create_galore_optimizers(args, model, dataset):
+def create_galore_optimizer(args, model, dataset):
     training_steps = calculate_max_steps(args, dataset)
     optimizer, lr_scheduler = create_optimizer_and_scheduler(
         model, args, args.galore_config, training_steps, lr=args.learning_rate, weight_decay=args.weight_decay)
@@ -31,7 +31,7 @@ def create_galore_optimizers(args, model, dataset):
     return optimizer, lr_scheduler
 
 
-def create_lorap_optimizers(args, model, dataset):
+def create_lorap_optimizer(args, model, dataset):
     optimizer_grouped_parameters = None
     if hasattr(model, 'create_optimizer_param_groups'):
         # Lora+ parameter groups
@@ -55,7 +55,7 @@ def create_lorap_optimizers(args, model, dataset):
     return optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs), None
 
 
-def create_muon_optimizers(args, model, dataset):
+def create_muon_optimizer(args, model, dataset):
     from swift.llm import git_clone_github, get_model_arch
     if not args.local_repo_path:
         args.local_repo_path = git_clone_github('https://github.com/MoonshotAI/Moonlight.git')
@@ -94,7 +94,7 @@ def create_muon_optimizers(args, model, dataset):
 
 # Add your own optimizers here, use --optimizer xxx to train
 optimizers_map = {
-    'galore': create_galore_optimizers,
-    'lorap': create_lorap_optimizers,
-    'muon': create_muon_optimizers,
+    'galore': create_galore_optimizer,
+    'lorap': create_lorap_optimizer,
+    'muon': create_muon_optimizer,
 }
diff --git a/swift/plugin/tuner.py b/swift/plugin/tuner.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Optional
+
 import torch
 from peft import IA3Config, PeftModel, get_peft_model
 
@@ -9,7 +11,7 @@
 class Tuner:
 
     @staticmethod
-    def prepare_model(args: 'TrainArguments', model: torch.nn.Module):
+    def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
         """Prepare a new model with a tuner
 
         Args:
@@ -25,9 +27,10 @@ def prepare_model(args: 'TrainArguments', model: torch.nn.Module):
     def save_pretrained(
         model: torch.nn.Module,
         save_directory: str,
+        state_dict: Optional[dict] = None,
         safe_serialization: bool = True,
         **kwargs,
-    ):
+    ) -> None:
         """Save when save_steps reaches
 
         Args:
@@ -38,7 +41,7 @@ def save_pretrained(
         raise NotImplementedError
 
     @staticmethod
-    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs):
+    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs) -> torch.nn.Module:
         """Load the ckpt_dir
 
         Args:
@@ -56,22 +59,22 @@ class PeftTuner(Tuner):
     def save_pretrained(
         model: torch.nn.Module,
         save_directory: str,
+        state_dict: Optional[dict] = None,
         safe_serialization: bool = True,
         **kwargs,
-    ):
-        model: PeftModel
+    ) -> None:
         model.save_pretrained(save_directory, safe_serialization=safe_serialization, **kwargs)
 
     @staticmethod
-    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs):
+    def from_pretrained(model: torch.nn.Module, model_id: str, **kwargs) -> torch.nn.Module:
         return PeftModel.from_pretrained(model, model_id, **kwargs)
 
 
 # Here gives a simple example of IA3
 class IA3(PeftTuner):
 
     @staticmethod
-    def prepare_model(args: 'TrainArguments', model: torch.nn.Module):
+    def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
         model_arch: ModelKeys = MODEL_ARCH_MAPPING[model.model_meta.model_arch]
         ia3_config = IA3Config(
             target_modules=find_all_linears(model), feedforward_modules='.*' + model_arch.mlp.split('{}.')[1] + '.*')
@@ -81,7 +84,7 @@ def prepare_model(args: 'TrainArguments', model: torch.nn.Module):
 class DummyTuner(PeftTuner):
 
     @staticmethod
-    def prepare_model(args: 'TrainArguments', model: torch.nn.Module):
+    def prepare_model(args: 'TrainArguments', model: torch.nn.Module) -> torch.nn.Module:
         return model
 
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+swift export \`
	`2`	`+ --adapters output/vx-xxx/checkpoint-xxx \`
	`3`	`+ --merge_lora true`