diff --git a/autotest/cluster/clusterx.py b/autotest/cluster/clusterx.py index fc7a3033c..1ff74622f 100644 --- a/autotest/cluster/clusterx.py +++ b/autotest/cluster/clusterx.py @@ -50,6 +50,7 @@ def execute_task(self, task_config: Dict[str, Any]): num_nodes=resource.get("num_nodes", 1), image=resource.get("image", None), no_env=resource.get("no_env", True), + image_pull_policy=resource.get("image_pull_policy","Always"), ) job_schema = self.cluster.run(params) diff --git a/autotest/config.yaml b/autotest/config.yaml index 54d14cec9..0a0caf2d8 100644 --- a/autotest/config.yaml +++ b/autotest/config.yaml @@ -277,6 +277,149 @@ case: runtime_info/text_tokens: 0 timeout: 1080 + qwen3-sft-cache: + - + type: sft + parameters: + config: autotest/config/qwen3_sft_cache.py + output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output + resource: + cpus_per_task: 80 + envs: + - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-30B-A3B + - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca + - CACHE_DIR=/mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/.cache + - XTUNER_DETERMINISTIC=true + assert_info: + base_metric: qwen3-sft-cache/e968368a/tracker.jsonl + check_metrics: + grad_norm: 0.000001 + loss/reduced_llm_loss: 0.000001 + lr: 0 + memory/max_memory_GB: 0.2 + runtime_info/tgs: 0.05 + runtime_info/text_tokens: 0 + timeout: 10800 + + qwen3-sft-vl-dense: + - + type: sft + parameters: + config: autotest/config/qwen3_vl_8B_dense.py + output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output + resource: + envs: + - MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-VL-8B-Instruct + - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl + - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl + - XTUNER_DETERMINISTIC=true + assert_info: + base_metric: qwen3-sft-vl-dense/812c1021/tracker.jsonl + check_metrics: + grad_norm: 0.000001 + loss/reduced_llm_loss: 0.000001 + lr: 0 + memory/max_memory_GB: 0.2 + runtime_info/tgs: 0.05 + runtime_info/text_tokens: 0 + timeout: 10800 + + qwen3-5-sft-vl-moe: + - + type: sft + parameters: + config: autotest/config/qwen3_5_30B_sft_vl.py + output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output + resource: + cpus_per_task: 80 + envs: + - MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B + - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl + - MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl + - XTUNER_DETERMINISTIC=true + assert_info: + base_metric: qwen3-5-sft-vl-moe/e968368a/tracker.jsonl + check_metrics: + grad_norm: 5 + loss/reduced_llm_loss: 5 + lr: 0 + memory/max_memory_GB: 0.2 + runtime_info/tgs: 0.05 + runtime_info/text_tokens: 0 + timeout: 10800 + + qwen3-5-sft-fp8: + - + type: sft + parameters: + config: autotest/config/qwen3_5_fp8.py + output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output + resource: + num_nodes: 1 + cpus_per_task: 80 + envs: + - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B + - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca + - XTUNER_DETERMINISTIC=true + - XTUNER_ACTIVATION_OFFLOAD=1 + - XTUNER_USE_FA3=1 + assert_info: + base_metric: qwen3-5-sft-fp8/625c0018/tracker.jsonl + check_metrics: + grad_norm: 0.1 + loss/reduced_llm_loss: 0.000001 + lr: 0 + memory/max_memory_GB: 0.2 + runtime_info/tgs: 0.05 + runtime_info/text_tokens: 0 + timeout: 10800 + + qwen3-5-sft-recompute: + - + type: sft + parameters: + config: autotest/config/qwen3_5_recompute.py + output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output + resource: + num_nodes: 2 + cpus_per_task: 80 + envs: + - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B + - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca + - XTUNER_DETERMINISTIC=true + assert_info: + base_metric: qwen3-5-sft-recompute/625c0018/tracker.jsonl + check_metrics: + grad_norm: 0.000001 + loss/reduced_llm_loss: 0.000001 + lr: 0 + memory/max_memory_GB: 0.2 + runtime_info/tgs: 0.05 + runtime_info/text_tokens: 0 + timeout: 10800 + + qwen3-5-sft-tp2: + - + type: sft + parameters: + config: autotest/config/qwen3_5_moe_30BA3_tp2.py + output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output + resource: + envs: + - QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B + - ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca + - XTUNER_DETERMINISTIC=true + assert_info: + base_metric: qwen3-5-sft-tp2/625c0018/tracker.jsonl + check_metrics: + grad_norm: 0.05 + loss/reduced_llm_loss: 0.000001 + lr: 0 + memory/max_memory_GB: 0.2 + runtime_info/tgs: 0.05 + runtime_info/text_tokens: 0 + timeout: 10800 + qwen3-rl-lmdeploy: - type: rl diff --git a/autotest/config/gptoss.py b/autotest/config/gptoss.py index 5a14f9972..334b5d2fd 100644 --- a/autotest/config/gptoss.py +++ b/autotest/config/gptoss.py @@ -5,8 +5,8 @@ FSDPConfig, LRConfig, ) -from xtuner.v1.datasets import FTDPTokenizeFnConfig from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig +from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig from xtuner.v1.loss.ce_loss import CELossConfig from xtuner.v1.model.moe.gpt_oss import GptOss21BA3P6Config from xtuner.v1.module.rope import RopeScalingConfig @@ -38,7 +38,7 @@ dataset_config = [ { "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0), - "tokenize_fn": FTDPTokenizeFnConfig(max_length=16384), + "tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='gpt-oss', max_length=16384), }, ] diff --git a/autotest/config/qwen3_5_30B_sft_vl.py b/autotest/config/qwen3_5_30B_sft_vl.py new file mode 100644 index 000000000..e59d8bb70 --- /dev/null +++ b/autotest/config/qwen3_5_30B_sft_vl.py @@ -0,0 +1,63 @@ +import os + +from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig +from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig +from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig +from xtuner.v1.loss.ce_loss import CELossConfig +from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config +from xtuner.v1.train import TrainerConfig + + +MEDIA_ROOT = os.environ["MEDIA_ROOT"] +MODEL_PATH = os.environ["MODEL_PATH"] +DATA_PATH = os.environ["DATA_PATH"] + + +moe_cfg = Qwen3_5_VLMoE35BA3Config() + +optim_cfg = AdamWConfig(lr=6e-05) +lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) +fsdp_cfg = FSDPConfig( + torch_compile=True, + cpu_offload=False, +) + +dataset_config = [ + { + "dataset": DatasetConfig( + name="sft", + anno_path=DATA_PATH, + class_name="VLMJsonlDataset", + media_root=MEDIA_ROOT, + sample_ratio=1.0, + ), + "tokenize_fn": Qwen3VLTokenizeFnConfig( + processor_path=MODEL_PATH, + max_length=16384, + add_vision_id=True, + ), + }, +] + +dataloader_config = DataloaderConfig( + dataset_config_list=dataset_config, + pack_max_length=16384, + collator="qwen3_vl_sft_collator", +) + +loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) + +trainer = TrainerConfig( + load_from=MODEL_PATH, + model_cfg=moe_cfg, + optim_cfg=optim_cfg, + fsdp_cfg=fsdp_cfg, + dataloader_cfg=dataloader_config, + lr_cfg=lr_cfg, + loss_cfg=loss_cfg, + tokenizer_path=MODEL_PATH, + global_batch_size=16, + total_epoch=1, + work_dir=f"{os.environ['WORK_DIR']}", + seed=0, +) diff --git a/autotest/config/qwen3_5_fp8.py b/autotest/config/qwen3_5_fp8.py new file mode 100644 index 000000000..2ebdb7fb8 --- /dev/null +++ b/autotest/config/qwen3_5_fp8.py @@ -0,0 +1,59 @@ +import os + +from xtuner.v1.config import ( + AdamWConfig, + FSDPConfig, + LRConfig, +) +from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig +from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig +from xtuner.v1.float8.config import Float8Config, ScalingGranularity +from xtuner.v1.loss.ce_loss import CELossConfig +from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config +from xtuner.v1.train import TrainerConfig + + +QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"] +ALPACA_PATH = os.environ["ALPACA_PATH"] + +float8_cfg = Float8Config( + scaling_granularity_gemm=ScalingGranularity.TILEWISE, + scaling_granularity_grouped_gemm=ScalingGranularity.TILEWISE, +) + +moe_cfg = Qwen3_5_VLMoE35BA3Config(float8_cfg=float8_cfg) +optim_cfg = AdamWConfig(lr=6e-05) +lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) +fsdp_cfg = FSDPConfig( + torch_compile=False, + cpu_offload=False, +) + +dataset_config = [ + { + "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0), + "tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='qwen3', max_length=16384), + }, +] + +dataloader_config = DataloaderConfig(pack_max_length=16384) + +loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) + + +trainer = TrainerConfig( + load_from=QWEN3_MOE_PATH, + model_cfg=moe_cfg, + optim_cfg=optim_cfg, + fsdp_cfg=fsdp_cfg, + dataset_cfg=dataset_config, + dataloader_cfg=dataloader_config, + lr_cfg=lr_cfg, + loss_cfg=loss_cfg, + tokenizer_path=QWEN3_MOE_PATH, + global_batch_size=16, + intra_layer_micro_batch=1, + total_epoch=1, + work_dir=f"{os.environ['WORK_DIR']}", + seed=0, +) diff --git a/autotest/config/qwen3_5_moe_30BA3_tp2.py b/autotest/config/qwen3_5_moe_30BA3_tp2.py new file mode 100644 index 000000000..3658db284 --- /dev/null +++ b/autotest/config/qwen3_5_moe_30BA3_tp2.py @@ -0,0 +1,54 @@ +import os + +from xtuner.v1.config import ( + AdamWConfig, + FSDPConfig, + LRConfig, +) +from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig +from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig +from xtuner.v1.loss.ce_loss import CELossConfig +from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config +from xtuner.v1.train import TrainerConfig + + +QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"] +ALPACA_PATH = os.environ["ALPACA_PATH"] + + +moe_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False) +optim_cfg = AdamWConfig(lr=6e-05) +lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) +fsdp_cfg = FSDPConfig( + torch_compile=True, + cpu_offload=False, + tp_size=2, +) + +dataset_config = [ + { + "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0), + "tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='qwen3', max_length=16384), + }, +] + +dataloader_config = DataloaderConfig(pack_max_length=16384) + +loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) + + +trainer = TrainerConfig( + load_from=QWEN3_MOE_PATH, + model_cfg=moe_cfg, + optim_cfg=optim_cfg, + fsdp_cfg=fsdp_cfg, + dataset_cfg=dataset_config, + dataloader_cfg=dataloader_config, + lr_cfg=lr_cfg, + loss_cfg=loss_cfg, + tokenizer_path=QWEN3_MOE_PATH, + global_batch_size=16, + total_epoch=1, + work_dir=f"{os.environ['WORK_DIR']}", + seed=0, +) diff --git a/autotest/config/qwen3_5_recompute.py b/autotest/config/qwen3_5_recompute.py new file mode 100644 index 000000000..16a7d4c91 --- /dev/null +++ b/autotest/config/qwen3_5_recompute.py @@ -0,0 +1,55 @@ +import os +import torch + +from xtuner.v1.config import ( + AdamWConfig, + FSDPConfig, + LRConfig, +) +from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig +from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig +from xtuner.v1.loss.ce_loss import CELossConfig +from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config +from xtuner.v1.train import TrainerConfig + + +QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"] +ALPACA_PATH = os.environ["ALPACA_PATH"] + + +moe_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=True) +optim_cfg = AdamWConfig(lr=6e-05) +lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) +fsdp_cfg = FSDPConfig( + torch_compile=True, + cpu_offload=True, + recompute_ratio=0.25, +) + +dataset_config = [ + { + "dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0), + "tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='qwen3', max_length=16384), + }, +] + +dataloader_config = DataloaderConfig(pack_max_length=16384) + +loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) + + +trainer = TrainerConfig( + load_from=QWEN3_MOE_PATH, + model_cfg=moe_cfg, + optim_cfg=optim_cfg, + fsdp_cfg=fsdp_cfg, + dataset_cfg=dataset_config, + dataloader_cfg=dataloader_config, + lr_cfg=lr_cfg, + loss_cfg=loss_cfg, + tokenizer_path=QWEN3_MOE_PATH, + global_batch_size=32, + total_epoch=1, + work_dir=f"{os.environ['WORK_DIR']}", + seed=0, +) diff --git a/autotest/config/qwen3_sft_cache.py b/autotest/config/qwen3_sft_cache.py new file mode 100644 index 000000000..96f38627d --- /dev/null +++ b/autotest/config/qwen3_sft_cache.py @@ -0,0 +1,55 @@ +import os + +from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig +from xtuner.v1.datasets import FTDPTokenizeFnConfig +from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig +from xtuner.v1.loss.ce_loss import CELossConfig +from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config +from xtuner.v1.train import TrainerConfig + + +QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"] +ALPACA_PATH = os.environ["ALPACA_PATH"] +CACHE_DIR = os.environ["CACHE_DIR"] + + +moe_cfg = Qwen3MoE30BA3Config() +optim_cfg = AdamWConfig(lr=6e-05) +lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6) +fsdp_cfg = FSDPConfig( + torch_compile=False, + cpu_offload=False, + ep_size=moe_cfg.ep_size, +) + +dataset_config = [ + { + "dataset": DatasetConfig( + name="alpaca", + anno_path=ALPACA_PATH, + sample_ratio=1.0, + cache_dir=CACHE_DIR, + ), + "tokenize_fn": FTDPTokenizeFnConfig(max_length=16384), + }, +] + +dataloader_config = DataloaderConfig(pack_max_length=16384) + +loss_cfg = CELossConfig(mode="chunk", chunk_size=1024) + +trainer = TrainerConfig( + load_from=QWEN3_MOE_PATH, + model_cfg=moe_cfg, + optim_cfg=optim_cfg, + fsdp_cfg=fsdp_cfg, + dataset_cfg=dataset_config, + dataloader_cfg=dataloader_config, + lr_cfg=lr_cfg, + loss_cfg=loss_cfg, + tokenizer_path=QWEN3_MOE_PATH, + global_batch_size=16, + total_epoch=1, + work_dir=f"{os.environ['WORK_DIR']}", + seed=0, +) diff --git a/autotest/config/qwen3_vl_8B_dense.py b/autotest/config/qwen3_vl_8B_dense.py new file mode 100644 index 000000000..87366f473 --- /dev/null +++ b/autotest/config/qwen3_vl_8B_dense.py @@ -0,0 +1,58 @@ +import os + +from xtuner.v1.config import AdamWConfig, LRConfig +from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig +from xtuner.v1.datasets.config import DatasetConfig, DataloaderConfig +from xtuner.v1.loss import CELossConfig +from xtuner.v1.model import Qwen3VLDense8BConfig +from xtuner.v1.train import TrainerConfig + +MODEL_PATH = os.environ["MODEL_PATH"] +DATA_PATH = os.environ["DATA_PATH"] +MEDIA_ROOT = os.environ["MEDIA_ROOT"] + +# model config +model_cfg = Qwen3VLDense8BConfig() + + +dataset_config = [ + { + "dataset": DatasetConfig( + name="sft", + anno_path=DATA_PATH, + class_name="VLMJsonlDataset", + media_root=MEDIA_ROOT, + sample_ratio=1.0, + ), + "tokenize_fn": Qwen3VLTokenizeFnConfig( + processor_path=MODEL_PATH, + max_length=8192, + add_vision_id=True, + ), + }, +] + +dataloader_config = DataloaderConfig( + dataset_config_list=dataset_config, + pack_max_length=8192, + collator="qwen3_vl_sft_collator", +) + +# optimizer and lr config +optim_cfg = AdamWConfig(lr=1e-5, foreach=False) +lr_cfg = LRConfig(lr_type="cosine", warmup_ratio=0.03, lr_min=1e-6) + +# trainer config +trainer = TrainerConfig( + model_cfg=model_cfg, + load_from=MODEL_PATH, + tokenizer_path=MODEL_PATH, + dataloader_cfg=dataloader_config, + optim_cfg=optim_cfg, + lr_cfg=lr_cfg, + loss_cfg=CELossConfig(mode="chunk", chunk_size=1024), + global_batch_size=16, + total_epoch=1, + work_dir=f"{os.environ['WORK_DIR']}", + seed=0, +)