Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions autotest/cluster/clusterx.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def execute_task(self, task_config: Dict[str, Any]):
num_nodes=resource.get("num_nodes", 1),
image=resource.get("image", None),
no_env=resource.get("no_env", True),
image_pull_policy=resource.get("image_pull_policy","Always"),
)

job_schema = self.cluster.run(params)
Expand Down
48 changes: 48 additions & 0 deletions autotest/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,54 @@ case:
runtime_info/text_tokens: 0
timeout: 1080

qwen3-sft-cache:
-
type: sft
parameters:
config: autotest/config/qwen3_sft_cache.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
cpus_per_task: 80
envs:
- QWEN3_MOE_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-30B-A3B
- ALPACA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
- CACHE_DIR=/mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/.cache
- XTUNER_DETERMINISTIC=true
assert_info:
base_metric: qwen3-sft-cache/e968368a/tracker.jsonl
check_metrics:
grad_norm: 0.000001
loss/reduced_llm_loss: 0.000001
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
timeout: 10800

qwen3-5-sft-vl-moe:
-
type: sft
parameters:
config: autotest/config/qwen3_5_30B_sft_vl.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
cpus_per_task: 80
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- XTUNER_DETERMINISTIC=true
assert_info:
base_metric: qwen3-5-sft-vl-moe/e968368a/tracker.jsonl
check_metrics:
grad_norm: 5
loss/reduced_llm_loss: 5
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
timeout: 10800

qwen3-rl-lmdeploy:
-
type: rl
Expand Down
4 changes: 2 additions & 2 deletions autotest/config/gptoss.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
FSDPConfig,
LRConfig,
)
from xtuner.v1.datasets import FTDPTokenizeFnConfig
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig
from xtuner.v1.loss.ce_loss import CELossConfig
from xtuner.v1.model.moe.gpt_oss import GptOss21BA3P6Config
from xtuner.v1.module.rope import RopeScalingConfig
Expand Down Expand Up @@ -38,7 +38,7 @@
dataset_config = [
{
"dataset": DatasetConfig(name="alpaca", anno_path=ALPACA_PATH, sample_ratio=1.0),
"tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
"tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template='gpt-oss', max_length=16384),
},
]

Expand Down
63 changes: 63 additions & 0 deletions autotest/config/qwen3_5_30B_sft_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os

from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig
from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
from xtuner.v1.loss.ce_loss import CELossConfig
from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
from xtuner.v1.train import TrainerConfig


MEDIA_ROOT = os.environ["MEDIA_ROOT"]
MODEL_PATH = os.environ["MODEL_PATH"]
DATA_PATH = os.environ["DATA_PATH"]


moe_cfg = Qwen3_5_VLMoE35BA3Config()

optim_cfg = AdamWConfig(lr=6e-05)
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
fsdp_cfg = FSDPConfig(
torch_compile=True,
cpu_offload=False,
)

dataset_config = [
{
"dataset": DatasetConfig(
name="sft",
anno_path=DATA_PATH,
class_name="VLMJsonlDataset",
media_root=MEDIA_ROOT,
sample_ratio=1.0,
),
"tokenize_fn": Qwen3VLTokenizeFnConfig(
processor_path=MODEL_PATH,
max_length=16384,
add_vision_id=True,
),
},
]

dataloader_config = DataloaderConfig(
dataset_config_list=dataset_config,
pack_max_length=16384,
collator="qwen3_vl_sft_collator",
)

loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)

trainer = TrainerConfig(
load_from=MODEL_PATH,
model_cfg=moe_cfg,
optim_cfg=optim_cfg,
fsdp_cfg=fsdp_cfg,
dataloader_cfg=dataloader_config,
lr_cfg=lr_cfg,
loss_cfg=loss_cfg,
tokenizer_path=MODEL_PATH,
global_batch_size=16,
total_epoch=1,
work_dir=f"{os.environ['WORK_DIR']}",
seed=0,
)
55 changes: 55 additions & 0 deletions autotest/config/qwen3_sft_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import os

from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig
from xtuner.v1.datasets import FTDPTokenizeFnConfig
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
from xtuner.v1.loss.ce_loss import CELossConfig
from xtuner.v1.model.moe.qwen3 import Qwen3MoE30BA3Config
from xtuner.v1.train import TrainerConfig


QWEN3_MOE_PATH = os.environ["QWEN3_MOE_PATH"]
ALPACA_PATH = os.environ["ALPACA_PATH"]
CACHE_DIR = os.environ["CACHE_DIR"]


moe_cfg = Qwen3MoE30BA3Config()
optim_cfg = AdamWConfig(lr=6e-05)
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
fsdp_cfg = FSDPConfig(
torch_compile=False,
cpu_offload=False,
ep_size=moe_cfg.ep_size,
)

dataset_config = [
{
"dataset": DatasetConfig(
name="alpaca",
anno_path=ALPACA_PATH,
sample_ratio=1.0,
cache_dir=CACHE_DIR,
),
"tokenize_fn": FTDPTokenizeFnConfig(max_length=16384),
},
]

dataloader_config = DataloaderConfig(pack_max_length=16384)

loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)

trainer = TrainerConfig(
load_from=QWEN3_MOE_PATH,
model_cfg=moe_cfg,
optim_cfg=optim_cfg,
fsdp_cfg=fsdp_cfg,
dataset_cfg=dataset_config,
dataloader_cfg=dataloader_config,
lr_cfg=lr_cfg,
loss_cfg=loss_cfg,
tokenizer_path=QWEN3_MOE_PATH,
global_batch_size=16,
total_epoch=1,
work_dir=f"{os.environ['WORK_DIR']}",
seed=0,
)
Loading