diff --git a/apps/sft/deepseek_v3.yaml b/apps/sft/deepseek_v3.yaml new file mode 100644 index 000000000..d20dc0d2b --- /dev/null +++ b/apps/sft/deepseek_v3.yaml @@ -0,0 +1,70 @@ +# >>> python -m apps.sft.main --config apps/sft/deepseek_v3.yaml + + +# TODO: required by torchtitan +# https://github.com/pytorch/torchtitan/blob/2f1c814da071cc8ad165d00be6f9c1a66f8e1cce/torchtitan/distributed/utils.py#L265 +comm: + trace_buf_size: 0 + +model_name: "deepseek-ai/DeepSeek-V3.1-Base" + +model: + name: deepseek_v3 + flavor: 16B + hf_assets_path: hf://${model_name} + +processes: + procs: 8 + with_gpus: true + +optimizer: + name: AdamW + lr: 1e-5 + eps: 1e-8 + +lr_scheduler: + warmup_steps: 200 + +training: + local_batch_size: 1 + seq_len: 2048 + max_norm: 1.0 + steps: 1000 + compile: false + dataset: "c4" + +parallelism: + data_parallel_replicate_degree: 1 + data_parallel_shard_degree: -1 + tensor_parallel_degree: 1 + pipeline_parallel_degree: 1 + context_parallel_degree: 1 + expert_parallel_degree: 1 + disable_loss_parallel: false + +checkpoint: + enable: true + folder: ./checkpoint # The folder to save checkpoints to. + initial_load_path: hf://${model_name} # The path to load the initial checkpoint from. Ignored if `folder` exists. + initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo + last_save_in_hf: true + interval: 500 + async_mode: "disabled" + +activation_checkpoint: + mode: selective + selective_ac_option: op + +metric_logging: + wandb: + project: sft-training + group: sft_exp_${oc.env:USER} + logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce + +# profiling: +# enable_profiling: false + +# metrics: +# log_freq: 10 +# enable_tensorboard: true +# save_tb_folder: "tb"