Skip to content

Commit c09f9b6

Browse files
krammnicfelipemello1Mark Obozovebsmothers
authoredFeb 11, 2025··
Add Phi4 (#2197)
Co-authored-by: Felipe Mello <fmellomascarenhas@gmail.com> Co-authored-by: Mark Obozov <markobozov@MacBook-Pro-Mark.local> Co-authored-by: ebsmothers <ebs@meta.com>
1 parent 386ca8d commit c09f9b6

File tree

20 files changed

+101400
-10
lines changed

20 files changed

+101400
-10
lines changed
 

‎recipes/configs/phi3/evaluation.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ model:
1212
# Checkpointer
1313
checkpointer:
1414
_component_: torchtune.training.FullModelHFCheckpointer
15-
checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
15+
checkpoint_dir: /tmp/phi-3
1616
checkpoint_files: [
1717
model-00001-of-00002.safetensors,
1818
model-00002-of-00002.safetensors
@@ -25,7 +25,7 @@ resume_from_checkpoint: False
2525
# Tokenizer
2626
tokenizer:
2727
_component_: torchtune.models.phi3.phi3_mini_tokenizer
28-
path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
28+
path: /tmp/phi-3/tokenizer.model
2929
max_seq_len: null
3030

3131
# Environment

‎recipes/configs/phi4/evaluation.yaml

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Config for EleutherEvalRecipe in eleuther_eval.py
2+
#
3+
# To launch, run the following command:
4+
# tune run eleuther_eval --config phi4/evaluation
5+
6+
output_dir: ./ # Not needed
7+
8+
# Model Arguments
9+
model:
10+
_component_: torchtune.models.phi4.phi4_14b
11+
12+
# Checkpointer
13+
checkpointer:
14+
_component_: torchtune.training.FullModelHFCheckpointer
15+
checkpoint_dir: /tmp/phi-4
16+
checkpoint_files: [
17+
model-00001-of-00002.safetensors,
18+
model-00002-of-00002.safetensors
19+
]
20+
recipe_checkpoint: null
21+
output_dir: ${output_dir}
22+
model_type: PHI3_MINI
23+
resume_from_checkpoint: False
24+
25+
# Tokenizer
26+
tokenizer:
27+
_component_: torchtune.models.phi4.phi4_14b_tokenizer
28+
vocab_path: /tmp/phi-4/vocab.json
29+
merges_path: /tmp/phi-4/merges.txt
30+
max_seq_len: null
31+
32+
# Environment
33+
device: cuda
34+
dtype: bf16
35+
seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
36+
37+
# EleutherAI specific eval args
38+
tasks: ["truthfulqa_mc2"]
39+
limit: null
40+
max_seq_length: 4096
41+
batch_size: 8
42+
enable_kv_cache: True
43+
44+
# Quantization specific args
45+
quantizer: null

‎recipes/configs/phi4/full.yaml

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# Config for multi-device full finetuning in full_finetune_distributed.py
2+
# using a Phi4 16K Instruct
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download microsoft/phi-4 --output-dir /tmp/phi-4 --hf-token <HF_TOKEN>
7+
#
8+
# Run this config on 4 GPUs using the following:
9+
# tune run --nproc_per_node 4 full_finetune_distributed --config phi4/full
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run --nproc_per_node 4 full_finetune_distributed --config phi4/full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works best when the model is being fine-tuned on 2+ GPUs.
17+
# Single device full finetuning requires more memory optimizations. It's
18+
# best to use mini_low_memory.yaml for those cases
19+
20+
output_dir: /tmp/torchtune/phi-4/full # /tmp may be deleted by your system. Change it to your preference.
21+
22+
# Model arguments
23+
model:
24+
_component_: torchtune.models.phi4.phi4_14b
25+
26+
# Tokenizer
27+
tokenizer:
28+
_component_: torchtune.models.phi4.phi4_14b_tokenizer
29+
vocab_path: /tmp/phi-4/vocab.json
30+
merges_path: /tmp/phi-4/merges.txt
31+
max_seq_len: null
32+
33+
# Checkpointer
34+
checkpointer:
35+
_component_: torchtune.training.FullModelHFCheckpointer
36+
checkpoint_dir: /tmp/phi-4
37+
checkpoint_files: [
38+
model-00001-of-00006.safetensors,
39+
model-00002-of-00006.safetensors,
40+
model-00003-of-00006.safetensors,
41+
model-00004-of-00006.safetensors,
42+
model-00005-of-00006.safetensors,
43+
model-00006-of-00006.safetensors,
44+
]
45+
recipe_checkpoint: null
46+
output_dir: ${output_dir}
47+
model_type: PHI3_MINI
48+
resume_from_checkpoint: False
49+
50+
# Dataset
51+
dataset:
52+
_component_: torchtune.datasets.alpaca_cleaned_dataset
53+
packed: False # True increases speed
54+
seed: null
55+
shuffle: True
56+
57+
# Fine-tuning arguments
58+
epochs: 1
59+
max_steps_per_epoch: null
60+
batch_size: 2
61+
gradient_accumulation_steps: 8 # Use to increase effective batch size
62+
optimizer:
63+
_component_: torch.optim.AdamW
64+
fused: True
65+
lr: 5e-6
66+
loss:
67+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
68+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
69+
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
70+
71+
# Training env
72+
device: cuda
73+
74+
# Memory management
75+
enable_activation_checkpointing: True # True reduces memory
76+
enable_activation_offloading: False # True reduces memory
77+
dtype: bf16
78+
79+
# Logging
80+
metric_logger:
81+
_component_: torchtune.training.metric_logging.DiskLogger
82+
log_dir: ${output_dir}/logs
83+
log_every_n_steps: 1
84+
log_peak_memory_stats: True
85+
86+
87+
# Profiler (disabled)
88+
profiler:
89+
_component_: torchtune.training.setup_torch_profiler
90+
enabled: False
91+
92+
#Output directory of trace artifacts
93+
output_dir: ${output_dir}/profiling_outputs
94+
95+
#`torch.profiler.ProfilerActivity` types to trace
96+
cpu: True
97+
cuda: True
98+
99+
#trace options passed to `torch.profiler.profile`
100+
profile_memory: False
101+
with_stack: False
102+
record_shapes: True
103+
with_flops: False
104+
105+
# `torch.profiler.schedule` options:
106+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
107+
wait_steps: 5
108+
warmup_steps: 3
109+
active_steps: 2
110+
num_cycles: 1
+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# Config for single device full finetuning in full_finetune_single_device.py
2+
# using a Phi4 16K Instruct
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download microsoft/phi-4 --output-dir /tmp/phi-4 --hf-token <HF_TOKEN>
7+
#
8+
# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
9+
# you can install it with
10+
# pip install bitsandbytes
11+
#
12+
# To launch on a single device, run the following command from root:
13+
# tune run full_finetune_single_device --config phi4/full_low_memory
14+
#
15+
# You can add specific overrides through the command line. For example
16+
# to override the checkpointer directory while launching training
17+
# you can run:
18+
# tune run full_finetune_single_device --config phi4/full_low_memory checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
19+
#
20+
# This config works only for training on single device.
21+
22+
output_dir: /tmp/torchtune/phi-4/full_low_memory # /tmp may be deleted by your system. Change it to your preference.
23+
24+
# Model arguments
25+
model:
26+
_component_: torchtune.models.phi4.phi4_14b
27+
28+
# Tokenizer
29+
tokenizer:
30+
_component_: torchtune.models.phi4.phi4_14b_tokenizer
31+
vocab_path: /tmp/phi-4/vocab.json
32+
merges_path: /tmp/phi-4/merges.txt
33+
max_seq_len: null
34+
35+
# Checkpointer
36+
checkpointer:
37+
_component_: torchtune.training.FullModelHFCheckpointer
38+
checkpoint_dir: /tmp/phi-4
39+
checkpoint_files: [
40+
model-00001-of-00006.safetensors,
41+
model-00002-of-00006.safetensors,
42+
model-00003-of-00006.safetensors,
43+
model-00004-of-00006.safetensors,
44+
model-00005-of-00006.safetensors,
45+
model-00006-of-00006.safetensors,
46+
]
47+
recipe_checkpoint: null
48+
output_dir: ${output_dir}
49+
model_type: PHI3_MINI
50+
resume_from_checkpoint: False
51+
52+
# Dataset
53+
dataset:
54+
_component_: torchtune.datasets.alpaca_cleaned_dataset
55+
packed: False # True increases speed
56+
seed: null
57+
shuffle: True
58+
59+
# Fine-tuning arguments
60+
epochs: 1
61+
max_steps_per_epoch: null
62+
batch_size: 2
63+
gradient_accumulation_steps: 1 # Use to increase effective batch size
64+
optimizer:
65+
_component_: bitsandbytes.optim.PagedAdamW
66+
lr: 5e-6
67+
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
68+
loss:
69+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
70+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
71+
72+
# Training env
73+
device: cuda
74+
75+
# Memory management
76+
enable_activation_checkpointing: True # True reduces memory
77+
enable_activation_offloading: True # True reduces memory
78+
dtype: bf16
79+
80+
# Logging
81+
metric_logger:
82+
_component_: torchtune.training.metric_logging.DiskLogger
83+
log_dir: ${output_dir}/logs
84+
log_every_n_steps: 1
85+
log_peak_memory_stats: True
86+
87+
88+
# Profiler (disabled)
89+
profiler:
90+
_component_: torchtune.training.setup_torch_profiler
91+
enabled: False
92+
93+
#Output directory of trace artifacts
94+
output_dir: ${output_dir}/profiling_outputs
95+
96+
#`torch.profiler.ProfilerActivity` types to trace
97+
cpu: True
98+
cuda: True
99+
100+
#trace options passed to `torch.profiler.profile`
101+
profile_memory: False
102+
with_stack: False
103+
record_shapes: True
104+
with_flops: False
105+
106+
# `torch.profiler.schedule` options:
107+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
108+
wait_steps: 5
109+
warmup_steps: 3
110+
active_steps: 2
111+
num_cycles: 1

0 commit comments

Comments
 (0)
Please sign in to comment.