Skip to content

Commit 1a43259

Browse files
committed
Add Phi4 support
1 parent aa8f365 commit 1a43259

11 files changed

+1194
-0
lines changed

recipes/configs/phi4/evaluation.yaml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# Config for EleutherEvalRecipe in eleuther_eval.py
2+
#
3+
# To launch, run the following command:
4+
# tune run eleuther_eval --config phi3/evaluation
5+
6+
output_dir: ./ # Not needed
7+
8+
# Model Arguments
9+
model:
10+
_component_: torchtune.models.phi3.phi3_mini
11+
12+
# Checkpointer
13+
checkpointer:
14+
_component_: torchtune.training.FullModelHFCheckpointer
15+
checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
16+
checkpoint_files: [
17+
model-00001-of-00002.safetensors,
18+
model-00002-of-00002.safetensors
19+
]
20+
recipe_checkpoint: null
21+
output_dir: ${output_dir}
22+
model_type: PHI3_MINI
23+
resume_from_checkpoint: False
24+
25+
# Tokenizer
26+
tokenizer:
27+
_component_: torchtune.models.phi3.phi3_mini_tokenizer
28+
path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
29+
max_seq_len: null
30+
31+
# Environment
32+
device: cuda
33+
dtype: bf16
34+
seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
35+
36+
# EleutherAI specific eval args
37+
tasks: ["truthfulqa_mc2"]
38+
limit: null
39+
max_seq_length: 4096
40+
batch_size: 8
41+
enable_kv_cache: True
42+
43+
# Quantization specific args
44+
quantizer: null

recipes/configs/phi4/mini_full.yaml

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# Config for multi-device full finetuning in full_finetune_distributed.py
2+
# using a Phi3 Mini 4K Instruct
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download microsoft/Phi-3-mini-4k-instruct --output-dir /tmp/Phi-3-mini-4k-instruct --hf-token <HF_TOKEN>
7+
#
8+
# Run this config on 4 GPUs using the following:
9+
# tune run --nproc_per_node 4 full_finetune_distributed --config phi3/mini_full
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run --nproc_per_node 4 full_finetune_distributed --config phi3/mini_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works best when the model is being fine-tuned on 2+ GPUs.
17+
# Single device full finetuning requires more memory optimizations. It's
18+
# best to use mini_low_memory.yaml for those cases
19+
20+
output_dir: /tmp/torchtune/phi3_mini/full # /tmp may be deleted by your system. Change it to your preference.
21+
22+
# Model arguments
23+
model:
24+
_component_: torchtune.models.phi3.phi3_mini
25+
26+
# Tokenizer
27+
tokenizer:
28+
_component_: torchtune.models.phi3.phi3_mini_tokenizer
29+
path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
30+
max_seq_len: null
31+
32+
# Checkpointer
33+
checkpointer:
34+
_component_: torchtune.training.FullModelHFCheckpointer
35+
checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
36+
checkpoint_files: [
37+
model-00001-of-00002.safetensors,
38+
model-00002-of-00002.safetensors
39+
]
40+
recipe_checkpoint: null
41+
output_dir: ${output_dir}
42+
model_type: PHI3_MINI
43+
resume_from_checkpoint: False
44+
45+
# Dataset
46+
dataset:
47+
_component_: torchtune.datasets.alpaca_cleaned_dataset
48+
packed: False # True increases speed
49+
seed: null
50+
shuffle: True
51+
52+
# Fine-tuning arguments
53+
epochs: 1
54+
max_steps_per_epoch: null
55+
batch_size: 2
56+
gradient_accumulation_steps: 8 # Use to increase effective batch size
57+
optimizer:
58+
_component_: torch.optim.AdamW
59+
fused: True
60+
lr: 5e-6
61+
loss:
62+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
63+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
64+
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
65+
66+
# Training env
67+
device: cuda
68+
69+
# Memory management
70+
enable_activation_checkpointing: True # True reduces memory
71+
enable_activation_offloading: False # True reduces memory
72+
dtype: bf16
73+
74+
# Logging
75+
metric_logger:
76+
_component_: torchtune.training.metric_logging.DiskLogger
77+
log_dir: ${output_dir}/logs
78+
log_every_n_steps: 1
79+
log_peak_memory_stats: True
80+
81+
82+
# Profiler (disabled)
83+
profiler:
84+
_component_: torchtune.training.setup_torch_profiler
85+
enabled: False
86+
87+
#Output directory of trace artifacts
88+
output_dir: ${output_dir}/profiling_outputs
89+
90+
#`torch.profiler.ProfilerActivity` types to trace
91+
cpu: True
92+
cuda: True
93+
94+
#trace options passed to `torch.profiler.profile`
95+
profile_memory: False
96+
with_stack: False
97+
record_shapes: True
98+
with_flops: False
99+
100+
# `torch.profiler.schedule` options:
101+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
102+
wait_steps: 5
103+
warmup_steps: 3
104+
active_steps: 2
105+
num_cycles: 1
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Config for single device full finetuning in full_finetune_single_device.py
2+
# using a Phi3 Mini 4K Instruct
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download microsoft/Phi-3-mini-4k-instruct --output-dir /tmp/Phi-3-mini-4k-instruct --hf-token <HF_TOKEN>
7+
#
8+
# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
9+
# you can install it with
10+
# pip install bitsandbytes
11+
#
12+
# To launch on a single device, run the following command from root:
13+
# tune run full_finetune_single_device --config phi3/mini_full_low_memory
14+
#
15+
# You can add specific overrides through the command line. For example
16+
# to override the checkpointer directory while launching training
17+
# you can run:
18+
# tune run full_finetune_single_device --config phi3/mini_full_low_memory checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
19+
#
20+
# This config works only for training on single device.
21+
22+
output_dir: /tmp/torchtune/phi3_mini/full_low_memory # /tmp may be deleted by your system. Change it to your preference.
23+
24+
# Model arguments
25+
model:
26+
_component_: torchtune.models.phi3.phi3_mini
27+
28+
# Tokenizer
29+
tokenizer:
30+
_component_: torchtune.models.phi3.phi3_mini_tokenizer
31+
path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
32+
max_seq_len: null
33+
34+
# Checkpointer
35+
checkpointer:
36+
_component_: torchtune.training.FullModelHFCheckpointer
37+
checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
38+
checkpoint_files: [
39+
model-00001-of-00002.safetensors,
40+
model-00002-of-00002.safetensors
41+
]
42+
recipe_checkpoint: null
43+
output_dir: ${output_dir}
44+
model_type: PHI3_MINI
45+
resume_from_checkpoint: False
46+
47+
# Dataset
48+
dataset:
49+
_component_: torchtune.datasets.alpaca_cleaned_dataset
50+
packed: False # True increases speed
51+
seed: null
52+
shuffle: True
53+
54+
# Fine-tuning arguments
55+
epochs: 1
56+
max_steps_per_epoch: null
57+
batch_size: 2
58+
gradient_accumulation_steps: 1 # Use to increase effective batch size
59+
optimizer:
60+
_component_: bitsandbytes.optim.PagedAdamW
61+
lr: 5e-6
62+
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
63+
loss:
64+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
65+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
66+
67+
# Training env
68+
device: cuda
69+
70+
# Memory management
71+
enable_activation_checkpointing: True # True reduces memory
72+
enable_activation_offloading: True # True reduces memory
73+
dtype: bf16
74+
75+
# Logging
76+
metric_logger:
77+
_component_: torchtune.training.metric_logging.DiskLogger
78+
log_dir: ${output_dir}/logs
79+
log_every_n_steps: 1
80+
log_peak_memory_stats: True
81+
82+
83+
# Profiler (disabled)
84+
profiler:
85+
_component_: torchtune.training.setup_torch_profiler
86+
enabled: False
87+
88+
#Output directory of trace artifacts
89+
output_dir: ${output_dir}/profiling_outputs
90+
91+
#`torch.profiler.ProfilerActivity` types to trace
92+
cpu: True
93+
cuda: True
94+
95+
#trace options passed to `torch.profiler.profile`
96+
profile_memory: False
97+
with_stack: False
98+
record_shapes: True
99+
with_flops: False
100+
101+
# `torch.profiler.schedule` options:
102+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
103+
wait_steps: 5
104+
warmup_steps: 3
105+
active_steps: 2
106+
num_cycles: 1

recipes/configs/phi4/mini_lora.yaml

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
2+
# using a Phi3 mini (3.8B) model
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download microsoft/Phi-3-mini-4k-instruct --output-dir /tmp/Phi-3-mini-4k-instruct --hf-token <HF_TOKEN>
7+
#
8+
# To launch on 2 devices, run the following command from root:
9+
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config phi3/mini_lora
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config phi3/mini_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works best when the model is being fine-tuned on 2+ GPUs.
17+
# For single device LoRA finetuning please use mini_lora_single_device.yaml
18+
# or mini_qlora_single_device.yaml
19+
20+
output_dir: /tmp/torchtune/phi3_mini/lora # /tmp may be deleted by your system. Change it to your preference.
21+
22+
# Model arguments
23+
model:
24+
_component_: torchtune.models.phi3.lora_phi3_mini
25+
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
26+
apply_lora_to_mlp: True
27+
apply_lora_to_output: False
28+
lora_rank: 8 # higher increases accuracy and memory
29+
lora_alpha: 16 # usually alpha=2*rank
30+
lora_dropout: 0.0
31+
32+
# Tokenizer
33+
tokenizer:
34+
_component_: torchtune.models.phi3.phi3_mini_tokenizer
35+
path: /tmp/Phi-3-mini-4k-instruct/tokenizer.model
36+
max_seq_len: null
37+
38+
# Checkpointer
39+
checkpointer:
40+
_component_: torchtune.training.FullModelHFCheckpointer
41+
checkpoint_dir: /tmp/Phi-3-mini-4k-instruct
42+
checkpoint_files: [
43+
model-00001-of-00002.safetensors,
44+
model-00002-of-00002.safetensors
45+
]
46+
recipe_checkpoint: null
47+
output_dir: ${output_dir}
48+
model_type: PHI3_MINI
49+
resume_from_checkpoint: False
50+
save_adapter_weights_only: False
51+
52+
# Dataset
53+
dataset:
54+
_component_: torchtune.datasets.alpaca_cleaned_dataset
55+
packed: False # True increases speed
56+
seed: null
57+
shuffle: True
58+
59+
# Fine-tuning arguments
60+
epochs: 1
61+
max_steps_per_epoch: null
62+
batch_size: 2
63+
gradient_accumulation_steps: 8 # Use to increase effective batch size
64+
optimizer:
65+
_component_: torch.optim.AdamW
66+
fused: True
67+
weight_decay: 0.01
68+
lr: 3e-4
69+
lr_scheduler:
70+
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
71+
num_warmup_steps: 100
72+
loss:
73+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
74+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
75+
76+
# Training env
77+
device: cuda
78+
79+
# Memory management
80+
enable_activation_checkpointing: False # True reduces memory
81+
enable_activation_offloading: False # True reduces memory
82+
dtype: bf16
83+
84+
# Logging
85+
metric_logger:
86+
_component_: torchtune.training.metric_logging.DiskLogger
87+
log_dir: ${output_dir}/logs
88+
log_every_n_steps: 1
89+
log_peak_memory_stats: True
90+
91+
92+
# Profiler (disabled)
93+
profiler:
94+
_component_: torchtune.training.setup_torch_profiler
95+
enabled: False
96+
97+
#Output directory of trace artifacts
98+
output_dir: ${output_dir}/profiling_outputs
99+
100+
#`torch.profiler.ProfilerActivity` types to trace
101+
cpu: True
102+
cuda: True
103+
104+
#trace options passed to `torch.profiler.profile`
105+
profile_memory: False
106+
with_stack: False
107+
record_shapes: True
108+
with_flops: False
109+
110+
# `torch.profiler.schedule` options:
111+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
112+
wait_steps: 5
113+
warmup_steps: 3
114+
active_steps: 2
115+
num_cycles: 1

0 commit comments

Comments
 (0)