2025-02-11 nightly release (a0f74c3)

pytorchbot · pytorchbot · commit d2b2eebabe09 · 2025-02-11T11:35:18.000Z
diff --git a/README.md b/README.md
@@ -72,7 +72,8 @@ torchtune provides the following finetuning recipes for training on one or more
 | DoRA/QDoRA Finetuning | ✅ | ✅ | ❌ | [lora_finetune_single_device](recipes/lora_finetune_single_device.py) <br> [lora_finetune_distributed](recipes/lora_finetune_distributed.py)| [Llama3 8B QDoRA single-device](recipes/configs/llama3/8B_qdora_single_device.yaml) <br> [Llama3 8B DoRA distributed](recipes/configs/llama3/8B_dora.yaml)
 | Quantization-Aware Training | ❌ | ✅ | ❌ | [qat_distributed](recipes/qat_distributed.py)| [Llama3 8B QAT](recipes/configs/llama3/8B_qat_full.yaml)
 | Quantization-Aware Training and LoRA Finetuning | ❌ | ✅ | ❌ | [qat_lora_finetune_distributed](recipes/qat_lora_finetune_distributed.py)| [Llama3 8B QAT](recipes/configs/llama3/8B_qat_lora.yaml)
-| Direct Preference Optimization | ✅ | ✅ | ❌ | [lora_dpo_single_device](recipes/lora_dpo_single_device.py) <br> [lora_dpo_distributed](recipes/lora_dpo_distributed.py) | [Llama2 7B single-device](recipes/configs/llama2/7B_lora_dpo_single_device.yaml) <br> [Llama2 7B distributed](recipes/configs/llama2/7B_lora_dpo.yaml)
+| Direct Preference Optimization: Full Finetuning | ❌ | ✅ | ❌ | [full_dpo_distributed](recipes/full_dpo_distributed.py) | [Llama3.1 8B DPO](recipes/configs/llama3_1/8B_full_dpo.yaml)
+| LoRA Direct Preference Optimization | ✅ | ✅ | ❌ | [lora_dpo_single_device](recipes/lora_dpo_single_device.py) <br> [lora_dpo_distributed](recipes/lora_dpo_distributed.py) | [Llama3.1 8B single-device](recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml) <br> [Llama3.1 8B distributed](recipes/configs/llama3_1/8B_lora_dpo.yaml)
 | Proximal Policy Optimization | ✅ | ❌ | ❌ |  [ppo_full_finetune_single_device](recipes/ppo_full_finetune_single_device.py) | [Mistral 7B](recipes/configs/mistral/7B_full_ppo_low_memory.yaml)
 | LoRA Knowledge Distillation | ✅ | ✅ | ❌ | [knowledge_distillation_single_device](recipes/knowledge_distillation_single_device.py) <br> [knowledge_distillation_distributed](recipes/knowledge_distillation_distributed.py) | [Qwen2 1.5B -> 0.5B single-device](recipes/configs/qwen2/1.5B_to_0.5B_KD_lora_single_device.yaml) <br> [Qwen2 1.5B -> 0.5B distributed](recipes/configs/qwen2/1.5B_to_0.5B_KD_lora_distributed.yaml)
 
diff --git a/recipes/configs/qwen2_5/14B_lora_single_device.yaml b/recipes/configs/qwen2_5/14B_lora_single_device.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-14B-Instruct --output-dir /tmp/Qwen2_5-14B-Instruct
+#   tune download Qwen/Qwen2.5-14B-Instruct --output-dir /tmp/Qwen2.5-14B-Instruct
 #
 # To launch on a single device, run the following command from root:
 #   tune run lora_finetune_single_device --config qwen2_5/14B_lora_single_device
@@ -30,13 +30,13 @@ model:
 
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-14B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-14B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-14B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-14B-Instruct/merges.txt
   max_seq_len: null
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-14B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-14B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00008"
diff --git a/recipes/configs/qwen2_5/32B_lora.yaml b/recipes/configs/qwen2_5/32B_lora.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-32B-Instruct --output-dir /tmp/Qwen2_5-32B-Instruct
+#   tune download Qwen/Qwen2.5-32B-Instruct --output-dir /tmp/Qwen2.5-32B-Instruct
 #
 # To launch on 8 devices, run the following command from root:
 #   tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/32B_lora
@@ -28,13 +28,13 @@ model:
 
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-32B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-32B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-32B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-32B-Instruct/merges.txt
   max_seq_len: null
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-32B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-32B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00017"
diff --git a/recipes/configs/qwen2_5/3B_full.yaml b/recipes/configs/qwen2_5/3B_full.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2.5-3B-Instruct
 #
 # To launch on 2 devices, run the following command from root:
 #   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/3B_full
@@ -22,8 +22,8 @@ output_dir: /tmp/torchtune/qwen2_5_3B/full # /tmp may be deleted by your system.
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-3B-Instruct/merges.txt
   max_seq_len: null
 
 # Dataset
@@ -39,7 +39,7 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-3B-Instruct
   checkpoint_files: [
     model-00001-of-00002.safetensors,
     model-00002-of-00002.safetensors,
diff --git a/recipes/configs/qwen2_5/3B_full_single_device.yaml b/recipes/configs/qwen2_5/3B_full_single_device.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2.5-3B-Instruct
 #
 # The default config uses an optimizer from bitsandbytes. If you do not have it installed,
 # you can install it with
@@ -24,8 +24,8 @@ output_dir: /tmp/torchtune/qwen2_5_3B/full_single_device # /tmp may be deleted b
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-3B-Instruct/merges.txt
   max_seq_len: null
 
 # Dataset
@@ -41,7 +41,7 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-3B-Instruct
   checkpoint_files: [
     model-00001-of-00002.safetensors,
     model-00002-of-00002.safetensors,
diff --git a/recipes/configs/qwen2_5/3B_lora.yaml b/recipes/configs/qwen2_5/3B_lora.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2.5-3B-Instruct
 #
 # To launch on 2 devices, run the following command from root:
 #   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/3B_lora
@@ -30,13 +30,13 @@ model:
 
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-3B-Instruct/merges.txt
   max_seq_len: null
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-3B-Instruct
   checkpoint_files: [
     model-00001-of-00002.safetensors,
     model-00002-of-00002.safetensors,
diff --git a/recipes/configs/qwen2_5/3B_lora_single_device.yaml b/recipes/configs/qwen2_5/3B_lora_single_device.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2.5-3B-Instruct
 #
 # To launch on a single device, run the following command from root:
 #   tune run lora_finetune_single_device --config qwen2_5/3B_lora_single_device
@@ -29,13 +29,13 @@ model:
 
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-3B-Instruct/merges.txt
   max_seq_len: null
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-3B-Instruct
   checkpoint_files: [
     model-00001-of-00002.safetensors,
     model-00002-of-00002.safetensors,
diff --git a/recipes/configs/qwen2_5/72B_lora.yaml b/recipes/configs/qwen2_5/72B_lora.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-72B-Instruct --output-dir /tmp/Qwen2_5-72B-Instruct
+#   tune download Qwen/Qwen2.5-72B-Instruct --output-dir /tmp/Qwen2.5-72B-Instruct
 #
 # To launch on 8 devices, run the following command from root:
 #   tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/72B_lora
@@ -28,13 +28,13 @@ model:
 
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-72B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-72B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-72B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-72B-Instruct/merges.txt
   max_seq_len: null
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-72B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-72B-Instruct
   checkpoint_files:
     filename_format: model-{}-of-{}.safetensors
     max_filename: "00037"
diff --git a/recipes/configs/qwen2_5/7B_full.yaml b/recipes/configs/qwen2_5/7B_full.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2.5-7B-Instruct
 #
 # To launch on 2 devices, run the following command from root:
 #   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/7B_full
@@ -22,8 +22,8 @@ output_dir: /tmp/torchtune/qwen2_5_7B/full # /tmp may be deleted by your system.
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-7B-Instruct/merges.txt
   max_seq_len: null
 
 # Dataset
@@ -39,7 +39,7 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-7B-Instruct
   checkpoint_files: [
     model-00001-of-00004.safetensors,
     model-00002-of-00004.safetensors,
diff --git a/recipes/configs/qwen2_5/7B_full_single_device.yaml b/recipes/configs/qwen2_5/7B_full_single_device.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2.5-7B-Instruct
 #
 # The default config uses an optimizer from bitsandbytes. If you do not have it installed,
 # you can install it with
@@ -24,8 +24,8 @@ output_dir: /tmp/torchtune/qwen2_5_7B/full_single_device # /tmp may be deleted b
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-7B-Instruct/merges.txt
   max_seq_len: null
 
 # Dataset
@@ -41,7 +41,7 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-7B-Instruct
   checkpoint_files: [
     model-00001-of-00004.safetensors,
     model-00002-of-00004.safetensors,
diff --git a/recipes/configs/qwen2_5/7B_lora.yaml b/recipes/configs/qwen2_5/7B_lora.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2.5-7B-Instruct
 #
 # To launch on 2 devices, run the following command from root:
 #   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/7B_lora
@@ -31,13 +31,13 @@ model:
 
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-7B-Instruct/merges.txt
   max_seq_len: null
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-7B-Instruct
   checkpoint_files: [
     model-00001-of-00004.safetensors,
     model-00002-of-00004.safetensors,
diff --git a/recipes/configs/qwen2_5/7B_lora_single_device.yaml b/recipes/configs/qwen2_5/7B_lora_single_device.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2.5-7B-Instruct
 #
 # To launch on a single device, run the following command from root:
 #   tune run lora_finetune_single_device --config qwen2_5/7B_lora_single_device
@@ -30,13 +30,13 @@ model:
 
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-7B-Instruct/merges.txt
   max_seq_len: null
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-7B-Instruct
   checkpoint_files: [
     model-00001-of-00004.safetensors,
     model-00002-of-00004.safetensors,
diff --git a/recipes/configs/qwen2_5/evaluation.yaml b/recipes/configs/qwen2_5/evaluation.yaml
@@ -11,7 +11,7 @@ model:
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
+  checkpoint_dir: /tmp/Qwen2.5-0_5B-Instruct
   checkpoint_files: [
     model.safetensors,
   ]
@@ -21,8 +21,8 @@ checkpointer:
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
-  path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
-  merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
+  path: /tmp/Qwen2.5-0_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2.5-0_5B-Instruct/merges.txt
   max_seq_len: null
 
 # Environment
diff --git a/torchtune/modules/attention_utils.py b/torchtune/modules/attention_utils.py
@@ -22,7 +22,26 @@
         flex_attention,
     )
 
-    flex_attention_compiled = torch.compile(flex_attention, dynamic=False)
+    def compile_flex_attention():
+        try:
+            return torch.compile(flex_attention, dynamic=False)
+        except Exception as e:
+            # It may fail on some combinations of hardware/versions. Using max-autotune fixes this issue.
+            # Context: https://github.com/pytorch/torchtune/issues/2113
+            _log.info(
+                f"Compiling flex_attention failed with error '{e}'. Retrying with mode='max-autotune'."
+            )
+            try:
+                return torch.compile(flex_attention, dynamic=False, mode="max-autotune")
+            except Exception as e:
+                _log.info(
+                    f"Compiling flex_attention failed with error: '{e}', "
+                    "Updating your pytorch version to nightlies may solve it, or you can set"
+                    "in your config dataset.packed=False to avoid using flex attention."
+                )
+                raise
+
+    flex_attention_compiled = compile_flex_attention()
 
     # We cannot do nested compile, but flex attention only has perf benefits
     # when compiled. To insulate it from the compiler, we wrap it with