Delete delayed scaling (#812)

mori360 · web-flow · commit 2271b6386db6 · 2025-01-31T14:10:32.000-08:00
Torchao plans to deprecate delayed scaling, delete it at torchtitan Fix the issue: #654 Here are the logs running with `enable_float8_linear = true` 1. `compile = false` ``` [rank0]:2025-01-31 10:12:50,551 - root - INFO - Float8 training active [rank0]:2025-01-31 10:12:50,571 - root - INFO - Swapped to Float8Linear layers with enable_fsdp_float8_all_gather=False [rank0]:2025-01-31 10:12:50,572 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters [rank0]:2025-01-31 10:12:50,572 - root - INFO - Applied selective activation checkpointing to the model [rank0]:2025-01-31 10:12:50,635 - root - INFO - Applied FSDP to the model [rank0]:2025-01-31 10:12:50,835 - root - INFO - CUDA memory usage for model: 3.77GiB(3.97%) [rank0]:2025-01-31 10:12:50,835 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to ./outputs/checkpoint [rank0]:2025-01-31 10:12:50,837 - root - INFO - TensorBoard logging enabled. Logs will be saved at ./outputs/tb/20250131-1012 [rank0]:2025-01-31 10:12:50,837 - root - INFO - Training starts at step 1, with local batch size 1, global batch size 8, sequence length 8192, total steps 5 (warmup 200) [rank0]:2025-01-31 10:12:50,837 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace [rank0]:2025-01-31 10:13:02,460 - root - INFO - step: 1 loss: 12.2581 memory: 74.27GiB(78.18%) tps: 705 mfu: 4.13% [rank0]:2025-01-31 10:13:02,460 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 [rank0]:2025-01-31 10:13:04,973 - root - INFO - step: 2 loss: 12.0754 memory: 81.77GiB(86.07%) tps: 3,262 mfu: 19.10% [rank0]:2025-01-31 10:13:07,033 - root - INFO - step: 3 loss: 11.7432 memory: 81.77GiB(86.07%) tps: 3,980 mfu: 23.30% [rank0]:2025-01-31 10:13:09,089 - root - INFO - step: 4 loss: 11.3079 memory: 81.77GiB(86.07%) tps: 3,986 mfu: 23.34% [rank0]:2025-01-31 10:13:11,146 - root - INFO - step: 5 loss: 10.9303 memory: 81.77GiB(86.07%) tps: 3,985 mfu: 23.33% [rank0]:2025-01-31 10:13:11,147 - root - INFO - Saving a full checkpoint at last step, step 5. [rank0]:2025-01-31 10:13:31,549 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.40 seconds. [rank0]:2025-01-31 10:13:31,549 - root - INFO - Sleeping 2 seconds for other ranks to complete [rank0]:2025-01-31 10:13:33,551 - root - INFO - Training completed ``` 2. `compile = true` ``` [rank0]:2025-01-31 10:18:55,527 - root - INFO - Float8 training active [rank0]:2025-01-31 10:18:55,547 - root - INFO - Swapped to Float8Linear layers with enable_fsdp_float8_all_gather=False [rank0]:2025-01-31 10:18:55,548 - root - INFO - Model llama3 8B size: 8,030,261,248 total parameters [rank0]:2025-01-31 10:18:55,549 - root - INFO - Applied selective activation checkpointing to the model [rank0]:2025-01-31 10:18:55,591 - root - INFO - Compiling each TransformerBlock with torch.compile [rank0]:2025-01-31 10:18:55,656 - root - INFO - Applied FSDP to the model [rank0]:2025-01-31 10:18:56,530 - root - INFO - CUDA memory usage for model: 3.77GiB(3.97%) [rank0]:2025-01-31 10:18:56,532 - root - INFO - TensorBoard logging enabled. Logs will be saved at ./outputs/tb/20250131-1018 [rank0]:2025-01-31 10:18:56,533 - root - INFO - Training starts at step 1, with local batch size 1, global batch size 8, sequence length 8192, total steps 5 (warmup 200) [rank0]:2025-01-31 10:18:56,533 - root - INFO - Profiling active. Traces will be saved at ./outputs/profile_trace [rank0]:[rank0]:W0131 10:19:01.052000 1427728 torch/_logging/_internal.py:1093] [0/0] [rank0]:[rank0]:W0131 10:19:01.052000 1427728 torch/_logging/_internal.py:1093] [0/0] Detected that context_fn is passed to torch.utils.checkpoint under torch.compile. [rank0]:[rank0]:W0131 10:19:01.052000 1427728 torch/_logging/_internal.py:1093] [0/0] Please make sure the checkpointed region does not contain in-place ops (e.g. torch.relu_). [rank0]:[rank0]:W0131 10:19:01.052000 1427728 torch/_logging/_internal.py:1093] [0/0] [rank0]:/data/users/yifanmao/pytorch/torch/_inductor/lowering.py:1903: UserWarning: Torchinductor does not support code generation for complex operators. Performance may be worse than eager. [rank0]: warnings.warn( [rank0]:2025-01-31 10:19:15,619 - root - INFO - step: 1 loss: 12.2476 memory: 40.21GiB(42.32%) tps: 429 mfu: 2.51% [rank0]:2025-01-31 10:19:15,619 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 [rank0]:2025-01-31 10:19:16,747 - root - INFO - step: 2 loss: 12.0860 memory: 47.77GiB(50.28%) tps: 7,267 mfu: 42.55% [rank0]:2025-01-31 10:19:17,852 - root - INFO - step: 3 loss: 11.7620 memory: 47.77GiB(50.28%) tps: 7,420 mfu: 43.45% [rank0]:2025-01-31 10:19:18,953 - root - INFO - step: 4 loss: 11.3075 memory: 47.77GiB(50.28%) tps: 7,449 mfu: 43.62% [rank0]:2025-01-31 10:19:20,054 - root - INFO - step: 5 loss: 10.9359 memory: 47.77GiB(50.28%) tps: 7,448 mfu: 43.61% [rank0]:2025-01-31 10:19:20,054 - root - INFO - Sleeping 2 seconds for other ranks to complete [rank0]:2025-01-31 10:19:22,056 - root - INFO - Training completed ```
diff --git a/scripts/estimate/estimation.py b/scripts/estimate/estimation.py
@@ -116,7 +116,11 @@ def loss_fn(pred, labels):
     model_config.vocab_size = tokenizer.n_words
     model_config.max_seq_len = job_config.training.seq_len
 
-    with FakeTensorMode() if not job_config.memory_estimation.disable_fake_mode else contextlib.nullcontext():
+    with (
+        FakeTensorMode()
+        if not job_config.memory_estimation.disable_fake_mode
+        else contextlib.nullcontext()
+    ):
 
         logger.info(
             f"Building {model_name} {job_config.model.flavor} with {model_config}"
@@ -174,8 +178,6 @@ def loss_fn(pred, labels):
                 torch.nn.utils.clip_grad_norm_(
                     model.parameters(), job_config.training.max_norm, foreach=True
                 )
-                # sync float8 amaxes and scales
-                float8_handler.sync_float8_amax_and_scale_history(model)
                 # optimizer step
                 optimizers.step()
                 lr_schedulers.step()
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -548,25 +548,6 @@ def __init__(self):
             action="store_true",
             help="Whether precompute float8 scales dynamically for FSDP",
         )
-        self.parser.add_argument(
-            "--float8.scaling_type_input",
-            type=str,
-            default="dynamic",
-            help="float8 scaling for input, dynamic (default) or delayed",
-            choices=["dynamic", "delayed"],
-        )
-        self.parser.add_argument(
-            "--float8.scaling_type_weight",
-            type=str,
-            default="dynamic",
-            help="float8 scaling for input, dynamic (default) or delayed",
-        )
-        self.parser.add_argument(
-            "--float8.scaling_type_grad_output",
-            type=str,
-            default="dynamic",
-            help="float8 scaling for input, dynamic (default) or delayed",
-        )
 
         # communications library settings
         self.parser.add_argument(
diff --git a/torchtitan/float8.py b/torchtitan/float8.py
@@ -41,7 +41,7 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
             )
             return
         try:
-            from torchao.float8 import CastConfig, Float8LinearConfig, ScalingType
+            from torchao.float8 import Float8LinearConfig
         except ImportError as e:
             raise ImportError(
                 "torchao is not installed. Please install it to use float8 linear layers."
@@ -52,14 +52,8 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
             parallel_dims.dp_shard_enabled
             and float8_config.enable_fsdp_float8_all_gather
         )
-        scaling_type_input = ScalingType(float8_config.scaling_type_input)
-        scaling_type_weight = ScalingType(float8_config.scaling_type_weight)
-        scaling_type_grad_output = ScalingType(float8_config.scaling_type_grad_output)
         self.config = Float8LinearConfig(
             enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
-            cast_config_input=CastConfig(scaling_type=scaling_type_input),
-            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
-            cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
         )
 
         self.enabled = True
@@ -70,15 +64,6 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
             and float8_config.precompute_float8_dynamic_scale_for_fsdp
         )
 
-        # for sync_float8_amax_and_scale_history
-        self.delayed_scaling = (
-            scaling_type_input is ScalingType.DELAYED
-            or scaling_type_weight is ScalingType.DELAYED
-            or scaling_type_grad_output is ScalingType.DELAYED
-        )
-        self._sync_float8_amax_and_scale_history = None
-        self.compile = job_config.training.compile
-
         logger.info("Float8 training active")
 
     def convert_to_float8_training(self, model: nn.Module):
@@ -117,31 +102,3 @@ def precompute_float8_dynamic_scale_for_fsdp(
         models = [model] if isinstance(model, nn.Module) else model
         for m in models:
             precompute_float8_dynamic_scale_for_fsdp(m)
-
-    def sync_float8_amax_and_scale_history(
-        self, model: Union[nn.Module, List[nn.Module]]
-    ):
-        if not self.enabled:
-            return
-
-        if not self.delayed_scaling:
-            return
-
-        from torchao.float8 import sync_float8_amax_and_scale_history
-
-        # TODO(vkuzo): see if precalculating the modules to sync over is going to
-        # meaningfully help performance
-
-        if self._sync_float8_amax_and_scale_history is None:
-            if self.compile:
-                self._sync_float8_amax_and_scale_history = torch.compile(
-                    sync_float8_amax_and_scale_history
-                )
-            else:
-                self._sync_float8_amax_and_scale_history = (
-                    sync_float8_amax_and_scale_history
-                )
-
-        models = [model] if isinstance(model, nn.Module) else model
-        for m in models:
-            self._sync_float8_amax_and_scale_history(m)
diff --git a/train.py b/train.py
@@ -321,9 +321,6 @@ def loss_fn(pred, labels):
                 pp_mesh=pp_mesh if parallel_dims.pp_enabled else None,
             )
 
-            # sync float8 amaxes and scales
-            float8_handler.sync_float8_amax_and_scale_history(model_parts)
-
             # optimizer step
             checkpoint.maybe_wait_for_staging()
             optimizers.step()

Original file line number	Diff line number	Diff line change
`@@ -321,9 +321,6 @@ def loss_fn(pred, labels):`
`321`	`321`	`pp_mesh=pp_mesh if parallel_dims.pp_enabled else None,`
`322`	`322`	`)`
`323`	`323`
`324`		`- # sync float8 amaxes and scales`
`325`		`- float8_handler.sync_float8_amax_and_scale_history(model_parts)`
`326`		`-`
`327`	`324`	`# optimizer step`
`328`	`325`	`checkpoint.maybe_wait_for_staging()`
`329`	`326`	`optimizers.step()`