fix tensor parallelism for float8 training with rowwise scaling (#1718)

vkuzo · web-flow · commit 988c5c97800d · 2025-02-18T13:13:28.000-08:00
Summary: 1. add a test for toy model + TP + float8 rowwise scaling training 2. fix underlying issues to make the test pass: a. add fast path for tensor view where the new shape is the same as old shape, for rowwise scaled float8 (this is needed for DTensor) b. modify the fake grad dependency workaround to work when grad is a DTensor Test Plan: 1. ./test/float8/test_everything.sh (one transient failure: https://www.internalfb.com/phabricator/paste/view/P1733103301) 2. verified that float8 rowwise scaling behaves sanely in torchtitan on LLaMa 3 8B on 8 H100s, with tp 2: ``` // requires pytorch/torchtitan#808 // baseline - bfloat16 + compile + tp 2 > with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --training.tensor_parallel_degree 2 --training.compile [rank0]:2025-02-14 13:41:16,175 - root - INFO - step: 40 loss: 7.4240 memory: 35.56GiB(37.43%) tps: 1,669 mfu: 9.77% // float8 baseline - float8 tensorwise + compile + tp 2 > with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --float8.enable_float8_linear --training.tensor_parallel_degree 2 --training.compile [rank0]:2025-02-14 13:44:07,806 - root - INFO - step: 40 loss: 7.4993 memory: 35.57GiB(37.44%) tps: 2,141 mfu: 12.54% // float8 rowwise without zero fake dep (for sanity) + compile + tp 2 > with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --float8.enable_float8_linear --training.tensor_parallel_degree 2 --training.compile --float8.recipe_name all_axiswise [rank0]:2025-02-14 13:47:51,400 - root - INFO - step: 40 loss: 7.3472 memory: 35.55GiB(37.42%) tps: 1,858 mfu: 10.88% // float8 rowwise + compile + tp 2 > with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --float8.enable_float8_linear --training.tensor_parallel_degree 2 --training.compile --float8.recipe_name all_axiswise [rank0]:2025-02-14 13:51:20,864 - root - INFO - step: 40 loss: 9.4211 memory: 35.55GiB(37.42%) tps: 1,820 mfu: 10.66% ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/test/float8/test_dtensor.py b/test/float8/test_dtensor.py
@@ -23,15 +23,26 @@
 
 from torch.distributed._tensor import DTensor, Replicate, Shard, distribute_tensor
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
-from torch.distributed.tensor.parallel import parallelize_module
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+    parallelize_module,
+)
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     ModelArgs,
     Transformer,
 )
 from tqdm import tqdm
 
 from torchao.float8 import Float8LinearConfig
-from torchao.float8.config import CastConfig, ScalingType, e4m3_dtype
+from torchao.float8.config import (
+    CastConfig,
+    Float8LinearRecipeName,
+    ScalingType,
+    e4m3_dtype,
+    recipe_name_to_linear_config,
+)
 from torchao.float8.float8_linear_utils import convert_to_float8_training
 from torchao.float8.float8_scaling_utils import NoopFwToFloat8BwDynamic
 from torchao.float8.float8_tensor import (
@@ -49,6 +60,8 @@
 from torchao.float8.fsdp_utils import WeightWithDynamicFloat8CastTensor
 from torchao.testing.float8.dtensor_utils import ToyModel
 
+torch.set_float32_matmul_precision("high")
+
 
 def setup_distributed():
     world_size = int(os.environ.get("WORLD_SIZE", -1))
@@ -180,13 +193,17 @@ def _test_dtensor_fp8_autograd(mesh: DeviceMesh, size=16):
 
 
 def _test_fp8_mlp_tensor_parallelism_base(
-    mesh: DeviceMesh, size=16, compile: bool = False
+    mesh: DeviceMesh, size=16, compile: bool = False, rowwise: bool = False
 ):
     device = mesh.device_type
-    # For now, only supports dynamic scaling of `x` and `dL_dY`.
-    # TODO(future): add support for float8 all-gather with delayed scaling
-    # for activations and gradients.
-    config = Float8LinearConfig(emulate=True)
+
+    if rowwise:
+        config = recipe_name_to_linear_config(Float8LinearRecipeName.ALL_AXISWISE)
+        # hack around config being frozen
+        # TODO(future PR): we should make this nicer at the config level
+        object.__setattr__(config, "emulate", True)
+    else:
+        config = Float8LinearConfig(emulate=True)
 
     toy_model = ToyModel().to(device)
     toy_model_fp8 = convert_to_float8_training(toy_model, config=config)
@@ -196,14 +213,28 @@ def _test_fp8_mlp_tensor_parallelism_base(
     sp_model = copy.deepcopy(toy_model)
     sp_model = convert_to_float8_training(sp_model, config=config)
 
+    # For tensorwise scaling, enable float8 all_gather.
+    # For rowwise scaling, keep high precision all_gather. Motivation for
+    # not doing float8 all-gather for rowwise: tensors need to be scaled both ways,
+    # so for float8 all-gather we'd need to send two float8 copies per tensor,
+    # which is similar # bytes over the wire than just doing bfloat16 all-gather.
+    if rowwise:
+        colwise_parallel_cls = ColwiseParallel
+        rowwise_parallel_cls = RowwiseParallel
+        prepare_input_cls = PrepareModuleInput
+    else:
+        colwise_parallel_cls = Float8ColwiseParallel
+        rowwise_parallel_cls = Float8RowwiseParallel
+        prepare_input_cls = PrepareFloat8ModuleInput
+
     # vanilla TP
     tp_model = parallelize_module(
         tp_model,
         mesh,
         {
-            "ffn.w1": Float8ColwiseParallel(),
-            "ffn.w2": Float8ColwiseParallel(),
-            "ffn.out_proj": Float8RowwiseParallel(),
+            "ffn.w1": colwise_parallel_cls(),
+            "ffn.w2": colwise_parallel_cls(),
+            "ffn.out_proj": rowwise_parallel_cls(),
         },
     )
 
@@ -212,33 +243,41 @@ def _test_fp8_mlp_tensor_parallelism_base(
         sp_model,
         mesh,
         {
-            "ffn": PrepareFloat8ModuleInput(
+            "ffn": prepare_input_cls(
                 input_layouts=Shard(1), desired_input_layouts=Replicate()
             ),
-            "ffn.w1": Float8ColwiseParallel(),
-            "ffn.w2": Float8ColwiseParallel(),
-            "ffn.out_proj": Float8RowwiseParallel(
+            "ffn.w1": colwise_parallel_cls(),
+            "ffn.w2": colwise_parallel_cls(),
+            "ffn.out_proj": rowwise_parallel_cls(
                 output_layouts=Shard(1), use_local_output=False
             ),
         },
     )
 
-    # PrepareFloat8ModuleInput with specific submodule fqn
+    # prepare_input_cls with specific submodule fqn
     sp_model2 = copy.deepcopy(toy_model)
     sp_model2 = convert_to_float8_training(sp_model2, config=config)
 
+    if rowwise:
+        prepare_input = prepare_input_cls(
+            input_layouts=Shard(1),
+            desired_input_layouts=Replicate(),
+        )
+    else:
+        prepare_input = prepare_input_cls(
+            input_layouts=Shard(1),
+            desired_input_layouts=Replicate(),
+            fwd_config_submodule_fqn="w2",
+        )
+
     sp_model2 = parallelize_module(
         sp_model2,
         mesh,
         {
-            "ffn": PrepareFloat8ModuleInput(
-                input_layouts=Shard(1),
-                desired_input_layouts=Replicate(),
-                fwd_config_submodule_fqn="w2",
-            ),
-            "ffn.w1": Float8ColwiseParallel(),
-            "ffn.w2": Float8ColwiseParallel(),
-            "ffn.out_proj": Float8RowwiseParallel(
+            "ffn": prepare_input,
+            "ffn.w1": colwise_parallel_cls(),
+            "ffn.w2": colwise_parallel_cls(),
+            "ffn.out_proj": rowwise_parallel_cls(
                 output_layouts=Shard(1), use_local_output=False
             ),
         },
@@ -278,11 +317,13 @@ def _test_fp8_mlp_tensor_parallelism_base(
 
 
 def _test_fp8_mlp_tensor_parallelism_eager(mesh: DeviceMesh, size=16):
-    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=False)
+    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=False, rowwise=False)
+    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=False, rowwise=True)
 
 
 def _test_fp8_mlp_tensor_parallelism_compile(mesh: DeviceMesh, size=16):
-    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=True)
+    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=True, rowwise=False)
+    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=True, rowwise=True)
 
 
 def _test_distribute_fsdp_tensor_subclass(tp_mesh: DeviceMesh):
diff --git a/torchao/float8/float8_linear.py b/torchao/float8/float8_linear.py
@@ -168,8 +168,10 @@ def backward(ctx, grad_output):
             ):
                 # workaround from https://github.com/pytorch/pytorch/issues/141881
                 # to avoid saving float8 weight from forward to backward when
-                # FSDP is on
-                weight_hp_t = weight_hp_t + (grad_output_reshaped[0, 0] * 0)
+                # FSDP is on: add a fake dependency on `grad_output`.
+                g_reshaped = grad_output.reshape(-1, grad_output.shape[-1]) * 0
+                zero = g_reshaped[:1] * 0
+                weight_hp_t = weight_hp_t + zero
 
             # Note: we need https://github.com/pytorch/pytorch/issues/136267
             # to be solved to have a chance to reuse max(abs(weight, dim=...))
diff --git a/torchao/float8/float8_ops.py b/torchao/float8/float8_ops.py
@@ -113,11 +113,25 @@ def float8_transpose(aten_op, args, kwargs=None):
 
 @implements([aten.view.default])
 def float8_view(aten_op, args, kwargs=None):
+    t, new_shape = args[0], args[1]
+
+    # if the new shape is the same as old, return an equivalent tensor
+    # note that we have to create a new wrapper to make PyTorch internals happy
+    if new_shape == list(t._data.shape):
+        new_data = aten_op(args[0]._data, *args[1:], **kwargs)
+        return Float8Tensor(
+            new_data,
+            args[0]._scale,
+            args[0]._orig_dtype,
+            args[0]._linear_mm_config,
+            args[0]._gemm_input_role,
+            args[0]._axiswise_dim,
+        )
+
     if len(args[0]._scale.shape) < 2:
         # tensorwise scaling
         return float8_desugar_op(aten_op, args, kwargs)
 
-    t, new_shape = args[0], args[1]
     # for now, only support reshaping to [-1, dim] or [dim, -1]
     axiswise_dim = t._axiswise_dim
     if len(new_shape) == 2:
@@ -146,6 +160,7 @@ def float8_view(aten_op, args, kwargs=None):
                 t._gemm_input_role,
                 new_axiswise_dim,
             )
+
     raise AssertionError(
         f"{aten_op} with axiswise scaling and t.shape {t.shape} t._scale.shape {t._scale.shape} t._axiswise_dim {t._axiswise_dim} new_shape {new_shape} is not supported yet."
     )
diff --git a/torchao/float8/float8_tensor_parallel.py b/torchao/float8/float8_tensor_parallel.py
@@ -36,6 +36,11 @@ def _float8_linear_supports_float8_allgather(m):
 
 
 class Float8ColwiseParallel(ColwiseParallel):
+    """
+    Like `ColwiseParallel`, but with all-gather in float8. This
+    currently assumes tensorwise scaling.
+    """
+
     @staticmethod
     def _prepare_input_fn(
         input_layouts, desired_input_layouts, mod, inputs, device_mesh
@@ -96,6 +101,11 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
 
 
 class Float8RowwiseParallel(RowwiseParallel):
+    """
+    Like `RowwiseParallel`, but with all-gather in float8. This
+    currently assumes tensorwise scaling.
+    """
+
     @staticmethod
     def _prepare_input_fn(
         input_layouts, desired_input_layouts, mod, inputs, device_mesh
@@ -154,18 +164,23 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
 
 
 class PrepareFloat8ModuleInput(PrepareModuleInput):
-    # subclass the PrepareModuleInput classes to implement fp8 specific logic, the only difference is that
-    # after we prepare the input DTensor, we cast the input to DTensor(Float8Tensor)
-    # This is to ensure the float8 cast happens before the all-gather (i.e. Shard -> Replicate)
-    # so that if there are multiple float8 users of the input activation, we perform fp8 allgather
-    # only once.
-    # FP8 Args:
-    #   float8_dtype (torch.dtype, optional): control what float8 dtype to cast to when prepare the module input,
-    #       we currently only support torch.float8_e4m3fn. default: torch.float8_e4m3fn
-    #   fwd_config_submodule_fqn (str, optional): the fqn of the submodule that contains the forward config used
-    #       for the float8 cast. If not specified, we will search for the Float8Linear in the submodules
-    #       and use the forward config from that module, in this case all module's forward config must be
-    #       the same.
+    """
+    Like `PrepareModuleInput`, but with all-gather in float8. This
+    currently assumes tensorwise scaling.
+
+    The only difference from `PrepareModuleInput` is that
+    after we prepare the input DTensor, we cast the input to DTensor(Float8Tensor)
+    This is to ensure the float8 cast happens before the all-gather (i.e. Shard -> Replicate)
+    so that if there are multiple float8 users of the input activation, we perform fp8 allgather
+    only once.
+    FP8 Args:
+      float8_dtype (torch.dtype, optional): control what float8 dtype to cast to when prepare the module input,
+          we currently only support torch.float8_e4m3fn. default: torch.float8_e4m3fn
+      fwd_config_submodule_fqn (str, optional): the fqn of the submodule that contains the forward config used
+          for the float8 cast. If not specified, we will search for the Float8Linear in the submodules
+          and use the forward config from that module, in this case all module's forward config must be
+          the same.
+    """
 
     def __init__(
         self,