fix autocast handling for float8 training rowwise recipes (#2587)

vkuzo · web-flow · commit 262b180cea03 · 2025-07-23T14:56:37.000-04:00
Summary:

Breakage reported by customer, fixing and adding a test.

Two unrelated changes:
1. delete a duplicate autocast test (testing same thing as the one I'm
   changing)
2. modify `Float8TrainingTensor` repr to print `lp_dtype` instead of
   `dtype`, since logically it's printing the low precision data dtype

Test Plan:

```bash
pytest test/float8/test_base.py -k test_autocast_outputs -s -x
```

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -410,51 +410,30 @@ def test_linear_from_recipe(
     @pytest.mark.parametrize(
         "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
     )
+    @pytest.mark.parametrize(
+        "recipe_name",
+        [
+            Float8LinearRecipeName.TENSORWISE,
+            Float8LinearRecipeName.ROWWISE,
+            Float8LinearRecipeName.ROWWISE_WITH_GW_HP,
+        ],
+    )
     @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
     def test_autocast_outputs(
         self,
         emulate: bool,
         linear_dtype: torch.dtype,
+        recipe_name: Float8LinearRecipeName,
     ):
         m_ref = nn.Sequential(
             nn.Linear(32, 32, device="cuda", dtype=linear_dtype),
             nn.Linear(32, 32, device="cuda", dtype=linear_dtype),
         )
-        config = Float8LinearConfig(
-            emulate=emulate,
-        )
-        m = convert_to_float8_training(copy.deepcopy(m_ref), config=config)
-
-        # autocast off
-        x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
-        y = m(x)
-        assert y.dtype == linear_dtype, f"y.dtype is {y.dtype}, expected {linear_dtype}"
-
-        # autocast on
-        with torch.autocast("cuda"):
-            y = m(x)
-        assert y.dtype == torch.half, f"y.dtype is {y.dtype}, expected {torch.half}"
-
-        with torch.autocast("cuda", dtype=torch.bfloat16):
-            y = m(x)
-        assert y.dtype == torch.bfloat16, (
-            f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
-        )
-
-    @pytest.mark.parametrize(
-        "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
-    )
-    @pytest.mark.parametrize(
-        "emulate", [True, False] if is_sm_at_least_89() else [True]
-    )
-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
-    def test_type_cast(self, linear_dtype: torch.dtype, emulate: bool):
-        m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
-        config = Float8LinearConfig(emulate=emulate)
-        m = Float8Linear.from_float(copy.deepcopy(m), config)
+        config = Float8LinearConfig.from_recipe_name(recipe_name)
+        # work around config being frozen
+        object.__setattr__(config, "emulate", emulate)
 
-        # Cast the module to dtype
-        m = m.to(dtype=linear_dtype)
+        m = convert_to_float8_training(copy.deepcopy(m_ref), config=config)
 
         # autocast off
         x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
diff --git a/torchao/float8/float8_ops.py b/torchao/float8/float8_ops.py
@@ -444,7 +444,6 @@ def autocast_to_copy(aten_op, args, kwargs=None):
     when the input is a Float8TrainingTensor, presenting as a fp32
     tensor.
     """
-    _assert_tensorwise_scale(aten_op, args[0]._scale)
     assert isinstance(args[0], Float8TrainingTensor)
     assert len(kwargs) == 1 and "dtype" in kwargs, (
         "Only support dtype kwarg for autocast"
@@ -459,6 +458,7 @@ def autocast_to_copy(aten_op, args, kwargs=None):
         kwargs["dtype"],
         args[0]._linear_mm_config,
         args[0]._gemm_input_role,
+        args[0]._axiswise_dim,
     )
 
 
diff --git a/torchao/float8/float8_training_tensor.py b/torchao/float8/float8_training_tensor.py
@@ -319,7 +319,7 @@ def __new__(
         return self
 
     def __repr__(self):
-        return f"Float8TrainingTensor(dtype={self._data.dtype}, scale={self._scale}, linear_mm_config={self._linear_mm_config}, axiswise_dim={self._axiswise_dim}\ngemm_input_role={self._gemm_input_role}\nas_orig_prec={self.to_original_precision()}"
+        return f"Float8TrainingTensor(lp_dtype={self._data.dtype}, scale={self._scale}, linear_mm_config={self._linear_mm_config}, axiswise_dim={self._axiswise_dim}\ngemm_input_role={self._gemm_input_role}\nas_orig_prec={self.to_original_precision()}"
 
     def __tensor_flatten__(self):
         ctx = {