Update on "[wip] add axiswise granularity to Float8Tensor"

vkuzo · vkuzo · commit 10520662e4b3 · 2024-07-28T09:20:00.000-07:00
Summary:

This PR adds the axiswise scaling granularity to `Float8Tensor` and
ensures that basic ops like transpose and `torch._scaled_mm` work as
expected.

A future PR will add integration with `Float8Linear`.

Test Plan:

TODO

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
diff --git a/float8_experimental/float8_ops.py b/float8_experimental/float8_ops.py
@@ -41,7 +41,6 @@ def decorator(func):
 
 @implements(
     [
-        # aten.view.default,
         aten._unsafe_view.default,
         aten.as_strided.default,
         aten.clone.default,
@@ -79,19 +78,40 @@ def float8_desugar_data_and_scale(aten_op, args, kwargs=None):
         args[0]._gemm_input_role,
     )
 
+
 @implements([aten.view.default])
 def float8_view(aten_op, args, kwargs=None):
     if len(args[0]._scale.shape) < 2:
         # tensorwise scaling
-        return float8_desugar_op(aten_op, *args, **kwargs)
-    print('args', args)
-    print('kwargs', kwargs)
-    tensor, new_shape = args[0], args[1]
-
-    # for now, only support reshaping to [-1, *dims] or [*dims, -1]
-    if len(new_shape) >= 2 and (new_shape[0] == -1 or new_shape[-1] == -1):
-        return float8_desugar_data_and_scale(aten_op, *args, **kwargs)
-    raise AssertionError(f"{aten_op} with axiswise scaling and shape {new_shape} is not supported yet.")
+        return float8_desugar_op(aten_op, args, kwargs)
+
+    t, new_shape = args[0], args[1]
+    # for now, only support reshaping to [-1, dim] or [dim, -1]
+    if len(new_shape) == 2:
+        if new_shape == [t.shape[0], -1] and t._scale.shape[0] == 1:
+            new_data = aten_op(t._data, new_shape, **kwargs)
+            new_scale = aten_op(t._scale, [1, -1], **kwargs)
+            return Float8Tensor(
+                new_data,
+                new_scale,
+                t._orig_dtype,
+                t._linear_mm_config,
+                t._gemm_input_role,
+            )
+        elif new_shape == [-1, t.shape[-1]] and t._scale.shape[-1] == 1:
+            new_data = aten_op(t._data, new_shape, **kwargs)
+            new_scale = aten_op(t._scale, [-1, 1], **kwargs)
+            return Float8Tensor(
+                new_data,
+                new_scale,
+                t._orig_dtype,
+                t._linear_mm_config,
+                t._gemm_input_role,
+            )
+    raise AssertionError(
+        f"{aten_op} with axiswise scaling and t.shape {t.shape} t._scale.shape {t._scale.shape} new_shape {new_shape} is not supported yet."
+    )
+
 
 @implements([aten.split.Tensor])
 def float8_split(aten_op, args, kwargs=None):
diff --git a/float8_experimental/float8_python_api.py b/float8_experimental/float8_python_api.py
@@ -38,7 +38,6 @@ def addmm_float8_unwrapped(
     """
     a_inverse_scale = a_scale.reciprocal()
     b_inverse_scale = b_scale.reciprocal()
-
     if output_dtype == torch.float32 and bias is not None:
         # Bias is not supported by _scaled_mm when output is fp32
         output = torch._scaled_mm(
diff --git a/test/test_base.py b/test/test_base.py
@@ -171,27 +171,57 @@ def test_axiswise_dynamic_cast(self, shape, dim_name):
         sqnr = compute_error(a, a_dq)
         assert sqnr >= 25.0
 
-    # TODO(next) make this work
     def test_axiswise_reshape(self):
         a = torch.randn(3, 5, 7, dtype=torch.bfloat16, device="cuda")
         linear_mm_config = LinearMMConfig()
 
-        a_fp8 = hp_tensor_to_float8_dynamic(
+        # if we scale across dim0, we can only reshape to [3, -1]
+        a_fp8_d0 = hp_tensor_to_float8_dynamic(
             a,
             e4m3_dtype,
             linear_mm_config,
             scaling_granularity=ScalingGranularity.AXISWISE,
             axiswise_dim=0,
         )
-        # a_fp8._data.shape is (3, 5, 7)
-        # a_fp8._scale.shape is (1, 5, 7)
-        print(a_fp8._scale.shape)
+        assert list(a_fp8_d0._data.shape) == [3, 5, 7]
+        assert list(a_fp8_d0._scale.shape) == [1, 5, 7]
+
+        a_fp8_d0_r = a_fp8_d0.reshape(3, -1)
+        assert list(a_fp8_d0_r.shape) == [3, 5 * 7]
+        assert list(a_fp8_d0_r._scale.shape) == [1, 5 * 7]
+        # verify numerics did not change
+        assert torch.allclose(
+            a_fp8_d0.to_original_precision(),
+            a_fp8_d0_r.to_original_precision().reshape(3, 5, 7),
+            atol=0,
+            rtol=0,
+        )
+        with pytest.raises(AssertionError):
+            a_fp8_d0_r2 = a_fp8_d0.reshape(-1, 7)
 
-        # reshape to (3, 5 * 7)
-        # a_fp8._scale.shape should be (1, 5 * 7)
-        a_fp8_r = a_fp8.reshape(3, -1)
-        print(a_fp8_r._scale.shape)
-         
+        # if we scale across dim2, we can only reshape to [-1, 7]
+        a_fp8_d2 = hp_tensor_to_float8_dynamic(
+            a,
+            e4m3_dtype,
+            linear_mm_config,
+            scaling_granularity=ScalingGranularity.AXISWISE,
+            axiswise_dim=2,
+        )
+        assert list(a_fp8_d2._data.shape) == [3, 5, 7]
+        assert list(a_fp8_d2._scale.shape) == [3, 5, 1]
+
+        a_fp8_d2_r = a_fp8_d2.reshape(-1, 7)
+        assert list(a_fp8_d2_r.shape) == [3 * 5, 7]
+        assert list(a_fp8_d2_r._scale.shape) == [3 * 5, 1]
+        # verify numerics did not change
+        assert torch.allclose(
+            a_fp8_d2.to_original_precision(),
+            a_fp8_d2_r.to_original_precision().reshape(3, 5, 7),
+            atol=0,
+            rtol=0,
+        )
+        with pytest.raises(AssertionError):
+            a_fp8_d2_r2 = a_fp8_d2.reshape(3, -1)
 
     def test_axiswise_gemm(self):
         a = torch.randn(16, 32, dtype=torch.bfloat16, device="cuda")
@@ -216,11 +246,9 @@ def test_axiswise_gemm(self):
             axiswise_dim=1,
         )
         c_fp8_compute = torch.mm(a_fp8, b_fp8.t())
-        print(c_fp8_compute)
         c_ref = torch.mm(a, b.t())
         sqnr = compute_error(c_ref, c_fp8_compute)
-        print('sqnr', sqnr)
-        # TODO check numerical accuracy
+        assert sqnr >= 25.0
 
 
 class TestFloat8Linear: