pytorch
diff --git a/‎.github/workflows/float8_test.yml
+3 b/‎.github/workflows/float8_test.yml
+3
diff --git a/‎.github/workflows/nightly_smoke_test.yml
+4-2 b/‎.github/workflows/nightly_smoke_test.yml
+4-2
diff --git a/‎.github/workflows/regression_test.yml
+3 b/‎.github/workflows/regression_test.yml
+3
diff --git a/‎benchmarks/float8/profile_linear_float8.py
+9-1 b/‎benchmarks/float8/profile_linear_float8.py
+9-1
diff --git a/‎test/float8/test_compile.py
+68 b/‎test/float8/test_compile.py
+68
diff --git a/‎test/integration/test_integration.py
+100-25 b/‎test/integration/test_integration.py
+100-25
diff --git a/‎torchao/_models/llama/eval.py
+1-1 b/‎torchao/_models/llama/eval.py
+1-1
diff --git a/‎torchao/dtypes/uintx/marlin_sparse_layout.py
+5 b/‎torchao/dtypes/uintx/marlin_sparse_layout.py
+5
diff --git a/‎torchao/float8/README.md
+4-1 b/‎torchao/float8/README.md
+4-1
diff --git a/‎torchao/float8/__init__.py
+4 b/‎torchao/float8/__init__.py
+4
@@ -29,6 +29,9 @@ jobs:
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
 
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 60
 
@@ -11,7 +11,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  HF_TOKEN: ${{ secrets.HF_TOKEN }} 
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
 jobs:
   test:
@@ -25,7 +25,9 @@ jobs:
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.1"
 
-
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: ${{ matrix.runs-on }}
 
@@ -34,6 +34,9 @@ jobs:
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
 
+    permissions:
+      id-token: write
+      contents: read
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       timeout: 120
 
@@ -37,6 +37,7 @@
     update_triton_kernels_in_prof_chome_trace_with_torch_logs,
 )
 
+from torchao.float8 import _prototype_register_float8_delayed_scaling_inductor_passes
 from torchao.float8.config import (
     Float8LinearRecipeName,
     ScalingType,
@@ -206,7 +207,7 @@ def profile_function(
         # by default torch.compile appends to log_file_name, so we delete it
         # if it exists
         if os.path.isfile(config.logs_file_path):
-            pathlib.Path.unlink(config.logs_file_path)
+            pathlib.Path(config.logs_file_path).unlink()
         torch._logging._init_logs(log_file_name=config.logs_file_path)
 
     activities = [ProfilerActivity.CPU]
@@ -288,6 +289,7 @@ def main(
     add_inductor_metadata_to_trace: bool = True,
     enable_sync_amax_history: bool = True,
     enable_activation_checkpointing: bool = False,
+    enable_float8_delayed_scaling_inductor_passes: bool = False,
 ):
     assert model_type in (
         "linear",
@@ -325,6 +327,12 @@ def main(
     print(
         f"enable_activation_checkpointing is set to {enable_activation_checkpointing}"
     )
+    print(
+        f"enable_float8_delayed_scaling_inductor_passes is set to {enable_float8_delayed_scaling_inductor_passes}"
+    )
+
+    if enable_float8_delayed_scaling_inductor_passes:
+        _prototype_register_float8_delayed_scaling_inductor_passes()
 
     device = "cuda"
     ref_dtype = torch.bfloat16
 
@@ -7,6 +7,7 @@
 import random
 import sys
 import unittest
+from dataclasses import replace
 from io import StringIO
 
 import pytest
@@ -25,6 +26,7 @@
 from torch._dynamo.test_case import TestCase as DynamoTestCase
 from torch._dynamo.testing import CompileCounterWithBackend
 
+from torchao.float8 import _prototype_register_float8_delayed_scaling_inductor_passes
 from torchao.float8.config import (
     CastConfig,
     Float8LinearConfig,
@@ -51,6 +53,7 @@
 from torchao.float8.float8_utils import config_has_stateful_scaling
 from torchao.float8.stateful_float8_linear import StatefulFloat8Linear
 from torchao.testing.float8.test_utils import get_test_float8_linear_config
+from torchao.utils import is_fbcode
 
 
 def _test_compile_base(
@@ -465,5 +468,70 @@ def test_dynamic_scale_numeric_parity(dtype: torch.dtype):
     assert torch.equal(float8_eager._data, float8_compile._data)
 
 
+@unittest.skipIf(
+    not is_sm_at_least_89() or not is_fbcode(),
+    "CUDA with float8 support not available; or not on fbcode (the test needs be run with the latest pytorch package)",
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+def test_delayed_scaling_pattern_replacement(dtype: torch.dtype):
+    from torch._inductor import config as inductor_config
+    from torch._inductor import metrics
+
+    inductor_config.loop_ordering_after_fusion = True
+
+    def clear_all():
+        metrics.reset()
+        from torch._inductor.fx_passes.post_grad import (
+            pass_patterns as post_grad_patterns_all,
+        )
+
+        post_grad_patterns_all[1].clear()
+        post_grad_patterns_all[1].seen_patterns.clear()
+
+    def compile_and_run_single_layer():
+        random.seed(0)
+        torch.manual_seed(0)
+        x_shape = (2048, 3072)
+        linear_dtype = dtype
+
+        x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype).requires_grad_()
+        m_ref = nn.Linear(3072, 2048, bias=True, device="cuda", dtype=linear_dtype)
+
+        config = get_test_float8_linear_config(
+            ScalingType.DELAYED,
+            ScalingType.DELAYED,
+            ScalingType.DELAYED,
+            False,
+        )
+
+        config = replace(config, enable_amax_init=False)
+
+        m_fp8 = StatefulFloat8Linear.from_float(
+            copy.deepcopy(m_ref),
+            config,
+        )
+
+        m_fp8 = torch.compile(m_fp8, backend="inductor", fullgraph=True)
+        m_ref = torch.compile(m_ref, backend="inductor", fullgraph=True)
+
+        y_fp8 = m_fp8(x)
+        y_fp8.sum().backward()
+
+        return m_fp8.weight.grad
+
+    clear_all()
+    ref_output = compile_and_run_single_layer()
+    ref_count_kernel = metrics.generated_kernel_count
+
+    clear_all()
+    _prototype_register_float8_delayed_scaling_inductor_passes()
+    new_output = compile_and_run_single_layer()
+    new_count_kernel = metrics.generated_kernel_count
+
+    torch.equal(ref_output, new_output)
+    # With the pattern replacement workaround, amax reduction kernels for the 3 tensors (weight, activation, gradient) are fused.
+    assert ref_count_kernel == new_count_kernel + 3
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
@@ -25,6 +25,9 @@
     AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight,
     AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight,
     AQFloat8WeightOnlyQuantizedLinearWeight,
+    AQGemliteInt4G64WeightOnlyQuantizedLinearWeight,
+    AQInt4G32WeightOnlyQuantizedLinearWeight,
+    AQInt4G128WeightOnlyQuantizedMarlinSparseLinearWeight,
     AQInt8DynamicallyQuantizedLinearWeight,
     AQInt8WeightOnlyQuantizedLinearWeight,
     AQInt8WeightOnlyQuantizedLinearWeight2,
@@ -1751,37 +1754,109 @@ def test_autoquant_min_sqnr(self, device, dtype):
     @unittest.skipIf(
         not TORCH_VERSION_AT_LEAST_2_4, "autoquant float option requires 2.4+."
     )
-    def test_autoquant_float(self):
+    def test_autoquant_hp_float(self):
         device = "cuda"
         dtype = torch.float32
         m, k, n = 128, 128, 128
         example_input = torch.randn(m, k, device=device, dtype=dtype)
-        model = (
-            torch.nn.Sequential(
-                torch.nn.ReLU(),
-                torch.nn.Linear(k, n),
-                torch.nn.ReLU(),
+        for qclass in torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST:
+            model = (
+                torch.nn.Sequential(
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(k, n, bias=True),
+                    torch.nn.ReLU(),
+                )
+                .to(device)
+                .to(dtype)
             )
-            .to(device)
-            .to(dtype)
-        )
-        ref = model(example_input)
-        torchao.autoquant(
-            model,
-            qtensor_class_list=torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST,
-        )
-        out = model(example_input)
-        from torchao.quantization.autoquant import (
-            BFloat16Tensor,
-            Float16Tensor,
-            Float32Tensor,
-        )
+            ref = model(example_input)
+            qtensor_class_list = [qclass]
+            torchao.autoquant(
+                model,
+                qtensor_class_list=qtensor_class_list,
+            )
+            out = model(example_input)
+            self.assertIn(
+                type(model[1].weight),
+                qtensor_class_list,
+            )
+            self.assertGreater(compute_error(out, ref), 40)
 
-        self.assertIn(
-            type(model[1].weight), [Float32Tensor, Float16Tensor, BFloat16Tensor]
-        )
-        print(compute_error(out, ref))
-        self.assertGreater(compute_error(out, ref), 60)
+    @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_5, "autoquant int4 option requires 2.5+."
+    )
+    @unittest.skipIf(not has_gemlite, "gemlite not available")
+    def test_autoquant_int4wo(self, device, dtype):
+        if device == "cpu":
+            self.skipTest(f"int4wo is for cuda, not {device}")
+
+        m, k, n = 128, 128, 128
+        example_input = torch.randn(m, k, device=device, dtype=dtype)
+
+        for qclass in [
+            AQGemliteInt4G64WeightOnlyQuantizedLinearWeight,
+            AQInt4G32WeightOnlyQuantizedLinearWeight,
+            AQInt4G128WeightOnlyQuantizedMarlinSparseLinearWeight,
+        ]:
+            model = (
+                torch.nn.Sequential(
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(k, n, bias=True),
+                    torch.nn.ReLU(),
+                )
+                .to(device)
+                .to(dtype)
+            )
+            ref = model(example_input)
+            qtensor_class_list = [qclass]
+            torchao.autoquant(
+                model,
+                qtensor_class_list=qtensor_class_list,
+            )
+            out = model(example_input)
+
+            self.assertIn(type(model[1].weight), qtensor_class_list)
+            self.assertGreater(compute_error(ref, out), 20)
+
+    @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @unittest.skipIf(not is_sm_at_least_90(), "Need cuda arch greater than SM90")
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_5, "autoquant int4 option requires 2.5+."
+    )
+    def test_autoquant_float8(self, device, dtype):
+        if device == "cpu":
+            self.skipTest(f"int4wo is for cuda, not {device}")
+
+        # note: marlin sparse layout failed when scale_t has a dimension of 1d
+        m, k, n = 128, 128, 128
+        example_input = torch.randn(m, k, device=device, dtype=dtype)
+
+        for qclass in [
+            AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight,
+            AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight,
+            AQFloat8WeightOnlyQuantizedLinearWeight,
+        ]:
+            model = (
+                torch.nn.Sequential(
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(k, n, bias=True),
+                    torch.nn.ReLU(),
+                )
+                .to(device)
+                .to(dtype)
+            )
+            ref = model(example_input)
+            qtensor_class_list = [qclass]
+            torchao.autoquant(
+                model,
+                qtensor_class_list=qtensor_class_list,
+            )
+            out = model(example_input)
+
+            self.assertIn(type(model[1].weight), qtensor_class_list)
+            self.assertGreater(compute_error(ref, out), 20)
 
 
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "requires 2.5+.")
 
@@ -345,7 +345,7 @@ def run_evaluation(
         args.device,
         args.precision,
         args.quantization,
-        args.sparstiy,
+        args.sparsity,
         args.compile,
         args.max_length,
         args.calibration_tasks,
 
@@ -227,6 +227,11 @@ def from_plain(
         # Linear layers are (in_features, out_features) but the int_data that is reaching this point
         # is (out_features, in_features). We need to transpose it to match the expected shape in the marlin code.
         q_w_24 = int_data.t()
+        # addressing the case when scale has dimension 1, happens when
+        # weight_shape[-1] == group_size == 128
+        if scale.ndim == 1:
+            scale = scale.reshape(scale.shape[0], -1)
+
         scale_t = scale.t()
 
         if not torch.cuda.get_device_capability()[0] >= 8:
 
@@ -82,6 +82,9 @@ from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 if not TORCH_VERSION_AT_LEAST_2_5:
     raise AssertionError("torchao.float8 requires PyTorch version 2.5 or greater")
 
+# Recommended: enable additional torchinductor passes to improve the performance of delayed scaling
+torchao.float8._prototype_register_float8_delayed_scaling_inductor_passes()
+
 # create model and sample input
 m = nn.Sequential(
     nn.Linear(2048, 4096),
@@ -172,7 +175,7 @@ For small shapes, a combination of (2) and (3) leads to speedup < 1.  For medium
 
 ## Scaling type vs speedup
 
-Delayed scaling is theoretically faster than dynamic scaling because of reduced read/write traffic requirements.  Today, torch.compile has a couple of limitations (see the performance section of https://github.com/pytorch/ao/issues/556) which prevent us from reaching the optimal behavior for delayed scaling, so the observed performance of delayed scaling is close to that of dynamic scaling. As the torch.compile limitations are fixed, we expect delayed scaling to eventually become more performant compared to dynamic scaling.
+Delayed scaling is theoretically faster than dynamic scaling because of reduced read/write traffic requirements. Today, torch.compile has a couple of limitations (see the performance section of https://github.com/pytorch/ao/issues/556) which prevent us from reaching the optimal behavior for delayed scaling without workarounds.  We have a prototype workaround (API subject to change) with the `torchao.float8._prototype_register_float8_delayed_scaling_inductor_passes()` API to improve delayed scaling performance.
 
 ## torch.compile behavior vs speedup
 
 
@@ -23,6 +23,9 @@
     ScaledMMConfig,
 )
 from torchao.float8.fsdp_utils import precompute_float8_dynamic_scale_for_fsdp
+from torchao.float8.inductor_utils import (
+    _prototype_register_float8_delayed_scaling_inductor_passes,
+)
 from torchao.float8.inference import Float8MMConfig
 from torchao.float8.stateful_float8_linear import WeightWithDelayedFloat8CastTensor
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
@@ -54,5 +57,6 @@
     "linear_requires_sync",
     "sync_float8_amax_and_scale_history",
     "precompute_float8_dynamic_scale_for_fsdp",
+    "_prototype_register_float8_delayed_scaling_inductor_passes",
     # note: Float8Tensor and Float8Linear are not public APIs
 ]