Fix:

the-tuning-machine · the-tuning-machine · commit 337d2643be5d · 2025-03-08T18:31:24.000+01:00
- fixing W4A8 quantization for cutlass kernel in precision benchmark
- importing triton only if cuda available
- setting a less harsh threshold for quant-dequant and for gemm kernel mm precision
diff --git a/benchmarks/benchmark_blockwise_scaled_linear_triton.py b/benchmarks/benchmark_blockwise_scaled_linear_triton.py
@@ -1,7 +1,9 @@
 import pandas as pd
 import torch
 from tqdm import tqdm
-from triton.testing import do_bench
+
+if torch.cuda.is_available():
+    from triton.testing import do_bench
 
 from torchao.float8.float8_utils import compute_error
 from torchao.ops import rowwise_scaled_linear_cutlass_s8s4
@@ -10,9 +12,10 @@
     fp8_blockwise_act_quant,
     fp8_blockwise_weight_quant,
 )
+
 from torchao.quantization.quant_api import (
-    int8_dynamic_activation_int4_weight,
-    quantize_,
+    _int8_symm_per_token_reduced_range_quant_cutlass,
+    _int4_symm_per_token_quant_cutlass,
 )
 
 from torchao.utils import is_sm_at_least_89
@@ -38,9 +41,14 @@ def get_blockwise_problem(
     assert (
         n % block_size == 0 and k % block_size == 0
     ), "N and K dims must be divisible by block_size"
-    A = (448.0 * (2 * torch.rand(m, k, device=device) - 1)).to(dtype)
+    assert dtype in [
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+    ], f"dtype must be torch.float8_e4m3fn or torch.float8_e5m2"
+    dtype_max = torch.finfo(dtype).max
+    A = (dtype_max * (2 * torch.rand(m, k, device=device) - 1)).to(dtype)
     A_scale = torch.randn((m, k // block_size), dtype=torch.half, device=device)
-    B = (448.0 * (2 * torch.rand(n, k, device=device) - 1)).to(dtype)
+    B = (dtype_max * (2 * torch.rand(n, k, device=device) - 1)).to(dtype)
     B_scale = torch.randn(
         (n // block_size, k // block_size), dtype=torch.half, device=device
     )
@@ -89,8 +97,15 @@ def benchmark_precision(
     W_q, W_s = fp8_blockwise_weight_quant(W, block_size, dtype)
     output_blockwise = blockwise_fp8_gemm(A_q, A_s, W_q, W_s)
 
-    quantize_(lin, int8_dynamic_activation_int4_weight())
-    output_rowwise = lin(A)
+    qact = _int8_symm_per_token_reduced_range_quant_cutlass(A)
+    qweight = _int4_symm_per_token_quant_cutlass(W)
+    output_rowwise = rowwise_scaled_linear_cutlass_s8s4(
+        qact.tensor_impl.int_data,
+        qact.tensor_impl.scale,
+        qweight.tensor_impl.int_data,
+        qweight.tensor_impl.scale,
+        None,
+    )
 
     return {
         "m": m,
diff --git a/test/prototype/test_blockwise_triton.py b/test/prototype/test_blockwise_triton.py
@@ -35,7 +35,7 @@ def test_blockwise_quant_dequant(_, N, K, dtype):
     error = torch.norm(x - x_reconstructed) / torch.norm(x)
     print(f"Relative Error: {error.item():.6f}")
 
-    assert error < 0.05, "Quant-Dequant error is too high"
+    assert error < 0.1, "Quant-Dequant error is too high"
 
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -56,4 +56,4 @@ def test_blockwise_fp8_gemm(M, N, K, dtype):
     error = torch.norm(C - C_q) / torch.norm(C)
     print(f"Relative Error: {error.item():.6f}")
 
-    assert error < 0.05, "Quantize gemm error is too high"
+    assert error < 0.1, "Quantize gemm error is too high"
diff --git a/torchao/prototype/blockwise_fp8/blockwise_quantization.py b/torchao/prototype/blockwise_fp8/blockwise_quantization.py
@@ -1,8 +1,10 @@
 from typing import Tuple
 
 import torch
-import triton
-import triton.language as tl
+
+if torch.cuda.is_available():
+    import triton
+    import triton.language as tl
 
 
 @triton.jit
@@ -50,6 +52,10 @@ def fp8_blockwise_act_quant(
     assert (
         x.size(-1) % block_size == 0
     ), f"Last dimension size must be divisible by block_size (block_size={block_size})"
+    assert dtype in [
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+    ], f"dtype must be torch.float8_e4m3fn or torch.float8_e5m2"
     y = torch.empty_like(x, dtype=dtype)
     s = x.new_empty(*x.size()[:-1], x.size(-1) // block_size, dtype=torch.float32)
     grid = lambda meta: (triton.cdiv(x.numel(), meta["BLOCK_SIZE"]),)
@@ -108,6 +114,10 @@ def fp8_blockwise_weight_quant(
     assert (
         x.size(0) % block_size == 0 and x.size(1) % block_size == 0
     ), f"Both dimensions of x must be divisible by block_size (block_size={block_size})"
+    assert dtype in [
+        torch.float8_e4m3fn,
+        torch.float8_e5m2,
+    ], f"dtype must be torch.float8_e4m3fn or torch.float8_e5m2"
     M, N = x.size()
     y = torch.empty_like(x, dtype=dtype)
     s = x.new_empty(M // block_size, N // block_size, dtype=torch.float32)