Fix:

the-tuning-machine · the-tuning-machine · commit 8d68d45cc300 · 2025-03-01T12:05:46.000+01:00
- removing triton dependency
- cleanning adaptative dtype
diff --git a/benchmarks/benchmark_blockwise_scaled_linear_triton.py b/benchmarks/benchmark_blockwise_scaled_linear_triton.py
@@ -15,6 +15,8 @@
     quantize_,
 )
 
+from torchao.utils import is_sm_at_least_89
+
 
 def benchmark_microseconds(f, *args):
     return do_bench(lambda: f(*args), return_mode="median") * 1e3
@@ -101,25 +103,7 @@ def benchmark_precision(
     }
 
 
-def get_device_available_dtypes():
-    sm = torch.cuda.get_device_capability()
-    available_dtypes = []
-
-    if sm[0] == 8 and sm[1] == 0:  # A100
-        available_dtypes.append(torch.float8_e5m2)
-    elif sm[0] == 9 and sm[1] == 0:  # H100
-        available_dtypes.append(torch.float8_e5m2)
-    elif sm[0] == 8 and sm[1] == 9:  # L4
-        available_dtypes.append(torch.float8_e4m3fn)
-        available_dtypes.append(torch.float8_e5m2)
-
-    print(
-        f"Available data types for device with compute capability {sm}: {available_dtypes}"
-    )
-    return available_dtypes
-
-
-if __name__ == "__main__":
+if __name__ == "__main__" and torch.cuda.is_available():
     device = torch.device("cuda")
     k_vals = (8192, 8192, 8192, 28672)
     n_vals = (8192, 10240, 57344, 8192)
@@ -128,7 +112,11 @@ def get_device_available_dtypes():
     latency_results = []
     precision_results = []
 
-    available_dtypes = get_device_available_dtypes()
+    available_dtypes = (
+        [torch.float8_e4m3fn, torch.float8_e5m2]
+        if is_sm_at_least_89()
+        else [torch.float8_e5m2]
+    )
 
     for m in tqdm([1 << i for i in range(10)]):
         for dtype in available_dtypes:
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -17,7 +17,6 @@ tabulate  # QOL for printing tables to stdout
 tiktoken
 blobfile
 lm_eval
-triton
 # sam
 diskcache
 pycocotools
diff --git a/test/prototype/test_blockwise_triton.py b/test/prototype/test_blockwise_triton.py
@@ -8,6 +8,8 @@
     fp8_blockwise_weight_quant,
 )
 
+from torchao.utils import is_sm_at_least_89
+
 BLOCKWISE_SIZE_MNK = [
     (2, 512, 128),
     (3, 2048, 2048),
@@ -20,9 +22,15 @@
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("_, N, K", BLOCKWISE_SIZE_MNK)
-def test_blockwise_quant_dequant(_, N, K):
+@pytest.mark.parametrize(
+    "dtype",
+    [torch.float8_e4m3fn, torch.float8_e5m2]
+    if is_sm_at_least_89()
+    else [torch.float8_e5m2],
+)
+def test_blockwise_quant_dequant(_, N, K, dtype):
     x = torch.randn(N, K).cuda()
-    qx, s = fp8_blockwise_weight_quant(x)
+    qx, s = fp8_blockwise_weight_quant(x, dtype=dtype)
     x_reconstructed = fp8_blockwise_weight_dequant(qx, s)
     error = torch.norm(x - x_reconstructed) / torch.norm(x)
     print(f"Relative Error: {error.item():.6f}")
@@ -32,17 +40,19 @@ def test_blockwise_quant_dequant(_, N, K):
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("M, N, K", BLOCKWISE_SIZE_MNK)
-def test_blockwise_fp8_gemm(M, N, K):
+@pytest.mark.parametrize(
+    "dtype",
+    [torch.float8_e4m3fn, torch.float8_e5m2]
+    if is_sm_at_least_89()
+    else [torch.float8_e5m2],
+)
+def test_blockwise_fp8_gemm(M, N, K, dtype):
     A = torch.randn(M, K).cuda()
     B = torch.randn(N, K).cuda()
-
     C = A @ B.T
-
-    A_q, A_s = fp8_blockwise_act_quant(A)
-    B_q, B_s = fp8_blockwise_weight_quant(B)
-
+    A_q, A_s = fp8_blockwise_act_quant(A, dtype=dtype)
+    B_q, B_s = fp8_blockwise_weight_quant(B, dtype=dtype)
     C_q = blockwise_fp8_gemm(A_q, A_s, B_q, B_s)
-    print(C_q, C)
     error = torch.norm(C - C_q) / torch.norm(C)
     print(f"Relative Error: {error.item():.6f}")
 
diff --git a/torchao/prototype/blockwise_fp8/blockwise_linear.py b/torchao/prototype/blockwise_fp8/blockwise_linear.py
@@ -32,9 +32,7 @@ def __init__(
         super().__init__()
         supported_dtypes = [
             torch.float8_e4m3fn,
-            torch.float8_e4m3fnuz,
             torch.float8_e5m2,
-            torch.float8_e5m2fnuz,
         ]
         assert (
             dtype in supported_dtypes