pytorch · romanwu10 · Aug 12, 2025 · Aug 12, 2025 · Aug 14, 2025
diff --git a/benchmarks/benchmark_blockwise_scaled_linear_triton.py b/benchmarks/benchmark_blockwise_scaled_linear_triton.py
@@ -18,6 +18,18 @@
         fp8_blockwise_act_quant,
         fp8_blockwise_weight_quant,
     )
+    # Import training kernels for comparison
+    from torchao.prototype.blockwise_fp8_training.kernels import (
+        blockwise_fp8_gemm_1x128_128x128,
+        fp8_blockwise_act_quant_lhs,
+        fp8_blockwise_weight_quant_transposed_rhs,
+    )
+    from torchao.prototype.blockwise_fp8_training.scaled_mm_kernels import (
+        blockwise_fp8_gemm_scaled_mm_1x128_128x128,
+    )
+    from torchao.prototype.blockwise_fp8_training.linear import (
+        Float8BlockwiseLinear,
+    )
     from torchao.utils import is_sm_at_least_89
 else:
     raise RuntimeError("This benchmark is only avaible on CUDA hardware")
@@ -74,6 +86,54 @@ def benchmark_latency(
     }
 
 
+def benchmark_training_kernels_latency(
+    m: int, k: int, n: int, block_size: int, dtype: torch.dtype, device
+):
+    """Benchmark training kernels: Triton vs torch._scaled_mm implementations."""
+    # Create reference tensors
+    A_ref = torch.randn((m, k), dtype=torch.bfloat16, device=device)
+    B_ref = torch.randn((k, n), dtype=torch.bfloat16, device=device) 
+    fp16_time = benchmark_microseconds(torch.nn.functional.linear, A_ref, B_ref)
+
+    # Create quantized inputs for training kernels
+    A_fp8, A_scale = fp8_blockwise_act_quant_lhs(A_ref, block_size)
+    B_fp8, B_scale = fp8_blockwise_weight_quant_transposed_rhs(B_ref, block_size)
+
+    # Benchmark Triton training kernel
+    try:
+        triton_time = benchmark_microseconds(
+            blockwise_fp8_gemm_1x128_128x128,
+            A_fp8, 1.0 / A_scale, B_fp8, 1.0 / B_scale
+        )
+    except Exception as e:
+        print(f"Triton kernel failed: {e}")
+        triton_time = float('inf')
+
+    # Benchmark torch._scaled_mm training kernel
+    try:
+        scaled_mm_time = benchmark_microseconds(
+            blockwise_fp8_gemm_scaled_mm_1x128_128x128,
+            A_fp8, 1.0 / A_scale, B_fp8, 1.0 / B_scale, block_size
+        )
+    except Exception as e:
+        print(f"Scaled MM kernel failed: {e}")
+        scaled_mm_time = float('inf')
+
+    return {
+        "m": m,
+        "k": k, 
+        "n": n,
+        "block_size": block_size,
+        "dtype": dtype,
+        "fp16_latency (ms)": fp16_time,
+        "triton_training_latency (ms)": triton_time,
+        "scaled_mm_training_latency (ms)": scaled_mm_time,
+        "triton_training_speedup": fp16_time / triton_time if triton_time != float('inf') else 0,
+        "scaled_mm_training_speedup": fp16_time / scaled_mm_time if scaled_mm_time != float('inf') else 0,
+        "scaled_mm_vs_triton_speedup": triton_time / scaled_mm_time if triton_time != float('inf') and scaled_mm_time != float('inf') else 0,
+    }
+
+
 def benchmark_precision(
     m: int, k: int, n: int, block_size: int, dtype: torch.dtype, device
 ):
@@ -96,20 +156,90 @@ def benchmark_precision(
     }
 
 
+def benchmark_training_kernels_precision(
+    m: int, k: int, n: int, block_size: int, dtype: torch.dtype, device
+):
+    """Benchmark precision of training kernels: Triton vs torch._scaled_mm."""
+    # Create high precision reference
+    A_ref = torch.randn((m, k), dtype=torch.bfloat16, device=device)
+    B_ref = torch.randn((k, n), dtype=torch.bfloat16, device=device)
+    ref_output = torch.nn.functional.linear(A_ref, B_ref)
+
+    # Create quantized inputs
+    A_fp8, A_scale = fp8_blockwise_act_quant_lhs(A_ref, block_size)
+    B_fp8, B_scale = fp8_blockwise_weight_quant_transposed_rhs(B_ref, block_size)
+
+    results = {
+        "m": m, "k": k, "n": n, "block_size": block_size, "dtype": dtype
+    }
+
+    # Test Triton kernel
+    try:
+        triton_output = blockwise_fp8_gemm_1x128_128x128(
+            A_fp8, 1.0 / A_scale, B_fp8, 1.0 / B_scale
+        )
+        results["triton_error_db"] = compute_error(ref_output, triton_output)
+    except Exception as e:
+        print(f"Triton precision test failed: {e}")
+        results["triton_error_db"] = float('inf')
+
+    # Test torch._scaled_mm kernel
+    try:
+        scaled_mm_output = blockwise_fp8_gemm_scaled_mm_1x128_128x128(
+            A_fp8, 1.0 / A_scale, B_fp8, 1.0 / B_scale, block_size
+        )
+        results["scaled_mm_error_db"] = compute_error(ref_output, scaled_mm_output)
+    except Exception as e:
+        print(f"Scaled MM precision test failed: {e}")
+        results["scaled_mm_error_db"] = float('inf')
+
+    # Compare the two implementations
+    if results["triton_error_db"] != float('inf') and results["scaled_mm_error_db"] != float('inf'):
+        try:
+            triton_output = blockwise_fp8_gemm_1x128_128x128(
+                A_fp8, 1.0 / A_scale, B_fp8, 1.0 / B_scale
+            )
+            scaled_mm_output = blockwise_fp8_gemm_scaled_mm_1x128_128x128(
+                A_fp8, 1.0 / A_scale, B_fp8, 1.0 / B_scale, block_size
+            )
+            results["triton_vs_scaled_mm_error_db"] = compute_error(triton_output, scaled_mm_output)
+        except Exception:
+            results["triton_vs_scaled_mm_error_db"] = float('inf')
+    else:
+        results["triton_vs_scaled_mm_error_db"] = float('inf')
+
+    return results
+
+
 if __name__ == "__main__" and torch.cuda.is_available():
     device = torch.device("cuda")
+
+    # Original inference benchmark configurations
     k_vals = (8192, 8192, 8192, 28672)
     n_vals = (8192, 10240, 57344, 8192)
     block_size_vals = (128, 128, 128, 128)
+
+    # Training kernel benchmark configurations (smaller set for faster testing)
+    training_configs = [
+        (1, 4096, 4096),    # Single token
+        (32, 4096, 4096),   # Small batch
+        (8, 4096, 11008),   # MLP up projection  
+        (8, 11008, 4096),   # MLP down projection
+        (1, 4096, 128256),  # Vocab projection (if memory allows)
+    ]
 
     latency_results = []
     precision_results = []
+    training_latency_results = []
+    training_precision_results = []
 
     available_dtypes = (
         [torch.float8_e4m3fn, torch.float8_e5m2]
         if is_sm_at_least_89()
         else [torch.float8_e5m2]
     )
+
+    print("Running original inference benchmarks...")
     for m in tqdm([1 << i for i in range(14)]):
         for dtype in available_dtypes:
             for n, k, block_size in zip(n_vals, k_vals, block_size_vals):
@@ -119,12 +249,42 @@ def benchmark_precision(
                 precision_results.append(
                     benchmark_precision(m, k, n, block_size, dtype, device)
                 )
+
+    print("Running training kernel benchmarks...")
+    for m, k, n in tqdm(training_configs):
+        # Only test on fp8_e4m3fn for training (most common)
+        if k % 128 == 0 and n % 128 == 0:  # Ensure divisibility
+            try:
+                training_latency_results.append(
+                    benchmark_training_kernels_latency(m, k, n, 128, torch.float8_e4m3fn, device)
+                )
+                training_precision_results.append(
+                    benchmark_training_kernels_precision(m, k, n, 128, torch.float8_e4m3fn, device)
+                )
+            except Exception as e:
+                print(f"Skipping training config ({m}, {k}, {n}): {e}")
 
-    df_latency = pd.DataFrame(latency_results)
-    df_precision = pd.DataFrame(precision_results)
-
-    df_latency.to_csv("blockwise_triton_latency_results.csv", index=False)
-    df_precision.to_csv("blockwise_triton_precision_results.csv", index=False)
+    # Save results
+    if latency_results:
+        df_latency = pd.DataFrame(latency_results)
+        df_latency.to_csv("blockwise_triton_inference_latency_results.csv", index=False)
+        print("\nInference Latency Results:")
+        print(df_latency.to_markdown(index=False))
 
-    print(df_latency.to_markdown(index=False))
-    print(df_precision.to_markdown(index=False))
+    if precision_results:
+        df_precision = pd.DataFrame(precision_results)
+        df_precision.to_csv("blockwise_triton_inference_precision_results.csv", index=False)
+        print("\nInference Precision Results:")
+        print(df_precision.to_markdown(index=False))
+
+    if training_latency_results:
+        df_training_latency = pd.DataFrame(training_latency_results)
+        df_training_latency.to_csv("blockwise_training_kernels_latency_results.csv", index=False)
+        print("\nTraining Kernels Latency Results:")
+        print(df_training_latency.to_markdown(index=False))
+
+    if training_precision_results:
+        df_training_precision = pd.DataFrame(training_precision_results)
+        df_training_precision.to_csv("blockwise_training_kernels_precision_results.csv", index=False)
+        print("\nTraining Kernels Precision Results:")
+        print(df_training_precision.to_markdown(index=False))