Enable FP8_E5M2 GEMM (#352)

sanchitintel · aacostadiaz · web-flow · commit f6ca3e7b0d5d · 2025-05-17T07:39:03.000+01:00
### Summary 1. Enable FP8 GEMM for float_e5m2_t dtype. float_e5m2_t -> FP16 conversion code has been adapted/copy-pasted from https://github.com/pytorch/pytorch/blob/dfcfad2112933cc34247421ac0a4d3f19a1806c1/c10/util/Float8_e5m2.h#L30-L43 2. The existing `E4M3 -> FP16` conversion uses many copies, and `E5M2 -> FP16` conversion was also sharing some code with it in the first commit of this PR. Achieved ~85% speedup after eliminating all unnecessary copies for `E5M2 -> FP16` conversion. 3. The FP8 GEMM example is now run for both E5M2 & E4M3. ### Caveat It seems the of `E5M2 -> FP16` conversion time can't be estimated by comparing this implementation with an implementation that disables `E5M2 -> FP16` conversion because weirdly, the throughput is higher with the implementation in this PR than the case of disabling `FP8 -> FP16` conversion, which uses garbage values for `A` & `B` fragments (but FP8 data is still loaded into registers) - ```cpp //convert_FP8_to_FP16(tCrA, tCrA_fp16); //convert_FP8_to_FP16(tCrB, tCrB_fp16); ``` Some ops involved in the compute may be sensitive to NaNs (maybe present/generated in case of garbage `A` & `B` FP16 values), which may be causing a slowdown in that case. ### E5M2 perf comparison with E4M3 | M | N | K | L | Latency with E4M3 | Latency with E5M2 |Speedup| |--|--|--|--|-----|-----|---| |1024|1536|7168|1|2.9335 ms |0.4216 ms | 6.95x | |1024|1536|1536|1|0.6363 ms |0.0950 ms | 6.69x| |1024|576|7168|1|2.9326 ms | 0.4214 ms| 6.95x | |1024|2048|512|1|0.2203 ms |0.0359 ms | 6.14x | |1024|7168|1024|1|0.8571 ms | 0.1301 ms| 6.59x | |1024|256|7168|1| 2.9286 ms| 0.4209 ms| 6.96x | |1024|7168|128|1|0.1256 ms |0.0269 ms | 4.66x | Intel GPU Max 1550 was used #### Build commands (in cutlass directory) ``` export IGC_ExtraOCLOptions="-cl-intel-256-GRF-per-thread" export IGC_VectorAliasBBThreshold=12000000 export IGC_VISAOptions="-perfmodel" rm -rf build; mkdir build; cd build; CC=clang CXX=clang++ cmake .. -GNinja -DCUTLASS_ENABLE_EXAMPLES=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCUTLASS_ENABLE_SYCL=ON -DCUTLASS_SYCL_PROFILING_ENABLED=ON -DDPCPP_SYCL_TARGET=intel_gpu_pvc -DCUTLASS_ENABLE_BENCHMARKS=OFF -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0 -fdiagnostics-color=always" ``` cc @pengzhao-intel --------- Co-authored-by: Alejandro Acosta <alejandro.acosta@codeplay.com>
diff --git a/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp b/examples/sycl/08_pvc_gemm_f8/08_pvc_gemm_f8.cpp
@@ -29,13 +29,15 @@
  *
  **************************************************************************************************/
 /*! \file
-    \brief CUTLASS Intel PVC Gemm with float8 (float_e4m3_t) input
+    \brief CUTLASS Intel PVC Gemm with float8 (float_e4m3_t or float_e5m2_t) input
 
-    This example demonstrates GEMM on PVC with float8 input. cutlass::float_e4m3_t is an 8-bit
-    floating point type with 4-bit exponent, 3-bit mantissa and 1 sign bit. The GEMM in this example
-    performs the MMA with fp16 input, first upcasting the float_e4m3_t data for both A and B.
+    This example demonstrates GEMM on PVC with float8 input. The GEMM in this example
+    performs the MMA with fp16 input, first upcasting the fp8 data for both A and B.
+
+    Aside from the input datatypes, this example is identical to 00_pvc_gemm, except that
+    we're currently being forced to load A with VNNI layout, which probably degrades
+    performance. Ref: https://github.com/codeplaysoftware/cutlass-sycl/issues/357
 
-    Aside from the input datatypes, this example is identical to 00_pvc_gemm.
 
     Verification for this example is a standard fp16 GEMM, with input data upcasted on the host.
 
@@ -172,7 +174,7 @@ struct ExampleRunner {
   // Methods
   //
   template <typename SrcT, typename DstT>
-  void convert_e4m3_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
+  void convert_fp8_to_fp16(const SrcT* d_src, DstT* d_dst, size_t size) {
       SrcT* h_src = new SrcT[size];
       syclcompat::memcpy(h_src, d_src, size * sizeof(SrcT));
       syclcompat::wait();
@@ -193,12 +195,12 @@ struct ExampleRunner {
       cutlass::DeviceAllocation<half_t> block_B_fp16(block_B.size());
 
       // fp8 -> fp16
-      convert_e4m3_to_fp16<float_e4m3_t, half_t>(
+      convert_fp8_to_fp16<ElementA, half_t>(
           block_A.get(),
           block_A_fp16.get(),
           block_A.size()
       );
-      convert_e4m3_to_fp16<float_e4m3_t, half_t>(
+      convert_fp8_to_fp16<ElementA, half_t>(
           block_B.get(),
           block_B_fp16.get(),
           block_B.size()
@@ -307,6 +309,11 @@ struct ExampleRunner {
       float cute_time = timer.seconds() / options.iterations;
       double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
       std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+      if constexpr (std::is_same_v<ElementA, float_e4m3_t>) {
+        std::cout << "Datatype: float_e4m3_t"<< std::endl;
+      } else if constexpr (std::is_same_v<ElementA, float_e5m2_t>) {
+        std::cout << "Datatype: float_e5m2_t"<< std::endl;
+      }
       printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
     }
 
@@ -315,26 +322,9 @@ struct ExampleRunner {
 
 };
 
-int main(int argc, const char** argv)
+template<typename ElementType>
+int launcher(Options& options)
 {
-  //
-  // Parse options
-  //
-
-  Options options;
-
-  options.parse(argc, argv);
-
-  if (options.help) {
-    options.print_usage(std::cout) << std::endl;
-    return 0;
-  }
-
-  if (options.error) {
-    std::cerr << "Aborting execution." << std::endl;
-    return -1;
-  }
-
   //
   // Run examples
   //
@@ -346,10 +336,9 @@ int main(int argc, const char** argv)
   bool passed;
 
   using ElementAccumulator = float;
-  using ElementComputeEpilogue = float; 
-  // TODO: support E5M2
-  using ElementInputA = cutlass::float_e4m3_t; 
-  using ElementInputB = cutlass::float_e4m3_t; 
+  using ElementComputeEpilogue = float;
+  using ElementInputA = ElementType; 
+  using ElementInputB = ElementType; 
   using ElementOutput = float;
 
   using LayoutA = cutlass::layout::RowMajor;
@@ -416,3 +405,26 @@ int main(int argc, const char** argv)
 
   return 0;
 }
+
+int main(int argc, const char** argv) {
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+  launcher<cutlass::float_e5m2_t>(options);
+  launcher<cutlass::float_e4m3_t>(options);
+  return 0;
+}
diff --git a/include/cutlass/fp8_to_fp16.h b/include/cutlass/fp8_to_fp16.h
@@ -1,117 +1,128 @@
-/***************************************************************************************************
- * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
- 
- #pragma once
-
-#include <cutlass/half.h> 
-#include <cute/util/sycl_vec.hpp>
-
-using half_t = cutlass::half_t;
-using uchar16 = cute::intel::uchar16;
-using ushort16 = cute::intel::ushort16;
-
-static inline ushort16 convert_ushort16(uchar16 x) {
-    ushort16 result;
-    #pragma unroll
-    for (int i = 0; i < 16; ++i) {
-        result[i] = static_cast<uint16_t>(x[i]);
-    }
-    return result;
-}
-
-static inline ushort16 E4M3_to_FP16_vec16(uchar16 xin) {
-    uchar16 xa = xin & 0x7F;
-    uchar16 sgn_x = xin ^ xa;
-
-    uchar16 zero_mask;
-    #pragma unroll
-    for (int i = 0; i < 16; ++i) {
-        zero_mask[i] = (xa[i] == 0) ? 1 : 0;
-    }
-    uchar16 nan_mask = (0x7E - xa) & 0x80;
-    uchar16 den_mask = ((xa - 8) >> 7) & 0x01;
-
-    xa += (nan_mask >> 1);
-    xa |= (den_mask & 8);
-    den_mask &= 0x48;
-    xa += 0x40 & ~(zero_mask * 0x40);
-
-    ushort16 x16 = convert_ushort16(xa) << 7;
-    ushort16 den_corr = convert_ushort16(den_mask & ~zero_mask) << 7;
-
-    ushort16 result = x16 - den_corr;
-    result &= ~(convert_ushort16(zero_mask) << 7);
-
-    ushort16 sign_ext = convert_ushort16(sgn_x) << 8;
-    result ^= sign_ext;
-
-    return result;
-}
-
-static inline unsigned short E4M3_to_FP16(unsigned char xin) {
-    unsigned char xa, sgn_x, nan_mask, den_mask;
-
-    union {
-        signed short i;
-        _Float16 f;
-    } x16, den_corr;
-
-    xa = xin & 0x7f;
-    sgn_x = xin ^ xa;
-
-    // mask for NaN input
-    nan_mask = (0x7e - xa) & 0x80;
-    // mask for denormal / zero input
-    den_mask = (((signed char)(xa - 8)) >> 7);
-
-    // apply Nan correction
-    xa += (nan_mask >> 1);
-    // first denormal correction
-    xa |= (den_mask & 8);
-    den_mask &= 0x48;
-    // exponent bias correction
-    xa += 0x40;
-
-    // zero-extend to 16 bits
-    x16.i = xa;
-    den_corr.i = den_mask;
-    // FP16 format
-    x16.i <<= 7;
-    den_corr.i <<= 7;
-
-    // apply correction for denormals/zero
-    x16.f -= den_corr.f;
-
-    // finally, apply the sign
-    x16.i ^= (((signed short)sgn_x) << 8);
-
-    return (unsigned short)x16.i;
-}
+/***************************************************************************************************
+ * Copyright (c) 2025 - 2025 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+ #pragma once
+
+#include <cutlass/half.h>
+#include <cute/util/sycl_vec.hpp>
+
+using uchar16 = cute::intel::uchar16;
+using ushort16 = cute::intel::ushort16;
+
+static inline ushort16 convert_ushort16(uchar16 x) {
+    ushort16 result;
+    #pragma unroll
+    for (int i = 0; i < 16; ++i) {
+        result[i] = static_cast<uint16_t>(x[i]);
+    }
+    return result;
+}
+
+static inline unsigned short E4M3_to_FP16(unsigned char xin) {
+  unsigned char xa, sgn_x, nan_mask, den_mask;
+
+  union {
+      signed short i;
+      _Float16 f;
+  } x16, den_corr;
+
+  xa = xin & 0x7f;
+  sgn_x = xin ^ xa;
+
+  // mask for NaN input
+  nan_mask = (0x7e - xa) & 0x80;
+  // mask for denormal / zero input
+  den_mask = (((signed char)(xa - 8)) >> 7);
+
+  // apply Nan correction
+  xa += (nan_mask >> 1);
+  // first denormal correction
+  xa |= (den_mask & 8);
+  den_mask &= 0x48;
+  // exponent bias correction
+  xa += 0x40;
+
+  // zero-extend to 16 bits
+  x16.i = xa;
+  den_corr.i = den_mask;
+  // FP16 format
+  x16.i <<= 7;
+  den_corr.i <<= 7;
+
+  // apply correction for denormals/zero
+  x16.f -= den_corr.f;
+
+  // finally, apply the sign
+  x16.i ^= (((signed short)sgn_x) << 8);
+
+  return (unsigned short)x16.i;
+}
+
+
+
+static inline ushort16 E4M3_to_FP16_chunk16(uchar16 xin) {
+  uchar16 xa = xin & 0x7F;
+  uchar16 sgn_x = xin ^ xa;
+
+  uchar16 zero_mask;
+  #pragma unroll
+  for (int i = 0; i < 16; ++i) {
+      zero_mask[i] = (xa[i] == 0) ? 1 : 0;
+  }
+  uchar16 nan_mask = (0x7E - xa) & 0x80;
+  uchar16 den_mask = ((xa - 8) >> 7) & 0x01;
+
+  xa += (nan_mask >> 1);
+  xa |= (den_mask & 8);
+  den_mask &= 0x48;
+  xa += 0x40 & ~(zero_mask * 0x40);
+
+  ushort16 x16 = convert_ushort16(xa) << 7;
+  ushort16 den_corr = convert_ushort16(den_mask & ~zero_mask) << 7;
+
+  ushort16 result = x16 - den_corr;
+  result &= ~(convert_ushort16(zero_mask) << 7);
+
+  ushort16 sign_ext = convert_ushort16(sgn_x) << 8;
+  result ^= sign_ext;
+
+  return result;
+}
+
+
+template<int N>
+static inline void E5M2_to_FP16(cutlass::Array<uint8_t, N> const &xin, cutlass::Array<uint16_t, N> &xout) {
+  // Adapted from https://github.com/pytorch/pytorch/blob/dfcfad2112933cc34247421ac0a4d3f19a1806c1/c10/util/Float8_e5m2.h#L30-L43
+  CUTLASS_PRAGMA_UNROLL
+  for (int i = 0; i < N; i++) {
+    xout[i] = (static_cast<uint16_t>(xin[i])) << 8;
+  }
+}
diff --git a/include/cutlass/gemm/collective/xe_mma_w8a8.hpp b/include/cutlass/gemm/collective/xe_mma_w8a8.hpp