[ROCm] simplify ck data type Adaptor (#15734)

PeixuanZuo · Prathik Rao · commit 3dbcb13921ea · 2023-05-16T06:22:38.000Z
DataTypeAdaptor is defined many times in every file that integrates CK.
This PR refactor the code to put DataTypeAdaptor in a header file.
diff --git a/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh b/onnxruntime/contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_pipelines.cuh
@@ -67,6 +67,7 @@ are in composable kernels. The scale and add logic is performed via Acc0ElementO
 #include "contrib_ops/rocm/bert/attention_softmax.h"
 #ifdef USE_COMPOSABLE_KERNEL
 #include "contrib_ops/rocm/bert/batched_gemm_softmax_gemm_permute_ck_impl/impl.cuh"
+#include "core/providers/rocm/composable_kernel_common.h"
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -451,22 +452,6 @@ class GemmSoftmaxGemmPermuteTunableOp : public tunable::TunableOp<GemmSoftmaxGem
 };
 
 #ifdef USE_COMPOSABLE_KERNEL
-namespace {
-template <typename T>
-struct DataTypeAdaptor {
-  using type = T;
-};
-
-template <>
-struct DataTypeAdaptor<half> {
-  using type = ck::half_t;
-};
-
-template <>
-struct DataTypeAdaptor<BFloat16> {
-  using type = ck::bhalf16_t;
-};
-}  // namespace
 
 template <typename T, bool USE_BIAS, bool USE_MASK>
 auto GetCKGemmSoftmaxGemmPermuteTypeStringAndOps() {
@@ -475,7 +460,7 @@ auto GetCKGemmSoftmaxGemmPermuteTypeStringAndOps() {
   using Nop = ck::tensor_operation::element_wise::PassThrough;
   using Acc0ElementOp = internal::PreSoftmaxAttentionScoreOp;
 
-  using CKDataType = typename DataTypeAdaptor<T>::type;
+  using CKDataType = typename CKDataTypeAdaptor<T>::type;
   using D0DataType = typename ck::detail::tuple_concat<
       std::conditional_t<USE_BIAS, ck::Tuple<CKDataType>, ck::Tuple<>>,
       std::conditional_t<USE_MASK, ck::Tuple<CKDataType>, ck::Tuple<>>>::type;
diff --git a/onnxruntime/contrib_ops/rocm/bert/gemm_fast_gelu_ck.cuh b/onnxruntime/contrib_ops/rocm/bert/gemm_fast_gelu_ck.cuh
@@ -8,6 +8,8 @@
 #include <vector>
 
 #ifdef USE_COMPOSABLE_KERNEL
+#include "core/providers/rocm/composable_kernel_common.h"
+
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
 #include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
@@ -28,20 +30,7 @@ namespace internal {
 
 #ifdef USE_COMPOSABLE_KERNEL
 
-template <typename T>
-struct DataTypeAdaptor {
-  using type = T;
-};
-
-template <>
-struct DataTypeAdaptor<half> {
-  using type = ck::half_t;
-};
-
-template <>
-struct DataTypeAdaptor<BFloat16> {
-  using type = ck::bhalf16_t;
-};
+using onnxruntime::rocm::CKDataTypeAdaptor;
 
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
@@ -52,7 +41,7 @@ using FastGelu = ck::tensor_operation::element_wise::FastGelu;
 
 template <typename T, typename ALayout, typename BLayout>
 auto GetCKGemmAddFastGeluTypeStringAndOps() {
-  using CKDataType = typename DataTypeAdaptor<T>::type;
+  using CKDataType = typename CKDataTypeAdaptor<T>::type;
   using DeviceGemmAddFastGelu = ck::tensor_operation::device::DeviceGemmMultipleD<
       ALayout, BLayout, ck::Tuple<Row>, Row,
       CKDataType, CKDataType, ck::Tuple<CKDataType>, CKDataType,
@@ -89,7 +78,7 @@ auto GetCKGemmAddFastGeluTypeStringAndOps() {
 
 template <typename T, typename ALayout, typename BLayout>
 auto GetCKGemmFastGeluTypeStringAndOps() {
-  using CKDataType = typename DataTypeAdaptor<T>::type;
+  using CKDataType = typename CKDataTypeAdaptor<T>::type;
   using DeviceGemmFastGelu = ck::tensor_operation::device::DeviceGemmMultipleD<
       ALayout, BLayout, ck::Tuple<>, Row,
       CKDataType, CKDataType, ck::Tuple<>, CKDataType,
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_ck.cuh
@@ -8,6 +8,8 @@
 #include <vector>
 
 #ifdef USE_COMPOSABLE_KERNEL
+#include "core/providers/rocm/composable_kernel_common.h"
+
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
@@ -22,15 +24,7 @@ namespace rocm {
 
 #ifdef USE_COMPOSABLE_KERNEL
 
-template <typename T>
-struct DataTypeAdaptor {
-  using type = T;
-};
-
-template <>
-struct DataTypeAdaptor<half> {
-  using type = ck::half_t;
-};
+using onnxruntime::rocm::CKDataTypeAdaptor;
 
 using Swish = ck::tensor_operation::element_wise::Swish;
 using Pass = ck::tensor_operation::element_wise::PassThrough;
@@ -40,9 +34,9 @@ constexpr int NumReduceDim = 3;
 
 template <typename T, typename AccT, bool WithSwish>
 auto GetCKGroupNormNHWCTypeStringAndOps() {
-  using InDataType = typename DataTypeAdaptor<T>::type;
-  using OutDataType = typename DataTypeAdaptor<T>::type;
-  using AccDataType = typename DataTypeAdaptor<AccT>::type;
+  using InDataType = typename CKDataTypeAdaptor<T>::type;
+  using OutDataType = typename CKDataTypeAdaptor<T>::type;
+  using AccDataType = typename CKDataTypeAdaptor<AccT>::type;
   using GammaDataType = float;
   using BetaDataType = float;
 
diff --git a/onnxruntime/core/providers/rocm/composable_kernel_common.h b/onnxruntime/core/providers/rocm/composable_kernel_common.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#ifdef USE_COMPOSABLE_KERNEL
+#include "ck/utility/data_type.hpp"
+#endif
+
+#include "core/providers/rocm/rocm_common.h"
+
+namespace onnxruntime {
+namespace rocm {
+
+#ifdef USE_COMPOSABLE_KERNEL
+template <typename T>
+struct CKDataTypeAdaptor {
+  using type = T;
+};
+
+template <>
+struct CKDataTypeAdaptor<half> {
+  using type = ck::half_t;
+};
+
+template <>
+struct CKDataTypeAdaptor<BFloat16> {
+  using type = ck::bhalf16_t;
+};
+#endif
+
+}  // namespace rocm
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/math/softmax_ck.cuh b/onnxruntime/core/providers/rocm/math/softmax_ck.cuh
@@ -8,6 +8,8 @@
 #include <vector>
 
 #ifdef USE_COMPOSABLE_KERNEL
+#include "core/providers/rocm/composable_kernel_common.h"
+
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
 #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
@@ -22,30 +24,15 @@ namespace rocm {
 
 #ifdef USE_COMPOSABLE_KERNEL
 
-template <typename T>
-struct DataTypeAdaptor {
-  using type = T;
-};
-
-template <>
-struct DataTypeAdaptor<half> {
-  using type = ck::half_t;
-};
-
-template <>
-struct DataTypeAdaptor<BFloat16> {
-  using type = ck::bhalf16_t;
-};
-
 using Nop = ck::tensor_operation::element_wise::PassThrough;
 constexpr int Rank = 4;
 constexpr int NumReduceDim = 1;
 
 template <typename InputT, typename OutputT, typename AccT>
 auto GetCKSoftmaxTypeStringAndOps() {
-  using InDataType = typename DataTypeAdaptor<InputT>::type;
-  using OutDataType = typename DataTypeAdaptor<OutputT>::type;
-  using AccDataType = typename DataTypeAdaptor<AccT>::type;
+  using InDataType = typename CKDataTypeAdaptor<InputT>::type;
+  using OutDataType = typename CKDataTypeAdaptor<OutputT>::type;
+  using AccDataType = typename CKDataTypeAdaptor<AccT>::type;
   using DeviceSoftmax = ck::tensor_operation::device::
       DeviceSoftmax<InDataType, AccDataType, OutDataType, Nop, Nop, Rank>;
   using InstanceFactory = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<DeviceSoftmax>;
diff --git a/onnxruntime/core/providers/rocm/tunable/gemm_ck.cuh b/onnxruntime/core/providers/rocm/tunable/gemm_ck.cuh
@@ -8,6 +8,8 @@
 #include <vector>
 
 #ifdef USE_COMPOSABLE_KERNEL
+#include "core/providers/rocm/composable_kernel_common.h"
+
 #include "ck/ck.hpp"
 #include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
 #include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
@@ -27,29 +29,14 @@ namespace internal {
 
 #ifdef USE_COMPOSABLE_KERNEL
 
-template <typename T>
-struct DataTypeAdaptor {
-  using type = T;
-};
-
-template <>
-struct DataTypeAdaptor<half> {
-  using type = ck::half_t;
-};
-
-template <>
-struct DataTypeAdaptor<BFloat16> {
-  using type = ck::bhalf16_t;
-};
-
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
 
 using Nop = ck::tensor_operation::element_wise::PassThrough;
 
 template <typename T, typename ALayout, typename BLayout>
 auto GetCKGemmTypeStringAndOps() {
-  using CKDataType = typename DataTypeAdaptor<T>::type;
+  using CKDataType = typename CKDataTypeAdaptor<T>::type;
   using DeviceGemm = ck::tensor_operation::device::DeviceGemm<
       ALayout, BLayout, Row,
       CKDataType, CKDataType, CKDataType,
@@ -95,7 +82,7 @@ auto GetCKGemmTypeStringAndOps() {
 
 template <typename T, typename ALayout, typename BLayout>
 auto GetCKStridedBatchedGemmTypeStringAndOps() {
-  using CKDataType = typename DataTypeAdaptor<T>::type;
+  using CKDataType = typename CKDataTypeAdaptor<T>::type;
   using DeviceStridedBatchedGemm = ck::tensor_operation::device::DeviceBatchedGemm<
       ALayout, BLayout, Row,
       CKDataType, CKDataType, CKDataType,