NhwcFusedConv: Add before Activation (#15837)

chenfucn · web-flow · commit 685e5b00f603 · 2023-05-08T21:02:35.000-07:00
### Description

Fp16 FusedConv and NhwcFusedConv. Fused Add operator should be performed
BEFORE the activation operator.


### Motivation and Context

Previous understanding of fused conv is incorrect.
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -1439,7 +1439,7 @@ class MLAS_HALF_GEMM_POSTPROCESSOR {
 /**
  * @brief Half precision activation functions, with optional sum tensor.
  * Supplied sum tensor must be the same layout as the GEMM output tensor.
- * And the supplied sum tensor will be added to the final result.
+ * And the supplied sum tensor will be added to the tensor before activation.
 */
 class MLAS_HALF_GEMM_ACTIVATION_PROCESSOR : public MLAS_HALF_GEMM_POSTPROCESSOR
 {
diff --git a/onnxruntime/core/mlas/lib/activate_fp16.cpp b/onnxruntime/core/mlas/lib/activate_fp16.cpp
@@ -689,35 +689,35 @@ MlasActivationKernel(
         size_t n = CountN;
 
         while (n >= 8) {
-            MLAS_FLOAT16X8 Vector = MlasLoadFloat16x8(buffer);
             MLAS_FLOAT16X8 AVec = MlasLoadFloat16x8(addsrc);
+            MLAS_FLOAT16X8 Vector = MlasLoadFloat16x8(buffer);
             addsrc += 8;
-            Vector = ActivationFunction.Activate(Vector);
             Vector = MlasAddFloat16x8(Vector, AVec);
+            Vector = ActivationFunction.Activate(Vector);
             MlasStoreFloat16x8(buffer, Vector);
             buffer += 8;
             n -= 8;
         }
 
         if (n >= 4) {
-            MLAS_FLOAT16X4 Vector = MlasLoadFloat16x4(buffer);
             MLAS_FLOAT16X4 AVec = MlasLoadFloat16x4(addsrc);
+            MLAS_FLOAT16X4 Vector = MlasLoadFloat16x4(buffer);
             addsrc += 4;
-            Vector = ActivationFunction.Activate(Vector);
             Vector = MlasAddFloat16x4(Vector, AVec);
+            Vector = ActivationFunction.Activate(Vector);
             MlasStoreFloat16x4(buffer, Vector);
             buffer += 4;
             n -= 4;
         }
 
         if (n > 0) {
-            MLAS_FLOAT16X4 buf;
-            std::memcpy(&buf, buffer, n * sizeof(_mlas_fp16_));
             MLAS_FLOAT16X4 addbuf;
+            MLAS_FLOAT16X4 buf;
             std::memcpy(&addbuf, addsrc, n * sizeof(_mlas_fp16_));
-            MLAS_FLOAT16X4 res = ActivationFunction.Activate(buf);
-            res = MlasAddFloat16x4(res, addbuf);
-            MlasStorePartialFloat16x4(buffer, res, n);
+            std::memcpy(&buf, buffer, n * sizeof(_mlas_fp16_));
+            buf = MlasAddFloat16x4(buf, addbuf);
+            buf = ActivationFunction.Activate(buf);
+            MlasStorePartialFloat16x4(buffer, buf, n);
         }
 
         CRow += ldc;
@@ -858,8 +858,6 @@ MLAS_HALF_GEMM_ACTIVATION_PROCESSOR::Process(
     ) const
 {
     std::vector<float> buffer(CountM*CountN);
-    MLAS_HALF_GEMM_2FLOAT_PROCESSOR proc(this->Activation_, buffer.data(), CountN);
-    proc.Process(C, StartM, StartN, CountM, CountN, ldc);
 
     _mlas_fp16_* Output = reinterpret_cast<_mlas_fp16_*>(C);
     auto* CRow = buffer.data();
@@ -876,6 +874,8 @@ MLAS_HALF_GEMM_ACTIVATION_PROCESSOR::Process(
             }
             CAdd += ldc;
         }
+        MlasActivation(&this->Activation_, CRow, nullptr, 1, CountN, CountN);
+
         CvtFloat2Half(Output, CRow, CountN);
         CRow += CountN;
         Output += ldc;
diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@@ -32,7 +32,7 @@ using ConvPadVector = ConvAttributes::ConvPadVector;
  * 2. Activation
  * It takes an operator attribute 'activation', which supplies the activation info.
  *
- * Add is performed AFTER activation.
+ * Add is performed BEFORE activation.
  *
  * The implementation supports both NCHW and NHWC. It runs faster with NHWC.
  *
@@ -281,12 +281,10 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
   if (Y->Shape().Size() == 0) {
     return Status::OK();
   }
-  if (Sum) {
-    if (Sum->Shape() != Y->Shape()) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Z shape does not match output shape.",
-                             " Z: ", Sum->Shape().ToString().c_str(),
-                             " Output: ", Y->Shape().ToString().c_str());
-    }
+  if (Sum && Sum->Shape() != Y->Shape()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Z shape does not match output shape.",
+                           " Z: ", Sum->Shape().ToString().c_str(),
+                           " Output: ", Y->Shape().ToString().c_str());
   }
 
   const int64_t input_image_size = input_shape.Size();
@@ -338,7 +336,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
   const auto* Xdata = X->Data<MLFloat16>();
   const auto* Bdata = B != nullptr ? B->Data<MLFloat16>() : nullptr;
   auto* Ydata = Y->MutableData<MLFloat16>();
-  const auto* SumData = Sum != nullptr ? Sum->Data<MLFloat16>() : nullptr;
+  const auto* sum_data = Sum != nullptr ? Sum->Data<MLFloat16>() : nullptr;
 
   BufferUniquePtr transpose_input_buffer;
   BufferUniquePtr transpose_output_buffer;
@@ -409,7 +407,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
   for (int64_t image_id = 0; image_id < N; ++image_id) {
     const auto* input_data = Xdata;
     auto* output_data = Ydata;
-    const auto* add_src = SumData;
+    const auto* add_src = sum_data;
 
     if (!channels_last_) {
       // Transpose the input from channels first (CHW) to channels last (HWC).
@@ -478,7 +476,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
             static_cast<size_t>(M),
             static_cast<size_t>(output_count),
             static_cast<size_t>(kernel_size),
-            &act);
+            (!channels_last_ && sum_data) ? nullptr : &act);
       } else {
         for (int64_t group_id = 0; group_id < group_count; ++group_id) {
           // Prepare the im2col transformation or use the input buffer directly for
@@ -554,7 +552,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
           gemm_params.C = worker_output + group_id * group_output_channels;
           gemm_params.ldc = static_cast<size_t>(M);
           gemm_params.Bias = Bdata;
-          gemm_params.OutputProcessor = &act;  // process fused activation and add
+          gemm_params.OutputProcessor = (!channels_last_ && sum_data) ? nullptr : &act;  // process fused activation and add
 
           MlasHalfGemmBatch(
               static_cast<size_t>(output_count),
@@ -574,10 +572,8 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
           Ydata,
           static_cast<size_t>(output_image_size),
           static_cast<size_t>(M));
-      if (SumData != nullptr) {
-        MLAS_ACTIVATION activation;
-        activation.ActivationKind = MlasIdentityActivation;
-        MLAS_HALF_GEMM_ACTIVATION_PROCESSOR proc(activation, SumData);
+      if (sum_data != nullptr) {
+        MLAS_HALF_GEMM_ACTIVATION_PROCESSOR proc(activation_, sum_data);
         proc.Process(Ydata, 0, 0, static_cast<size_t>(M),
                      static_cast<size_t>(output_image_size),
                      static_cast<size_t>(output_image_size));
@@ -586,8 +582,8 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
 
     Xdata += X_offset;
     Ydata += Y_offset;
-    if (SumData != nullptr) {
-      SumData += Y_offset;
+    if (sum_data != nullptr) {
+      sum_data += Y_offset;
     }
   }
 
diff --git a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp
@@ -70,6 +70,8 @@ class MlasFp16ActivationTest : public MlasTestBase {
     auto addonData = AddonBuffer.GetBuffer(M * N, true);
     MatrixGuardBuffer<float> FloatBuffer;
     auto* fpBuffer = FloatBuffer.GetBuffer(M * N, true);
+    MatrixGuardBuffer<float> FloatBuffer1;
+    auto* fpAddBuffer = FloatBuffer1.GetBuffer(M * N, true);
 
     size_t o = 3;
     for (size_t i = 0; i < M * N; i++) {
@@ -88,7 +90,6 @@ class MlasFp16ActivationTest : public MlasTestBase {
 
     MLAS_ACTIVATION Activation;
     MLAS_HALF_GEMM_ACTIVATION_PROCESSOR proc(Activation, nullptr);
-    MLAS_HALF_GEMM_2FLOAT_PROCESSOR converter(Activation, fpBuffer, N);
     MLAS_HALF_GEMM_ACTIVATION_PROCESSOR addon(Activation, reinterpret_cast<const MLAS_FP16*>(addonData));
     for (auto kind : acts) {
       Activation.ActivationKind = MLAS_ACTIVATION_KIND(kind);
@@ -111,17 +112,23 @@ class MlasFp16ActivationTest : public MlasTestBase {
         testData1[i] = TestData[i].f;
         testData2[i] = TestData[i].f;
         testData3[i] = TestData[i].f;
+        fpBuffer[i] = TestData[i].f;
+        fpAddBuffer[i] = TestData[i].f + addonData[i].ToFloat();
       }
       size_t offset = 7;
       for (size_t i = _countof(TestData); i < M * N; i++) {
         offset = (offset + 19) % 23;
-        testData1[i] = (MinimumFillValue + offset) / 16.0f;
+        float f = (MinimumFillValue + offset) / 16.0f;
+        testData1[i] = f;
         testData2[i] = testData1[i];
         testData3[i] = testData1[i];
+        fpBuffer[i] = f;
+        fpAddBuffer[i] = f + addonData[i].ToFloat();
       }
 
       proc.Process(reinterpret_cast<MLAS_FP16*>(testData1), 0, 0, M, N, N);
-      converter.Process(reinterpret_cast<MLAS_FP16*>(testData2), 0, 0, M, N, N);
+      MlasActivation(&Activation, fpBuffer, nullptr, M, N, N);
+      MlasActivation(&Activation, fpAddBuffer, nullptr, M, N, N);
       addon.Process(reinterpret_cast<MLAS_FP16*>(testData3), 0, 0, M, N, N);
 
       for (size_t i = 0; i < M * N; i++) {
@@ -131,8 +138,8 @@ class MlasFp16ActivationTest : public MlasTestBase {
             << std::setw(8) << std::setfill('0') << std::hex << actual << ", expecting:"
             << std::setw(8) << std::setfill('0') << std::hex << fpBuffer[i];
 
-        float addonActual = testData3[i].ToFloat() - addonData[i].ToFloat();
-        EXPECT_TRUE(check_equal(addonActual, fpBuffer[i]))
+        float addonActual = testData3[i].ToFloat();
+        EXPECT_TRUE(check_equal(addonActual, fpAddBuffer[i]))
             << ", Vector + Activation Kind:" << (int)kind << ", i=" << i << ", value:"
             << std::setw(8) << std::setfill('0') << std::hex << actual << ", expecting:"
             << std::setw(8) << std::setfill('0') << std::hex << fpBuffer[i];

Original file line number	Diff line number	Diff line change
`@@ -1439,7 +1439,7 @@ class MLAS_HALF_GEMM_POSTPROCESSOR {`
`1439`	`1439`	`/**`
`1440`	`1440`	`* @brief Half precision activation functions, with optional sum tensor.`
`1441`	`1441`	`* Supplied sum tensor must be the same layout as the GEMM output tensor.`
`1442`		`- * And the supplied sum tensor will be added to the final result.`
	`1442`	`+ * And the supplied sum tensor will be added to the tensor before activation.`
`1443`	`1443`	`*/`
`1444`	`1444`	`class MLAS_HALF_GEMM_ACTIVATION_PROCESSOR : public MLAS_HALF_GEMM_POSTPROCESSOR`
`1445`	`1445`	`{`