[QNN EP] MatMul+Add->Gemm fusion when AttentionFusion isn't enabled (#25017)

qti-yuduo · web-flow · commit 93ee7bf0b01c · 2025-06-27T16:52:56.000-07:00
### Description MatMul+Add->Gemm fusion when AttentionFusion isn't enabled. ### Motivation and Context Graph transformation [MatMulAddFusion](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/optimizer/matmul_add_fusion.cc) fold `ONNX::MatMul` followed by `ONNX::Add` into `ONNX::GEMM`, however, it [intentionally skipping the portion belongs to "Attention Pattern"](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/optimizer/matmul_add_fusion.cc#L21). This result in poor performance on QNN EP (and other EPs who does not run *AttentionFusion transformers) due to unfused MatMul + Add pairs. ![image](https://github.com/user-attachments/assets/cad0b2c6-ab07-4ced-a647-396c04fed365) With this change, additional GEMM would be fused *post* AttentionFusions.
diff --git a/onnxruntime/core/optimizer/graph_transformer_mgr.cc b/onnxruntime/core/optimizer/graph_transformer_mgr.cc
@@ -4,6 +4,9 @@
 #include "core/optimizer/graph_transformer_mgr.h"
 #include "core/optimizer/rule_based_graph_transformer.h"
 
+#include <memory>
+#include <utility>
+
 using namespace onnxruntime;
 using namespace ::onnxruntime::common;
 
@@ -60,7 +63,8 @@ void GraphTransformerManager::ClearGraphModified(void) {
 common::Status GraphTransformerManager::Register(std::unique_ptr<GraphTransformer> transformer,
                                                  TransformerLevel level) {
   const auto& name = transformer->Name();
-  if (transformers_info_.find(name) != transformers_info_.end()) {
+  const auto& registered = level_to_transformer_map_[level];
+  if (std::find(registered.begin(), registered.end(), transformer) != registered.end()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "This transformer is already registered " + name);
   }
 
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -215,6 +215,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
   const InlinedHashSet<std::string_view> cpu_acl_eps = {onnxruntime::kCpuExecutionProvider,
                                                         onnxruntime::kAclExecutionProvider};
 #endif
+  const InlinedHashSet<std::string_view> no_limit_empty_ep_list = {};
   const InlinedHashSet<std::string_view> dml_ep = {onnxruntime::kDmlExecutionProvider};
   AllocatorPtr cpu_allocator = CPUAllocator::DefaultInstance();
 
@@ -243,7 +244,6 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       for (const auto& p : session_options.initializers_to_share_map) {
         excluded_initializers.insert(p.first);
       }
-      const InlinedHashSet<std::string_view> no_limit_empty_ep_list = {};
       transformers.emplace_back(std::make_unique<ConstantSharing>(no_limit_empty_ep_list, excluded_initializers));
       transformers.emplace_back(std::make_unique<CommonSubexpressionElimination>());
       transformers.emplace_back(std::make_unique<ConstantFolding>(cpu_execution_provider, !disable_quant_qdq,
@@ -363,14 +363,13 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       transformers.emplace_back(std::make_unique<EmbedLayerNormFusion>(cpu_acl_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<GatherSliceToSplitFusion>(cpu_cuda_rocm_eps));
       transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));
-
       transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_acl_cuda_dml_rocm_eps));
-
       transformers.emplace_back(std::make_unique<GroupQueryAttentionFusion>(cuda_eps));
-
+      // Run MatMulAddFusion again after *AttentionFusion transforms with `preserve_attention_pattern = false`,
+      // to cleanup the remaining MatMul-Add that were part of the attention pattern but not detected or fused.
+      transformers.emplace_back(std::make_unique<MatMulAddFusion>(no_limit_empty_ep_list, false));
       transformers.emplace_back(std::make_unique<SkipLayerNormFusion>(cpu_acl_cuda_dml_rocm_eps));
-
       transformers.emplace_back(std::make_unique<FastGeluFusion>(cpu_cuda_dml_rocm_eps));
       transformers.emplace_back(std::make_unique<QuickGeluFusion>(cpu_acl_cuda_dml_rocm_eps));
 
diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.cc b/onnxruntime/core/optimizer/matmul_add_fusion.cc
@@ -1,11 +1,17 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/inlined_containers.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/graph/graph_utils.h"
+#include "core/optimizer/graph_transformer_utils.h"
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/matmul_add_fusion.h"
-#include "core/graph/graph_utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include <deque>
+
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
 
 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
@@ -128,7 +134,7 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
     int64_t m = 0, k = 0, n = 0;
     if (need_reshape) {
       // Only check and skip Attention pattern here because normally input to Attention is 4D.
-      if (attn_pattern_cache.IsAttentionPattern(graph, matmul_node, add_node)) {
+      if (preserve_attention_pattern_ && attn_pattern_cache.IsAttentionPattern(graph, matmul_node, add_node)) {
         continue;
       }
 
diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.h b/onnxruntime/core/optimizer/matmul_add_fusion.h
@@ -9,10 +9,15 @@ namespace onnxruntime {
 
 class MatMulAddFusion : public GraphTransformer {
  public:
-  MatMulAddFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
-      : GraphTransformer("MatMulAddFusion", compatible_execution_providers) {}
+  MatMulAddFusion(const InlinedHashSet<std::string_view>& compatible_execution_providers = {},
+                  const bool preserve_attention_pattern = true) noexcept
+      : GraphTransformer("MatMulAddFusion", compatible_execution_providers),
+        preserve_attention_pattern_(preserve_attention_pattern) {}
 
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+
+ private:
+  bool preserve_attention_pattern_;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -12,7 +12,7 @@ namespace qnn {
 
 /**
  * An ONNX MatMul can be translated to either a QNN MatMul or a QNN FullyConnected.
- * ONNX's MatMul suports inputs of rank 1, but neither QNN's MatMul nor FullyConnected support two rank 1 inputs.
+ * ONNX's MatMul supports inputs of rank 1, but neither QNN's MatMul nor FullyConnected support two rank 1 inputs.
  * So, we need to add Reshape Ops if necessary.
  * In two cases, FullyConnected (input_1's shape is [n, k]) is used instead of MatMul without extra Transpose Op:
  * 1. input_1 is a rank 2 initializer.
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -2696,6 +2696,43 @@ TEST_F(GraphTransformationTests, MatMulAddFusion_NeedReshape_3D) {
                                         1, pre_graph_checker, post_graph_checker));
 }
 
+// With attention pattern, but targeting an execution provider that does not perform
+// AttentionFusion, fuse into GEMM should still be happen, rather than skipping them
+TEST_F(GraphTransformationTests, MatMulAddFusion_PreserveAttentionPattern) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "matmul_add_fusion/matmul_add_from_attention.onnx";
+
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+
+  // This toy model contains 11 MatMul + Add pairs, 0 GEMMs.
+  // 7 of them are out of "Attention Pattern" (see MatMulAddFusion::IsAttentionPattern)
+  // 4 of them are in "Attention Pattern" conditionally skipped by MatMulAddFusion pass
+  OpCountMap op_count_before = CountOpsInGraph(p_model->MainGraph());
+  const InlinedHashSet<std::string_view> empty_list = {};
+
+  // In attention pattern, 4 MatMul + Add pairs should be preserved
+  Graph& graph = p_model->MainGraph();
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(
+      std::make_unique<MatMulAddFusion>(empty_list, /*preserve_attention_pattern=*/true), TransformerLevel::Level1));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+  OpCountMap op_count_cpu_ep = CountOpsInGraph(graph);
+  constexpr int expected_fusions1 = 11 - 4;
+  ASSERT_EQ(op_count_cpu_ep["MatMul"], op_count_before["MatMul"] - expected_fusions1);
+  ASSERT_EQ(op_count_cpu_ep["Add"], op_count_before["Add"] - expected_fusions1);
+  ASSERT_EQ(op_count_cpu_ep["Gemm"], op_count_before["Gemm"] + expected_fusions1);
+
+  // In attention pattern, 4 MatMul + Add pairs should be fused into Gemm
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(
+      std::make_unique<MatMulAddFusion>(empty_list, /*preserve_attention_pattern=*/false), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  OpCountMap op_count_qnn_ep = CountOpsInGraph(graph);
+  constexpr int expected_fusions2 = 11;
+  ASSERT_EQ(op_count_qnn_ep["MatMul"], op_count_before["MatMul"] - expected_fusions2);
+  ASSERT_EQ(op_count_qnn_ep["Add"], op_count_before["Add"] - expected_fusions2);
+  ASSERT_EQ(op_count_qnn_ep["Gemm"], op_count_before["Gemm"] + expected_fusions2);
+}
+
 #ifndef DISABLE_CONTRIB_OPS
 TEST_F(GraphTransformationTests, Gemm_Relu_three_input) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "matmul_add_fusion/3Input/gemm_relu.onnx";
diff --git a/onnxruntime/test/testdata/transform/matmul_add_fusion/matmul_add_from_attention.onnx b/onnxruntime/test/testdata/transform/matmul_add_fusion/matmul_add_from_attention.onnx