Qnn weight sharing improvement (#23945)

HectorSVC · web-flow · commit 7ae606f71f3e · 2025-03-13T08:13:14.000-07:00
### Description
Qnn weight sharing improvement so that only the last session in the weight sharing group (the session that has both share_ep_contexts and stop_share_ep_contexts enabled) generates the .bin file. The .bin file name is decided from the 1st session. And all generated *_ctx.onnx models point to this single .bin to avoid post-processing work.
Previously each session generates a _ctx.onnx model with a .bin file. So it requires post-processing work to go through generated *_ctx.onnx models to get the last generated *_ctx.bin file and update all *_ctx.onnx to point to the same .bin file and remove the .bin files not used.
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -3674,9 +3674,6 @@ struct OrtApi {
    *   Enable the float32 model to be inferenced with fp16 precision. Otherwise, it will be fp32 precision.
    *     - "0": With fp32 precision.
    *     - "1": Default. With fp16 precision.
-   *   "enable_htp_weight_sharing": Enable QNN weight sharing feature while compiling multiple graphs into one QNN context.
-   *     - "0": Default. Disabled.
-   *     - "1": Enabled.
    *   "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another
    *   execution provider (typically CPU EP).
    *     - "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O.
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -10,6 +10,7 @@
 #include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model.h"
+#include "core/providers/qnn/shared_context.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -207,7 +208,9 @@ Status CreateEPContextNodes(Model* model,
                             const onnxruntime::PathString& context_model_path,
                             bool qnn_context_embed_mode,
                             uint64_t max_spill_fill_buffer_size,
-                            const logging::Logger& logger) {
+                            const logging::Logger& logger,
+                            bool share_ep_contexts,
+                            bool stop_share_ep_contexts) {
   auto& graph = model->MainGraph();
 
   using namespace ONNX_NAMESPACE;
@@ -241,6 +244,7 @@ Status CreateEPContextNodes(Model* model,
         ep_node.AddAttribute(EP_CACHE_CONTEXT, cache_payload);
       } else {
         onnxruntime::PathString context_bin_path;
+        std::string context_cache_name;
         auto pos = context_model_path.find_last_of(ORT_TSTR("."));
         if (pos != std::string::npos) {
           context_bin_path = context_model_path.substr(0, pos);
@@ -253,14 +257,36 @@ Status CreateEPContextNodes(Model* model,
           graph_name_in_file.replace(name_pos, strlen(kQnnExecutionProvider), "");
         }
         context_bin_path = context_bin_path + ToPathString(graph_name_in_file + ".bin");
-        std::string context_cache_name(std::filesystem::path(context_bin_path).filename().string());
-        std::ofstream of_stream(context_bin_path.c_str(), std::ofstream::binary);
-        if (!of_stream) {
-          LOGS(logger, ERROR) << "Failed to open create context file.";
-          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to open context cache file.");
+        context_cache_name = std::filesystem::path(context_bin_path).filename().string();
+
+        // If generate ctx.onnx with share_ep_context enabled, all ctx.onnx should point to the same ctx.bin
+        if (share_ep_contexts) {
+          auto shared_ctx_bin_name = SharedContext::GetInstance().GetSharedCtxBinFileName();
+          if (shared_ctx_bin_name.empty()) {
+            SharedContext::GetInstance().SetSharedCtxBinFileName(context_cache_name);
+          } else {
+            context_cache_name = shared_ctx_bin_name;
+            auto model_folder_path = std::filesystem::path(context_bin_path).parent_path().string();
+            context_bin_path = ToPathString(model_folder_path + "/" + context_cache_name);
+          }
+        }
+
+        // Write the ctx.bin file for the case: 1. no share_ep_context enabled, write for every session
+        // 2. share_ep_context enabled, only write for the last session which has stop_share_ep_contexts enabled
+        if (!share_ep_contexts || (share_ep_contexts && stop_share_ep_contexts)) {
+          std::ofstream of_stream(context_bin_path.c_str(), std::ofstream::binary);
+          if (!of_stream) {
+            LOGS(logger, ERROR) << "Failed to open create context file.";
+            return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to open context cache file.");
+          }
+          of_stream.write(reinterpret_cast<char*>(buffer), buffer_size);
         }
-        of_stream.write(reinterpret_cast<char*>(buffer), buffer_size);
+
         ep_node.AddAttribute(EP_CACHE_CONTEXT, context_cache_name);
+        if (share_ep_contexts && stop_share_ep_contexts) {
+          SharedContext::GetInstance().ResetSharedCtxBinFileName();
+        }
+
         ep_node.AddAttribute(MAX_SIZE, static_cast<int64_t>(max_spill_fill_buffer_size));
       }
     } else {
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -65,6 +65,8 @@ Status CreateEPContextNodes(Model* model,
                             const onnxruntime::PathString& context_model_path,
                             bool qnn_context_embed_mode,
                             uint64_t max_spill_fill_buffer_size,
-                            const logging::Logger& logger);
+                            const logging::Logger& logger,
+                            bool share_ep_contexts,
+                            bool stop_share_ep_contexts);
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -538,7 +538,7 @@ Status SetQnnContextConfig(ContextPriority context_priority, QnnContext_Config_t
   return Status::OK();
 }
 
-Status QnnBackendManager::CreateContext() {
+Status QnnBackendManager::CreateContext(bool enable_htp_weight_sharing) {
   if (true == context_created_) {
     LOGS_DEFAULT(INFO) << "Context created already.";
     return Status::OK();
@@ -547,7 +547,7 @@ Status QnnBackendManager::CreateContext() {
   QnnContext_Config_t context_config_weight_sharing = QNN_CONTEXT_CONFIG_INIT;
   QnnHtpContext_CustomConfig_t custom_config;
   custom_config.option = QNN_HTP_CONTEXT_CONFIG_OPTION_WEIGHT_SHARING_ENABLED;
-  custom_config.weightSharingEnabled = enable_htp_weight_sharing_;
+  custom_config.weightSharingEnabled = enable_htp_weight_sharing;
   context_config_weight_sharing.option = QNN_CONTEXT_CONFIG_OPTION_CUSTOM;
   context_config_weight_sharing.customConfig = &custom_config;
 
@@ -810,7 +810,8 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
 // or generate Qnn context binary is enabled -- to get the max spill fill buffer size
 Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
                                        bool load_from_cached_context,
-                                       bool need_load_system_lib) {
+                                       bool need_load_system_lib,
+                                       bool share_ep_contexts) {
   std::lock_guard<std::recursive_mutex> lock(logger_recursive_mutex_);
   if (backend_setup_completed_) {
     LOGS(logger, VERBOSE) << "Backend setup already!";
@@ -865,9 +866,18 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger,
     LOGS(logger, VERBOSE) << "InitializeProfiling succeed.";
   }
 
+  bool enable_htp_weight_sharing = false;
+  if (share_ep_contexts && !load_from_cached_context) {
+#if defined(__aarch64__) || defined(_M_ARM64)
+    LOGS(logger, WARNING) << "Weight sharing only available with offline generation on x64 platform, not work on real device.";
+#else
+    enable_htp_weight_sharing = true;
+#endif
+  }
+
   if (!load_from_cached_context) {
     if (status.IsOK()) {
-      status = CreateContext();
+      status = CreateContext(enable_htp_weight_sharing);
     }
     if (status.IsOK()) {
       LOGS(logger, VERBOSE) << "CreateContext succeed.";
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -43,7 +43,6 @@ struct QnnBackendManagerConfig {
   uint32_t device_id;
   QnnHtpDevice_Arch_t htp_arch;
   uint32_t soc_model;
-  bool enable_htp_weight_sharing;
 };
 
 class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager> {
@@ -67,8 +66,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
         qnn_saver_path_(config.qnn_saver_path),
         device_id_(config.device_id),
         htp_arch_(config.htp_arch),
-        soc_model_(config.soc_model),
-        enable_htp_weight_sharing_(config.enable_htp_weight_sharing) {
+        soc_model_(config.soc_model) {
   }
 
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnBackendManager);
@@ -84,7 +82,8 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
 
   // Initializes handles to QNN resources (device, logger, etc.).
   // NOTE: This function locks the internal `logger_recursive_mutex_`.
-  Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context, bool need_load_system_lib);
+  Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context,
+                      bool need_load_system_lib, bool share_ep_contexts);
 
   Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
 
@@ -155,7 +154,7 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
 
   Status ReleaseProfilehandle();
 
-  Status CreateContext();
+  Status CreateContext(bool enable_htp_weight_sharing);
 
   Status ReleaseContext();
 
@@ -298,7 +297,6 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
   uint32_t device_id_ = 0;
   QnnHtpDevice_Arch_t htp_arch_ = QNN_HTP_DEVICE_ARCH_NONE;
   uint32_t soc_model_ = QNN_SOC_MODEL_UNKNOWN;
-  bool enable_htp_weight_sharing_ = false;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -337,19 +337,8 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_fp16_precision: " << enable_HTP_FP16_precision_;
   }
 
-  bool enable_htp_weight_sharing = false;
-  static const std::string QNN_HTP_WEIGHT_SHARING_ENABLED = "enable_htp_weight_sharing";
-  auto htp_weight_sharing_enabled_pos = provider_options_map.find(QNN_HTP_WEIGHT_SHARING_ENABLED);
-  if (htp_weight_sharing_enabled_pos != provider_options_map.end()) {
-    if ("1" == htp_weight_sharing_enabled_pos->second) {
-      enable_htp_weight_sharing = true;
-    } else if ("0" == htp_weight_sharing_enabled_pos->second) {
-      enable_htp_weight_sharing = false;
-    } else {
-      LOGS_DEFAULT(VERBOSE) << "Invalid enable_htp_weight_sharing: " << enable_htp_weight_sharing
-                            << " only 0 or 1 allowed. Set to 0.";
-    }
-    LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_weight_sharing: " << enable_htp_weight_sharing;
+  if (qnn_context_embed_mode_ && share_ep_contexts_) {
+    LOGS_DEFAULT(ERROR) << "[EP context generation:] Weight sharing enabled conflict with EP context embed mode. Inference will not work as expected!";
   }
 
   // Add this option because this feature requires QnnSystem lib and it's no supported for Windows x86_64 platform
@@ -406,8 +395,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
                                      qnn_saver_path,
                                      device_id_,
                                      htp_arch,
-                                     soc_model,
-                                     enable_htp_weight_sharing});
+                                     soc_model});
   }
 
 #if defined(_WIN32)
@@ -701,7 +689,9 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   // It will load the QnnSystem lib if is_qnn_ctx_model=true, and
   // delay the Qnn context creation to Compile() using the cached context binary
   // or generate context cache enable, need to use use QnnSystem lib to parse the binary to get the max spill fill buffer size
-  auto rt = qnn_backend_manager_->SetupBackend(logger, is_qnn_ctx_model, context_cache_enabled_ && enable_spill_fill_buffer_);
+  auto rt = qnn_backend_manager_->SetupBackend(logger, is_qnn_ctx_model,
+                                               context_cache_enabled_ && enable_spill_fill_buffer_,
+                                               share_ep_contexts_);
   if (Status::OK() != rt) {
     LOGS(logger, ERROR) << "QNN SetupBackend failed " << rt.ErrorMessage();
     return result;
@@ -1051,7 +1041,9 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                   context_model_path,
                                                   qnn_context_embed_mode_,
                                                   max_spill_fill_buffer_size,
-                                                  logger));
+                                                  logger,
+                                                  share_ep_contexts_,
+                                                  stop_share_ep_contexts_));
 
     if (share_ep_contexts_ && !stop_share_ep_contexts_ &&
         nullptr == SharedContext::GetInstance().GetSharedQnnBackendManager()) {
diff --git a/onnxruntime/core/providers/qnn/shared_context.h b/onnxruntime/core/providers/qnn/shared_context.h
@@ -84,6 +84,21 @@ class SharedContext {
     qnn_backend_manager_.reset();
   }
 
+  void SetSharedCtxBinFileName(std::string& shared_ctx_bin_file_name) {
+    const std::lock_guard<std::mutex> lock(mtx_);
+    shared_ctx_bin_file_name_ = shared_ctx_bin_file_name;
+  }
+
+  const std::string& GetSharedCtxBinFileName() {
+    const std::lock_guard<std::mutex> lock(mtx_);
+    return shared_ctx_bin_file_name_;
+  }
+
+  void ResetSharedCtxBinFileName() {
+    const std::lock_guard<std::mutex> lock(mtx_);
+    shared_ctx_bin_file_name_.clear();
+  }
+
  private:
   SharedContext() = default;
   ~SharedContext() = default;
@@ -94,6 +109,9 @@ class SharedContext {
   std::vector<std::unique_ptr<qnn::QnnModel>> shared_qnn_models_;
   // Used for compiling multiple models into same QNN context binary
   std::shared_ptr<qnn::QnnBackendManager> qnn_backend_manager_;
+  // Track the shared ctx binary .bin file name, all _ctx.onnx point to this .bin file
+  // only the last session generate the .bin file since it contains all graphs from all sessions.
+  std::string shared_ctx_bin_file_name_;
   // Producer sessions can be in parallel
   // Consumer sessions have to be after producer sessions initialized
   std::mutex mtx_;
diff --git a/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc b/onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc
@@ -48,7 +48,6 @@ namespace qnnctxgen {
       "\t    [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
       "\t    [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
       "\t    Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
-      "\t    [QNN only] [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
       "\t    [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
       "\t    Defaults to '1' (QNN EP handles the graph I/O quantization and dequantization). \n"
       "\t    [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
@@ -161,8 +160,8 @@ static bool ParseSessionConfigs(const std::string& configs_string,
               std::string str = str_stream.str();
               ORT_THROW("Wrong value for htp_graph_finalization_optimization_mode. select from: " + str);
             }
-          } else if (key == "enable_htp_fp16_precision" || key == "enable_htp_weight_sharing" ||
-                     key == "offload_graph_io_quantization" || key == "enable_htp_spill_fill_buffer") {
+          } else if (key == "enable_htp_fp16_precision" || key == "offload_graph_io_quantization" ||
+                     key == "enable_htp_spill_fill_buffer") {
             std::unordered_set<std::string> supported_options = {"0", "1"};
             if (supported_options.find(value) == supported_options.end()) {
               std::ostringstream str_stream;
@@ -173,7 +172,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
             }
           } else {
             ORT_THROW(R"(Wrong key type entered. Choose from options: ['backend_path', 'vtcm_mb', 'htp_performance_mode',
- 'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', 'enable_htp_fp16_precision', 'enable_htp_weight_sharing',
+ 'htp_graph_finalization_optimization_mode', 'soc_model', 'htp_arch', 'enable_htp_fp16_precision',
  'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer'])");
           }
 
diff --git a/onnxruntime/test/ep_weight_sharing_ctx_gen/main.cc b/onnxruntime/test/ep_weight_sharing_ctx_gen/main.cc
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc