From 2cb77c1e902b53ccca71746b8860e9fd5cdf9383 Mon Sep 17 00:00:00 2001
From: Winston Kuo <winskuo@qti.qualcomm.com>
Date: Mon, 22 Jun 2026 16:39:27 +0800
Subject: [PATCH] Qualcomm AI Engine Direct - Enable more HF LLM Model

---
 backends/qualcomm/tests/test_qnn_delegate.py          |  8 ++++----
 .../{qwen2_5/qwen2_5.py => hf_causal_lm.py}           |  8 ++++----
 examples/qualcomm/oss_scripts/llm_utils/README.md     |  2 +-
 .../llm_utils/qnn_decoder_model_manager.py            | 11 ++++++++---
 4 files changed, 17 insertions(+), 12 deletions(-)
 rename examples/qualcomm/oss_scripts/{qwen2_5/qwen2_5.py => hf_causal_lm.py} (96%)

diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 17ff3f845ea..20195669220 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -8581,18 +8581,18 @@ def test_attention_sink(self):
                         msg["wiki_ppl"], self.llm_specs[model_name].wikitext_ppl
                     )
 
-    def test_qwen2_5(self):
-        # This is not testing static llm flow.
+    def test_hf_causal_lm(self):
+        # This is the Hugging Face transformers flow, not the static llm flow.
         if not self.required_envs([]):
             self.skipTest("missing required envs")
         prompt = "My favourite condiment is "
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/hf_causal_lm.py",
             "--prompt",
             prompt,
             "--decoder_model",
-            "qwen2.5_0.5B",
+            "qwen2_5-0_5b",
             "--ptq",
             "16a8w",
             "--enable_spinquant_r3",
diff --git a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py b/examples/qualcomm/oss_scripts/hf_causal_lm.py
similarity index 96%
rename from examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
rename to examples/qualcomm/oss_scripts/hf_causal_lm.py
index 7876a5b54b3..2ed7e49d2a8 100644
--- a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py
+++ b/examples/qualcomm/oss_scripts/hf_causal_lm.py
@@ -33,7 +33,7 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 logging.getLogger().setLevel(logging.INFO)
 
-PTE_FILENAME = "qwen_qnn_q16"
+PTE_FILENAME = "hf_causal_lm_qnn"
 
 
 def compile(args: argparse.Namespace, qnn_config: QnnConfig):  # noqa: C901
@@ -195,7 +195,7 @@ def main(args):
         "-a",
         "--artifact",
         help="path for storing generated artifacts by this example.",
-        default="qwen2_5",
+        default="hf_causal_lm",
         type=str,
     )
 
@@ -216,8 +216,8 @@ def main(args):
 
     parser.add_argument(
         "--decoder_model",
-        choices=["qwen2.5_0.5B", "qwen2.5_0.5B_instruct", "qwen2.5_1.5B_instruct"],
-        help="The Qwen model to export. Current available options are: [qwen2.5_0.5B, qwen2.5_0.5B_instruct, qwen2.5_1.5B_instruct]",
+        choices=list(HUGGING_FACE_REPO_IDS.keys()),
+        help=f"The Hugging Face decoder model to export. Available options are: {list(HUGGING_FACE_REPO_IDS.keys())}",
         required=True,
     )
 
diff --git a/examples/qualcomm/oss_scripts/llm_utils/README.md b/examples/qualcomm/oss_scripts/llm_utils/README.md
index 84c23c294fa..4cf16448485 100644
--- a/examples/qualcomm/oss_scripts/llm_utils/README.md
+++ b/examples/qualcomm/oss_scripts/llm_utils/README.md
@@ -1,7 +1,7 @@
 ## Tutorial to run [eval_decoder_model_qnn.py](./eval_decoder_model_qnn.py)
 This script, [`eval_decoder_model_qnn.py`](./eval_decoder_model_qnn.py), is designed to evaluate large language models (LLMs) from transformers that have been compiled into ExecuTorch Portable Executable (PTE) format for execution on Qualcomm devices. It leverages the `lm-evaluation-harness` library to perform various NLP evaluation tasks.
 
-> ⚠️ **Important:** Note that this script runs PTE files generated specifically for Hugging Face Transformers, such as [qwen2_5.py](../qwen2_5/qwen2_5.py), rather than [the static LLaMA version](../llama/llama.py).
+> ⚠️ **Important:** Note that this script runs PTE files generated specifically for Hugging Face Transformers, such as [hf_causal_lm.py](../hf_causal_lm.py), rather than [the static LLaMA version](../llama/llama.py).
 
 ### Features:
 
diff --git a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
index 89277bcaac8..c9d00e9c463 100644
--- a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
+++ b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py
@@ -44,9 +44,13 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
 HUGGING_FACE_REPO_IDS = {
-    "qwen2.5_0.5B": "Qwen/Qwen2.5-0.5B",
-    "qwen2.5_1.5B_instruct": "Qwen/Qwen2.5-1.5B-Instruct",
-    "qwen2.5_0.5B_instruct": "Qwen/Qwen2.5-0.5B-Instruct",
+    "llama3_2-1b": "NousResearch/Llama-3.2-1B",
+    "qwen2_5-0_5b": "Qwen/Qwen2.5-0.5B",
+    "qwen2_5-1_5b_instruct": "Qwen/Qwen2.5-1.5B-Instruct",
+    "qwen2_5-0_5b_instruct": "Qwen/Qwen2.5-0.5B-Instruct",
+    "qwen3-0_6b": "Qwen/Qwen3-0.6B",
+    "smollm2_135m": "HuggingFaceTB/SmolLM2-135M",
+    "gemma-2b" : "weqweasdas/RM-Gemma-2B",
 }
 
 
@@ -64,6 +68,7 @@ def get_qnn_llm_edge_manager(model_name, max_seq_len=128, enable_spinquant_r3=Tr
     config.ar_len = 1  # kv mode
     config.max_batch_size = batch_size
     config.enable_spinquant_r3 = enable_spinquant_r3
+    config.use_cache = True
 
     # Some config has head_dim provided that is different from equation below(e.g., qwen3)
     if not hasattr(config, "head_dim"):