pytorch · winskuo-quic · Jun 22, 2026
@@ -8581,18 +8581,18 @@ def test_attention_sink(self):
                         msg["wiki_ppl"], self.llm_specs[model_name].wikitext_ppl
                     )
 
-    def test_qwen2_5(self):
-        # This is not testing static llm flow.
+    def test_hf_causal_lm(self):
+        # This is the Hugging Face transformers flow, not the static llm flow.
         if not self.required_envs([]):
             self.skipTest("missing required envs")
         prompt = "My favourite condiment is "
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/hf_causal_lm.py",
             "--prompt",
             prompt,
             "--decoder_model",
-            "qwen2.5_0.5B",
+            "qwen2_5-0_5b",
             "--ptq",
             "16a8w",
             "--enable_spinquant_r3",

@@ -33,7 +33,7 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 logging.getLogger().setLevel(logging.INFO)
 
-PTE_FILENAME = "qwen_qnn_q16"
+PTE_FILENAME = "hf_causal_lm_qnn"
 
 
 def compile(args: argparse.Namespace, qnn_config: QnnConfig):  # noqa: C901
@@ -195,7 +195,7 @@ def main(args):
         "-a",
         "--artifact",
         help="path for storing generated artifacts by this example.",
-        default="qwen2_5",
+        default="hf_causal_lm",
         type=str,
     )
 
@@ -216,8 +216,8 @@ def main(args):
 
     parser.add_argument(
         "--decoder_model",
-        choices=["qwen2.5_0.5B", "qwen2.5_0.5B_instruct", "qwen2.5_1.5B_instruct"],
-        help="The Qwen model to export. Current available options are: [qwen2.5_0.5B, qwen2.5_0.5B_instruct, qwen2.5_1.5B_instruct]",
+        choices=list(HUGGING_FACE_REPO_IDS.keys()),
+        help=f"The Hugging Face decoder model to export. Available options are: {list(HUGGING_FACE_REPO_IDS.keys())}",
         required=True,
     )
 

@@ -1,7 +1,7 @@
 ## Tutorial to run [eval_decoder_model_qnn.py](./eval_decoder_model_qnn.py)
 This script, [`eval_decoder_model_qnn.py`](./eval_decoder_model_qnn.py), is designed to evaluate large language models (LLMs) from transformers that have been compiled into ExecuTorch Portable Executable (PTE) format for execution on Qualcomm devices. It leverages the `lm-evaluation-harness` library to perform various NLP evaluation tasks.
 
-> ⚠️ **Important:** Note that this script runs PTE files generated specifically for Hugging Face Transformers, such as [qwen2_5.py](../qwen2_5/qwen2_5.py), rather than [the static LLaMA version](../llama/llama.py).
+> ⚠️ **Important:** Note that this script runs PTE files generated specifically for Hugging Face Transformers, such as [hf_causal_lm.py](../hf_causal_lm.py), rather than [the static LLaMA version](../llama/llama.py).
 
 ### Features:
 

@@ -1,3 +1,3 @@
 # Copyright (c) Qualcomm Innovation Center, Inc.
 # All rights reserved
 #
@@ -44,9 +44,13 @@
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
 HUGGING_FACE_REPO_IDS = {
-    "qwen2.5_0.5B": "Qwen/Qwen2.5-0.5B",
-    "qwen2.5_1.5B_instruct": "Qwen/Qwen2.5-1.5B-Instruct",
-    "qwen2.5_0.5B_instruct": "Qwen/Qwen2.5-0.5B-Instruct",
+    "llama3_2-1b": "NousResearch/Llama-3.2-1B",
+    "qwen2_5-0_5b": "Qwen/Qwen2.5-0.5B",
+    "qwen2_5-1_5b_instruct": "Qwen/Qwen2.5-1.5B-Instruct",
+    "qwen2_5-0_5b_instruct": "Qwen/Qwen2.5-0.5B-Instruct",
+    "qwen3-0_6b": "Qwen/Qwen3-0.6B",
+    "smollm2_135m": "HuggingFaceTB/SmolLM2-135M",
+    "gemma-2b" : "weqweasdas/RM-Gemma-2B",
 }
 
 
@@ -64,6 +68,7 @@
     config.ar_len = 1  # kv mode
     config.max_batch_size = batch_size
     config.enable_spinquant_r3 = enable_spinquant_r3
+    config.use_cache = True
 
     # Some config has head_dim provided that is different from equation below(e.g., qwen3)
     if not hasattr(config, "head_dim"):