From 2cb77c1e902b53ccca71746b8860e9fd5cdf9383 Mon Sep 17 00:00:00 2001 From: Winston Kuo Date: Mon, 22 Jun 2026 16:39:27 +0800 Subject: [PATCH] Qualcomm AI Engine Direct - Enable more HF LLM Model --- backends/qualcomm/tests/test_qnn_delegate.py | 8 ++++---- .../{qwen2_5/qwen2_5.py => hf_causal_lm.py} | 8 ++++---- examples/qualcomm/oss_scripts/llm_utils/README.md | 2 +- .../llm_utils/qnn_decoder_model_manager.py | 11 ++++++++--- 4 files changed, 17 insertions(+), 12 deletions(-) rename examples/qualcomm/oss_scripts/{qwen2_5/qwen2_5.py => hf_causal_lm.py} (96%) diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 17ff3f845ea..20195669220 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -8581,18 +8581,18 @@ def test_attention_sink(self): msg["wiki_ppl"], self.llm_specs[model_name].wikitext_ppl ) - def test_qwen2_5(self): - # This is not testing static llm flow. + def test_hf_causal_lm(self): + # This is the Hugging Face transformers flow, not the static llm flow. if not self.required_envs([]): self.skipTest("missing required envs") prompt = "My favourite condiment is " cmds = [ "python", - f"{self.executorch_root}/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py", + f"{self.executorch_root}/examples/qualcomm/oss_scripts/hf_causal_lm.py", "--prompt", prompt, "--decoder_model", - "qwen2.5_0.5B", + "qwen2_5-0_5b", "--ptq", "16a8w", "--enable_spinquant_r3", diff --git a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py b/examples/qualcomm/oss_scripts/hf_causal_lm.py similarity index 96% rename from examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py rename to examples/qualcomm/oss_scripts/hf_causal_lm.py index 7876a5b54b3..2ed7e49d2a8 100644 --- a/examples/qualcomm/oss_scripts/qwen2_5/qwen2_5.py +++ b/examples/qualcomm/oss_scripts/hf_causal_lm.py @@ -33,7 +33,7 @@ logging.basicConfig(level=logging.INFO, format=FORMAT) logging.getLogger().setLevel(logging.INFO) -PTE_FILENAME = "qwen_qnn_q16" +PTE_FILENAME = "hf_causal_lm_qnn" def compile(args: argparse.Namespace, qnn_config: QnnConfig): # noqa: C901 @@ -195,7 +195,7 @@ def main(args): "-a", "--artifact", help="path for storing generated artifacts by this example.", - default="qwen2_5", + default="hf_causal_lm", type=str, ) @@ -216,8 +216,8 @@ def main(args): parser.add_argument( "--decoder_model", - choices=["qwen2.5_0.5B", "qwen2.5_0.5B_instruct", "qwen2.5_1.5B_instruct"], - help="The Qwen model to export. Current available options are: [qwen2.5_0.5B, qwen2.5_0.5B_instruct, qwen2.5_1.5B_instruct]", + choices=list(HUGGING_FACE_REPO_IDS.keys()), + help=f"The Hugging Face decoder model to export. Available options are: {list(HUGGING_FACE_REPO_IDS.keys())}", required=True, ) diff --git a/examples/qualcomm/oss_scripts/llm_utils/README.md b/examples/qualcomm/oss_scripts/llm_utils/README.md index 84c23c294fa..4cf16448485 100644 --- a/examples/qualcomm/oss_scripts/llm_utils/README.md +++ b/examples/qualcomm/oss_scripts/llm_utils/README.md @@ -1,7 +1,7 @@ ## Tutorial to run [eval_decoder_model_qnn.py](./eval_decoder_model_qnn.py) This script, [`eval_decoder_model_qnn.py`](./eval_decoder_model_qnn.py), is designed to evaluate large language models (LLMs) from transformers that have been compiled into ExecuTorch Portable Executable (PTE) format for execution on Qualcomm devices. It leverages the `lm-evaluation-harness` library to perform various NLP evaluation tasks. -> ⚠️ **Important:** Note that this script runs PTE files generated specifically for Hugging Face Transformers, such as [qwen2_5.py](../qwen2_5/qwen2_5.py), rather than [the static LLaMA version](../llama/llama.py). +> ⚠️ **Important:** Note that this script runs PTE files generated specifically for Hugging Face Transformers, such as [hf_causal_lm.py](../hf_causal_lm.py), rather than [the static LLaMA version](../llama/llama.py). ### Features: diff --git a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py index 89277bcaac8..c9d00e9c463 100644 --- a/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py +++ b/examples/qualcomm/oss_scripts/llm_utils/qnn_decoder_model_manager.py @@ -44,9 +44,13 @@ logging.basicConfig(level=logging.INFO, format=FORMAT) HUGGING_FACE_REPO_IDS = { - "qwen2.5_0.5B": "Qwen/Qwen2.5-0.5B", - "qwen2.5_1.5B_instruct": "Qwen/Qwen2.5-1.5B-Instruct", - "qwen2.5_0.5B_instruct": "Qwen/Qwen2.5-0.5B-Instruct", + "llama3_2-1b": "NousResearch/Llama-3.2-1B", + "qwen2_5-0_5b": "Qwen/Qwen2.5-0.5B", + "qwen2_5-1_5b_instruct": "Qwen/Qwen2.5-1.5B-Instruct", + "qwen2_5-0_5b_instruct": "Qwen/Qwen2.5-0.5B-Instruct", + "qwen3-0_6b": "Qwen/Qwen3-0.6B", + "smollm2_135m": "HuggingFaceTB/SmolLM2-135M", + "gemma-2b" : "weqweasdas/RM-Gemma-2B", } @@ -64,6 +68,7 @@ def get_qnn_llm_edge_manager(model_name, max_seq_len=128, enable_spinquant_r3=Tr config.ar_len = 1 # kv mode config.max_batch_size = batch_size config.enable_spinquant_r3 = enable_spinquant_r3 + config.use_cache = True # Some config has head_dim provided that is different from equation below(e.g., qwen3) if not hasattr(config, "head_dim"):