Skip to content

Commit 00149f2

Browse files
author
Guang Yang
committed
Benchmark optimum-executorch
1 parent 4457cf6 commit 00149f2

File tree

6 files changed

+127
-40
lines changed

6 files changed

+127
-40
lines changed

.ci/scripts/gather_benchmark_configs.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
BENCHMARK_CONFIGS = {
3333
"xplat": [
3434
"xnnpack_q8",
35-
"hf_xnnpack_fp32",
35+
"hf_xnnpack_custom_spda_kv_cache_8da4w",
36+
"et_xnnpack_custom_spda_kv_cache_8da4w",
3637
"llama3_fb16",
3738
"llama3_spinquant",
3839
"llama3_qlora",
@@ -129,25 +130,26 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]:
129130
"""
130131
configs = []
131132
if is_valid_huggingface_model_id(model_name):
133+
configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w")
132134
if model_name.startswith("meta-llama/"):
133-
# LLaMA models
135+
# etLLM recipes for Llama
134136
repo_name = model_name.split("meta-llama/")[1]
135137
if "qlora" in repo_name.lower():
136138
configs.append("llama3_qlora")
137139
elif "spinquant" in repo_name.lower():
138140
configs.append("llama3_spinquant")
139141
else:
140142
configs.append("llama3_fb16")
143+
configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
141144
configs.extend(
142145
[
143146
config
144147
for config in BENCHMARK_CONFIGS.get(target_os, [])
145148
if config.startswith("llama")
146149
]
147150
)
148-
else:
149-
# Non-LLaMA models
150-
configs.append("hf_xnnpack_fp32")
151+
if model_name.startswith("Qwen/Qwen3"):
152+
configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
151153
elif model_name in MODEL_NAME_TO_MODEL:
152154
# ExecuTorch in-tree non-GenAI models
153155
configs.append("xnnpack_q8")

.github/workflows/android-perf-private-device-experiment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ on:
1818
description: Models to be benchmarked
1919
required: false
2020
type: string
21-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
21+
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
2222
devices:
2323
description: Target devices to run benchmark
2424
required: false
@@ -34,7 +34,7 @@ on:
3434
description: Models to be benchmarked
3535
required: false
3636
type: string
37-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
37+
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
3838
devices:
3939
description: Target devices to run benchmark
4040
required: false
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
60+
models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' }}
6161
devices: samsung_galaxy_s22_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/android-perf.yml

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7171
# during scheduled runs and to provide flexibility for different defaults between
7272
# on-demand and periodic benchmarking.
73-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
73+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'llama' }}
7474
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
7575
run: |
7676
set -eux
@@ -201,8 +201,8 @@ jobs:
201201
HF_MODEL_REPO=${{ matrix.model }}
202202
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
203203
204+
# Convert HF checkpoint to ET via etLLM path
204205
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
205-
# Llama models on Hugging Face
206206
if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
207207
# SpinQuant
208208
# Download prequantized chceckpoint from Hugging Face
@@ -298,12 +298,51 @@ jobs:
298298
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
299299
ls -lh "${OUT_ET_MODEL_NAME}.pte"
300300
fi
301-
else
302-
echo "Unsupported model ${{ matrix.model }}"
303-
exit 1
304301
fi
305302
306-
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
303+
if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
304+
DOWNLOADED_PATH=$(
305+
bash .ci/scripts/download_hf_hub.sh \
306+
--model_id "${HF_MODEL_REPO}" \
307+
--files "tokenizer.json"
308+
)
309+
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
310+
311+
# Install optimum-executorch
312+
git clone https://github.com/huggingface/optimum-executorch
313+
pushd optimum-executorch
314+
# There is no release yet, for CI stability, always test from the same commit on main
315+
git checkout 1c653dc49812fc431a22312c7295d97005d22e12
316+
python install_dev.py
317+
pip list
318+
319+
ARGS=(
320+
"--model" "${HF_MODEL_REPO}"
321+
"--task" "text-generation"
322+
"--recipe" "xnnpack"
323+
"--use_custom_sdpa"
324+
"--qlinear"
325+
"--qembedding"
326+
"--output_dir" "."
327+
)
328+
329+
# Add conditional arguments based on model
330+
case "${HF_MODEL_REPO}" in
331+
*"google/gemma-3-1b-it"*)
332+
echo "--use_custom_kv_cache can not be used for HybridCache"
333+
;;
334+
*)
335+
ARGS+=("--use_custom_kv_cache")
336+
;;
337+
esac
338+
339+
optimum-cli export executorch "${ARGS[@]}"
340+
341+
mv model.pte ${OUT_ET_MODEL_NAME}.pte
342+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
343+
fi
344+
345+
zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
307346
ls -lh model.zip
308347
mkdir -p "${ARTIFACTS_DIR_NAME}"
309348
mv model.zip "${ARTIFACTS_DIR_NAME}"

.github/workflows/apple-perf-private-device-experiment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ on:
1818
description: Models to be benchmarked
1919
required: false
2020
type: string
21-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
21+
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
2222
devices:
2323
description: Target devices to run benchmark
2424
required: false
@@ -34,7 +34,7 @@ on:
3434
description: Models to be benchmarked
3535
required: false
3636
type: string
37-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
37+
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf
3838
devices:
3939
description: Target devices to run benchmark
4040
required: false
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
60+
models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' }}
6161
devices: apple_iphone_15_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/apple-perf.yml

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7171
# during scheduled runs and to provide flexibility for different defaults between
7272
# on-demand and periodic benchmarking.
73-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
73+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,google/gemma-3-1b-it,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'llama' }}
7474
CRON_DEFAULT_DEVICES: apple_iphone_15
7575
run: |
7676
set -eux
@@ -207,6 +207,7 @@ jobs:
207207
HF_MODEL_REPO=${{ matrix.model }}
208208
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
209209
210+
# Convert HF checkpoint to ET via etLLM path
210211
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
211212
# Llama models on Hugging Face
212213
if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
@@ -299,12 +300,51 @@ jobs:
299300
${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
300301
ls -lh "${OUT_ET_MODEL_NAME}.pte"
301302
fi
302-
else
303-
echo "Unsupported model ${{ matrix.model }}"
304-
exit 1
305303
fi
306304
307-
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
305+
if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
306+
DOWNLOADED_PATH=$(
307+
bash .ci/scripts/download_hf_hub.sh \
308+
--model_id "${HF_MODEL_REPO}" \
309+
--files "tokenizer.json"
310+
)
311+
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
312+
313+
# Install optimum-executorch
314+
git clone https://github.com/huggingface/optimum-executorch
315+
pushd optimum-executorch
316+
# There is no release yet, for CI stability, always test from the same commit on main
317+
git checkout 1c653dc49812fc431a22312c7295d97005d22e12
318+
python install_dev.py
319+
pip list
320+
321+
ARGS=(
322+
"--model" "${HF_MODEL_REPO}"
323+
"--task" "text-generation"
324+
"--recipe" "xnnpack"
325+
"--use_custom_sdpa"
326+
"--qlinear"
327+
"--qembedding"
328+
"--output_dir" "."
329+
)
330+
331+
# Add conditional arguments based on model
332+
case "${HF_MODEL_REPO}" in
333+
*"google/gemma-3-1b-it"*)
334+
echo "--use_custom_kv_cache can not be used for HybridCache"
335+
;;
336+
*)
337+
ARGS+=("--use_custom_kv_cache")
338+
;;
339+
esac
340+
341+
optimum-cli export executorch "${ARGS[@]}"
342+
343+
mv model.pte ${OUT_ET_MODEL_NAME}.pte
344+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
345+
fi
346+
347+
zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
308348
ls -lh model.zip
309349
mkdir -p "${ARTIFACTS_DIR_NAME}"
310350
mv model.zip "${ARTIFACTS_DIR_NAME}"

.github/workflows/trunk.yml

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -570,34 +570,40 @@ jobs:
570570
git clone https://github.com/huggingface/optimum-executorch
571571
pushd optimum-executorch
572572
# There is no release yet, for CI stability, always test from the same commit on main
573-
git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
573+
git checkout 1c653dc49812fc431a22312c7295d97005d22e12
574574
pip install .[tests]
575+
pip install transformers==4.52.4
575576
popd
576-
577-
if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then
578-
# Fixes for gemma-3 is not available in the released version
579-
git clone https://github.com/huggingface/transformers.git
580-
pushd transformers
581-
git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f
582-
pip install -e .
583-
popd
584-
fi
585577
pip list
586578
echo "::endgroup::"
587579
588580
echo "::group::Export to ExecuTorch"
589581
# Pass matrix variable as environment variable
590582
export MODEL_ID="${{ matrix.hf_model_id }}"
591-
export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w"
583+
export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_kv_cache_8da4w"
592584
pushd optimum-executorch
593585
594-
optimum-cli export executorch \
595-
--model ${MODEL_ID} \
596-
--task text-generation \
597-
--recipe xnnpack \
598-
--use_custom_sdpa \
599-
--output_dir ${OUTPUT_DIR} \
600-
--qlinear
586+
ARGS=(
587+
"--model" "${MODEL_ID}"
588+
"--task" "text-generation"
589+
"--recipe" "xnnpack"
590+
"--use_custom_sdpa"
591+
"--qlinear"
592+
"--qembedding"
593+
"--output_dir" "."
594+
)
595+
596+
# Add conditional arguments based on model
597+
case "${MODEL_ID}" in
598+
*"google/gemma-3-1b-it"*)
599+
echo "--use_custom_kv_cache can not be used for HybridCache"
600+
;;
601+
*)
602+
ARGS+=("--use_custom_kv_cache")
603+
;;
604+
esac
605+
606+
optimum-cli export executorch "${ARGS[@]}"
601607
602608
ls -FlAGhp ${OUTPUT_DIR}
603609
popd

0 commit comments

Comments
 (0)