Skip to content

Commit 925745b

Browse files
Add benchmark for Flash Attention Decode (#363)
This PR adds the benchmark for Flash Attention Decode. It is stacked on top of #362. --------- Co-authored-by: Alejandro Acosta <[email protected]>
1 parent 347dc33 commit 925745b

9 files changed

+1166
-2
lines changed

benchmarks/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,9 @@ ninja benchmarks_gemm_sycl
4444
# target = intel_gpu_pvc | intel_gpu_bmg_g21
4545
cmake .. -GNinja -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=$target -DCUTLASS_ENABLE_BENCHMARKS=ON -DCUTLASS_ENABLE_TESTS=ON
4646
47-
ninja cutlass_benchmarks_flash_attention_prefill
47+
ninja cutlass_benchmarks_flash_attention
4848
./benchmarks/flash_attention/flash_attention_prefill/cutlass_benchmarks_flash_attention_prefill_xe --config_file=../benchmarks/device/pvc/input_files/input_flash_attention_prefill.in
49+
./benchmarks/flash_attention/flash_attention_decode/cutlass_benchmarks_flash_attention_decode_xe --config_file=../benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_kvcache.in
4950
```
5051

5152
## Compiling and Running Flash Attention v2 benchmarks with default configurations with Intel Xe backend
@@ -54,7 +55,7 @@ ninja cutlass_benchmarks_flash_attention_prefill
5455
# target = intel_gpu_pvc | intel_gpu_bmg_g21
5556
cmake .. -GNinja -DCUTLASS_ENABLE_SYCL=ON -DDPCPP_SYCL_TARGET=$target -DCUTLASS_ENABLE_BENCHMARKS=ON -DCUTLASS_ENABLE_TESTS=ON
5657
57-
ninja benchmarks_flash_attention_prefill
58+
ninja benchmarks_flash_attention
5859
```
5960

6061
## Compiling and Running all benchmarks with default configurations with Intel Xe backend
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Flash attention decode (with kv-cache)
2+
3+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
4+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
5+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
6+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
7+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
8+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
9+
10+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
11+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
12+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
13+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
14+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
15+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
16+
17+
18+
#FP16 benchmarks
19+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
20+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
21+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
22+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
23+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
24+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
25+
26+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
27+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=1024 --num_heads_kv=8
28+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
29+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
30+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
31+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=1024 --num_heads_kv=8
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Flash attention decode (without kv-cache)
2+
3+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
4+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
5+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
6+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
7+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
8+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
9+
10+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
11+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
12+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
13+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
14+
PvcFMHADecodeBF16BF16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
15+
PvcFMHADecodeBF16BF16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
16+
17+
#FP16 benchmarks
18+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
19+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
20+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
21+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
22+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
23+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_FixedLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
24+
25+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
26+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_no_kv_cache --batch=1 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=2048 --seq_len_kv_cache=0 --num_heads_kv=8
27+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
28+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_no_kv_cache --batch=8 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
29+
PvcFMHADecodeFP16FP16FP32_RCR_h128_NonCausal_VarLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8
30+
PvcFMHADecodeFP16FP16FP32_RCR_h128_Causal_VarLen --bm_name=attention_decode_no_kv_cache --batch=16 --seq_len_qo=1 --num_heads_q=32 --head_size_qk=128 --head_size_vo=128 --seq_len_kv=1024 --seq_len_kv_cache=0 --num_heads_kv=8

benchmarks/flash_attention/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@
2828

2929
cutlass_benchmark_add_suite(cutlass_benchmarks_flash_attention)
3030
add_subdirectory(flash_attention_prefill)
31+
add_subdirectory(flash_attention_decode)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright (c) 2024 - 2025 Codeplay Software Ltd. All rights reserved.
2+
# SPDX-License-Identifier: BSD-3-Clause
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# 1. Redistributions of source code must retain the above copyright notice, this
8+
# list of conditions and the following disclaimer.
9+
#
10+
# 2. Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# 3. Neither the name of the copyright holder nor the names of its
15+
# contributors may be used to endorse or promote products derived from
16+
# this software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22+
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23+
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25+
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26+
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
29+
set(CUTLASS_APPLICATIONS_DIR ${CMAKE_SOURCE_DIR}/applications)
30+
31+
# Pass these configuration files for the CI
32+
set(CONFIG_FILE_KV_CACHE --config_file=${CMAKE_SOURCE_DIR}/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_kvcache.in)
33+
set(CONFIG_FILE_NO_KV_CACHE --config_file=${CMAKE_SOURCE_DIR}/benchmarks/device/bmg/input_files/input_sglang_flash_attention_decode_nokvcache.in)
34+
35+
cutlass_benchmark_add_suite(cutlass_benchmarks_flash_attention_decode
36+
SUPERSUITE cutlass_benchmarks_flash_attention)
37+
38+
cutlass_benchmark_add_executable(
39+
cutlass_benchmarks_flash_attention_decode_xe
40+
main.cpp
41+
TEST_COMMAND_OPTIONS
42+
CONFIG_FILE_NO_KV_CACHE
43+
CONFIG_FILE_KV_CACHE
44+
SUITE cutlass_benchmarks_flash_attention_decode
45+
)

0 commit comments

Comments
 (0)