From d08b0e5dc4ecf4af364121933fbe0aa61031cc66 Mon Sep 17 00:00:00 2001
From: Scott Todd <scott.todd0@gmail.com>
Date: Tue, 17 Dec 2024 09:39:10 -0800
Subject: [PATCH 1/3] [shortfin] Use custom manylinux dockerfile in
 build_linux_package.sh. (#709)

Package builds started failing last night when the `latest` upstream
manylinux dockerfile switched to gcc 14:
https://github.com/nod-ai/shark-ai/actions/runs/12371699664/job/34528374484

```
  Running command Building wheel for shortfin (pyproject.toml)
  -- The C compiler identification is GNU 14.2.1
  -- The CXX compiler identification is GNU 14.2.1

...

  [325/365] Building CXX object src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o
  FAILED: src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o
  /opt/rh/gcc-toolset-14/root/usr/bin/c++ -DCPUINFO_SUPPORTED_PLATFORM=1 -DSPDLOG_COMPILED_LIB -DSPDLOG_FMT_EXTERNAL -DSPDLOG_SHARED_LIB -D_SHORTFIN_BUILDING_DYLIB -Dspdlog_EXPORTS -I/home/runner/work/shark-ai/shark-ai/c/shortfin/src -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/src -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/spdlog-src/include -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/fmt-src/include -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/runtime/src -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build/runtime/src -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/cpuinfo/include -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build/runtime/src/iree/base/internal/flatcc -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/flatcc/include -isystem /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-build/runtime/src/iree/schemas -O3 -DNDEBUG -std=gnu++20 -flto=auto -fno-fat-lto-objects -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Werror -pthread -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/flatcc/include/ -I/home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/shortfin_iree-src/third_party/flatcc/include/flatcc/reflection/ -MD -MT src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o -MF src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o.d -o src/shortfin/local/CMakeFiles/shortfin_local.dylib.objects.dir/device.cc.o -c /home/runner/work/shark-ai/shark-ai/c/shortfin/src/shortfin/local/device.cc
  In file included from /home/runner/work/shark-ai/shark-ai/c/shortfin/src/shortfin/local/device.cc:10:
  /home/runner/work/shark-ai/shark-ai/c/shortfin/build/cmake/default/_deps/fmt-src/include/fmt/ranges.h:211:49: error: self-comparison always evaluates to true [-Werror=tautological-compare]
    211 |                      integer_sequence<bool, (Is == Is)...>) -> std::true_type;
        |                                              ~~~^~~~~
  cc1plus: all warnings being treated as errors
```

This switches to our own downstream dockerfile, defined here:
https://github.com/nod-ai/base-docker-images/blob/main/dockerfiles/manylinux_x86_64.Dockerfile,
which is pinned to an older version of the base image (and thus gcc).

Tested successfully here:
https://github.com/ScottTodd/shark-ai/actions/runs/12378199850
---
 shortfin/build_tools/build_linux_package.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/shortfin/build_tools/build_linux_package.sh b/shortfin/build_tools/build_linux_package.sh
index 91b944e513..db24639876 100755
--- a/shortfin/build_tools/build_linux_package.sh
+++ b/shortfin/build_tools/build_linux_package.sh
@@ -37,13 +37,18 @@ REPO_ROOT="$(cd "$THIS_DIR"/../../ && pwd)"
 SCRIPT_NAME="$(basename $0)"
 ARCH="$(uname -m)"
 
-# Note: we can switch to https://github.com/nod-ai/base-docker-images as needed for extra deps.
-MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}"
 PYTHON_VERSIONS="${OVERRIDE_PYTHON_VERSIONS:-cp311-cp311 cp312-cp312 cp313-cp313}"
 OUTPUT_DIR="${OUTPUT_DIR:-${THIS_DIR}/wheelhouse}"
 CACHE_DIR="${CACHE_DIR:-}"
 SHORTFIN_ENABLE_TRACING="${SHORTFIN_ENABLE_TRACING:-ON}"
 
+if [[ "${ARCH}" == "x86_64" ]]; then
+  MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-ghcr.io/nod-ai/manylinux_x86_64@sha256:4acf83343706d1e37252d6001ded3c97a73bc38620580f855b4e65e35ddc5681}"
+else
+  # TODO: publish a multi-platform manylinux image and include more deps in all platforms (rust, ccache, etc.)
+  MANYLINUX_DOCKER_IMAGE="${MANYLINUX_DOCKER_IMAGE:-quay.io/pypa/manylinux_2_28_${ARCH}:latest}"
+fi
+
 function run_on_host() {
   echo "Running on host"
   echo "Launching docker image ${MANYLINUX_DOCKER_IMAGE}"

From aab71618d6d0926720d68d7b9f2c5810fc1e6f86 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@amd.com>
Date: Tue, 17 Dec 2024 21:03:26 +0100
Subject: [PATCH 2/3] [shortfin] Bump fmt and spdlog and test with GCC 14
 (#711)

Bumps libftm to 11.0.2 to mitigate a build error with occurring with
GCC 14. Bumping spdlog to 1.15.0 (which bundles libfmt 11.0.2)
accordingly to keep the libs in sync. Furthermore expands testing
to build with GCC 14.
---
 .github/workflows/ci-libshortfin.yml          | 19 ++++++++++++-------
 shortfin/CMakeLists.txt                       |  8 ++++++--
 shortfin/src/shortfin/array/array.cc          |  1 +
 shortfin/src/shortfin/local/device.cc         |  1 +
 shortfin/src/shortfin/local/fiber.cc          |  1 +
 shortfin/src/shortfin/local/program.cc        |  1 +
 shortfin/src/shortfin/local/system.cc         |  1 +
 shortfin/src/shortfin/local/systems/amdgpu.cc |  2 ++
 .../src/shortfin/local/systems/factory.cc     |  2 ++
 shortfin/src/shortfin/support/config.cc       |  1 +
 10 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci-libshortfin.yml b/.github/workflows/ci-libshortfin.yml
index 0e09828038..543a6abe62 100644
--- a/.github/workflows/ci-libshortfin.yml
+++ b/.github/workflows/ci-libshortfin.yml
@@ -38,7 +38,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        name: ["Ubuntu (Clang)(full)", "Ubuntu (Clang)(host-only)", "Ubuntu (GCC)", "Windows (MSVC)"]
+        name: ["Ubuntu (Clang)(full)", "Ubuntu (Clang)(host-only)", "Windows (MSVC)"]
         python-version: ["3.10", "3.11", "3.12"]
         include:
           - name: Ubuntu (Clang)(full)
@@ -53,16 +53,21 @@ jobs:
             cmake-options:
               -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 -DCMAKE_LINKER_TYPE=LLD -DSHORTFIN_HAVE_AMDGPU=OFF -DSHORTFIN_BUILD_STATIC=ON -DSHORTFIN_BUILD_DYNAMIC=ON
             additional-packages: clang lld
-          - name: Ubuntu (GCC)
+          - name: Ubuntu (GCC 13)
             runs-on: ubuntu-24.04
+            # Only test with GCC 13 and Python 3.12
+            python-version: "3.12"
+            cmake-options:
+              -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_CXX_COMPILER=g++-13
+          - name: Ubuntu (GCC 14)
+            runs-on: ubuntu-24.04
+            # Only test with GCC 14 and Python 3.12
+            python-version: "3.12"
+            cmake-options:
+              -DCMAKE_C_COMPILER=gcc-14 -DCMAKE_CXX_COMPILER=g++-14
           - name: Windows (MSVC)
             runs-on: windows-2022
         exclude:
-          # Only test Python 3.12 with GCC
-          - name: Ubuntu (GCC)
-            python-version: "3.10"
-          - name: Ubuntu (GCC)
-            python-version: "3.11"
           # TODO: Include additional Python versions for Windows after build got fixed
           - name: Windows (MSVC)
             python-version: "3.10"
diff --git a/shortfin/CMakeLists.txt b/shortfin/CMakeLists.txt
index 2c79d5b41d..bd46d84f9d 100644
--- a/shortfin/CMakeLists.txt
+++ b/shortfin/CMakeLists.txt
@@ -39,6 +39,10 @@ if(NOT WIN32)
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 endif()
 
+# For unicode support Windows libfmt requires compiling with /utf-8.
+add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
+add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")
+
 # Pins
 set(SHORTFIN_IREE_GIT_TAG "iree-3.1.0rc20241204")
 
@@ -140,7 +144,7 @@ if(SHORTFIN_BUNDLE_DEPS)
   FetchContent_Declare(
     fmt
     GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-    GIT_TAG        e69e5f977d458f2650bb346dadf2ad30c5320281 # 10.2.1 (sync with spdlog)
+    GIT_TAG        0c9fce2ffefecfdce794e1859584e25877b7b592 # 11.0.2 (sync with spdlog)
   )
 
   ## spdlog
@@ -149,7 +153,7 @@ if(SHORTFIN_BUNDLE_DEPS)
   FetchContent_Declare(
     spdlog
     GIT_REPOSITORY https://github.com/gabime/spdlog.git
-    GIT_TAG        2d4acf8cc321d7783d8f2e22e17a794c6d0e9450 # v1.14.1
+    GIT_TAG        8e5613379f5140fefb0b60412fbf1f5406e7c7f8 # v1.15.0
   )
 
   ## xtl: required for xtensor
diff --git a/shortfin/src/shortfin/array/array.cc b/shortfin/src/shortfin/array/array.cc
index 882e4ef395..c0eca52d06 100644
--- a/shortfin/src/shortfin/array/array.cc
+++ b/shortfin/src/shortfin/array/array.cc
@@ -10,6 +10,7 @@
 
 #include "fmt/core.h"
 #include "fmt/ranges.h"
+#include "fmt/xchar.h"
 #include "shortfin/array/xtensor_bridge.h"
 #include "shortfin/support/logging.h"
 
diff --git a/shortfin/src/shortfin/local/device.cc b/shortfin/src/shortfin/local/device.cc
index 3afd2b8ad2..1bed3a419d 100644
--- a/shortfin/src/shortfin/local/device.cc
+++ b/shortfin/src/shortfin/local/device.cc
@@ -8,6 +8,7 @@
 
 #include <fmt/core.h>
 #include <fmt/ranges.h>
+#include <fmt/xchar.h>
 
 namespace shortfin::local {
 
diff --git a/shortfin/src/shortfin/local/fiber.cc b/shortfin/src/shortfin/local/fiber.cc
index 8ad9f29606..2c03672fd0 100644
--- a/shortfin/src/shortfin/local/fiber.cc
+++ b/shortfin/src/shortfin/local/fiber.cc
@@ -8,6 +8,7 @@
 
 #include <fmt/core.h>
 #include <fmt/ranges.h>
+#include <fmt/xchar.h>
 
 #include "shortfin/local/system.h"
 #include "shortfin/support/logging.h"
diff --git a/shortfin/src/shortfin/local/program.cc b/shortfin/src/shortfin/local/program.cc
index 6ab1f47ae8..71452da3ee 100644
--- a/shortfin/src/shortfin/local/program.cc
+++ b/shortfin/src/shortfin/local/program.cc
@@ -8,6 +8,7 @@
 
 #include "fmt/core.h"
 #include "fmt/std.h"
+#include "fmt/xchar.h"
 #include "iree/io/formats/parser_registry.h"
 #include "iree/modules/hal/module.h"
 #include "iree/modules/io/parameters/module.h"
diff --git a/shortfin/src/shortfin/local/system.cc b/shortfin/src/shortfin/local/system.cc
index ef31bb0015..00fcf4c650 100644
--- a/shortfin/src/shortfin/local/system.cc
+++ b/shortfin/src/shortfin/local/system.cc
@@ -7,6 +7,7 @@
 #include "shortfin/local/system.h"
 
 #include <fmt/core.h>
+#include <fmt/xchar.h>
 
 #include "iree/hal/utils/allocators.h"
 #include "shortfin/local/fiber.h"
diff --git a/shortfin/src/shortfin/local/systems/amdgpu.cc b/shortfin/src/shortfin/local/systems/amdgpu.cc
index cecedd1a03..262d2ec626 100644
--- a/shortfin/src/shortfin/local/systems/amdgpu.cc
+++ b/shortfin/src/shortfin/local/systems/amdgpu.cc
@@ -6,6 +6,8 @@
 
 #include "shortfin/local/systems/amdgpu.h"
 
+#include <fmt/xchar.h>
+
 #include "shortfin/support/logging.h"
 #include "shortfin/support/sysconfig.h"
 
diff --git a/shortfin/src/shortfin/local/systems/factory.cc b/shortfin/src/shortfin/local/systems/factory.cc
index bf5b788dc7..c5ee036cd4 100644
--- a/shortfin/src/shortfin/local/systems/factory.cc
+++ b/shortfin/src/shortfin/local/systems/factory.cc
@@ -4,6 +4,8 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <fmt/xchar.h>
+
 #include "shortfin/local/system.h"
 #include "shortfin/support/logging.h"
 
diff --git a/shortfin/src/shortfin/support/config.cc b/shortfin/src/shortfin/support/config.cc
index 7de820d1c1..d188ddb162 100644
--- a/shortfin/src/shortfin/support/config.cc
+++ b/shortfin/src/shortfin/support/config.cc
@@ -12,6 +12,7 @@
 #include <cstdlib>
 
 #include "fmt/format.h"
+#include "fmt/xchar.h"
 #include "shortfin/support/logging.h"
 
 namespace shortfin {

From c4a592ac8bcb2202a554ab1a4d311fdf5ddf28eb Mon Sep 17 00:00:00 2001
From: Archana Ramalingam
 <98564406+archana-ramalingam@users.noreply.github.com>
Date: Tue, 17 Dec 2024 12:35:11 -0800
Subject: [PATCH 3/3] [sharktank] Update block_seq_stride for perplexity CI
 tests (#707)

- Update `block_seq_stride` for perplexity CI tests
- Update default value of `block_seq_stride` from `16` to `32` in
`export_paged_llm_v1.py`
---
 .github/workflows/ci_eval.yaml                |  4 +-
 .github/workflows/ci_eval_short.yaml          |  2 +-
 app_tests/integration_tests/llm/utils.py      |  1 +
 .../sharktank/evaluate/perplexity_iree.py     | 47 +++++++++++--------
 .../sharktank/examples/export_paged_llm_v1.py |  2 +-
 .../sharktank/layers/configs/llm_configs.py   |  2 +-
 sharktank/sharktank/utils/export_artifacts.py |  5 +-
 7 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
index 3b85cb6527..a71698774d 100644
--- a/.github/workflows/ci_eval.yaml
+++ b/.github/workflows/ci_eval.yaml
@@ -24,7 +24,7 @@ jobs:
   test_perplexity_iree:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     timeout-minutes: 1000
-    name: "Perplexity-IREE"
+    name: "IREE Perplexity"
     strategy:
       matrix:
         version: [3.11]
@@ -83,7 +83,7 @@ jobs:
   test_perplexity_torch:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
     timeout-minutes: 1000
-    name: "Perplexity-Torch"
+    name: "Torch Perplexity"
     strategy:
       matrix:
         version: [3.11]
diff --git a/.github/workflows/ci_eval_short.yaml b/.github/workflows/ci_eval_short.yaml
index edaaee966f..d5f8f56824 100644
--- a/.github/workflows/ci_eval_short.yaml
+++ b/.github/workflows/ci_eval_short.yaml
@@ -23,7 +23,7 @@ concurrency:
 
 jobs:
   test_perplexity_iree:
-    name: "Llama3.1 8B FP16"
+    name: "IREE Perplexity"
     strategy:
       matrix:
         version: [3.11]
diff --git a/app_tests/integration_tests/llm/utils.py b/app_tests/integration_tests/llm/utils.py
index 80b5b3c098..dbbdee10da 100644
--- a/app_tests/integration_tests/llm/utils.py
+++ b/app_tests/integration_tests/llm/utils.py
@@ -90,6 +90,7 @@ def export_paged_llm_v1(mlir_path, config_path, model_path, batch_sizes):
             "python",
             "-m",
             "sharktank.examples.export_paged_llm_v1",
+            "--block-seq-stride=16",
             f"--{model_path.suffix.strip('.')}-file={model_path}",
             f"--output-mlir={mlir_path}",
             f"--output-config={config_path}",
diff --git a/sharktank/sharktank/evaluate/perplexity_iree.py b/sharktank/sharktank/evaluate/perplexity_iree.py
index 6060eb91bd..c47726f0ef 100644
--- a/sharktank/sharktank/evaluate/perplexity_iree.py
+++ b/sharktank/sharktank/evaluate/perplexity_iree.py
@@ -68,12 +68,14 @@ def __init__(
         kv_cache_type,
         tensor_parallelism_size,
         attention_kernel,
+        block_seq_stride,
     ):
         self.torch_device = torch_device
         self.iree_device = iree_device
         self.iree_hip_target = iree_hip_target
         self.iree_hal_target_backends = iree_hal_target_backends
         self.kv_cache_type = kv_cache_type
+        self.block_seq_stride = block_seq_stride
         self.activation_dtype = torch.float16
         self.attention_dtype = torch.float16
         self.tensor_parallelism_size = tensor_parallelism_size
@@ -136,6 +138,7 @@ def compile_model(self, weight_path_str):
             iree_hal_target_backends=self.iree_hal_target_backends,
             attention_kernel=self.attention_kernel,
             tensor_parallelism_size=self.tensor_parallelism_size,
+            block_seq_stride=self.block_seq_stride,
         )
         vmfb_path = export_artifacts.get_artifacts()
         return vmfb_path
@@ -145,7 +148,7 @@ def load_model(self, weight_path, tokenizer, vmfb_path):
 
         self.config = LlamaModelConfig(
             hp=configs.LlamaHParams.from_gguf_props(weight_path.properties),
-            block_seq_stride=16,
+            block_seq_stride=self.block_seq_stride,
             kv_cache_type=self.kv_cache_type,
             device=self.torch_device,
             activation_dtype=self.activation_dtype,
@@ -394,6 +397,7 @@ def run_perplexity(
     tensor_parallelism_size,
     attention_kernel,
     num_prompts,
+    block_seq_stride,
 ):
     start = time.time()
     perplexity = Perplexity(
@@ -404,6 +408,7 @@ def run_perplexity(
         kv_cache_type=kv_cache_type,
         tensor_parallelism_size=tensor_parallelism_size,
         attention_kernel=attention_kernel,
+        block_seq_stride=block_seq_stride,
     )
 
     perplexity.get_prompts(num_prompts=num_prompts)
@@ -425,8 +430,18 @@ def run_perplexity(
 
 def main(argv):
     parser = cli.create_parser()
-    parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
-    parser.add_argument("--torch-device", help="Torch device (or default)")
+    parser.add_argument(
+        "--attention-kernel",
+        type=str,
+        default="decomposed",
+        choices=["decomposed", "torch_sdpa"],
+    )
+    parser.add_argument(
+        "--block-seq-stride",
+        help="Block sequence stride for paged KV cache, must divide evenly into the context length",
+        type=int,
+        default=32,
+    )
     parser.add_argument("--iree-device", help="List an IREE device (e.g., 'hip://0')")
     parser.add_argument(
         "--iree-hip-target",
@@ -440,11 +455,12 @@ def main(argv):
         default="rocm",
         help="Specify the iree-hal target backends (e.g., rocm)",
     )
+    parser.add_argument("--kv-cache-type", default="paged", help="KV cache type")
     parser.add_argument(
-        "--attention-kernel",
-        type=str,
-        default="decomposed",
-        choices=["decomposed", "torch_sdpa"],
+        "--num-prompts",
+        type=int,
+        default=100,
+        help="Number of prompts for perplexity test (1 to 100)",
     )
     parser.add_argument(
         "--tensor-parallelism-size",
@@ -452,36 +468,29 @@ def main(argv):
         default=1,
         help="Number of devices for tensor parallel sharding",
     )
-    parser.add_argument(
-        "--num-prompts",
-        type=int,
-        default=100,
-        help="Number of prompts for perplexity test",
-    )
+    parser.add_argument("--torch-device", help="Torch device (or default)")
 
     cli.add_tokenizer_options(parser)
     cli.add_input_dataset_options(parser)
     args = cli.parse(parser, args=argv)
 
     torch_device = torch.device(args.torch_device) if args.torch_device else None
-    iree_device = args.iree_device
-    kv_cache_type = args.kv_cache_type
     weight_path = cli.get_input_dataset(args)
     tokenizer = cli.get_tokenizer(args)
-    weight_path_str = str(args.irpa_file)
 
     ppl = run_perplexity(
         weight_path=weight_path,
-        weight_path_str=weight_path_str,
+        weight_path_str=str(args.irpa_file),
         tokenizer=tokenizer,
         torch_device=torch_device,
-        iree_device=iree_device,
+        iree_device=args.iree_device,
         iree_hip_target=args.iree_hip_target,
         iree_hal_target_backends=args.iree_hal_target_backends,
-        kv_cache_type=kv_cache_type,
+        kv_cache_type=args.kv_cache_type,
         tensor_parallelism_size=args.tensor_parallelism_size,
         attention_kernel=args.attention_kernel,
         num_prompts=args.num_prompts,
+        block_seq_stride=args.block_seq_stride,
     )
 
     logger.info(f"\n{json.dumps(ppl, indent=2)}")
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
index ad297bcce7..056d8a98e8 100644
--- a/sharktank/sharktank/examples/export_paged_llm_v1.py
+++ b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -49,7 +49,7 @@ def main():
         "--block-seq-stride",
         help="Block sequence stride for paged KV cache, must divide evenly into the context length",
         type=int,
-        default="16",
+        default=32,
     )
     parser.add_argument(
         "--verbose",
diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py
index 88f5c344cb..6cf79402e4 100644
--- a/sharktank/sharktank/layers/configs/llm_configs.py
+++ b/sharktank/sharktank/layers/configs/llm_configs.py
@@ -144,7 +144,7 @@ class LlamaModelConfig:
 
     # Block sequence stride for a paged KV cache. This must divide evenly
     # into the context length.
-    block_seq_stride: int = 16
+    block_seq_stride: int = 32
 
     # Either "paged" or "direct".
     kv_cache_type: str = "paged"
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
index 0bf2525251..75cdbab7a2 100644
--- a/sharktank/sharktank/utils/export_artifacts.py
+++ b/sharktank/sharktank/utils/export_artifacts.py
@@ -92,7 +92,7 @@ def __init__(
         iree_hal_target_backends: str,
         attention_kernel: str,
         tensor_parallelism_size: int,
-        block_seq_stride: Optional[int] = None,
+        block_seq_stride: int,
     ):
         self.sharktank_dir = str(
             Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
@@ -180,14 +180,13 @@ def export_to_mlir(
             f"--output-mlir={mlir_path}",
             f"--output-config={json_path}",
             f"--bs={str(self.batch_size)}",
+            f"--block-seq-stride={self.block_seq_stride}",
         ]
         if skip_decode:
             export_args.append("--skip-decode")
         if self.attention_kernel in ["decomposed", "torch"]:
             export_args.append("--attention-kernel")
             export_args.append(self.attention_kernel)
-        if self.block_seq_stride:
-            export_args.append(f"--block-seq-stride={self.block_seq_stride}")
 
         cwd = self.sharktank_dir
         cmd = subprocess.list2cmdline(export_args)