Make sharktank export prefill/decode batch size seperately (#1046)

rsuderman · renxida · web-flow · commit 4ebbccf9fab3 · 2025-03-10T15:55:30.000-07:00
Prefill and decode have different preferable batch sizes. We should
export them separately.

---------

Co-authored-by: Xida Ren &lt;cedar.ren@gmail.com&gt;
diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py
@@ -327,7 +327,8 @@ def export_model(self, weights_path: Path) -> Tuple[Path, Path]:
                 f"--{weights_path.suffix.strip('.')}-file={weights_path}",
                 f"--output-mlir={mlir_path}",
                 f"--output-config={config_path}",
-                f"--bs={bs_string}",
+                f"--bs-prefill={bs_string}",
+                f"--bs-decode={bs_string}",
             ],
             check=True,
         )
diff --git a/docs/shortfin/llm/developer/e2e_llama8b_mi300x.md b/docs/shortfin/llm/developer/e2e_llama8b_mi300x.md
@@ -69,7 +69,8 @@ python -m sharktank.examples.export_paged_llm_v1 \
   --irpa-file=$MODEL_PARAMS_PATH \
   --output-mlir=$MLIR_PATH \
   --output-config=$OUTPUT_CONFIG_PATH \
-  --bs=$BS
+  --bs-prefill=$BS
+  --bs-decode=$BS
 ```
 
 ## Compiling to `.vmfb`
diff --git a/docs/shortfin/llm/user/llama_serving.md b/docs/shortfin/llm/user/llama_serving.md
@@ -159,7 +159,8 @@ python -m sharktank.examples.export_paged_llm_v1 \
   --gguf-file=$MODEL_PARAMS_PATH \
   --output-mlir=$MLIR_PATH \
   --output-config=$OUTPUT_CONFIG_PATH \
-  --bs=$EXPORT_BATCH_SIZES
+  --bs-prefill=$EXPORT_BATCH_SIZES \
+  --bs-decode=$EXPORT_BATCH_SIZES
 ```
 
 ### Compile using IREE to a `.vmfb` file
diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
@@ -22,7 +22,8 @@ For Llama3.1 8B (FP16) model on a MI300 server:
 pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py -k test_llama3_8B_f16 \
   --llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa \
   --llama3-8b-tokenizer-path=tokenizer_config.json \
-  --bs=4 \
+  --bs-prefill=4 \
+  --bs-decode=4 \
   --run-nightly-llama-tests
 ```
 
@@ -31,7 +32,8 @@ pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py -k test_llam
 pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py -k test_llama3_8B_f16 \
   --llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa  \
   --llama3-8b-tokenizer-path=tokenizer_config.json \
-  --bs=4 \
+  --bs-prefill=4 \
+  --bs-decode=4 \
   --iree-device=hip://1 \
   --iree-hip-target=gfx942 \
   --iree-hal-target-device=hip
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -40,7 +40,13 @@ def main():
         default="/tmp/batch_llama_v1.json",
     )
     parser.add_argument(
-        "--bs",
+        "--bs-prefill",
+        help="Comma-separated batch size(s) to generate, e.g. `4` or `2,4`",
+        type=lambda arg: [int(bs) for bs in arg.split(",")],
+        default="4",
+    )
+    parser.add_argument(
+        "--bs-decode",
         help="Comma-separated batch size(s) to generate, e.g. `4` or `2,4`",
         type=lambda arg: [int(bs) for bs in arg.split(",")],
         default="4",
@@ -336,13 +342,14 @@ def _(
             return logits
 
     bsizes = []
-    for bs in args.bs:
-        if not args.skip_prefill:
+    if not args.skip_prefill:
+        for bs in args.bs_prefill:
             generate_batch_prefill(bs)
-        if not args.skip_decode:
+    if not args.skip_decode:
+        for bs in args.bs_decode:
             generate_batch_decode(bs)
-        bsizes.append(bs)
-    config = generate_params_json(hp, bsizes, bsizes)
+
+    config = generate_params_json(hp, args.bs_prefill, args.bs_decode)
     print("GENERATED!")
 
     if args.verbose:
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
@@ -192,7 +192,8 @@ def export_to_mlir(
             f"--irpa-file={self.irpa_path}",
             f"--output-mlir={mlir_path}",
             f"--output-config={json_path}",
-            f"--bs={str(self.batch_size)}",
+            f"--bs-prefill={str(self.batch_size)}",
+            f"--bs-decode={str(self.batch_size)}",
             f"--block-seq-stride={self.block_seq_stride}",
             f"--attention-dtype={self.attention_dtype}",
             f"--activation-dtype={self.activation_dtype}",