From 865545198fd16acdde128da4e9a1693a4af68ec1 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Wed, 11 Dec 2024 16:22:10 +0000
Subject: [PATCH 1/3] Update user docs for running `llm server`, Add back
 `sentencepiece` as requirement for `sharktank` to enable
 `export_paged_llm_v1`

---
 docs/shortfin/llm/user/e2e_llama8b_mi300x.md | 81 +++++++-------------
 sharktank/requirements.txt                   |  2 +-
 2 files changed, 28 insertions(+), 55 deletions(-)

diff --git a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
index 4a8423bc8..94c076980 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
+++ b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
@@ -22,30 +22,37 @@ python -m venv --prompt shark-ai .venv
 source .venv/bin/activate
 ```
 
-### Install `shark-ai`
+### Install `shortfin`
 
-You can install either the `latest stable` version of `shark-ai`
-or the `nightly` version:
+You can install either the `latest stable` version of shortfin by installing
+`shark-ai` or the `nightly` version directly:
 
 #### Stable
 
 ```bash
-pip install shark-ai
+pip install shark-ai[apps]
 ```
 
 #### Nightly
 
 ```bash
-pip install sharktank -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
-pip install shortfin -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
+pip install shortfin[apps] --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
 ```
 
-#### Install dataclasses-json
+<!-- TODO: Remove when sharktank added to `shark-ai[apps]` -->
+### Install `sharktank`
 
-<!-- TODO: This should be included in release: -->
+Install the `nightly` version of sharktank:
 
 ```bash
-pip install dataclasses-json
+pip install sharktank --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
+```
+
+<!-- TODO: Remove once `sentencepiece` added to nightly `sharktank` -->
+### Install `sentencepiece`
+
+```bash
+pip install sentencepiece
 ```
 
 ### Define a directory for export files
@@ -78,8 +85,8 @@ This example uses the `llama8b_f16.gguf` and `tokenizer.json` files
 that were downloaded in the previous step.
 
 ```bash
-export MODEL_PARAMS_PATH=$EXPORT_DIR/llama3.1-8b/llama8b_f16.gguf
-export TOKENIZER_PATH=$EXPORT_DIR/llama3.1-8b/tokenizer.json
+export MODEL_PARAMS_PATH=$EXPORT_DIR/meta-llama-3.1-8b-instruct.f16.gguf
+export TOKENIZER_PATH=$EXPORT_DIR/tokenizer.json
 ```
 
 #### General env vars
@@ -91,8 +98,6 @@ The following env vars can be copy + pasted directly:
 export MLIR_PATH=$EXPORT_DIR/model.mlir
 # Path to export config.json file
 export OUTPUT_CONFIG_PATH=$EXPORT_DIR/config.json
-# Path to export edited_config.json file
-export EDITED_CONFIG_PATH=$EXPORT_DIR/edited_config.json
 # Path to export model.vmfb file
 export VMFB_PATH=$EXPORT_DIR/model.vmfb
 # Batch size for kvcache
@@ -108,7 +113,7 @@ to export our model to `.mlir` format.
 
 ```bash
 python -m sharktank.examples.export_paged_llm_v1 \
-  --irpa-file=$MODEL_PARAMS_PATH \
+  --gguf-file=$MODEL_PARAMS_PATH \
   --output-mlir=$MLIR_PATH \
   --output-config=$OUTPUT_CONFIG_PATH \
   --bs=$BS
@@ -137,37 +142,6 @@ iree-compile $MLIR_PATH \
  -o $VMFB_PATH
 ```
 
-## Write an edited config
-
-We need to write a config for our model with a slightly edited structure
-to run with shortfin. This will work for the example in our docs.
-You may need to modify some of the parameters for a specific model.
-
-### Write edited config
-
-```bash
-cat > $EDITED_CONFIG_PATH << EOF
-{
-    "module_name": "module",
-    "module_abi_version": 1,
-    "max_seq_len": 131072,
-    "attn_head_count": 8,
-    "attn_head_dim": 128,
-    "prefill_batch_sizes": [
-        $BS
-    ],
-    "decode_batch_sizes": [
-        $BS
-    ],
-    "transformer_block_count": 32,
-    "paged_kv_cache": {
-        "block_seq_stride": 16,
-        "device_block_count": 256
-    }
-}
-EOF
-```
-
 ## Running the `shortfin` LLM server
 
 We should now have all of the files that we need to run the shortfin LLM server.
@@ -178,15 +152,14 @@ Verify that you have the following in your specified directory ($EXPORT_DIR):
 ls $EXPORT_DIR
 ```
 
-- edited_config.json
+- config.json
+- meta-llama-3.1-8b-instruct.f16.gguf
+- model.mlir
 - model.vmfb
+- tokenizer_config.json
+- tokenizer.json
 
-### Launch server:
-
-<!-- #### Set the target device
-
-TODO: Add instructions on targeting different devices,
-when `--device=hip://$DEVICE` is supported -->
+### Launch server
 
 #### Run the shortfin server
 
@@ -209,7 +182,7 @@ Run the following command to launch the Shortfin LLM Server in the background:
 ```bash
 python -m shortfin_apps.llm.server \
    --tokenizer_json=$TOKENIZER_PATH \
-   --model_config=$EDITED_CONFIG_PATH \
+   --model_config=$OUTPUT_CONFIG_PATH \
    --vmfb=$VMFB_PATH \
    --parameters=$MODEL_PARAMS_PATH \
    --device=hip > shortfin_llm_server.log 2>&1 &
@@ -252,7 +225,7 @@ port = 8000 # Change if running on a different port
 generate_url = f"http://localhost:{port}/generate"
 
 def generation_request():
-    payload = {"text": "What is the capital of the United States?", "sampling_params": {"max_completion_tokens": 50}}
+    payload = {"text": "Name the capital of the United States.", "sampling_params": {"max_completion_tokens": 50}}
     try:
         resp = requests.post(generate_url, json=payload)
         resp.raise_for_status()  # Raises an HTTPError for bad responses
diff --git a/sharktank/requirements.txt b/sharktank/requirements.txt
index e7181284a..d2879740c 100644
--- a/sharktank/requirements.txt
+++ b/sharktank/requirements.txt
@@ -5,7 +5,7 @@ gguf==0.10.0
 numpy<2.0
 
 # Needed for newer gguf versions (TODO: remove when gguf package includes this)
-# sentencepiece>=0.1.98,<=0.2.0
+sentencepiece>=0.1.98,<=0.2.0
 
 # Model deps.
 huggingface-hub==0.22.2

From e7159d62f020876207f747568dde9c3ef5759b95 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Thu, 12 Dec 2024 17:53:19 +0000
Subject: [PATCH 2/3] Upgrade `gguf` to `0.11.0`

---
 sharktank/requirements.txt | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sharktank/requirements.txt b/sharktank/requirements.txt
index d2879740c..6ec1ea1d9 100644
--- a/sharktank/requirements.txt
+++ b/sharktank/requirements.txt
@@ -1,12 +1,9 @@
 iree-turbine
 
 # Runtime deps.
-gguf==0.10.0
+gguf==0.11.0
 numpy<2.0
 
-# Needed for newer gguf versions (TODO: remove when gguf package includes this)
-sentencepiece>=0.1.98,<=0.2.0
-
 # Model deps.
 huggingface-hub==0.22.2
 transformers==4.40.0

From 011f2732e4e29f279417709128aec7654ad243f2 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Thu, 12 Dec 2024 21:47:04 +0000
Subject: [PATCH 3/3] Remove `pip install sentencepiece`, Update `stable` and
 `nightly` installation instructs, Only set lower-bound for `gguf`

---
 docs/shortfin/llm/user/e2e_llama8b_mi300x.md | 31 +++++++-------------
 sharktank/requirements.txt                   |  2 +-
 2 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
index 94c076980..313a8086c 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
+++ b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
@@ -22,38 +22,27 @@ python -m venv --prompt shark-ai .venv
 source .venv/bin/activate
 ```
 
-### Install `shortfin`
+## Install stable shark-ai packages
 
-You can install either the `latest stable` version of shortfin by installing
-`shark-ai` or the `nightly` version directly:
-
-#### Stable
+<!-- TODO: Add `sharktank` to `shark-ai` meta package -->
 
 ```bash
-pip install shark-ai[apps]
+pip install shark-ai[apps] sharktank
 ```
 
-#### Nightly
-
-```bash
-pip install shortfin[apps] --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
-```
+### Nightly packages
 
-<!-- TODO: Remove when sharktank added to `shark-ai[apps]` -->
-### Install `sharktank`
+To install nightly packages:
 
-Install the `nightly` version of sharktank:
+<!-- TODO: Add `sharktank` to `shark-ai` meta package -->
 
 ```bash
-pip install sharktank --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
+pip install shark-ai[apps] sharktank \
+    --pre --find-links https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
 ```
 
-<!-- TODO: Remove once `sentencepiece` added to nightly `sharktank` -->
-### Install `sentencepiece`
-
-```bash
-pip install sentencepiece
-```
+See also the
+[instructions here](https://github.com/nod-ai/shark-ai/blob/main/docs/nightly_releases.md).
 
 ### Define a directory for export files
 
diff --git a/sharktank/requirements.txt b/sharktank/requirements.txt
index 6ec1ea1d9..90cbedded 100644
--- a/sharktank/requirements.txt
+++ b/sharktank/requirements.txt
@@ -1,7 +1,7 @@
 iree-turbine
 
 # Runtime deps.
-gguf==0.11.0
+gguf>=0.11.0
 numpy<2.0
 
 # Model deps.