From 18b4141297b05cd53d25962b88fce67b8aa8f87c Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Mon, 16 Dec 2024 12:44:08 -0600
Subject: [PATCH 01/18] Cleanup sglang doc for user flow, Expand examples for
 targeting shortfin from sglang

---
 .../shortfin_with_sglang_frontend_language.md | 223 ++++++++----------
 1 file changed, 95 insertions(+), 128 deletions(-)

diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index b63861a56..858fda118 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -24,21 +24,11 @@ For this tutorial, you will need to meet the following prerequisites:
     - You can check out [pyenv](https://github.com/pyenv/pyenv)
     as a good tool to be able to manage multiple versions of python
     on the same system.
-- A running `shortfin` LLM server as described [below](#installstart-shortfin-llm-server)
+- A running `shortfin` LLM server. Directions on launching the llm server can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md)
   - We will use the shortfin server as the `backend` to generate completions
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.
 
-### Hardware
-
-- This tutorial is designed to run on an [AMD MI300X GPU](https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html)
-
-## Install/Start `shortfin` LLM server
-
-Follow the steps [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md)
-to export a model with `sharktank` and start a `shortfin` LLM server
-with that model.
-
 ## Install sglang
 
 ### Install sglang inside of virtual environment
@@ -48,6 +38,8 @@ We can use pip to install it in the same virtual environment that we used
 to start our Shortfin LLM Server.
 
 ```bash
+python -m venv --prompt shark-ai .venv
+source .venv/bin/activate
 pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 ```
 
@@ -56,8 +48,9 @@ pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python"
 You can verify the installation/setup through the following examples:
 
 - [Multi-Turn Q&A Example](#multi-turn-qa-example)
+- [Streaming Example](#streaming-example)
 - [Fork Example](#fork-example)
-- [Benchmark Shortfin](#bench-mark-shortfin-w-sglang-bench_serving-script)
+- [Multi-Turn Q&A Batching Example](#multi-turn-qa-batch-example)
 
 ## Multi-Turn Q&A example
 
@@ -79,16 +72,16 @@ import sglang as sgl
 
 from sglang.lang.chat_template import get_chat_template
 
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://localhost:8000", ) # Change base_url if running at different address
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80", ) # Change base_url if running at different address
 
 sgl.set_default_backend(backend)
 
 @sgl.function
 def multi_turn_question(s, question_1, question_2):
      s += sgl.user(question_1)
-     s += sgl.assistant(sgl.gen("answer_1", max_tokens=256))
+     s += sgl.assistant(sgl.gen("answer_1", max_tokens=50))
      s += sgl.user(question_2)
-     s += sgl.assistant(sgl.gen("answer_2", max_tokens=256))
+     s += sgl.assistant(sgl.gen("answer_2", max_tokens=50))
 
 state = multi_turn_question.run(question_1="Name the capital city of the USA.", question_2="The Smithsonian is in this location.")
 
@@ -96,40 +89,56 @@ for m in state.messages():
     print(m["role"], m["content"])
 ```
 
-### Shortfin example output
+## Streaming Example
 
-You should see an output similar to this:
+We can stream our request for a more responsive feel. Let's invoke a `streaming` Q&A from our server:
 
-```text
-========== single ==========
+```python
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
 
-user : Name the capital city of the USA
-assistant : The capital city of the United States of America is Washington, D.C. (short for District of Columbia).
-user : The Smithsonian is in this location.
-assistant : The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes.
-```
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80")  # Change base_url if running at a different address
 
-## Fork example
+sgl.set_default_backend(backend)
 
-Now that we have sglang installed, we can run an example to show a `fork`
-flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html):
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=50))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=50))
 
-### Open python interpreter
+question_1 = "Name the capital city of the USA."
+question_2 = "The Smithsonian is in this location."
 
-```bash
-python
+# Run the multi-turn question function with streaming enabled
+state = multi_turn_question.run(
+    question_1=question_1,
+    question_2=question_2,
+    stream=True,
+)
+
+# Collect messages from the streamed output
+messages = ""
+
+for chunk in state.text_iter():
+    messages += chunk
+
+print(messages)
 ```
 
-### Run example
 
-You can copy and paste the following example into your interpreter:
+## Fork example
+
+We can also send different pieces of the same prompt in parallel using the `fork`
+flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html):
 
 ```python
 import sglang as sgl
 
 from sglang.lang.chat_template import get_chat_template
 
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://localhost:8000") # Change base_url if running at different address
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at different address
 
 sgl.set_default_backend(backend)
 
@@ -142,7 +151,7 @@ def tip_suggestion(s):
     forks = s.fork(2)
     for i, f in enumerate(forks):
         f += f"Now, expand tip {i+1} into a paragraph:\n"
-        f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
+        f += sgl.gen(f"detailed_tip", max_tokens=50, stop="\n\n")
     s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
     s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
     s += "In summary" + sgl.gen("summary")
@@ -152,103 +161,61 @@ state = tip_suggestion.run()
 print(state.text())
 ```
 
-### Shortfin example output
-
-You should see an output similar to this:
-
-```text
-Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise.
-
-Tip 1:A balanced diet is important for maintaining good health. It should
-include a variety of foods from all the major food groups, such as fruits,
-vegetables, grains, proteins, and dairy. Eating a balanced diet can help
-prevent chronic diseases such as heart disease, diabetes, and obesity.
-
-Now, expand tip 2 into a paragraph:
-Regular exercise is also important for maintaining good health. It can help
-improve cardiovascular health, strengthen muscles and bones, and reduce the
-risk of chronic diseases. Exercise can also help improve mental health by
-reducing stress and anxiety. It is recommended that adults get at least 150
-minutes of moderate-intensity exercise or 75 minutes of vigorous-intensity
-exercise per week.
-
-Now, combine the two paragraphs into a single paragraph:
-A balanced diet and regular exercise are both important for maintaining good
-health. A balanced diet should include a variety of foods from all the major
-food groups, such as fruits, vegetables, grains, proteins, and dairy.
-Eating a balanced diet can help prevent chronic diseases such as heart disease,
-diabetes, and obesity. Regular exercise is also important for maintaining good
-health. It can help improve cardiovascular health, strengthen muscles and bones,
-and reduce the risk of chronic diseases. Exercise can also help improve mental
-health by reducing stress and anxiety. It is recommended that
-
-Tip 2:Regular exercise is important for maintaining a healthy body and mind.
-It can help improve cardiovascular health, strengthen muscles and bones,
-and reduce the risk of chronic diseases such as diabetes and heart disease.
-Additionally, exercise has been shown to improve mood, reduce stress,
-and increase overall well-being. It is recommended that adults engage in
-at least 150 minutes of moderate-intensity aerobic activity or 75 minutes of
-vigorous-intensity aerobic activity per week, as well as strength training
-exercises at least two days per week.
-
-In summary, a balanced diet and regular exercise are both essential for
-maintaining good health. A balanced diet should include a variety of foods from
-all the major food groups, while regular exercise can help improve
-cardiovascular health, strengthen muscles and bones, reduce the risk of
-chronic diseases, and improve mental health. It is recommended that adults
-engage in at least 150 minutes of moderate-intensity aerobic activity or
-75 minutes of vigorous-intensity aerobic activity per week,
-as well as strength training exercises at least two days per week.
-```
+## Multi-Turn Q&A Batch Example
+
+With **Shortfin** + SGLang, we can also easily send requests as a batch.
+Let's now invoke a `batched` Q&A flow with the SGLang [Batching](https://sgl-project.github.io/frontend/frontend.html#batching):
+
+```python
+import sglang as sgl
+from sglang.lang.chat_template import get_chat_template
 
-## Benchmark shortfin w/ sglang `bench_serving` script
+# Initialize the backend with the specified chat template and base URL
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80")  # Change base_url if running at a different address
 
-We can obtain benchmarking metrics using the `bench_serving` script
-provided by SGLang:
+# Set the default backend for sglang
+sgl.set_default_backend(backend)
 
-**NOTE: Change `--base-url` if running at a different address**
+@sgl.function
+def multi_turn_question(s, question_1, question_2):
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1", max_tokens=50))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2", max_tokens=50))
+
+# Define the questions for the first and second sets
+question_1_1 = "Name the capital city of the USA."
+question_1_2 = "The Smithsonian is in this location."
+question_2_1 = "Name the largest city in the USA."
+question_2_2 = "The Empire State Building is in this location."
+
+# Run the multi-turn question function in batch mode
+states = multi_turn_question.run_batch(
+    [
+        {
+            "question_1": question_1_1,
+            "question_2": question_1_2,
+        },
+        {
+            "question_1": question_2_1,
+            "question_2": question_2_2,
+        },
+    ]
+)
+
+# Extract responses from the states
+first_qa = states[0]
+second_qa = states[1]
+
+first_qa_messages = first_qa.messages()
+second_qa_messages = second_qa.messages()
+
+# Print messages from the first QA session
+for m in first_qa_messages:
+    print(m["role"], m["content"])
 
-```bash
-python -m sglang.bench_serving --backend shortfin --num-prompt 10 --base-url http://localhost:8000 --tokenizer /path/to/tokenizer/dir --request-rate 1
-```
+# Print messages from the second QA session
+for m in second_qa_messages:
+    print(m["role"], m["content"])
 
-There are some more metrics captured, but the most relevant are the following:
-
-- E2E Latency
-- TTFT (Time to First Token)
-- TPOT (Time per Output Token)
-- ITL (Inter-Token Latency)
-- Request Throughput
-- Benchmark Duration
-
-When complete, you should see an output similar to this:
-
-```text
-============ Serving Benchmark Result ============
-Backend:                                 shortfin
-Traffic request rate:                    1.0
-Successful requests:                     10
-Benchmark duration (s):                  427.91
-Total input tokens:                      1960
-Total generated tokens:                  2774
-Total generated tokens (retokenized):    63
-Request throughput (req/s):              0.02
-Input token throughput (tok/s):          4.58
-Output token throughput (tok/s):         6.48
-----------------End-to-End Latency----------------
-Mean E2E Latency (ms):                   416268.77
-Median E2E Latency (ms):                 417159.14
----------------Time to First Token----------------
-Mean TTFT (ms):                          292404.29
-Median TTFT (ms):                        365989.01
-P99 TTFT (ms):                           367325.63
------Time per Output Token (excl. 1st token)------
-Mean TPOT (ms):                          1359.41
-Median TPOT (ms):                        163.96
-P99 TPOT (ms):                           6316.12
----------------Inter-token Latency----------------
-Mean ITL (ms):                           2238.99
-Median ITL (ms):                         958.75
-P99 ITL (ms):                            2719.50
-==================================================
 ```

From 84259208c0cc1a34dcd5b5ad87ce24c20eab982a Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Tue, 17 Dec 2024 10:47:43 -0600
Subject: [PATCH 02/18] add k8s instructions

---
 docs/shortfin/llm/user/e2e_llama8b_k8s.md     | 42 ++++++++++++
 .../shortfin_with_sglang_frontend_language.md |  3 +-
 .../llm/k8s/llama-app-deployment.yaml         | 64 +++++++++++++++++++
 3 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 docs/shortfin/llm/user/e2e_llama8b_k8s.md
 create mode 100644 shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml

diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
new file mode 100644
index 000000000..99ac3561e
--- /dev/null
+++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
@@ -0,0 +1,42 @@
+# LLama 8b GPU instructions on Kubernetes
+
+## Setup
+
+We will use an example with `llama_8b_f16` in order to describe the
+process of exporting a model and deploying four instances of a shortfin llm server
+behind a load balancer on MI300X GPU.
+
+### Pre-Requisites
+
+- Kubernetes cluster available to use
+- kubectl installed on system and configured for cluster of interest
+    - To install kubectl, please check out [kubectl install](https://kubernetes.io/docs/tasks/tools/#kubectl)
+    and make sure to set the `KUBECONFIG` environment variable to point to your kube config file to authorize
+    connection to the cluster.
+
+### Deploy shortfin llama app service
+
+Please edit the following file to fetch the correct artifacts and serve the intended configuration of the llama3 model for your use case [here](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml).
+
+To deploy llama app:
+
+```
+kubectl apply -f llama-app-deployment.yaml
+```
+
+To retrieve external IP for targetting the llama app load balancer:
+
+```
+kubectl get service shark-llama-app-service
+```
+
+Now, you can use the external IP for sglang integration or just sending image generation requests.
+
+### Delete shortfin llama app service
+
+After done using, make sure to delete:
+
+```
+kubectl delete deployment shark-llama-app-deployment
+kubectl delete service shark-llama-app-service
+```
diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index 858fda118..832ec9c3d 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -24,7 +24,8 @@ For this tutorial, you will need to meet the following prerequisites:
     - You can check out [pyenv](https://github.com/pyenv/pyenv)
     as a good tool to be able to manage multiple versions of python
     on the same system.
-- A running `shortfin` LLM server. Directions on launching the llm server can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md)
+- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md) and for launching
+on a kubernetes cluster, please look [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_k8s.md)
   - We will use the shortfin server as the `backend` to generate completions
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.
diff --git a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml
new file mode 100644
index 000000000..b2d86efd2
--- /dev/null
+++ b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml
@@ -0,0 +1,64 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  # please change name to include your amd username to reduce conflict
+  name: shark-llama-app-deployment
+spec:
+  replicas: 4 # number of server instances
+  selector:
+    matchLabels:
+      app: shark-llama-app # please change name to include your amd username
+  template:
+    metadata:
+      labels:
+        app: shark-llama-app # please change name to include your amd username
+    spec:
+      containers:
+      - name: shark-llama-app-container
+        image: rocm/dev-ubuntu-22.04:6.3
+        command: ["/bin/bash", "-c"]
+        # update to your artifacts and change cli flags for instantiation of server to match your intended llama configuration
+        args:
+        - |
+          rocminfo &&
+          echo "ROCm is working!" &&
+          sudo apt update &&
+          curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash &&
+          sudo apt install git -y &&
+          sudo apt install python3.11 python3.11-dev python3.11-venv -y &&
+          sudo apt-get install wget -y &&
+          python3.11 -m venv shark_venv && source shark_venv/bin/activate &&
+          mkdir shark_artifacts &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/config.json -O shark_artifacts/config.json &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/meta-llama-3.1-8b-instruct.f16.gguf -O shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/model.vmfb -O shark_artifacts/model.vmfb &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer_config.json -O shark_artifacts/tokenizer_config.json &&
+          wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer.json -O shark_artifacts/tokenizer.json &&
+          pip install --pre  shortfin[apps] -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels &&
+          pip install pandas &&
+          python -m shortfin_apps.llm.server --tokenizer_json=shark_artifacts/tokenizer.json --model_config=shark_artifacts/config.json --vmfb=shark_artifacts/model.vmfb --parameters=shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf --device_ids 0 --device=hip;
+        resources:
+          # change number of gpus required here based on your llama configuration
+          requests:
+            amd.com/gpu: 1
+          limits:
+            amd.com/gpu: 1
+      restartPolicy: Always
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  name: shark-llama-app-service # please change name to include your amd username
+  # specific to OCI in how we expose the load balancer (only AMD network available currently)
+  annotations:
+    service.beta.kubernetes.io/oci-load-balancer-internal: "true"
+spec:
+  selector:
+    app: shark-llama-app # please change name to include your amd username
+  ports:
+  - protocol: TCP
+    port: 80 # external port
+    targetPort: 8000 # port the container exposes
+  type: LoadBalancer

From a32a2eb466ab6693e1f1c91d2857b19420455da6 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Tue, 17 Dec 2024 18:40:32 -0600
Subject: [PATCH 03/18] update doc wording

---
 docs/shortfin/llm/user/e2e_llama8b_k8s.md                       | 2 +-
 shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
index 99ac3561e..58f19c215 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md
+++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
@@ -16,7 +16,7 @@ behind a load balancer on MI300X GPU.
 
 ### Deploy shortfin llama app service
 
-Please edit the following file to fetch the correct artifacts and serve the intended configuration of the llama3 model for your use case [here](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml).
+Save [llama-app-deployment.yaml](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration.
 
 To deploy llama app:
 
diff --git a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml
index b2d86efd2..57a9995a9 100644
--- a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml
+++ b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml
@@ -20,8 +20,6 @@ spec:
         # update to your artifacts and change cli flags for instantiation of server to match your intended llama configuration
         args:
         - |
-          rocminfo &&
-          echo "ROCm is working!" &&
           sudo apt update &&
           curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash &&
           sudo apt install git -y &&

From bfe1058433af0351834e7e72b8021a2f4434c3a5 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Wed, 18 Dec 2024 17:57:56 -0600
Subject: [PATCH 04/18] Make shortfin server heading for more visible doc
 links, Add iree-base-runtime, iree-base-compiler and iree-turbine to nightly
 install instructions

---
 docs/shortfin/llm/user/e2e_llama8b_mi300x.md                 | 5 +++++
 .../llm/user/shortfin_with_sglang_frontend_language.md       | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
index 313a8086c..87ceb84dd 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
+++ b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md
@@ -39,6 +39,11 @@ To install nightly packages:
 ```bash
 pip install shark-ai[apps] sharktank \
     --pre --find-links https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels
+pip install -f https://iree.dev/pip-release-links.html --pre --upgrade \
+    iree-base-compiler \
+    iree-base-runtime \
+    iree-turbine \
+    "numpy<2.0"
 ```
 
 See also the
diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index 832ec9c3d..0dbbeb56b 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -24,6 +24,9 @@ For this tutorial, you will need to meet the following prerequisites:
     - You can check out [pyenv](https://github.com/pyenv/pyenv)
     as a good tool to be able to manage multiple versions of python
     on the same system.
+
+### Shortfin LLM Server
+
 - A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md) and for launching
 on a kubernetes cluster, please look [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_k8s.md)
   - We will use the shortfin server as the `backend` to generate completions

From b320fcb026f84adeed0c1639d0c45a8bc55ac51a Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 20 Dec 2024 07:52:30 -0600
Subject: [PATCH 05/18] Update link to k8 docs

---
 .../shortfin/llm/user/shortfin_with_sglang_frontend_language.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index e1f1de3f2..fe986ca9b 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -28,7 +28,7 @@ For this tutorial, you will need to meet the following prerequisites:
 ### Shortfin LLM Server
 
 - A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](./llama_end_to_end.md) and for launching
-on a kubernetes cluster, please look [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_k8s.md)
+on a kubernetes cluster, please look [here](./e2e_llama8b_k8s.md)
   - We will use the shortfin server as the `backend` to generate completions
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.

From 78dcc9bf38557cc6a9294cbae2b6120e39b5ccff Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 20 Dec 2024 10:22:43 -0600
Subject: [PATCH 06/18] move k8s deployment file

---
 docs/shortfin/llm/user/e2e_llama8b_k8s.md                       | 2 +-
 .../shortfin_apps/llm/k8s/llama-app-deployment.yaml             | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename shortfin/{python => deployment}/shortfin_apps/llm/k8s/llama-app-deployment.yaml (100%)

diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
index 58f19c215..160d9a21e 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md
+++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
@@ -16,7 +16,7 @@ behind a load balancer on MI300X GPU.
 
 ### Deploy shortfin llama app service
 
-Save [llama-app-deployment.yaml](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration.
+Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration.
 
 To deploy llama app:
 
diff --git a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
similarity index 100%
rename from shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml
rename to shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml

From 0a39a523fc4af64937058d2ddff99f03bd24f770 Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Fri, 20 Dec 2024 08:42:40 -0800
Subject: [PATCH 07/18] allow multiple yaml docs in one for check-yaml

---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9f2be5cf6..e2e8d797f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,6 +7,7 @@ repos:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-yaml
+        args: ['--allow-multiple-documents']
     -   id: check-added-large-files
 -   repo: https://github.com/psf/black
     rev: 22.10.0

From 34c7df46f5140e0dd808c6830408f045f935e942 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 20 Dec 2024 12:28:45 -0600
Subject: [PATCH 08/18] remove amd specific things

---
 .../shortfin_apps/llm/k8s/llama-app-deployment.yaml  | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
index 57a9995a9..5ea37267b 100644
--- a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
+++ b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
@@ -1,17 +1,16 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  # please change name to include your amd username to reduce conflict
   name: shark-llama-app-deployment
 spec:
   replicas: 4 # number of server instances
   selector:
     matchLabels:
-      app: shark-llama-app # please change name to include your amd username
+      app: shark-llama-app
   template:
     metadata:
       labels:
-        app: shark-llama-app # please change name to include your amd username
+        app: shark-llama-app
     spec:
       containers:
       - name: shark-llama-app-container
@@ -48,13 +47,10 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: shark-llama-app-service # please change name to include your amd username
-  # specific to OCI in how we expose the load balancer (only AMD network available currently)
-  annotations:
-    service.beta.kubernetes.io/oci-load-balancer-internal: "true"
+  name: shark-llama-app-service
 spec:
   selector:
-    app: shark-llama-app # please change name to include your amd username
+    app: shark-llama-app
   ports:
   - protocol: TCP
     port: 80 # external port

From 52de1abd3dc0c23cccdc11cc653172140c87d194 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 20 Dec 2024 12:32:20 -0600
Subject: [PATCH 09/18] text gen update

---
 docs/shortfin/llm/user/e2e_llama8b_k8s.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
index 160d9a21e..343001b47 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md
+++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
@@ -30,7 +30,7 @@ To retrieve external IP for targetting the llama app load balancer:
 kubectl get service shark-llama-app-service
 ```
 
-Now, you can use the external IP for sglang integration or just sending image generation requests.
+Now, you can use the external IP for sglang integration or just sending text generation requests.
 
 ### Delete shortfin llama app service
 

From bd3e5ad5d995ce65e9ea6cc3ffe9fae0c7c300a1 Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Fri, 20 Dec 2024 10:34:07 -0800
Subject: [PATCH 10/18] inline document titles for readme links

Co-authored-by: Scott Todd <scott.todd0@gmail.com>
---
 .../llm/user/shortfin_with_sglang_frontend_language.md        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index fe986ca9b..53524f373 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -27,8 +27,8 @@ For this tutorial, you will need to meet the following prerequisites:
 
 ### Shortfin LLM Server
 
-- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](./llama_end_to_end.md) and for launching
-on a kubernetes cluster, please look [here](./e2e_llama8b_k8s.md)
+- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_end_to_end.md) and for launching
+on a kubernetes cluster, see [LLama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md)
   - We will use the shortfin server as the `backend` to generate completions
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.

From 2b31e45b9fb82239aabcbbb395791bb280012733 Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 20 Dec 2024 12:58:32 -0600
Subject: [PATCH 11/18] Add clarification for BASE_URL, and use environment
 variable to access BASE_URL in code examples

---
 .../shortfin_with_sglang_frontend_language.md | 51 ++++++++++++++-----
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index 53524f373..d2bcbfd12 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -4,15 +4,15 @@ This doc includes basic steps for hooking up sglang with a running Shortfin serv
 
 ## Current Support Status
 
-| Feature     | Description | Enabled    | Reference |
-| ----------- | ----------- | ---------- | ------------ |
-| `gen`       | Generate shortfin completion, given a prompt | ✅ | [Shortfin Implementation](https://github.com/nod-ai/sglang/blob/main/python/sglang/lang/backend/shortfin.py) |
-| `streaming` | Stream shortfin completion, given a prompt | ✅ | [Streaming](https://sgl-project.github.io/frontend/frontend.html#streaming) |
-| `run_batch` | Run batch of disjoint requests with continous batching | ✅ | [Batching](https://sgl-project.github.io/frontend/frontend.html#batching) |
-| `fork`      | Generate sections of the same prompt in parallel | ✅ | [Fork Docs](https://sgl-project.github.io/frontend/frontend.html#parallelism) |
-| `choices`   | Given set of choices, generate response based on best log probs | ❌ | [Choices Methods](https://sgl-project.github.io/frontend/choices_methods.html#choices-methods-in-sglang) |
-| `image`     | Pass image as part of multi-modal prompt | ❌ | [sgl.image](https://sgl-project.github.io/frontend/frontend.html#multi-modality) |
-| `regex`     | Specify regular expression as decoding constraint | ❌ | [Regex](https://sgl-project.github.io/frontend/frontend.html#constrained-decoding) |
+| Feature     | Description                                                     | Enabled | Reference                                                                                                    |
+| ----------- | --------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------ |
+| `gen`       | Generate shortfin completion, given a prompt                    | ✅       | [Shortfin Implementation](https://github.com/nod-ai/sglang/blob/main/python/sglang/lang/backend/shortfin.py) |
+| `streaming` | Stream shortfin completion, given a prompt                      | ✅       | [Streaming](https://sgl-project.github.io/frontend/frontend.html#streaming)                                  |
+| `run_batch` | Run batch of disjoint requests with continous batching          | ✅       | [Batching](https://sgl-project.github.io/frontend/frontend.html#batching)                                    |
+| `fork`      | Generate sections of the same prompt in parallel                | ✅       | [Fork Docs](https://sgl-project.github.io/frontend/frontend.html#parallelism)                                |
+| `choices`   | Given set of choices, generate response based on best log probs | ❌       | [Choices Methods](https://sgl-project.github.io/frontend/choices_methods.html#choices-methods-in-sglang)     |
+| `image`     | Pass image as part of multi-modal prompt                        | ❌       | [sgl.image](https://sgl-project.github.io/frontend/frontend.html#multi-modality)                             |
+| `regex`     | Specify regular expression as decoding constraint               | ❌       | [Regex](https://sgl-project.github.io/frontend/frontend.html#constrained-decoding)                           |
 
 ## Prerequisites
 
@@ -33,6 +33,14 @@ on a kubernetes cluster, see [LLama 8b GPU instructions on Kubernetes](./e2e_lla
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.
 
+After the `shortfin` LLM Server has started, we must obtain the base_url.
+We will store this in our environment in order to send request to `shortfin`
+ through the `sglang` client examples below.
+
+```bash
+export SHORTFIN_BASE_URL="SHORTFIN_BASE_URL" # example: http://localhost:8000
+```
+
 ## Install sglang
 
 ### Install sglang inside of virtual environment
@@ -72,11 +80,15 @@ python
 You can copy and paste the following example into your interpreter:
 
 ```python
+import os
+
 import sglang as sgl
 
 from sglang.lang.chat_template import get_chat_template
 
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80", ) # Change base_url if running at different address
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
+
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
 
 sgl.set_default_backend(backend)
 
@@ -98,10 +110,14 @@ for m in state.messages():
 We can stream our request for a more responsive feel. Let's invoke a `streaming` Q&A from our server:
 
 ```python
+import os
+
 import sglang as sgl
 from sglang.lang.chat_template import get_chat_template
 
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80")  # Change base_url if running at a different address
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
+
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
 
 sgl.set_default_backend(backend)
 
@@ -138,11 +154,15 @@ We can also send different pieces of the same prompt in parallel using the `fork
 flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html):
 
 ```python
+import os
+
 import sglang as sgl
 
 from sglang.lang.chat_template import get_chat_template
 
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at different address
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
+
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
 
 sgl.set_default_backend(backend)
 
@@ -171,11 +191,14 @@ With **Shortfin** + SGLang, we can also easily send requests as a batch.
 Let's now invoke a `batched` Q&A flow with the SGLang [Batching](https://sgl-project.github.io/frontend/frontend.html#batching):
 
 ```python
+import os
+
 import sglang as sgl
 from sglang.lang.chat_template import get_chat_template
 
-# Initialize the backend with the specified chat template and base URL
-backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80")  # Change base_url if running at a different address
+SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"]
+
+backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL)
 
 # Set the default backend for sglang
 sgl.set_default_backend(backend)

From b2d2d32c0dcab2278f3844ff0ff22cf24bdc3434 Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Fri, 20 Dec 2024 11:25:45 -0800
Subject: [PATCH 12/18] Llama instead of LLama

Co-authored-by: Scott Todd <scott.todd0@gmail.com>
---
 docs/shortfin/llm/user/e2e_llama8b_k8s.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
index 343001b47..a1fdb5b0b 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md
+++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md
@@ -1,4 +1,4 @@
-# LLama 8b GPU instructions on Kubernetes
+# Llama 8b GPU instructions on Kubernetes
 
 ## Setup
 

From 739324c7e0920afe7ea93c20decea37616a31744 Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Fri, 20 Dec 2024 11:26:02 -0800
Subject: [PATCH 13/18] Llama instead of LLama

Co-authored-by: Scott Todd <scott.todd0@gmail.com>
---
 .../shortfin/llm/user/shortfin_with_sglang_frontend_language.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index d2bcbfd12..9d6cf4493 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -28,7 +28,7 @@ For this tutorial, you will need to meet the following prerequisites:
 ### Shortfin LLM Server
 
 - A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_end_to_end.md) and for launching
-on a kubernetes cluster, see [LLama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md)
+on a kubernetes cluster, see [Llama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md)
   - We will use the shortfin server as the `backend` to generate completions
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.

From 6d8be0f1af1f3c691d0b6e9bf496365af62ce541 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 20 Dec 2024 14:06:05 -0600
Subject: [PATCH 14/18] rename file names and update docs

---
 .../llm/user/{llama_end_to_end.md => llama_serving.md}        | 0
 .../{e2e_llama8b_k8s.md => llama_serving_on_kubernetes.md}    | 1 +
 .../llm/user/shortfin_with_sglang_frontend_language.md        | 4 ++--
 3 files changed, 3 insertions(+), 2 deletions(-)
 rename docs/shortfin/llm/user/{llama_end_to_end.md => llama_serving.md} (100%)
 rename docs/shortfin/llm/user/{e2e_llama8b_k8s.md => llama_serving_on_kubernetes.md} (87%)

diff --git a/docs/shortfin/llm/user/llama_end_to_end.md b/docs/shortfin/llm/user/llama_serving.md
similarity index 100%
rename from docs/shortfin/llm/user/llama_end_to_end.md
rename to docs/shortfin/llm/user/llama_serving.md
diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md
similarity index 87%
rename from docs/shortfin/llm/user/e2e_llama8b_k8s.md
rename to docs/shortfin/llm/user/llama_serving_on_kubernetes.md
index a1fdb5b0b..47e69f41a 100644
--- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md
+++ b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md
@@ -16,6 +16,7 @@ behind a load balancer on MI300X GPU.
 
 ### Deploy shortfin llama app service
 
+To generate the artifacts required for this k8s deployment, please follow [llama_serving.md](./llama_serving.md) until you have have all of the files that we need to run the shortfin LLM server.
 Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration.
 
 To deploy llama app:
diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index 9d6cf4493..ad6307094 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -27,8 +27,8 @@ For this tutorial, you will need to meet the following prerequisites:
 
 ### Shortfin LLM Server
 
-- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_end_to_end.md) and for launching
-on a kubernetes cluster, see [Llama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md)
+- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_serving.md) and for launching
+on a kubernetes cluster, see [Llama 8b GPU instructions on Kubernetes](./llama_serving_on_kubernetes.md)
   - We will use the shortfin server as the `backend` to generate completions
     from SGLang's `frontend language`. In this tutorial, you can think of
     `sglang` as the client and `shortfin` as the server.

From b722269f1375695c8dbcb5127a084e50d84a2c33 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 20 Dec 2024 14:11:39 -0600
Subject: [PATCH 15/18] more explicit about artifacts in deployment file

---
 .../deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
index 5ea37267b..068f8cdb6 100644
--- a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
+++ b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
@@ -16,7 +16,8 @@ spec:
       - name: shark-llama-app-container
         image: rocm/dev-ubuntu-22.04:6.3
         command: ["/bin/bash", "-c"]
-        # update to your artifacts and change cli flags for instantiation of server to match your intended llama configuration
+        # update to artifacts you generated form llama_serving.md (this is an example with the base llama3.1 8b tp1 artifacts)
+        # change cli flags for instantiation of server to match your intended llama configuration
         args:
         - |
           sudo apt update &&

From 50e473e8110e26a31734fb794ccde2160005e7f3 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 20 Dec 2024 14:14:05 -0600
Subject: [PATCH 16/18] remove device id cli flag

---
 .../deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
index 068f8cdb6..08a22aa3d 100644
--- a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
+++ b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml
@@ -34,7 +34,7 @@ spec:
           wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer.json -O shark_artifacts/tokenizer.json &&
           pip install --pre  shortfin[apps] -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels &&
           pip install pandas &&
-          python -m shortfin_apps.llm.server --tokenizer_json=shark_artifacts/tokenizer.json --model_config=shark_artifacts/config.json --vmfb=shark_artifacts/model.vmfb --parameters=shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf --device_ids 0 --device=hip;
+          python -m shortfin_apps.llm.server --tokenizer_json=shark_artifacts/tokenizer.json --model_config=shark_artifacts/config.json --vmfb=shark_artifacts/model.vmfb --parameters=shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf --device=hip;
         resources:
           # change number of gpus required here based on your llama configuration
           requests:

From 91d5fb0d84fe855395201ea3a28422e77c08a260 Mon Sep 17 00:00:00 2001
From: saienduri <saimanas.enduri@amd.com>
Date: Fri, 20 Dec 2024 14:23:58 -0600
Subject: [PATCH 17/18] more storage instructions for artifacts

---
 docs/shortfin/llm/user/llama_serving_on_kubernetes.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/shortfin/llm/user/llama_serving_on_kubernetes.md b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md
index 47e69f41a..f573bd8ae 100644
--- a/docs/shortfin/llm/user/llama_serving_on_kubernetes.md
+++ b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md
@@ -17,7 +17,8 @@ behind a load balancer on MI300X GPU.
 ### Deploy shortfin llama app service
 
 To generate the artifacts required for this k8s deployment, please follow [llama_serving.md](./llama_serving.md) until you have have all of the files that we need to run the shortfin LLM server.
-Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration.
+Please upload your artifacts to a storage option that you can pull from in your k8s cluster (NFS, S3, CSP).
+Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts you just stored and change flags to intended configuration.
 
 To deploy llama app:
 

From 0e3944521cf6ef24db5e363584d6897bbf80279a Mon Sep 17 00:00:00 2001
From: Stephen Baione <stbaione@amd.com>
Date: Fri, 20 Dec 2024 19:56:15 -0600
Subject: [PATCH 18/18] Add explanation for completion params that can be
 passed through `SGLang`, Added `Server Options` to `llama_serving.md`,
 detailing server flags

---
 docs/shortfin/llm/user/llama_serving.md       | 29 +++++++++++++++++++
 .../shortfin_with_sglang_frontend_language.md | 14 +++++++++
 2 files changed, 43 insertions(+)

diff --git a/docs/shortfin/llm/user/llama_serving.md b/docs/shortfin/llm/user/llama_serving.md
index a74851407..cc2c959b4 100644
--- a/docs/shortfin/llm/user/llama_serving.md
+++ b/docs/shortfin/llm/user/llama_serving.md
@@ -272,3 +272,32 @@ If you want to find the process again:
 ```bash
 ps -f | grep shortfin
 ```
+
+## Server Options
+
+To run the server with different options, you can use the
+following command to see the available flags:
+
+```bash
+python -m shortfin_apps.llm.server --help
+```
+
+### Server Options
+
+A full list of options can be found below:
+
+| Argument                                        | Description                                                                                                                                                                         |
+| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--host HOST`                                   | Specify the host to bind the server.                                                                                                                                                |
+| `--port PORT`                                   | Specify the port to bind the server.                                                                                                                                                |
+| `--root-path ROOT_PATH`                         | Root path to use for installing behind a path-based proxy.                                                                                                                          |
+| `--timeout-keep-alive TIMEOUT_KEEP_ALIVE`       | Keep-alive timeout duration.                                                                                                                                                        |
+| `--tokenizer_json TOKENIZER_JSON`               | Path to a `tokenizer.json` file.                                                                                                                                                    |
+| `--tokenizer_config_json TOKENIZER_CONFIG_JSON` | Path to a `tokenizer_config.json` file.                                                                                                                                             |
+| `--model_config MODEL_CONFIG`                   | Path to the model config file.                                                                                                                                                      |
+| `--vmfb VMFB`                                   | Model [VMFB](https://iree.dev/developers/general/developer-tips/#inspecting-vmfb-files) to load.                                                                                    |
+| `--parameters [FILE ...]`                       | Parameter archives to load (supports: `gguf`, `irpa`, `safetensors`).                                                                                                               |
+| `--device {local-task,hip,amdgpu}`              | Device to serve on (e.g., `local-task`, `hip`). Same options as [iree-run-module --list_drivers](https://iree.dev/guides/deployment-configurations/gpu-rocm/#get-the-iree-runtime). |
+| `--device_ids [DEVICE_IDS ...]`                 | Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a device ID like `amdgpu:0:0@0`.                                                   |
+| `--isolation {none,per_fiber,per_call}`         | Concurrency control: How to isolate programs.                                                                                                                                       |
+| `--amdgpu_async_allocations`                    | Enable asynchronous allocations for AMD GPU device contexts.                                                                                                                        |
diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
index ad6307094..3812b5277 100644
--- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
+++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md
@@ -64,6 +64,20 @@ You can verify the installation/setup through the following examples:
 - [Fork Example](#fork-example)
 - [Multi-Turn Q&A Batching Example](#multi-turn-qa-batch-example)
 
+In these examples, we will set our `max_tokens` to 50 when generating completions.
+This details how many tokens we want to generate for each completion.
+
+We can modify the arguments passed to `sgl.gen` to alter the outputs of our
+`shortfin` LLM server. Specifically:
+
+- `max_tokens` - The maximum number of tokens to generate for completion.
+                 We may obtain longer responses by increasing this value,
+                 and shorter responses by decreasing it.
+- `temperature` - We can include a temperature parameter to control the
+                  randomness of the generated completions. A higher value
+                  will result in more randomness, while a lower value will
+                  result in more deterministic completions.
+
 ## Multi-Turn Q&A example
 
 Now that we have sglang installed, we can run an example to show a multi-turn