From 18b4141297b05cd53d25962b88fce67b8aa8f87c Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Mon, 16 Dec 2024 12:44:08 -0600 Subject: [PATCH 01/18] Cleanup sglang doc for user flow, Expand examples for targeting shortfin from sglang --- .../shortfin_with_sglang_frontend_language.md | 223 ++++++++---------- 1 file changed, 95 insertions(+), 128 deletions(-) diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index b63861a56..858fda118 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -24,21 +24,11 @@ For this tutorial, you will need to meet the following prerequisites: - You can check out [pyenv](https://github.com/pyenv/pyenv) as a good tool to be able to manage multiple versions of python on the same system. -- A running `shortfin` LLM server as described [below](#installstart-shortfin-llm-server) +- A running `shortfin` LLM server. Directions on launching the llm server can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md) - We will use the shortfin server as the `backend` to generate completions from SGLang's `frontend language`. In this tutorial, you can think of `sglang` as the client and `shortfin` as the server. -### Hardware - -- This tutorial is designed to run on an [AMD MI300X GPU](https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html) - -## Install/Start `shortfin` LLM server - -Follow the steps [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md) -to export a model with `sharktank` and start a `shortfin` LLM server -with that model. - ## Install sglang ### Install sglang inside of virtual environment @@ -48,6 +38,8 @@ We can use pip to install it in the same virtual environment that we used to start our Shortfin LLM Server. ```bash +python -m venv --prompt shark-ai .venv +source .venv/bin/activate pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python" ``` @@ -56,8 +48,9 @@ pip install "git+https://github.com/nod-ai/sglang.git#subdirectory=python" You can verify the installation/setup through the following examples: - [Multi-Turn Q&A Example](#multi-turn-qa-example) +- [Streaming Example](#streaming-example) - [Fork Example](#fork-example) -- [Benchmark Shortfin](#bench-mark-shortfin-w-sglang-bench_serving-script) +- [Multi-Turn Q&A Batching Example](#multi-turn-qa-batch-example) ## Multi-Turn Q&A example @@ -79,16 +72,16 @@ import sglang as sgl from sglang.lang.chat_template import get_chat_template -backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://localhost:8000", ) # Change base_url if running at different address +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80", ) # Change base_url if running at different address sgl.set_default_backend(backend) @sgl.function def multi_turn_question(s, question_1, question_2): s += sgl.user(question_1) - s += sgl.assistant(sgl.gen("answer_1", max_tokens=256)) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=50)) s += sgl.user(question_2) - s += sgl.assistant(sgl.gen("answer_2", max_tokens=256)) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=50)) state = multi_turn_question.run(question_1="Name the capital city of the USA.", question_2="The Smithsonian is in this location.") @@ -96,40 +89,56 @@ for m in state.messages(): print(m["role"], m["content"]) ``` -### Shortfin example output +## Streaming Example -You should see an output similar to this: +We can stream our request for a more responsive feel. Let's invoke a `streaming` Q&A from our server: -```text -========== single ========== +```python +import sglang as sgl +from sglang.lang.chat_template import get_chat_template -user : Name the capital city of the USA -assistant : The capital city of the United States of America is Washington, D.C. (short for District of Columbia). -user : The Smithsonian is in this location. -assistant : The Smithsonian Institution is indeed located in Washington, D.C. and is one of the world's largest and most comprehensive museums and research complexes. -``` +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at a different address -## Fork example +sgl.set_default_backend(backend) -Now that we have sglang installed, we can run an example to show a `fork` -flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html): +@sgl.function +def multi_turn_question(s, question_1, question_2): + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=50)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=50)) -### Open python interpreter +question_1 = "Name the capital city of the USA." +question_2 = "The Smithsonian is in this location." -```bash -python +# Run the multi-turn question function with streaming enabled +state = multi_turn_question.run( + question_1=question_1, + question_2=question_2, + stream=True, +) + +# Collect messages from the streamed output +messages = "" + +for chunk in state.text_iter(): + messages += chunk + +print(messages) ``` -### Run example -You can copy and paste the following example into your interpreter: +## Fork example + +We can also send different pieces of the same prompt in parallel using the `fork` +flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html): ```python import sglang as sgl from sglang.lang.chat_template import get_chat_template -backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://localhost:8000") # Change base_url if running at different address +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at different address sgl.set_default_backend(backend) @@ -142,7 +151,7 @@ def tip_suggestion(s): forks = s.fork(2) for i, f in enumerate(forks): f += f"Now, expand tip {i+1} into a paragraph:\n" - f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n") + f += sgl.gen(f"detailed_tip", max_tokens=50, stop="\n\n") s += "Tip 1:" + forks[0]["detailed_tip"] + "\n" s += "Tip 2:" + forks[1]["detailed_tip"] + "\n" s += "In summary" + sgl.gen("summary") @@ -152,103 +161,61 @@ state = tip_suggestion.run() print(state.text()) ``` -### Shortfin example output - -You should see an output similar to this: - -```text -Here are two tips for staying healthy: 1. Balanced Diet. 2. Regular Exercise. - -Tip 1:A balanced diet is important for maintaining good health. It should -include a variety of foods from all the major food groups, such as fruits, -vegetables, grains, proteins, and dairy. Eating a balanced diet can help -prevent chronic diseases such as heart disease, diabetes, and obesity. - -Now, expand tip 2 into a paragraph: -Regular exercise is also important for maintaining good health. It can help -improve cardiovascular health, strengthen muscles and bones, and reduce the -risk of chronic diseases. Exercise can also help improve mental health by -reducing stress and anxiety. It is recommended that adults get at least 150 -minutes of moderate-intensity exercise or 75 minutes of vigorous-intensity -exercise per week. - -Now, combine the two paragraphs into a single paragraph: -A balanced diet and regular exercise are both important for maintaining good -health. A balanced diet should include a variety of foods from all the major -food groups, such as fruits, vegetables, grains, proteins, and dairy. -Eating a balanced diet can help prevent chronic diseases such as heart disease, -diabetes, and obesity. Regular exercise is also important for maintaining good -health. It can help improve cardiovascular health, strengthen muscles and bones, -and reduce the risk of chronic diseases. Exercise can also help improve mental -health by reducing stress and anxiety. It is recommended that - -Tip 2:Regular exercise is important for maintaining a healthy body and mind. -It can help improve cardiovascular health, strengthen muscles and bones, -and reduce the risk of chronic diseases such as diabetes and heart disease. -Additionally, exercise has been shown to improve mood, reduce stress, -and increase overall well-being. It is recommended that adults engage in -at least 150 minutes of moderate-intensity aerobic activity or 75 minutes of -vigorous-intensity aerobic activity per week, as well as strength training -exercises at least two days per week. - -In summary, a balanced diet and regular exercise are both essential for -maintaining good health. A balanced diet should include a variety of foods from -all the major food groups, while regular exercise can help improve -cardiovascular health, strengthen muscles and bones, reduce the risk of -chronic diseases, and improve mental health. It is recommended that adults -engage in at least 150 minutes of moderate-intensity aerobic activity or -75 minutes of vigorous-intensity aerobic activity per week, -as well as strength training exercises at least two days per week. -``` +## Multi-Turn Q&A Batch Example + +With **Shortfin** + SGLang, we can also easily send requests as a batch. +Let's now invoke a `batched` Q&A flow with the SGLang [Batching](https://sgl-project.github.io/frontend/frontend.html#batching): + +```python +import sglang as sgl +from sglang.lang.chat_template import get_chat_template -## Benchmark shortfin w/ sglang `bench_serving` script +# Initialize the backend with the specified chat template and base URL +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at a different address -We can obtain benchmarking metrics using the `bench_serving` script -provided by SGLang: +# Set the default backend for sglang +sgl.set_default_backend(backend) -**NOTE: Change `--base-url` if running at a different address** +@sgl.function +def multi_turn_question(s, question_1, question_2): + s += sgl.user(question_1) + s += sgl.assistant(sgl.gen("answer_1", max_tokens=50)) + s += sgl.user(question_2) + s += sgl.assistant(sgl.gen("answer_2", max_tokens=50)) + +# Define the questions for the first and second sets +question_1_1 = "Name the capital city of the USA." +question_1_2 = "The Smithsonian is in this location." +question_2_1 = "Name the largest city in the USA." +question_2_2 = "The Empire State Building is in this location." + +# Run the multi-turn question function in batch mode +states = multi_turn_question.run_batch( + [ + { + "question_1": question_1_1, + "question_2": question_1_2, + }, + { + "question_1": question_2_1, + "question_2": question_2_2, + }, + ] +) + +# Extract responses from the states +first_qa = states[0] +second_qa = states[1] + +first_qa_messages = first_qa.messages() +second_qa_messages = second_qa.messages() + +# Print messages from the first QA session +for m in first_qa_messages: + print(m["role"], m["content"]) -```bash -python -m sglang.bench_serving --backend shortfin --num-prompt 10 --base-url http://localhost:8000 --tokenizer /path/to/tokenizer/dir --request-rate 1 -``` +# Print messages from the second QA session +for m in second_qa_messages: + print(m["role"], m["content"]) -There are some more metrics captured, but the most relevant are the following: - -- E2E Latency -- TTFT (Time to First Token) -- TPOT (Time per Output Token) -- ITL (Inter-Token Latency) -- Request Throughput -- Benchmark Duration - -When complete, you should see an output similar to this: - -```text -============ Serving Benchmark Result ============ -Backend: shortfin -Traffic request rate: 1.0 -Successful requests: 10 -Benchmark duration (s): 427.91 -Total input tokens: 1960 -Total generated tokens: 2774 -Total generated tokens (retokenized): 63 -Request throughput (req/s): 0.02 -Input token throughput (tok/s): 4.58 -Output token throughput (tok/s): 6.48 -----------------End-to-End Latency---------------- -Mean E2E Latency (ms): 416268.77 -Median E2E Latency (ms): 417159.14 ----------------Time to First Token---------------- -Mean TTFT (ms): 292404.29 -Median TTFT (ms): 365989.01 -P99 TTFT (ms): 367325.63 ------Time per Output Token (excl. 1st token)------ -Mean TPOT (ms): 1359.41 -Median TPOT (ms): 163.96 -P99 TPOT (ms): 6316.12 ----------------Inter-token Latency---------------- -Mean ITL (ms): 2238.99 -Median ITL (ms): 958.75 -P99 ITL (ms): 2719.50 -================================================== ``` From 84259208c0cc1a34dcd5b5ad87ce24c20eab982a Mon Sep 17 00:00:00 2001 From: saienduri Date: Tue, 17 Dec 2024 10:47:43 -0600 Subject: [PATCH 02/18] add k8s instructions --- docs/shortfin/llm/user/e2e_llama8b_k8s.md | 42 ++++++++++++ .../shortfin_with_sglang_frontend_language.md | 3 +- .../llm/k8s/llama-app-deployment.yaml | 64 +++++++++++++++++++ 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 docs/shortfin/llm/user/e2e_llama8b_k8s.md create mode 100644 shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md new file mode 100644 index 000000000..99ac3561e --- /dev/null +++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md @@ -0,0 +1,42 @@ +# LLama 8b GPU instructions on Kubernetes + +## Setup + +We will use an example with `llama_8b_f16` in order to describe the +process of exporting a model and deploying four instances of a shortfin llm server +behind a load balancer on MI300X GPU. + +### Pre-Requisites + +- Kubernetes cluster available to use +- kubectl installed on system and configured for cluster of interest + - To install kubectl, please check out [kubectl install](https://kubernetes.io/docs/tasks/tools/#kubectl) + and make sure to set the `KUBECONFIG` environment variable to point to your kube config file to authorize + connection to the cluster. + +### Deploy shortfin llama app service + +Please edit the following file to fetch the correct artifacts and serve the intended configuration of the llama3 model for your use case [here](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml). + +To deploy llama app: + +``` +kubectl apply -f llama-app-deployment.yaml +``` + +To retrieve external IP for targetting the llama app load balancer: + +``` +kubectl get service shark-llama-app-service +``` + +Now, you can use the external IP for sglang integration or just sending image generation requests. + +### Delete shortfin llama app service + +After done using, make sure to delete: + +``` +kubectl delete deployment shark-llama-app-deployment +kubectl delete service shark-llama-app-service +``` diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index 858fda118..832ec9c3d 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -24,7 +24,8 @@ For this tutorial, you will need to meet the following prerequisites: - You can check out [pyenv](https://github.com/pyenv/pyenv) as a good tool to be able to manage multiple versions of python on the same system. -- A running `shortfin` LLM server. Directions on launching the llm server can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md) +- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md) and for launching +on a kubernetes cluster, please look [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_k8s.md) - We will use the shortfin server as the `backend` to generate completions from SGLang's `frontend language`. In this tutorial, you can think of `sglang` as the client and `shortfin` as the server. diff --git a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml new file mode 100644 index 000000000..b2d86efd2 --- /dev/null +++ b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + # please change name to include your amd username to reduce conflict + name: shark-llama-app-deployment +spec: + replicas: 4 # number of server instances + selector: + matchLabels: + app: shark-llama-app # please change name to include your amd username + template: + metadata: + labels: + app: shark-llama-app # please change name to include your amd username + spec: + containers: + - name: shark-llama-app-container + image: rocm/dev-ubuntu-22.04:6.3 + command: ["/bin/bash", "-c"] + # update to your artifacts and change cli flags for instantiation of server to match your intended llama configuration + args: + - | + rocminfo && + echo "ROCm is working!" && + sudo apt update && + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash && + sudo apt install git -y && + sudo apt install python3.11 python3.11-dev python3.11-venv -y && + sudo apt-get install wget -y && + python3.11 -m venv shark_venv && source shark_venv/bin/activate && + mkdir shark_artifacts && + wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/config.json -O shark_artifacts/config.json && + wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/meta-llama-3.1-8b-instruct.f16.gguf -O shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf && + wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/model.vmfb -O shark_artifacts/model.vmfb && + wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer_config.json -O shark_artifacts/tokenizer_config.json && + wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer.json -O shark_artifacts/tokenizer.json && + pip install --pre shortfin[apps] -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels && + pip install pandas && + python -m shortfin_apps.llm.server --tokenizer_json=shark_artifacts/tokenizer.json --model_config=shark_artifacts/config.json --vmfb=shark_artifacts/model.vmfb --parameters=shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf --device_ids 0 --device=hip; + resources: + # change number of gpus required here based on your llama configuration + requests: + amd.com/gpu: 1 + limits: + amd.com/gpu: 1 + restartPolicy: Always + +--- + +apiVersion: v1 +kind: Service +metadata: + name: shark-llama-app-service # please change name to include your amd username + # specific to OCI in how we expose the load balancer (only AMD network available currently) + annotations: + service.beta.kubernetes.io/oci-load-balancer-internal: "true" +spec: + selector: + app: shark-llama-app # please change name to include your amd username + ports: + - protocol: TCP + port: 80 # external port + targetPort: 8000 # port the container exposes + type: LoadBalancer From a32a2eb466ab6693e1f1c91d2857b19420455da6 Mon Sep 17 00:00:00 2001 From: saienduri Date: Tue, 17 Dec 2024 18:40:32 -0600 Subject: [PATCH 03/18] update doc wording --- docs/shortfin/llm/user/e2e_llama8b_k8s.md | 2 +- shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md index 99ac3561e..58f19c215 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md +++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md @@ -16,7 +16,7 @@ behind a load balancer on MI300X GPU. ### Deploy shortfin llama app service -Please edit the following file to fetch the correct artifacts and serve the intended configuration of the llama3 model for your use case [here](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml). +Save [llama-app-deployment.yaml](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration. To deploy llama app: diff --git a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml index b2d86efd2..57a9995a9 100644 --- a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml +++ b/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml @@ -20,8 +20,6 @@ spec: # update to your artifacts and change cli flags for instantiation of server to match your intended llama configuration args: - | - rocminfo && - echo "ROCm is working!" && sudo apt update && curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash && sudo apt install git -y && From bfe1058433af0351834e7e72b8021a2f4434c3a5 Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Wed, 18 Dec 2024 17:57:56 -0600 Subject: [PATCH 04/18] Make shortfin server heading for more visible doc links, Add iree-base-runtime, iree-base-compiler and iree-turbine to nightly install instructions --- docs/shortfin/llm/user/e2e_llama8b_mi300x.md | 5 +++++ .../llm/user/shortfin_with_sglang_frontend_language.md | 3 +++ 2 files changed, 8 insertions(+) diff --git a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md index 313a8086c..87ceb84dd 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md +++ b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md @@ -39,6 +39,11 @@ To install nightly packages: ```bash pip install shark-ai[apps] sharktank \ --pre --find-links https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels +pip install -f https://iree.dev/pip-release-links.html --pre --upgrade \ + iree-base-compiler \ + iree-base-runtime \ + iree-turbine \ + "numpy<2.0" ``` See also the diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index 832ec9c3d..0dbbeb56b 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -24,6 +24,9 @@ For this tutorial, you will need to meet the following prerequisites: - You can check out [pyenv](https://github.com/pyenv/pyenv) as a good tool to be able to manage multiple versions of python on the same system. + +### Shortfin LLM Server + - A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_mi300x.md) and for launching on a kubernetes cluster, please look [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_k8s.md) - We will use the shortfin server as the `backend` to generate completions From b320fcb026f84adeed0c1639d0c45a8bc55ac51a Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Fri, 20 Dec 2024 07:52:30 -0600 Subject: [PATCH 05/18] Update link to k8 docs --- .../shortfin/llm/user/shortfin_with_sglang_frontend_language.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index e1f1de3f2..fe986ca9b 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -28,7 +28,7 @@ For this tutorial, you will need to meet the following prerequisites: ### Shortfin LLM Server - A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](./llama_end_to_end.md) and for launching -on a kubernetes cluster, please look [here](https://github.com/nod-ai/shark-ai/blob/main/docs/shortfin/llm/user/e2e_llama8b_k8s.md) +on a kubernetes cluster, please look [here](./e2e_llama8b_k8s.md) - We will use the shortfin server as the `backend` to generate completions from SGLang's `frontend language`. In this tutorial, you can think of `sglang` as the client and `shortfin` as the server. From 78dcc9bf38557cc6a9294cbae2b6120e39b5ccff Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 20 Dec 2024 10:22:43 -0600 Subject: [PATCH 06/18] move k8s deployment file --- docs/shortfin/llm/user/e2e_llama8b_k8s.md | 2 +- .../shortfin_apps/llm/k8s/llama-app-deployment.yaml | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename shortfin/{python => deployment}/shortfin_apps/llm/k8s/llama-app-deployment.yaml (100%) diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md index 58f19c215..160d9a21e 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md +++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md @@ -16,7 +16,7 @@ behind a load balancer on MI300X GPU. ### Deploy shortfin llama app service -Save [llama-app-deployment.yaml](https://github.com/nod-ai/shark-ai/tree/main/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration. +Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration. To deploy llama app: diff --git a/shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml similarity index 100% rename from shortfin/python/shortfin_apps/llm/k8s/llama-app-deployment.yaml rename to shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml From 0a39a523fc4af64937058d2ddff99f03bd24f770 Mon Sep 17 00:00:00 2001 From: saienduri <77521230+saienduri@users.noreply.github.com> Date: Fri, 20 Dec 2024 08:42:40 -0800 Subject: [PATCH 07/18] allow multiple yaml docs in one for check-yaml --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f2be5cf6..e2e8d797f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,6 +7,7 @@ repos: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml + args: ['--allow-multiple-documents'] - id: check-added-large-files - repo: https://github.com/psf/black rev: 22.10.0 From 34c7df46f5140e0dd808c6830408f045f935e942 Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 20 Dec 2024 12:28:45 -0600 Subject: [PATCH 08/18] remove amd specific things --- .../shortfin_apps/llm/k8s/llama-app-deployment.yaml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml index 57a9995a9..5ea37267b 100644 --- a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml +++ b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml @@ -1,17 +1,16 @@ apiVersion: apps/v1 kind: Deployment metadata: - # please change name to include your amd username to reduce conflict name: shark-llama-app-deployment spec: replicas: 4 # number of server instances selector: matchLabels: - app: shark-llama-app # please change name to include your amd username + app: shark-llama-app template: metadata: labels: - app: shark-llama-app # please change name to include your amd username + app: shark-llama-app spec: containers: - name: shark-llama-app-container @@ -48,13 +47,10 @@ spec: apiVersion: v1 kind: Service metadata: - name: shark-llama-app-service # please change name to include your amd username - # specific to OCI in how we expose the load balancer (only AMD network available currently) - annotations: - service.beta.kubernetes.io/oci-load-balancer-internal: "true" + name: shark-llama-app-service spec: selector: - app: shark-llama-app # please change name to include your amd username + app: shark-llama-app ports: - protocol: TCP port: 80 # external port From 52de1abd3dc0c23cccdc11cc653172140c87d194 Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 20 Dec 2024 12:32:20 -0600 Subject: [PATCH 09/18] text gen update --- docs/shortfin/llm/user/e2e_llama8b_k8s.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md index 160d9a21e..343001b47 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md +++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md @@ -30,7 +30,7 @@ To retrieve external IP for targetting the llama app load balancer: kubectl get service shark-llama-app-service ``` -Now, you can use the external IP for sglang integration or just sending image generation requests. +Now, you can use the external IP for sglang integration or just sending text generation requests. ### Delete shortfin llama app service From bd3e5ad5d995ce65e9ea6cc3ffe9fae0c7c300a1 Mon Sep 17 00:00:00 2001 From: saienduri <77521230+saienduri@users.noreply.github.com> Date: Fri, 20 Dec 2024 10:34:07 -0800 Subject: [PATCH 10/18] inline document titles for readme links Co-authored-by: Scott Todd --- .../llm/user/shortfin_with_sglang_frontend_language.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index fe986ca9b..53524f373 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -27,8 +27,8 @@ For this tutorial, you will need to meet the following prerequisites: ### Shortfin LLM Server -- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found [here](./llama_end_to_end.md) and for launching -on a kubernetes cluster, please look [here](./e2e_llama8b_k8s.md) +- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_end_to_end.md) and for launching +on a kubernetes cluster, see [LLama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md) - We will use the shortfin server as the `backend` to generate completions from SGLang's `frontend language`. In this tutorial, you can think of `sglang` as the client and `shortfin` as the server. From 2b31e45b9fb82239aabcbbb395791bb280012733 Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Fri, 20 Dec 2024 12:58:32 -0600 Subject: [PATCH 11/18] Add clarification for BASE_URL, and use environment variable to access BASE_URL in code examples --- .../shortfin_with_sglang_frontend_language.md | 51 ++++++++++++++----- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index 53524f373..d2bcbfd12 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -4,15 +4,15 @@ This doc includes basic steps for hooking up sglang with a running Shortfin serv ## Current Support Status -| Feature | Description | Enabled | Reference | -| ----------- | ----------- | ---------- | ------------ | -| `gen` | Generate shortfin completion, given a prompt | ✅ | [Shortfin Implementation](https://github.com/nod-ai/sglang/blob/main/python/sglang/lang/backend/shortfin.py) | -| `streaming` | Stream shortfin completion, given a prompt | ✅ | [Streaming](https://sgl-project.github.io/frontend/frontend.html#streaming) | -| `run_batch` | Run batch of disjoint requests with continous batching | ✅ | [Batching](https://sgl-project.github.io/frontend/frontend.html#batching) | -| `fork` | Generate sections of the same prompt in parallel | ✅ | [Fork Docs](https://sgl-project.github.io/frontend/frontend.html#parallelism) | -| `choices` | Given set of choices, generate response based on best log probs | ❌ | [Choices Methods](https://sgl-project.github.io/frontend/choices_methods.html#choices-methods-in-sglang) | -| `image` | Pass image as part of multi-modal prompt | ❌ | [sgl.image](https://sgl-project.github.io/frontend/frontend.html#multi-modality) | -| `regex` | Specify regular expression as decoding constraint | ❌ | [Regex](https://sgl-project.github.io/frontend/frontend.html#constrained-decoding) | +| Feature | Description | Enabled | Reference | +| ----------- | --------------------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------ | +| `gen` | Generate shortfin completion, given a prompt | ✅ | [Shortfin Implementation](https://github.com/nod-ai/sglang/blob/main/python/sglang/lang/backend/shortfin.py) | +| `streaming` | Stream shortfin completion, given a prompt | ✅ | [Streaming](https://sgl-project.github.io/frontend/frontend.html#streaming) | +| `run_batch` | Run batch of disjoint requests with continous batching | ✅ | [Batching](https://sgl-project.github.io/frontend/frontend.html#batching) | +| `fork` | Generate sections of the same prompt in parallel | ✅ | [Fork Docs](https://sgl-project.github.io/frontend/frontend.html#parallelism) | +| `choices` | Given set of choices, generate response based on best log probs | ❌ | [Choices Methods](https://sgl-project.github.io/frontend/choices_methods.html#choices-methods-in-sglang) | +| `image` | Pass image as part of multi-modal prompt | ❌ | [sgl.image](https://sgl-project.github.io/frontend/frontend.html#multi-modality) | +| `regex` | Specify regular expression as decoding constraint | ❌ | [Regex](https://sgl-project.github.io/frontend/frontend.html#constrained-decoding) | ## Prerequisites @@ -33,6 +33,14 @@ on a kubernetes cluster, see [LLama 8b GPU instructions on Kubernetes](./e2e_lla from SGLang's `frontend language`. In this tutorial, you can think of `sglang` as the client and `shortfin` as the server. +After the `shortfin` LLM Server has started, we must obtain the base_url. +We will store this in our environment in order to send request to `shortfin` + through the `sglang` client examples below. + +```bash +export SHORTFIN_BASE_URL="SHORTFIN_BASE_URL" # example: http://localhost:8000 +``` + ## Install sglang ### Install sglang inside of virtual environment @@ -72,11 +80,15 @@ python You can copy and paste the following example into your interpreter: ```python +import os + import sglang as sgl from sglang.lang.chat_template import get_chat_template -backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80", ) # Change base_url if running at different address +SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"] + +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL) sgl.set_default_backend(backend) @@ -98,10 +110,14 @@ for m in state.messages(): We can stream our request for a more responsive feel. Let's invoke a `streaming` Q&A from our server: ```python +import os + import sglang as sgl from sglang.lang.chat_template import get_chat_template -backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at a different address +SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"] + +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL) sgl.set_default_backend(backend) @@ -138,11 +154,15 @@ We can also send different pieces of the same prompt in parallel using the `fork flow with the SGLang [Frontend Language](https://sgl-project.github.io/frontend/frontend.html): ```python +import os + import sglang as sgl from sglang.lang.chat_template import get_chat_template -backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at different address +SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"] + +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL) sgl.set_default_backend(backend) @@ -171,11 +191,14 @@ With **Shortfin** + SGLang, we can also easily send requests as a batch. Let's now invoke a `batched` Q&A flow with the SGLang [Batching](https://sgl-project.github.io/frontend/frontend.html#batching): ```python +import os + import sglang as sgl from sglang.lang.chat_template import get_chat_template -# Initialize the backend with the specified chat template and base URL -backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url="http://10.158.231.134:80") # Change base_url if running at a different address +SHORTFIN_BASE_URL = os.environ["SHORTFIN_BASE_URL"] + +backend = sgl.Shortfin(chat_template=get_chat_template("llama-3-instruct"), base_url=SHORTFIN_BASE_URL) # Set the default backend for sglang sgl.set_default_backend(backend) From b2d2d32c0dcab2278f3844ff0ff22cf24bdc3434 Mon Sep 17 00:00:00 2001 From: saienduri <77521230+saienduri@users.noreply.github.com> Date: Fri, 20 Dec 2024 11:25:45 -0800 Subject: [PATCH 12/18] Llama instead of LLama Co-authored-by: Scott Todd --- docs/shortfin/llm/user/e2e_llama8b_k8s.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/e2e_llama8b_k8s.md index 343001b47..a1fdb5b0b 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md +++ b/docs/shortfin/llm/user/e2e_llama8b_k8s.md @@ -1,4 +1,4 @@ -# LLama 8b GPU instructions on Kubernetes +# Llama 8b GPU instructions on Kubernetes ## Setup From 739324c7e0920afe7ea93c20decea37616a31744 Mon Sep 17 00:00:00 2001 From: saienduri <77521230+saienduri@users.noreply.github.com> Date: Fri, 20 Dec 2024 11:26:02 -0800 Subject: [PATCH 13/18] Llama instead of LLama Co-authored-by: Scott Todd --- .../shortfin/llm/user/shortfin_with_sglang_frontend_language.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index d2bcbfd12..9d6cf4493 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -28,7 +28,7 @@ For this tutorial, you will need to meet the following prerequisites: ### Shortfin LLM Server - A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_end_to_end.md) and for launching -on a kubernetes cluster, see [LLama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md) +on a kubernetes cluster, see [Llama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md) - We will use the shortfin server as the `backend` to generate completions from SGLang's `frontend language`. In this tutorial, you can think of `sglang` as the client and `shortfin` as the server. From 6d8be0f1af1f3c691d0b6e9bf496365af62ce541 Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 20 Dec 2024 14:06:05 -0600 Subject: [PATCH 14/18] rename file names and update docs --- .../llm/user/{llama_end_to_end.md => llama_serving.md} | 0 .../{e2e_llama8b_k8s.md => llama_serving_on_kubernetes.md} | 1 + .../llm/user/shortfin_with_sglang_frontend_language.md | 4 ++-- 3 files changed, 3 insertions(+), 2 deletions(-) rename docs/shortfin/llm/user/{llama_end_to_end.md => llama_serving.md} (100%) rename docs/shortfin/llm/user/{e2e_llama8b_k8s.md => llama_serving_on_kubernetes.md} (87%) diff --git a/docs/shortfin/llm/user/llama_end_to_end.md b/docs/shortfin/llm/user/llama_serving.md similarity index 100% rename from docs/shortfin/llm/user/llama_end_to_end.md rename to docs/shortfin/llm/user/llama_serving.md diff --git a/docs/shortfin/llm/user/e2e_llama8b_k8s.md b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md similarity index 87% rename from docs/shortfin/llm/user/e2e_llama8b_k8s.md rename to docs/shortfin/llm/user/llama_serving_on_kubernetes.md index a1fdb5b0b..47e69f41a 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_k8s.md +++ b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md @@ -16,6 +16,7 @@ behind a load balancer on MI300X GPU. ### Deploy shortfin llama app service +To generate the artifacts required for this k8s deployment, please follow [llama_serving.md](./llama_serving.md) until you have have all of the files that we need to run the shortfin LLM server. Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration. To deploy llama app: diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index 9d6cf4493..ad6307094 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -27,8 +27,8 @@ For this tutorial, you will need to meet the following prerequisites: ### Shortfin LLM Server -- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_end_to_end.md) and for launching -on a kubernetes cluster, see [Llama 8b GPU instructions on Kubernetes](./e2e_llama8b_k8s.md) +- A running `shortfin` LLM server. Directions on launching the llm server on one system can be found in [Llama end to end serving instructions](./llama_serving.md) and for launching +on a kubernetes cluster, see [Llama 8b GPU instructions on Kubernetes](./llama_serving_on_kubernetes.md) - We will use the shortfin server as the `backend` to generate completions from SGLang's `frontend language`. In this tutorial, you can think of `sglang` as the client and `shortfin` as the server. From b722269f1375695c8dbcb5127a084e50d84a2c33 Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 20 Dec 2024 14:11:39 -0600 Subject: [PATCH 15/18] more explicit about artifacts in deployment file --- .../deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml index 5ea37267b..068f8cdb6 100644 --- a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml +++ b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml @@ -16,7 +16,8 @@ spec: - name: shark-llama-app-container image: rocm/dev-ubuntu-22.04:6.3 command: ["/bin/bash", "-c"] - # update to your artifacts and change cli flags for instantiation of server to match your intended llama configuration + # update to artifacts you generated form llama_serving.md (this is an example with the base llama3.1 8b tp1 artifacts) + # change cli flags for instantiation of server to match your intended llama configuration args: - | sudo apt update && From 50e473e8110e26a31734fb794ccde2160005e7f3 Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 20 Dec 2024 14:14:05 -0600 Subject: [PATCH 16/18] remove device id cli flag --- .../deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml index 068f8cdb6..08a22aa3d 100644 --- a/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml +++ b/shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml @@ -34,7 +34,7 @@ spec: wget https://sharkpublic.blob.core.windows.net/sharkpublic/stephen/llama3.1_8b/tokenizer.json -O shark_artifacts/tokenizer.json && pip install --pre shortfin[apps] -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels && pip install pandas && - python -m shortfin_apps.llm.server --tokenizer_json=shark_artifacts/tokenizer.json --model_config=shark_artifacts/config.json --vmfb=shark_artifacts/model.vmfb --parameters=shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf --device_ids 0 --device=hip; + python -m shortfin_apps.llm.server --tokenizer_json=shark_artifacts/tokenizer.json --model_config=shark_artifacts/config.json --vmfb=shark_artifacts/model.vmfb --parameters=shark_artifacts/meta-llama-3.1-8b-instruct.f16.gguf --device=hip; resources: # change number of gpus required here based on your llama configuration requests: From 91d5fb0d84fe855395201ea3a28422e77c08a260 Mon Sep 17 00:00:00 2001 From: saienduri Date: Fri, 20 Dec 2024 14:23:58 -0600 Subject: [PATCH 17/18] more storage instructions for artifacts --- docs/shortfin/llm/user/llama_serving_on_kubernetes.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/shortfin/llm/user/llama_serving_on_kubernetes.md b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md index 47e69f41a..f573bd8ae 100644 --- a/docs/shortfin/llm/user/llama_serving_on_kubernetes.md +++ b/docs/shortfin/llm/user/llama_serving_on_kubernetes.md @@ -17,7 +17,8 @@ behind a load balancer on MI300X GPU. ### Deploy shortfin llama app service To generate the artifacts required for this k8s deployment, please follow [llama_serving.md](./llama_serving.md) until you have have all of the files that we need to run the shortfin LLM server. -Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts and intended configuration. +Please upload your artifacts to a storage option that you can pull from in your k8s cluster (NFS, S3, CSP). +Save [llama-app-deployment.yaml](../../../../shortfin/deployment/shortfin_apps/llm/k8s/llama-app-deployment.yaml) locally and edit it to include your artifacts you just stored and change flags to intended configuration. To deploy llama app: From 0e3944521cf6ef24db5e363584d6897bbf80279a Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Fri, 20 Dec 2024 19:56:15 -0600 Subject: [PATCH 18/18] Add explanation for completion params that can be passed through `SGLang`, Added `Server Options` to `llama_serving.md`, detailing server flags --- docs/shortfin/llm/user/llama_serving.md | 29 +++++++++++++++++++ .../shortfin_with_sglang_frontend_language.md | 14 +++++++++ 2 files changed, 43 insertions(+) diff --git a/docs/shortfin/llm/user/llama_serving.md b/docs/shortfin/llm/user/llama_serving.md index a74851407..cc2c959b4 100644 --- a/docs/shortfin/llm/user/llama_serving.md +++ b/docs/shortfin/llm/user/llama_serving.md @@ -272,3 +272,32 @@ If you want to find the process again: ```bash ps -f | grep shortfin ``` + +## Server Options + +To run the server with different options, you can use the +following command to see the available flags: + +```bash +python -m shortfin_apps.llm.server --help +``` + +### Server Options + +A full list of options can be found below: + +| Argument | Description | +| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `--host HOST` | Specify the host to bind the server. | +| `--port PORT` | Specify the port to bind the server. | +| `--root-path ROOT_PATH` | Root path to use for installing behind a path-based proxy. | +| `--timeout-keep-alive TIMEOUT_KEEP_ALIVE` | Keep-alive timeout duration. | +| `--tokenizer_json TOKENIZER_JSON` | Path to a `tokenizer.json` file. | +| `--tokenizer_config_json TOKENIZER_CONFIG_JSON` | Path to a `tokenizer_config.json` file. | +| `--model_config MODEL_CONFIG` | Path to the model config file. | +| `--vmfb VMFB` | Model [VMFB](https://iree.dev/developers/general/developer-tips/#inspecting-vmfb-files) to load. | +| `--parameters [FILE ...]` | Parameter archives to load (supports: `gguf`, `irpa`, `safetensors`). | +| `--device {local-task,hip,amdgpu}` | Device to serve on (e.g., `local-task`, `hip`). Same options as [iree-run-module --list_drivers](https://iree.dev/guides/deployment-configurations/gpu-rocm/#get-the-iree-runtime). | +| `--device_ids [DEVICE_IDS ...]` | Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a device ID like `amdgpu:0:0@0`. | +| `--isolation {none,per_fiber,per_call}` | Concurrency control: How to isolate programs. | +| `--amdgpu_async_allocations` | Enable asynchronous allocations for AMD GPU device contexts. | diff --git a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md index ad6307094..3812b5277 100644 --- a/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md +++ b/docs/shortfin/llm/user/shortfin_with_sglang_frontend_language.md @@ -64,6 +64,20 @@ You can verify the installation/setup through the following examples: - [Fork Example](#fork-example) - [Multi-Turn Q&A Batching Example](#multi-turn-qa-batch-example) +In these examples, we will set our `max_tokens` to 50 when generating completions. +This details how many tokens we want to generate for each completion. + +We can modify the arguments passed to `sgl.gen` to alter the outputs of our +`shortfin` LLM server. Specifically: + +- `max_tokens` - The maximum number of tokens to generate for completion. + We may obtain longer responses by increasing this value, + and shorter responses by decreasing it. +- `temperature` - We can include a temperature parameter to control the + randomness of the generated completions. A higher value + will result in more randomness, while a lower value will + result in more deterministic completions. + ## Multi-Turn Q&A example Now that we have sglang installed, we can run an example to show a multi-turn