From 865545198fd16acdde128da4e9a1693a4af68ec1 Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Wed, 11 Dec 2024 16:22:10 +0000 Subject: [PATCH 1/3] Update user docs for running `llm server`, Add back `sentencepiece` as requirement for `sharktank` to enable `export_paged_llm_v1` --- docs/shortfin/llm/user/e2e_llama8b_mi300x.md | 81 +++++++------------- sharktank/requirements.txt | 2 +- 2 files changed, 28 insertions(+), 55 deletions(-) diff --git a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md index 4a8423bc8..94c076980 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md +++ b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md @@ -22,30 +22,37 @@ python -m venv --prompt shark-ai .venv source .venv/bin/activate ``` -### Install `shark-ai` +### Install `shortfin` -You can install either the `latest stable` version of `shark-ai` -or the `nightly` version: +You can install either the `latest stable` version of shortfin by installing +`shark-ai` or the `nightly` version directly: #### Stable ```bash -pip install shark-ai +pip install shark-ai[apps] ``` #### Nightly ```bash -pip install sharktank -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels -pip install shortfin -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels +pip install shortfin[apps] --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels ``` -#### Install dataclasses-json + +### Install `sharktank` - +Install the `nightly` version of sharktank: ```bash -pip install dataclasses-json +pip install sharktank --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels +``` + + +### Install `sentencepiece` + +```bash +pip install sentencepiece ``` ### Define a directory for export files @@ -78,8 +85,8 @@ This example uses the `llama8b_f16.gguf` and `tokenizer.json` files that were downloaded in the previous step. ```bash -export MODEL_PARAMS_PATH=$EXPORT_DIR/llama3.1-8b/llama8b_f16.gguf -export TOKENIZER_PATH=$EXPORT_DIR/llama3.1-8b/tokenizer.json +export MODEL_PARAMS_PATH=$EXPORT_DIR/meta-llama-3.1-8b-instruct.f16.gguf +export TOKENIZER_PATH=$EXPORT_DIR/tokenizer.json ``` #### General env vars @@ -91,8 +98,6 @@ The following env vars can be copy + pasted directly: export MLIR_PATH=$EXPORT_DIR/model.mlir # Path to export config.json file export OUTPUT_CONFIG_PATH=$EXPORT_DIR/config.json -# Path to export edited_config.json file -export EDITED_CONFIG_PATH=$EXPORT_DIR/edited_config.json # Path to export model.vmfb file export VMFB_PATH=$EXPORT_DIR/model.vmfb # Batch size for kvcache @@ -108,7 +113,7 @@ to export our model to `.mlir` format. ```bash python -m sharktank.examples.export_paged_llm_v1 \ - --irpa-file=$MODEL_PARAMS_PATH \ + --gguf-file=$MODEL_PARAMS_PATH \ --output-mlir=$MLIR_PATH \ --output-config=$OUTPUT_CONFIG_PATH \ --bs=$BS @@ -137,37 +142,6 @@ iree-compile $MLIR_PATH \ -o $VMFB_PATH ``` -## Write an edited config - -We need to write a config for our model with a slightly edited structure -to run with shortfin. This will work for the example in our docs. -You may need to modify some of the parameters for a specific model. - -### Write edited config - -```bash -cat > $EDITED_CONFIG_PATH << EOF -{ - "module_name": "module", - "module_abi_version": 1, - "max_seq_len": 131072, - "attn_head_count": 8, - "attn_head_dim": 128, - "prefill_batch_sizes": [ - $BS - ], - "decode_batch_sizes": [ - $BS - ], - "transformer_block_count": 32, - "paged_kv_cache": { - "block_seq_stride": 16, - "device_block_count": 256 - } -} -EOF -``` - ## Running the `shortfin` LLM server We should now have all of the files that we need to run the shortfin LLM server. @@ -178,15 +152,14 @@ Verify that you have the following in your specified directory ($EXPORT_DIR): ls $EXPORT_DIR ``` -- edited_config.json +- config.json +- meta-llama-3.1-8b-instruct.f16.gguf +- model.mlir - model.vmfb +- tokenizer_config.json +- tokenizer.json -### Launch server: - - +### Launch server #### Run the shortfin server @@ -209,7 +182,7 @@ Run the following command to launch the Shortfin LLM Server in the background: ```bash python -m shortfin_apps.llm.server \ --tokenizer_json=$TOKENIZER_PATH \ - --model_config=$EDITED_CONFIG_PATH \ + --model_config=$OUTPUT_CONFIG_PATH \ --vmfb=$VMFB_PATH \ --parameters=$MODEL_PARAMS_PATH \ --device=hip > shortfin_llm_server.log 2>&1 & @@ -252,7 +225,7 @@ port = 8000 # Change if running on a different port generate_url = f"http://localhost:{port}/generate" def generation_request(): - payload = {"text": "What is the capital of the United States?", "sampling_params": {"max_completion_tokens": 50}} + payload = {"text": "Name the capital of the United States.", "sampling_params": {"max_completion_tokens": 50}} try: resp = requests.post(generate_url, json=payload) resp.raise_for_status() # Raises an HTTPError for bad responses diff --git a/sharktank/requirements.txt b/sharktank/requirements.txt index e7181284a..d2879740c 100644 --- a/sharktank/requirements.txt +++ b/sharktank/requirements.txt @@ -5,7 +5,7 @@ gguf==0.10.0 numpy<2.0 # Needed for newer gguf versions (TODO: remove when gguf package includes this) -# sentencepiece>=0.1.98,<=0.2.0 +sentencepiece>=0.1.98,<=0.2.0 # Model deps. huggingface-hub==0.22.2 From e7159d62f020876207f747568dde9c3ef5759b95 Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Thu, 12 Dec 2024 17:53:19 +0000 Subject: [PATCH 2/3] Upgrade `gguf` to `0.11.0` --- sharktank/requirements.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sharktank/requirements.txt b/sharktank/requirements.txt index d2879740c..6ec1ea1d9 100644 --- a/sharktank/requirements.txt +++ b/sharktank/requirements.txt @@ -1,12 +1,9 @@ iree-turbine # Runtime deps. -gguf==0.10.0 +gguf==0.11.0 numpy<2.0 -# Needed for newer gguf versions (TODO: remove when gguf package includes this) -sentencepiece>=0.1.98,<=0.2.0 - # Model deps. huggingface-hub==0.22.2 transformers==4.40.0 From 011f2732e4e29f279417709128aec7654ad243f2 Mon Sep 17 00:00:00 2001 From: Stephen Baione Date: Thu, 12 Dec 2024 21:47:04 +0000 Subject: [PATCH 3/3] Remove `pip install sentencepiece`, Update `stable` and `nightly` installation instructs, Only set lower-bound for `gguf` --- docs/shortfin/llm/user/e2e_llama8b_mi300x.md | 31 +++++++------------- sharktank/requirements.txt | 2 +- 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md index 94c076980..313a8086c 100644 --- a/docs/shortfin/llm/user/e2e_llama8b_mi300x.md +++ b/docs/shortfin/llm/user/e2e_llama8b_mi300x.md @@ -22,38 +22,27 @@ python -m venv --prompt shark-ai .venv source .venv/bin/activate ``` -### Install `shortfin` +## Install stable shark-ai packages -You can install either the `latest stable` version of shortfin by installing -`shark-ai` or the `nightly` version directly: - -#### Stable + ```bash -pip install shark-ai[apps] +pip install shark-ai[apps] sharktank ``` -#### Nightly - -```bash -pip install shortfin[apps] --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels -``` +### Nightly packages - -### Install `sharktank` +To install nightly packages: -Install the `nightly` version of sharktank: + ```bash -pip install sharktank --pre -f https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels +pip install shark-ai[apps] sharktank \ + --pre --find-links https://github.com/nod-ai/shark-ai/releases/expanded_assets/dev-wheels ``` - -### Install `sentencepiece` - -```bash -pip install sentencepiece -``` +See also the +[instructions here](https://github.com/nod-ai/shark-ai/blob/main/docs/nightly_releases.md). ### Define a directory for export files diff --git a/sharktank/requirements.txt b/sharktank/requirements.txt index 6ec1ea1d9..90cbedded 100644 --- a/sharktank/requirements.txt +++ b/sharktank/requirements.txt @@ -1,7 +1,7 @@ iree-turbine # Runtime deps. -gguf==0.11.0 +gguf>=0.11.0 numpy<2.0 # Model deps.