Skip to content

Commit ee84439

Browse files
Release TKML v4.0.6 (#244)
Co-authored-by: amd-pworfolk <[email protected]>
1 parent d263bba commit ee84439

File tree

16 files changed

+475
-150
lines changed

16 files changed

+475
-150
lines changed
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
name: "Test Lemonade Server"
2+
description: Launch Lemonade Server and test the endpoints
3+
inputs:
4+
conda_env:
5+
required: true
6+
load_command:
7+
required: true
8+
amd_oga:
9+
required: false
10+
default: ""
11+
description: "Location of the OGA for RyzenAI NPU install directory on disk"
12+
runs:
13+
using: "composite"
14+
steps:
15+
- name: Ensure the Lemonade serer works properly
16+
shell: PowerShell
17+
run: |
18+
$Env:AMD_OGA = "${{ inputs.amd_oga }}"
19+
20+
$outputFile = "output.log"
21+
$errorFile = "error.log"
22+
$serverProcess = Start-Process -FilePath "conda" -ArgumentList "run ${{ inputs.conda_env }} lemonade -d .\ci-cache ${{ inputs.load_command }} serve --max-new-tokens 10" -RedirectStandardOutput $outputFile -RedirectStandardError $errorFile -PassThru -NoNewWindow
23+
24+
Write-Host "Wait for 30 seconds to let the server come up"
25+
Start-Sleep -Seconds 30
26+
27+
Write-Host "Check if server process successfully launched"
28+
$serverRunning = Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue
29+
if (-not $serverRunning) {
30+
Write-Host "Error: Server process isn't running, even though we just tried to start it!"
31+
Write-Host "Standard Output:"
32+
Get-Content $outputFile
33+
34+
Write-Host "Standard Error:"
35+
Get-Content $errorFile
36+
exit 1
37+
} else {
38+
Write-Host "Server process is alive."
39+
}
40+
41+
Write-Host "Wait for the server port to come up"
42+
while ($true) {
43+
44+
$llmPortCheck = Test-NetConnection -ComputerName 127.0.0.1 -Port 8000
45+
if (-not $llmPortCheck.TcpTestSucceeded) {
46+
Write-Host "LLM server is not yet running on port 8000!"
47+
Write-Host "Standard Output:"
48+
Get-Content $outputFile
49+
50+
Write-Host "Standard Error:"
51+
Get-Content $errorFile
52+
} else {
53+
Write-Host "LLM server is running on port 8000."
54+
break
55+
}
56+
57+
Start-Sleep -Seconds 30
58+
}
59+
60+
Write-Host "Checking the /health endpoint"
61+
$response = Invoke-WebRequest -Uri http://127.0.0.1:8000/health -UseBasicParsing
62+
63+
if ($response.StatusCode -eq 200) {
64+
Write-Output "Good: /health status code is 200"
65+
} else {
66+
Write-Output "Error: /health status code is not 200"
67+
Write-Host "Standard Output:"
68+
Get-Content $outputFile
69+
70+
Write-Host "Standard Error:"
71+
Get-Content $errorFile
72+
exit 1
73+
}
74+
75+
$jsonContent = $response.Content | ConvertFrom-Json
76+
if ($jsonContent) {
77+
Write-Output "Good: /health JSON content is not empty: $jsonContent"
78+
} else {
79+
Write-Output "Error: /health JSON content is empty"
80+
Write-Host "Standard Output:"
81+
Get-Content $outputFile
82+
83+
Write-Host "Standard Error:"
84+
Get-Content $errorFile
85+
exit 1
86+
}
87+
88+
Write-Host "Checking the /ws (streaming generation) endpoint"
89+
90+
# Define the WebSocket URI
91+
$uri = [System.Uri]::new("ws://127.0.0.1:8000/ws")
92+
93+
# Create a new ClientWebSocket instance
94+
$webSocket = [System.Net.WebSockets.ClientWebSocket]::new()
95+
96+
# Connect to the WebSocket server
97+
$webSocket.ConnectAsync($uri, [System.Threading.CancellationToken]::None).Wait()
98+
99+
# Define the message to send
100+
$message = "Hello, WebSocket!"
101+
$buffer = [System.Text.Encoding]::UTF8.GetBytes($message)
102+
$segment = [System.ArraySegment[byte]]::new($buffer)
103+
104+
# Send the message
105+
$webSocket.SendAsync($segment, [System.Net.WebSockets.WebSocketMessageType]::Text, $true, [System.Threading.CancellationToken]::None).Wait()
106+
107+
# Buffer to store the response
108+
$responseBuffer = New-Object byte[] 1024
109+
$responseSegment = [System.ArraySegment[byte]]::new($responseBuffer)
110+
111+
# Variable to store the complete response
112+
$response = ""
113+
114+
# Receive the streaming response
115+
do {
116+
$result = $webSocket.ReceiveAsync($responseSegment, [System.Threading.CancellationToken]::None).Result
117+
$response += [System.Text.Encoding]::UTF8.GetString($responseBuffer, 0, $result.Count)
118+
} while ($response -notlike "*</s>*")
119+
120+
# Close the WebSocket connection
121+
$webSocket.CloseAsync([System.Net.WebSockets.WebSocketCloseStatus]::NormalClosure, "Closing", [System.Threading.CancellationToken]::None).Wait()
122+
123+
# Check if the response is not empty
124+
if ($response -and $response -notlike "</s>") {
125+
Write-Output "Response is not empty: $response"
126+
} else {
127+
Write-Output "Response is empty or only contains the end marker: $response"
128+
Write-Host "Standard Output:"
129+
Get-Content $outputFile
130+
131+
Write-Host "Standard Error:"
132+
Get-Content $errorFile
133+
exit 1
134+
}
135+
136+
Write-Host "Checking the /stats endpoint"
137+
$response = Invoke-WebRequest -Uri http://127.0.0.1:8000/stats -UseBasicParsing
138+
if ($response.StatusCode -eq 200) {
139+
Write-Output "Good: /stats status code is 200"
140+
} else {
141+
Write-Output "Error: /stats status code is not 200"
142+
Write-Host "Standard Output:"
143+
Get-Content $outputFile
144+
145+
Write-Host "Standard Error:"
146+
Get-Content $errorFile
147+
exit 1
148+
}
149+
150+
$jsonContent = $response.Content | ConvertFrom-Json
151+
if ($jsonContent) {
152+
Write-Output "Good: /stats JSON content is not empty: $jsonContent"
153+
} else {
154+
Write-Output "Error: /stats JSON content is empty"
155+
Write-Host "Standard Output:"
156+
Get-Content $outputFile
157+
158+
Write-Host "Standard Error:"
159+
Get-Content $errorFile
160+
exit 1
161+
}
162+
163+
Write-Host "Close the server process"
164+
Stop-Process -Id $serverProcess.Id

.github/workflows/test_lemonade.yml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ jobs:
1616
make-lemonade:
1717
env:
1818
LEMONADE_CI_MODE: "True"
19-
runs-on: ubuntu-latest
19+
strategy:
20+
matrix:
21+
os: [ubuntu-latest, windows-latest]
22+
runs-on: ${{ matrix.os }}
2023
steps:
2124
- uses: actions/checkout@v3
2225
- name: Set up Miniconda with 64-bit Python
@@ -25,6 +28,7 @@ jobs:
2528
miniconda-version: "latest"
2629
activate-environment: lemon
2730
python-version: "3.10"
31+
run-post: "false"
2832
- name: Install dependencies
2933
shell: bash -el {0}
3034
run: |
@@ -41,11 +45,17 @@ jobs:
4145
shell: bash -el {0}
4246
run: |
4347
pylint src/turnkeyml/llm --rcfile .pylintrc --disable E0401
48+
- name: Test HF+CPU server
49+
if: runner.os == 'Windows'
50+
timeout-minutes: 10
51+
uses: ./.github/actions/server-testing
52+
with:
53+
conda_env: -n lemon
54+
load_command: -i facebook/opt-125m huggingface-load
4455
- name: Run lemonade tests
4556
shell: bash -el {0}
4657
run: |
4758
lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10
48-
4959
python test/llm_api.py
5060
5161

.github/workflows/test_turnkey.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ jobs:
3636
conda install pylint=3.2.7
3737
pip install pytest
3838
pip install -e plugins/devices
39-
pip install transformers timm
4039
pip install -e . # Required to test current tkml package instead of pypi version
40+
# tokenizers 0.20.4 seems to have an install bug, which we must avoid by limiting
41+
# the transformers version
42+
pip install "transformers<4.46.3" "tokenizers<0.20.4" timm
4143
python -m pip check
4244
- name: Lint with PyLint
4345
shell: bash -el {0}

src/turnkeyml/llm/docs/llamacpp.md renamed to docs/llamacpp.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This flow has been verified with a generic Llama.cpp model.
88

99
These instructions are only for linux or Windows with wsl. It may be necessary to be running WSL in an Administrator command prompt.
1010

11-
These instructions also assume that TurnkeyML's llm extensions have been installed (for example with "pip install -e .[llm]")
11+
These instructions also assumes that lemonade has been installed.
1212

1313

1414
### Set up Environment (Assumes TurnkeyML is already installed)
@@ -45,4 +45,4 @@ lemonade --input ~/llama.cpp/models/dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp
4545
On windows, the llama.cpp binary might be in a different location (such as llama.cpp\build\bin\Release\), in which case the command mgiht be something like:
4646
```bash
4747
lemonade --input ~\llama.cpp\models\dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp --executable ~\llama.cpp\build\bin\Release\llama-cli accuracy-mmlu --ntrain 5
48-
```
48+
```

docs/ort_genai_igpu.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# OnnxRuntime GenAI (OGA) for iGPU and CPU
2+
3+
onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ONNX LLMs: https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file
4+
5+
## Installation
6+
7+
To install:
8+
9+
1. `conda create -n oga-igpu python=3.9`
10+
1. `conda activate oga-igpu`
11+
1. `pip install -e .[llm-oga-igpu]`
12+
- Note: don't forget the `[llm-oga-igpu]` at the end, this is what installs ort-genai
13+
1. Get models:
14+
- The oga-load tool can download models from Hugging Face and build ONNX files using oga model_builder. Models can be quantized and optimized for both igpu and cpu.
15+
- Download and build ONNX model files:
16+
- `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4`
17+
- `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4`
18+
- The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls:
19+
- `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4`
20+
- `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4`
21+
- The ONNX model build process can be forced to run again, overwriting the above cache, by using the --force flag:
22+
`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force`
23+
- Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models:
24+
- Gemma
25+
- LLaMa
26+
- Mistral
27+
- Phi
28+
- Qwen
29+
- Nemotron
30+
- For the full list of supported models, please see the
31+
[model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md).
32+
- The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository:
33+
- cpu: fp32, int4
34+
- igpu: fp16, int4
35+
1. Directory structure:
36+
- The model_builder tool caches Hugging Face files and temporary ONNX external data files in `<LEMONADE CACHE>\model_builder`
37+
- The output from model_builder is stored in `<LEMONADE_CACHE>\oga_models\<MODELNAME>\<SUBFOLDER>`
38+
- `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case
39+
- `SUBFOLDER` is `<EP>-<DTYPE>`, where `EP` is the execution provider (`dml` for igpu, `cpu` for cpu, and `npu` for npu) and `DTYPE` is the datatype
40+
- If the --int4-block-size flag is used then `SUBFOLDER` is` <EP>-<DTYPE>-block-<SIZE>` where `SIZE` is the specified block size
41+
- Other ONNX models in the format required by onnxruntime-genai can be loaded in lemonade if placed in the `<LEMONADE_CACHE>\oga_models` folder.
42+
Use the -i and --subfolder flags to specify the folder and subfolder:
43+
`lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load`
44+
Lemonade will expect the ONNX model files to be located in `<LEMONADE_CACHE>\oga_models\my_model_name\my_subfolder`
45+
46+
## Usage
47+
48+
Prompt: `lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device igpu --dtype int4 llm-prompt -p "My thoughts are" --max-new-tokens 50`
49+
50+
Serving: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --dtype int4 --device igpu serve --max-new-tokens 100`

src/turnkeyml/llm/docs/ort_genai_npu.md renamed to docs/ort_genai_npu.md

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
66

77
### Warnings
88

9-
- Users have experienced inconsistent results across models and machines. If one model isn't working well on your laptop, try one of the other models.
109
- The OGA wheels need to be installed in a specific order or you will end up with the wrong packages in your environment. If you see pip dependency errors, please delete your conda env and start over with a fresh environment.
1110

1211
### Installation
@@ -18,20 +17,16 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
1817
1. `cd REPO_ROOT`
1918
1. `pip install -e .[oga-npu]`
2019
1. Download required OGA packages
21-
1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 Preview Release`.
20+
1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 EA Release`.
2221
1. Unzip `amd_oga_Oct4_2024.zip`
2322
1. Setup your folder structure:
24-
1. Copy all of the content inside `amd_oga` to lemonade's `REPO_ROOT\src\lemonade\tools\ort_genai\models\`
25-
1. Move all dlls from `REPO_ROOT\src\lemonade\tools\ort_genai\models\libs` to `REPO_ROOT\src\lemonade\tools\ort_genai\models\`
23+
1. Copy the `amd_oga` folder from the above zip file, if desired
24+
1. Create the system environment variable `AMD_OGA` and set it to the path to the `amd_oga` folder
2625
1. Install the wheels:
27-
1. `cd amd_oga\wheels`
26+
1. `cd %AMD_OGA%\wheels`
2827
1. `pip install onnxruntime_genai-0.5.0.dev0-cp310-cp310-win_amd64.whl`
2928
1. `pip install onnxruntime_vitisai-1.20.0-cp310-cp310-win_amd64.whl`
3029
1. `pip install voe-1.2.0-cp310-cp310-win_amd64.whl`
31-
1. Ensure you have access to the models on Hungging Face:
32-
1. Ensure you can access the models under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902) on Hugging Face. Models are gated and you may have to request access.
33-
1. Create a Hugging Face Access Token [here](https://huggingface.co/settings/tokens). Ensure you select `Read access to contents of all public gated repos you can access` if creating a finegrained token.
34-
1. Set your Hugging Face token as an environment variable: `set HF_TOKEN=<your token>`
3530
1. Install driver
3631
1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and download `Win24AIDriver.zip` from `Ryzen AI 1.3 Preview Release`.
3732
1. Unzip `Win24AIDriver.zip`
@@ -40,7 +35,7 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
4035

4136
### Runtime
4237

43-
To test basic functionality, point lemonade to any of the models under under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
38+
To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-llms-for-ryzen-ai-1.3-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
4439

4540
```
4641
lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@
6262
"llm-oga-dml": [
6363
"onnxruntime-genai-directml==0.4.0",
6464
"tqdm",
65-
"torch>=2.0.0",
66-
"transformers",
65+
"torch>=2.0.0,<2.4",
66+
"transformers<4.45.0",
6767
"accelerate",
6868
"py-cpuinfo",
6969
"sentencepiece",

src/turnkeyml/llm/README.md

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ Contents:
55

66
1. [Getting Started](#getting-started)
77
1. [Install Specialized Tools](#install-specialized-tools)
8+
- [OnnxRuntime GenAI](#install-onnxruntime-genai)
9+
- [RyzenAI NPU for PyTorch](#install-ryzenai-npu-for-pytorch)
810
1. [Code Organization](#code-organization)
911
1. [Contributing](#contributing)
1012

@@ -85,29 +87,21 @@ Lemonade supports specialized tools that each require their own setup steps. **N
8587

8688
## Install OnnxRuntime-GenAI
8789

88-
To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai) (e.g., the `oga-load` Tool), use `pip install -e .[llm-oga-dml]` instead of the default installation command.
90+
To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai), use `pip install -e .[llm-oga-dml]` instead of the default installation command.
8991

90-
Next, you need to get an OGA model. Per the OGA instructions, we suggest Phi-3-Mini. Use the following command to download it from Hugging Face, and make sure to set your `--local-dir` to the `REPO_ROOT/src/turnkeyml/llm/ort_genai/models` directory.
92+
You can then load supported OGA models on to CPU or iGPU with the `oga-load` tool, for example:
9193

92-
`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-4k-instruct`
94+
`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
9395

94-
You can try it out with: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
96+
You can also launch a server process with:
9597

96-
You can also try Phi-3-Mini-128k-Instruct with the following commands:
98+
`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 serve`
9799

98-
`huggingface-cli download microsoft/Phi-3-mini-128k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
100+
You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md).
99101

100-
`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
102+
> Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_npu.md) for more information.
101103
102-
You can also try out the CPU with:
103-
104-
`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
105-
106-
`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device cpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
107-
108-
> Note: no other models or devices are officially supported by `lemonade` on OGA at this time. Contributions appreciated! It only takes a few minutes to add a new model, we just need to add a path to the downloaded model folder to the supported models dictionary in [oga.py](https://github.com/onnx/turnkeyml/blob/v4.0.2/src/turnkeyml/llm/tools/ort_genai/oga.py).
109-
110-
## Install RyzenAI NPU
104+
## Install RyzenAI NPU for PyTorch
111105

112106
To run your LLMs on RyzenAI NPU, first install and set up the `ryzenai-transformers` conda environment (see instructions [here](https://github.com/amd/RyzenAI-SW/blob/main/example/transformers/models/llm/docs/README.md)). Then, install `lemonade` into `ryzenai-transformers`. The `ryzenai-npu-load` Tool will become available in that environment.
113107

src/turnkeyml/llm/cache.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@ class Keys:
3030
PROMPT_TOKENS = "prompt_tokens"
3131
CACHE_DIR = "cache_dir"
3232
DEVICE = "device"
33+
OGA_MODELS_SUBFOLDER = "oga_models_subfolder"

src/turnkeyml/llm/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def main():
103103
first_tool_args.append(global_args["input"])
104104

105105
state = State(
106-
cache_dir=global_args["cache_dir"],
106+
cache_dir=os.path.abspath(global_args["cache_dir"]),
107107
build_name=global_args["input"].replace("/", "_"),
108108
sequence_info=sequence.info,
109109
)

0 commit comments

Comments
 (0)