onnx
diff --git a/‎.github/actions/server-testing/action.yml
Lines changed: 164 additions & 0 deletions b/‎.github/actions/server-testing/action.yml
Lines changed: 164 additions & 0 deletions
diff --git a/‎.github/workflows/test_lemonade.yml
Lines changed: 12 additions & 2 deletions b/‎.github/workflows/test_lemonade.yml
Lines changed: 12 additions & 2 deletions
diff --git a/‎.github/workflows/test_turnkey.yml
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/test_turnkey.yml
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/turnkeyml/llm/docs/llamacpp.md renamed to ‎docs/llamacpp.md
Lines changed: 2 additions & 2 deletions b/‎src/turnkeyml/llm/docs/llamacpp.md renamed to ‎docs/llamacpp.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/ort_genai_igpu.md
Lines changed: 50 additions & 0 deletions b/‎docs/ort_genai_igpu.md
Lines changed: 50 additions & 0 deletions
diff --git a/‎src/turnkeyml/llm/docs/ort_genai_npu.md renamed to ‎docs/ort_genai_npu.md
Lines changed: 5 additions & 10 deletions b/‎src/turnkeyml/llm/docs/ort_genai_npu.md renamed to ‎docs/ort_genai_npu.md
Lines changed: 5 additions & 10 deletions
diff --git a/‎setup.py
Lines changed: 2 additions & 2 deletions b/‎setup.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/turnkeyml/llm/README.md
Lines changed: 10 additions & 16 deletions b/‎src/turnkeyml/llm/README.md
Lines changed: 10 additions & 16 deletions
diff --git a/‎src/turnkeyml/llm/cache.py
Lines changed: 1 addition & 0 deletions b/‎src/turnkeyml/llm/cache.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/turnkeyml/llm/cli.py
Lines changed: 1 addition & 1 deletion b/‎src/turnkeyml/llm/cli.py
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,164 @@
+name: "Test Lemonade Server"
+description: Launch Lemonade Server and test the endpoints
+inputs:
+  conda_env:
+    required: true
+  load_command:
+    required: true
+  amd_oga:
+    required: false
+    default: ""
+    description: "Location of the OGA for RyzenAI NPU install directory on disk"
+runs:
+  using: "composite"
+  steps:
+    - name: Ensure the Lemonade serer works properly
+      shell: PowerShell
+      run: |
+        $Env:AMD_OGA = "${{ inputs.amd_oga }}"
+        
+        $outputFile = "output.log"
+        $errorFile = "error.log"
+        $serverProcess = Start-Process -FilePath "conda" -ArgumentList "run ${{ inputs.conda_env }} lemonade -d .\ci-cache ${{ inputs.load_command }} serve --max-new-tokens 10" -RedirectStandardOutput $outputFile -RedirectStandardError $errorFile -PassThru -NoNewWindow
+
+        Write-Host "Wait for 30 seconds to let the server come up"
+        Start-Sleep -Seconds 30
+        
+        Write-Host "Check if server process successfully launched"
+        $serverRunning = Get-Process -Id $serverProcess.Id -ErrorAction SilentlyContinue
+        if (-not $serverRunning) {
+          Write-Host "Error: Server process isn't running, even though we just tried to start it!"
+          Write-Host "Standard Output:"
+          Get-Content $outputFile
+
+          Write-Host "Standard Error:"
+          Get-Content $errorFile
+          exit 1
+        } else {
+          Write-Host "Server process is alive."
+        }
+
+        Write-Host "Wait for the server port to come up"
+        while ($true) {
+          
+          $llmPortCheck = Test-NetConnection -ComputerName 127.0.0.1 -Port 8000
+          if (-not $llmPortCheck.TcpTestSucceeded) {
+            Write-Host "LLM server is not yet running on port 8000!"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+          } else {
+            Write-Host "LLM server is running on port 8000."
+            break
+          }
+
+          Start-Sleep -Seconds 30
+        }
+
+        Write-Host "Checking the /health endpoint"
+        $response = Invoke-WebRequest -Uri http://127.0.0.1:8000/health -UseBasicParsing
+
+        if ($response.StatusCode -eq 200) {
+            Write-Output "Good: /health status code is 200"
+        } else {
+            Write-Output "Error: /health status code is not 200"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        $jsonContent = $response.Content | ConvertFrom-Json
+        if ($jsonContent) {
+            Write-Output "Good: /health JSON content is not empty: $jsonContent"
+        } else {
+            Write-Output "Error: /health JSON content is empty"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        Write-Host "Checking the /ws (streaming generation) endpoint"
+
+        # Define the WebSocket URI
+        $uri = [System.Uri]::new("ws://127.0.0.1:8000/ws")
+
+        # Create a new ClientWebSocket instance
+        $webSocket = [System.Net.WebSockets.ClientWebSocket]::new()
+
+        # Connect to the WebSocket server
+        $webSocket.ConnectAsync($uri, [System.Threading.CancellationToken]::None).Wait()
+
+        # Define the message to send
+        $message = "Hello, WebSocket!"
+        $buffer = [System.Text.Encoding]::UTF8.GetBytes($message)
+        $segment = [System.ArraySegment[byte]]::new($buffer)
+
+        # Send the message
+        $webSocket.SendAsync($segment, [System.Net.WebSockets.WebSocketMessageType]::Text, $true, [System.Threading.CancellationToken]::None).Wait()
+
+        # Buffer to store the response
+        $responseBuffer = New-Object byte[] 1024
+        $responseSegment = [System.ArraySegment[byte]]::new($responseBuffer)
+
+        # Variable to store the complete response
+        $response = ""
+
+        # Receive the streaming response
+        do {
+            $result = $webSocket.ReceiveAsync($responseSegment, [System.Threading.CancellationToken]::None).Result
+            $response += [System.Text.Encoding]::UTF8.GetString($responseBuffer, 0, $result.Count)
+        } while ($response -notlike "*</s>*")
+
+        # Close the WebSocket connection
+        $webSocket.CloseAsync([System.Net.WebSockets.WebSocketCloseStatus]::NormalClosure, "Closing", [System.Threading.CancellationToken]::None).Wait()
+
+        # Check if the response is not empty
+        if ($response -and $response -notlike "</s>") {
+            Write-Output "Response is not empty: $response"
+        } else {
+            Write-Output "Response is empty or only contains the end marker: $response"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        Write-Host "Checking the /stats endpoint"
+        $response = Invoke-WebRequest -Uri http://127.0.0.1:8000/stats  -UseBasicParsing
+        if ($response.StatusCode -eq 200) {
+            Write-Output "Good: /stats status code is 200"
+        } else {
+            Write-Output "Error: /stats status code is not 200"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        $jsonContent = $response.Content | ConvertFrom-Json
+        if ($jsonContent) {
+            Write-Output "Good: /stats JSON content is not empty: $jsonContent"
+        } else {
+            Write-Output "Error: /stats JSON content is empty"
+            Write-Host "Standard Output:"
+            Get-Content $outputFile
+
+            Write-Host "Standard Error:"
+            Get-Content $errorFile
+            exit 1
+        }
+
+        Write-Host "Close the server process"
+        Stop-Process -Id $serverProcess.Id
@@ -16,7 +16,10 @@ jobs:
   make-lemonade:
     env:
         LEMONADE_CI_MODE: "True"
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v3
       - name: Set up Miniconda with 64-bit Python
@@ -25,6 +28,7 @@ jobs:
           miniconda-version: "latest"
           activate-environment: lemon
           python-version: "3.10"
+          run-post: "false"
       - name: Install dependencies
         shell: bash -el {0}
         run: |
@@ -41,11 +45,17 @@ jobs:
         shell: bash -el {0}
         run: |
           pylint src/turnkeyml/llm --rcfile .pylintrc --disable E0401
+      - name: Test HF+CPU server
+        if: runner.os == 'Windows'
+        timeout-minutes: 10
+        uses: ./.github/actions/server-testing
+        with:
+          conda_env: -n lemon
+          load_command: -i facebook/opt-125m huggingface-load
       - name: Run lemonade tests
         shell: bash -el {0}
         run: |
           lemonade -i facebook/opt-125m huggingface-load llm-prompt -p "hi" --max-new-tokens 10
-
           python test/llm_api.py
           
 
@@ -36,8 +36,10 @@ jobs:
           conda install pylint=3.2.7
           pip install pytest
           pip install -e plugins/devices
-          pip install transformers timm
           pip install -e . # Required to test current tkml package instead of pypi version
+          # tokenizers 0.20.4 seems to have an install bug, which we must avoid by limiting
+          # the transformers version
+          pip install "transformers<4.46.3" "tokenizers<0.20.4" timm
           python -m pip check
       - name: Lint with PyLint
         shell: bash -el {0}
 
@@ -8,7 +8,7 @@ This flow has been verified with a generic Llama.cpp model.
 
 These instructions are only for linux or Windows with wsl. It may be necessary to be running WSL in an Administrator command prompt.
 
-These instructions also assume that TurnkeyML's llm extensions have been installed (for example with "pip install -e .[llm]")
+These instructions also assumes that lemonade has been installed.
 
 
 ### Set up Environment (Assumes TurnkeyML is already installed)
@@ -45,4 +45,4 @@ lemonade --input ~/llama.cpp/models/dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp
 On windows, the llama.cpp binary might be in a different location (such as llama.cpp\build\bin\Release\), in which case the command mgiht be something like:
 ```bash
 lemonade --input ~\llama.cpp\models\dolphin-llama2-7b.Q5_K_M.gguf load-llama-cpp --executable ~\llama.cpp\build\bin\Release\llama-cli accuracy-mmlu --ntrain 5
-```
+```
@@ -0,0 +1,50 @@
+# OnnxRuntime GenAI (OGA) for iGPU and CPU
+
+onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running ONNX LLMs: https://github.com/microsoft/onnxruntime-genai/tree/main?tab=readme-ov-file
+
+## Installation
+
+To install:
+
+1. `conda create -n oga-igpu python=3.9`
+1. `conda activate oga-igpu`
+1. `pip install -e .[llm-oga-igpu]`
+   - Note: don't forget the `[llm-oga-igpu]` at the end, this is what installs ort-genai
+1. Get models:
+    - The oga-load tool can download models from Hugging Face and build ONNX files using oga model_builder.  Models can be quantized and optimized for both igpu and cpu.
+    - Download and build ONNX model files:
+      - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4`
+      - `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device cpu --dtype int4`
+    - The ONNX model files will be stored in the respective subfolder of the lemonade cache folder and will be reused in future oga-load calls:
+      - `oga_models\microsoft_phi-3-mini-4k-instruct\dml-int4`
+      - `oga_models\microsoft_phi-3-mini-4k-instruct\cpu-int4`
+    - The ONNX model build process can be forced to run again, overwriting the above cache, by using the --force flag:
+      `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 --force`
+    - Transformer model architectures supported by the model_builder tool include many popular state-of-the-art models:
+      - Gemma
+      - LLaMa
+      - Mistral
+      - Phi
+      - Qwen
+      - Nemotron
+    - For the full list of supported models, please see the 
+        [model_builder documentation](https://github.com/microsoft/onnxruntime-genai/blob/main/src/python/py/models/README.md).
+	- The following quantizations are supported for automatically building ONNXRuntime GenAI model files from the Hugging Face repository:
+		- cpu: fp32, int4
+		- igpu: fp16, int4
+1. Directory structure:
+	- The model_builder tool caches Hugging Face files and temporary ONNX external data files in `<LEMONADE CACHE>\model_builder`
+	- The output from model_builder is stored in `<LEMONADE_CACHE>\oga_models\<MODELNAME>\<SUBFOLDER>`
+		- `MODELNAME` is the Hugging Face checkpoint name where any '/' is mapped to an '_' and everything is lower case
+		- `SUBFOLDER` is `<EP>-<DTYPE>`, where `EP` is the execution provider (`dml` for igpu, `cpu` for cpu, and `npu` for npu) and `DTYPE` is the datatype
+		- If the --int4-block-size flag is used then `SUBFOLDER` is` <EP>-<DTYPE>-block-<SIZE>` where `SIZE` is the specified block size
+	- Other ONNX models in the format required by onnxruntime-genai can be loaded in lemonade if placed in the `<LEMONADE_CACHE>\oga_models` folder.
+	  Use the -i and --subfolder flags to specify the folder and subfolder:
+		`lemonade -i my_model_name --subfolder my_subfolder --device igpu --dtype int4 oga-load`
+	  Lemonade will expect the ONNX model files to be located in `<LEMONADE_CACHE>\oga_models\my_model_name\my_subfolder`
+	  
+## Usage
+
+Prompt: `lemonade -i meta-llama/Llama-3.2-1B-Instruct oga-load --device igpu --dtype int4 llm-prompt -p "My thoughts are" --max-new-tokens 50`
+
+Serving: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --dtype int4 --device igpu serve --max-new-tokens 100`
@@ -6,7 +6,6 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
 
 ### Warnings
 
- - Users have experienced inconsistent results across models and machines. If one model isn't working well on your laptop, try one of the other models.
  - The OGA wheels need to be installed in a specific order or you will end up with the wrong packages in your environment. If you see pip dependency errors, please delete your conda env and start over with a fresh environment.
 
 ### Installation
@@ -18,20 +17,16 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
     1. `cd REPO_ROOT`
     1. `pip install -e .[oga-npu]`
 1. Download required OGA packages
-    1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and  download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 Preview Release`.
+    1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and  download `amd_oga_Oct4_2024.zip` from `Ryzen AI 1.3 EA Release`.
     1. Unzip `amd_oga_Oct4_2024.zip`
 1. Setup your folder structure:
-    1. Copy all of the content inside `amd_oga` to lemonade's `REPO_ROOT\src\lemonade\tools\ort_genai\models\`
-    1. Move all dlls from `REPO_ROOT\src\lemonade\tools\ort_genai\models\libs` to `REPO_ROOT\src\lemonade\tools\ort_genai\models\`
+    1. Copy the `amd_oga` folder from the above zip file, if desired
+    1. Create the system environment variable `AMD_OGA` and set it to the path to the `amd_oga` folder
 1. Install the wheels:
-    1. `cd amd_oga\wheels`
+    1. `cd %AMD_OGA%\wheels`
     1. `pip install onnxruntime_genai-0.5.0.dev0-cp310-cp310-win_amd64.whl`
     1. `pip install onnxruntime_vitisai-1.20.0-cp310-cp310-win_amd64.whl`
     1. `pip install voe-1.2.0-cp310-cp310-win_amd64.whl`
-1. Ensure you have access to the models on Hungging Face:
-    1. Ensure you can access the models under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902) on Hugging Face. Models are gated and you may have to request access.
-    1. Create a Hugging Face Access Token [here](https://huggingface.co/settings/tokens). Ensure you select `Read access to contents of all public gated repos you can access` if creating a finegrained token.
-    1. Set your Hugging Face token as an environment variable: `set HF_TOKEN=<your token>`
 1. Install driver
     1. Access the [AMD RyzenAI EA Lounge](https://account.amd.com/en/member/ryzenai-sw-ea.html#tabs-a5e122f973-item-4757898120-tab) and  download `Win24AIDriver.zip` from `Ryzen AI 1.3 Preview Release`.
     1. Unzip `Win24AIDriver.zip`
@@ -40,7 +35,7 @@ onnxruntime-genai (aka OGA) is a new framework created by Microsoft for running
 
 ### Runtime
 
-To test basic functionality, point lemonade to any of the models under under [quark-quantized-onnx-llms-for-ryzen-ai-13-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
+To test basic functionality, point lemonade to any of the models under [quark-quantized-onnx-llms-for-ryzen-ai-1.3-ea](https://huggingface.co/collections/amd/quark-quantized-onnx-llms-for-ryzen-ai-13-ea-66fc8e24927ec45504381902):
 
 ```
 lemonade -i amd/Llama-2-7b-hf-awq-g128-int4-asym-fp32-onnx-ryzen-strix oga-load --device npu --dtype int4 llm-prompt -p "hello whats your name?" --max-new-tokens 15
 
@@ -62,8 +62,8 @@
         "llm-oga-dml": [
             "onnxruntime-genai-directml==0.4.0",
             "tqdm",
-            "torch>=2.0.0",
-            "transformers",
+            "torch>=2.0.0,<2.4",
+            "transformers<4.45.0",
             "accelerate",
             "py-cpuinfo",
             "sentencepiece",
 
@@ -5,6 +5,8 @@ Contents:
 
 1. [Getting Started](#getting-started)
 1. [Install Specialized Tools](#install-specialized-tools)
+    - [OnnxRuntime GenAI](#install-onnxruntime-genai)
+    - [RyzenAI NPU for PyTorch](#install-ryzenai-npu-for-pytorch)
 1. [Code Organization](#code-organization)
 1. [Contributing](#contributing)
 
@@ -85,29 +87,21 @@ Lemonade supports specialized tools that each require their own setup steps. **N
 
 ## Install OnnxRuntime-GenAI
 
-To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai) (e.g., the `oga-load` Tool), use `pip install -e .[llm-oga-dml]` instead of the default installation command.
+To install support for [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai), use `pip install -e .[llm-oga-dml]` instead of the default installation command.
 
-Next, you need to get an OGA model. Per the OGA instructions, we suggest Phi-3-Mini. Use the following command to download it from Hugging Face, and make sure to set your `--local-dir` to the `REPO_ROOT/src/turnkeyml/llm/ort_genai/models` directory.
+You can then load supported OGA models on to CPU or iGPU with the `oga-load` tool, for example:
 
-`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-4k-instruct`
+`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
 
-You can try it out with: `lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
+You can also launch a server process with:
 
-You can also try Phi-3-Mini-128k-Instruct with the following commands:
+`lemonade -i microsoft/Phi-3-mini-4k-instruct oga-load --device igpu --dtype int4 serve`
 
-`huggingface-cli download microsoft/Phi-3-mini-128k-instruct-onnx --include directml/directml-int4-awq-block-128* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
+You can learn more about the CPU and iGPU support in our [OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_igpu.md).
 
-`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device igpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
+> Note: early access to AMD's RyzenAI NPU is also available. See the [RyzenAI NPU OGA documentation](https://github.com/onnx/turnkeyml/blob/main/docs/ort_genai_npu.md) for more information.
 
-You can also try out the CPU with:
-
-`huggingface-cli download microsoft/Phi-3-mini-4k-instruct-onnx --include cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/* --local-dir REPO_ROOT/src/turnkeyml/llm/tools/ort_genai/models/phi-3-mini-128k-instruct`
-
-`lemonade -i microsoft/Phi-3-mini-128k-instruct oga-load --device cpu --dtype int4 llm-prompt -p "Hello, my thoughts are"`
-
-> Note: no other models or devices are officially supported by `lemonade` on OGA at this time. Contributions appreciated! It only takes a few minutes to add a new model, we just need to add a path to the downloaded model folder to the supported models dictionary in [oga.py](https://github.com/onnx/turnkeyml/blob/v4.0.2/src/turnkeyml/llm/tools/ort_genai/oga.py).
-
-## Install RyzenAI NPU
+## Install RyzenAI NPU for PyTorch
 
 To run your LLMs on RyzenAI NPU, first install and set up the `ryzenai-transformers` conda environment (see instructions [here](https://github.com/amd/RyzenAI-SW/blob/main/example/transformers/models/llm/docs/README.md)). Then, install `lemonade` into `ryzenai-transformers`. The `ryzenai-npu-load` Tool will become available in that environment.
 
 
@@ -30,3 +30,4 @@ class Keys:
     PROMPT_TOKENS = "prompt_tokens"
     CACHE_DIR = "cache_dir"
     DEVICE = "device"
+    OGA_MODELS_SUBFOLDER = "oga_models_subfolder"
@@ -103,7 +103,7 @@ def main():
         first_tool_args.append(global_args["input"])
 
         state = State(
-            cache_dir=global_args["cache_dir"],
+            cache_dir=os.path.abspath(global_args["cache_dir"]),
             build_name=global_args["input"].replace("/", "_"),
             sequence_info=sequence.info,
         )
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ def main():`
`103`	`103`	`first_tool_args.append(global_args["input"])`
`104`	`104`
`105`	`105`	`state = State(`
`106`		`- cache_dir=global_args["cache_dir"],`
	`106`	`+ cache_dir=os.path.abspath(global_args["cache_dir"]),`
`107`	`107`	`build_name=global_args["input"].replace("/", "_"),`
`108`	`108`	`sequence_info=sequence.info,`
`109`	`109`	`)`