onnx
diff --git a/‎.github/workflows/test_lemonade_oga_cpu.yml
Lines changed: 8 additions & 8 deletions b/‎.github/workflows/test_lemonade_oga_cpu.yml
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/lemonade/getting_started.md
Lines changed: 1 addition & 1 deletion b/‎docs/lemonade/getting_started.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/turnkey/tools_user_guide.md
Lines changed: 2 additions & 2 deletions b/‎docs/turnkey/tools_user_guide.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/turnkey/api/loading_a_build.py
Lines changed: 2 additions & 2 deletions b/‎examples/turnkey/api/loading_a_build.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎plugins/devices/test/benchmark.py
Lines changed: 1 addition & 1 deletion b/‎plugins/devices/test/benchmark.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py
Lines changed: 1 addition & 0 deletions b/‎setup.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/lemonade/cache.py
Lines changed: 26 additions & 1 deletion b/‎src/lemonade/cache.py
Lines changed: 26 additions & 1 deletion
diff --git a/‎src/lemonade/cli.py
Lines changed: 1 addition & 1 deletion b/‎src/lemonade/cli.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lemonade/tools/chat.py
Lines changed: 10 additions & 9 deletions b/‎src/lemonade/tools/chat.py
Lines changed: 10 additions & 9 deletions
diff --git a/‎src/lemonade/tools/ort_genai/oga.py
Lines changed: 12 additions & 4 deletions b/‎src/lemonade/tools/ort_genai/oga.py
Lines changed: 12 additions & 4 deletions
diff --git a/‎src/turnkeyml/common/build.py
Lines changed: 15 additions & 1 deletion b/‎src/turnkeyml/common/build.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/turnkeyml/common/filesystem.py
Lines changed: 8 additions & 6 deletions b/‎src/turnkeyml/common/filesystem.py
Lines changed: 8 additions & 6 deletions
@@ -42,14 +42,6 @@ jobs:
         shell: bash -el {0}
         run: |
           pylint src/lemonade --rcfile .pylintrc --disable E0401
-      - name: Test OGA+CPU server
-        if: runner.os == 'Windows'
-        timeout-minutes: 10
-        uses: ./.github/actions/server-testing
-        with:
-          conda_env: -n lemon
-          load_command: -i TinyPixel/small-llama2 oga-load --device cpu --dtype int4
-          hf_token: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions
       - name: Run lemonade tests
         shell: bash -el {0}
         env:
@@ -64,4 +56,12 @@ jobs:
           # Test high-level LEAP APIs
           python examples/lemonade/leap_oga_cpu.py
           python examples/lemonade/leap_oga_cpu_streaming.py
+      - name: Test OGA+CPU server
+        if: runner.os == 'Windows'
+        timeout-minutes: 10
+        uses: ./.github/actions/server-testing
+        with:
+          conda_env: -n lemon
+          load_command: -i TinyPixel/small-llama2 oga-load --device cpu --dtype int4
+          hf_token: "${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}" # Required by OGA model_builder in OGA 0.4.0 but not future versions
 
@@ -52,7 +52,7 @@ To measure the accuracy of an LLM using MMLU, try this:
 
 That command will run just the management test from MMLU on your LLM and save the score to the lemonade cache at `~/.cache/lemonade`.
 
-You can run the full suite of MMLU subjects by omitting the `--test` argument. You can learn more about this with `lemonade accuracy-mmlu -h.
+You can run the full suite of MMLU subjects by omitting the `--test` argument. You can learn more about this with `lemonade accuracy-mmlu -h`.
 
 ## Benchmarking
 
 
@@ -153,9 +153,9 @@ Each build directory contains:
 - The stats file, `turnkey_stats.yaml`, which collects all of the statistics collected by the tools.
   - This is what forms the content of the CSV reports generated by the `turnkey report` tool.
 - One log file per tool that was executed, which may contain additional information about what happened during the tool run.
-  - For example, `cache_dir/build_dir/log_discover.txt`.
+  - For example, `cache_dir/builds/build_dir/log_discover.txt`.
 - All of the artifacts produced by the tools.
-  - For example, `cache_dir/build_dir/onnx/my_model.onnx`.
+  - For example, `cache_dir/builds/build_dir/onnx/my_model.onnx`.
 
 The `--lean-cache` global argument ensures that all build artifacts are removed at the end of the sequence. This is useful for saving disk space when gathering statistics over a large amount of models. Log files (.txt), json files (.json), and yaml files (.yaml, such as state.yaml and stats.yaml) are not removed.
 
 
@@ -11,6 +11,7 @@
 import onnxruntime as ort
 from turnkeyml.common.filesystem import get_available_builds, DEFAULT_CACHE_DIR
 from turnkeyml.state import State
+from turnkeyml.common.build import output_dir
 from turnkeyml.tools.load_build import LoadBuild
 from turnkeyml.tools.onnx import ConvertOnnxToFp16
 
@@ -21,8 +22,7 @@ def main():
 
     # We use the _state.yaml file in the build directory when loading a build
     prior_state_file = os.path.join(
-        DEFAULT_CACHE_DIR,
-        prerequisite_build,
+        output_dir(DEFAULT_CACHE_DIR, prerequisite_build),
         f"{prerequisite_build}_state.yaml",
     )
 
 
@@ -497,7 +497,7 @@ def test_010_cli_cache_benchmark(self):
             "--cache-dir",
             cache_dir,
             "-i",
-            os.path.join(cache_dir, "*", "*_state.yaml"),
+            os.path.join(cache_dir, "builds", "*", "*_state.yaml"),
             "load-build",
             "benchmark",
         ]
 
@@ -49,6 +49,7 @@
         "pytz",
         "tqdm",
         "matplotlib",
+        "tabulate",
         # Conditional dependencies for ONNXRuntime backends
         "onnxruntime >=1.10.1;platform_system=='Linux' and extra != 'llm-oga-cuda'",
         "onnxruntime-directml >=1.19.0;platform_system=='Windows' and extra != 'llm-oga-cuda'",
 
@@ -1,11 +1,12 @@
 import os
+from datetime import datetime, timezone
 
 # Allow an environment variable to override the default
 # location for the build cache
 if os.environ.get("LEMONADE_CACHE_DIR"):
     DEFAULT_CACHE_DIR = os.path.expanduser(os.environ.get("LEMONADE_CACHE_DIR"))
 else:
-    DEFAULT_CACHE_DIR = os.path.expanduser("~/.cache/lemonade")
+    DEFAULT_CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "lemonade")
 
 
 def checkpoint_to_model_name(checkpoint_name: str) -> str:
@@ -16,6 +17,29 @@ def checkpoint_to_model_name(checkpoint_name: str) -> str:
     return checkpoint_name.split("/")[1]
 
 
+def build_name(input_name):
+    """
+    Name the lemonade build by concatenating these two factors:
+        1. Sanitize the input name (typically a model checkpoint name) by
+            replacing any `/` characters with `_`.
+        2. Timestamp in the format:
+                <month>m_<day>d_<year>y_<hour>h_<minute>m_<second>s
+            This timestamp ensures that builds in the same cache will not
+            collide in the same build directory.
+    """
+
+    # Sanitize the input name
+    input_name_sanitized = input_name.replace("/", "_")
+
+    # Get the current time in GMT
+    current_time = datetime.now(timezone.utc)
+
+    # Format the timestamp string
+    timestamp = current_time.strftime("%Yy_%mm_%dd_%Hh_%Mm_%Ss")
+
+    return f"{input_name_sanitized}_{timestamp}"
+
+
 class Keys:
     MODEL = "model"
     PER_ITERATION_LATENCY = "per_iteration_latency"
@@ -37,3 +61,4 @@ class Keys:
     DEVICE = "device"
     OGA_MODELS_SUBFOLDER = "oga_models_subfolder"
     MEMORY_USAGE_PLOT = "memory_usage_plot"
+    MAX_MEMORY_USED_GB = "max_memory_used_GB"
@@ -128,7 +128,7 @@ def main():
 
         state = State(
             cache_dir=os.path.abspath(global_args["cache_dir"]),
-            build_name=global_args["input"].replace("/", "_"),
+            build_name=cache.build_name(global_args["input"]),
             sequence_info=sequence.info,
         )
         sequence.launch(
 
@@ -25,6 +25,8 @@
 }
 
 DEFAULT_SERVER_PORT = 8000
+DEFAULT_MAX_NEW_TOKENS = 512
+DEFAULT_N_TRIALS = 1
 
 END_OF_STREAM = "</s>"
 
@@ -95,17 +97,19 @@ def parser(add_help: bool = True) -> argparse.ArgumentParser:
         parser.add_argument(
             "--max-new-tokens",
             "-m",
-            default=512,
+            default=DEFAULT_MAX_NEW_TOKENS,
             type=int,
-            help="Maximum number of new tokens in the response",
+            help=f"Maximum number of new tokens in the response "
+            f"(default is {DEFAULT_MAX_NEW_TOKENS})",
         )
 
         parser.add_argument(
             "--n-trials",
             "-n",
-            default=1,
+            default=DEFAULT_N_TRIALS,
             type=positive_int,
-            help="Number of responses the LLM will generate for the prompt (useful for testing)",
+            help=f"Number of responses the LLM will generate for the prompt "
+            f"(useful for testing, default is {DEFAULT_N_TRIALS})",
         )
 
         return parser
@@ -126,17 +130,14 @@ def parse(self, state: State, args, known_only=True) -> argparse.Namespace:
             # No change to the prompt
             pass
 
-        if parsed_args.n_trials < 1:
-            raise ValueError("N_TRIALS should be a positive number")
-
         return parsed_args
 
     def run(
         self,
         state: State,
         prompt: str = "Hello",
-        max_new_tokens: int = 512,
-        n_trials: int = 1,
+        max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+        n_trials: int = DEFAULT_N_TRIALS,
     ) -> State:
 
         model: ModelAdapter = state.model
 
@@ -16,7 +16,7 @@
 from fnmatch import fnmatch
 from queue import Queue
 from packaging.version import Version
-from huggingface_hub import snapshot_download
+from huggingface_hub import snapshot_download, list_repo_files
 import onnxruntime_genai as og
 import onnxruntime_genai.models.builder as model_builder
 from turnkeyml.state import State
@@ -245,7 +245,7 @@ class OgaLoad(FirstTool):
             Models on Hugging Face that follow the "amd/**-onnx-ryzen-strix" pattern
         Local models for cpu, igpu, or npu:
             The specified checkpoint is converted to a local path, via mapping to lower case
-            and replacing '/' with '_'.  If this model already exists in the 'models' folderr
+            and replacing '/' with '_'.  If this model already exists in the 'models' folder
             of the lemonade cache and if it has a subfolder <device>-<dtype>, then this model
             will be used.  If the --force flag is used and the model is built with model_builder,
             then it will be rebuilt.
@@ -398,8 +398,16 @@ def run(
                     + "."
                 )
 
+            # Check whether the model is a safetensors checkpoint or a pre-exported
+            # ONNX model
+            # Note: This approach only supports ONNX models where the ONNX files are in the
+            #   Huggingface repo root. This does not support the case where the ONNX files
+            #   are in a nested directory within the repo.
+            model_files = list_repo_files(repo_id=checkpoint)
+            onnx_model = any([filename.endswith(".onnx") for filename in model_files])
+
             # Download the model from HF
-            if device == "npu" or device == "hybrid":
+            if onnx_model:
 
                 # NPU models on HF are ready to go and HF does its own caching
                 full_model_path = snapshot_download(
@@ -474,7 +482,7 @@ def run(
                         os.makedirs(os.path.dirname(dst_dll), exist_ok=True)
                         shutil.copy2(src_dll, dst_dll)
             else:
-                # device is 'cpu' or 'igpu'
+                # checkpoint is safetensors, so we need to run it through model_builder
 
                 # Use model_builder to download model and convert to ONNX
                 printing.log_info(f"Building {checkpoint} for {device} using {dtype}")
 
@@ -39,8 +39,22 @@ def load_yaml(file_path) -> Dict:
             )
 
 
+def builds_dir(cache_dir):
+    """
+    Each build stores stats, logs, and other files in a build directory.
+    All build directories are located at:
+        <cache_dir>/builds
+    """
+    return os.path.join(cache_dir, "builds")
+
+
 def output_dir(cache_dir, build_name):
-    path = os.path.join(cache_dir, build_name)
+    """
+    Each build stores stats, logs, and other files in an output directory at:
+    All build directories are located at:
+        <builds_dir>/<build_name>
+    """
+    path = os.path.join(builds_dir(cache_dir), build_name)
     return path
 
 
 
@@ -183,7 +183,7 @@ def clean_output_dir(cache_dir: str, build_name: str) -> None:
     """
     Delete all elements of the output directory that are not human readable
     """
-    output_dir = os.path.join(cache_dir, build_name)
+    output_dir = build.output_dir(cache_dir, build_name)
     if os.path.isdir(output_dir) and is_build_dir(cache_dir, build_name):
         output_dir = os.path.expanduser(output_dir)
     else:
@@ -244,10 +244,10 @@ def get_available_builds(cache_dir):
     check_cache_dir(cache_dir)
 
     builds = [
-        pathlib.PurePath(build).name
-        for build in os.listdir(os.path.abspath(cache_dir))
-        if os.path.isdir(os.path.join(cache_dir, build))
-        and is_build_dir(cache_dir, build)
+        pathlib.PurePath(build_name).name
+        for build_name in os.listdir(os.path.abspath(build.builds_dir(cache_dir)))
+        if os.path.isdir(build.output_dir(cache_dir, build_name))
+        and is_build_dir(cache_dir, build_name)
     ]
     builds.sort()
 
@@ -517,7 +517,9 @@ def rebase_cache_dir(input_path: str, build_name: str, new_cache_dir: str):
     """
 
     relative_input_path = input_path.split(build_name, 1)[1][1:]
-    return os.path.join(new_cache_dir, build_name, relative_input_path)
+    return os.path.join(
+        build.output_dir(new_cache_dir, build_name), relative_input_path
+    )
 
 
 def check_extension(choices, file_name, error_func):
Original file line number	Diff line number	Diff line change
`@@ -497,7 +497,7 @@ def test_010_cli_cache_benchmark(self):`
`497`	`497`	`"--cache-dir",`
`498`	`498`	`cache_dir,`
`499`	`499`	`"-i",`
`500`		`- os.path.join(cache_dir, "", "_state.yaml"),`
	`500`	`+ os.path.join(cache_dir, "builds", "", "_state.yaml"),`
`501`	`501`	`"load-build",`
`502`	`502`	`"benchmark",`
`503`	`503`	`]`
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ def main():`
`128`	`128`
`129`	`129`	`state = State(`
`130`	`130`	`cache_dir=os.path.abspath(global_args["cache_dir"]),`
`131`		`- build_name=global_args["input"].replace("/", "_"),`
	`131`	`+ build_name=cache.build_name(global_args["input"]),`
`132`	`132`	`sequence_info=sequence.info,`
`133`	`133`	`)`
`134`	`134`	`sequence.launch(`