From 6713ecb69b578c87cde68da5df195453c040cdb4 Mon Sep 17 00:00:00 2001 From: Xida Date: Wed, 8 Jan 2025 01:01:20 +0000 Subject: [PATCH 01/27] initial commit using scotts code --- .github/workflows/pkgci.yml | 37 +++ .github/workflows/pkgci_shark_ai.yml | 71 +++++ build_tools/setup_venv.py | 378 +++++++++++++++++++++++++++ 3 files changed, 486 insertions(+) create mode 100644 .github/workflows/pkgci.yml create mode 100644 .github/workflows/pkgci_shark_ai.yml create mode 100644 build_tools/setup_venv.py diff --git a/.github/workflows/pkgci.yml b/.github/workflows/pkgci.yml new file mode 100644 index 000000000..2c655d1c1 --- /dev/null +++ b/.github/workflows/pkgci.yml @@ -0,0 +1,37 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +name: PkgCI + +on: + workflow_dispatch: + pull_request: + push: + branches: + - main + +permissions: + contents: read + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + build_packages: + name: Build Packages + uses: ./.github/workflows/build_packages.yml + with: + build_type: "dev" + + test_shark_ai: + name: Test shark-ai + needs: [build_packages] + uses: ./.github/workflows/pkgci_shark_ai.yml diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml new file mode 100644 index 000000000..1ef8c9bf8 --- /dev/null +++ b/.github/workflows/pkgci_shark_ai.yml @@ -0,0 +1,71 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +name: PkgCI - shark-ai + +on: + workflow_call: + inputs: + artifact_run_id: + type: string + default: "" + workflow_dispatch: + inputs: + artifact_run_id: + type: string + description: "Id for a workflow run that produced dev packages" + default: "" + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + test_shortfin_llm_server: + name: "Integration Tests - Shortfin LLM Server" + strategy: + matrix: + version: [3.11] + fail-fast: false + runs-on: nodai-amdgpu-mi250-x86-64 + defaults: + run: + shell: bash + env: + PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages + VENV_DIR: ${{ github.workspace }}/.venv + steps: + - name: "Checkout Code" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: "Setting up Python" + id: setup_python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{matrix.version}} + + - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: linux_x86_64_dev_packages + path: ${{ env.PACKAGE_DOWNLOAD_DIR }} + - name: Setup venv + run: | + ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \ + --artifact-path=${PACKAGE_DOWNLOAD_DIR} \ + --fetch-gh-workflow=${{ inputs.artifact_run_id }} + + - name: Install nightly IREE packages + run: | + source ${VENV_DIR}/bin/activate + uv pip install -r requirements-iree-pinned.txt + + - name: Run LLM Integration Tests + run: | + source ${VENV_DIR}/bin/activate + pytest -v -s app_tests/integration_tests/llm/shortfin --log-cli-level=INFO diff --git a/build_tools/setup_venv.py b/build_tools/setup_venv.py new file mode 100644 index 000000000..9c4caa041 --- /dev/null +++ b/build_tools/setup_venv.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +# Copyright 2024 Advanced Micro Devices, Inc. +# Copyright 2023 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +"""Sets up a Python venv with shark-ai packages from a workflow run. + +There are several modes in which to use this script: + +* Within a workflow triggered by `workflow_call`, an artifact action will + typically be used to fetch relevant package artifacts. Specify the fetched + location with `--artifact-path=`: + + ```yml + - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: linux_x86_64_release_packages + path: ${{ env.PACKAGE_DOWNLOAD_DIR }} + - name: Setup venv + run: | + ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \ + --artifact-path=${PACKAGE_DOWNLOAD_DIR} + ``` + +* Within a workflow triggered by `workflow_dispatch`, pass `artifact_run_id` as + an input that developers must specify when running the workflow: + + ```yml + on: + workflow_dispatch: + inputs: + artifact_run_id: + type: string + default: "" + + ... + steps: + - name: Setup venv + run: | + ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \ + --fetch-gh-workflow=${{ inputs.artifact_run_id }} + ``` + + (Note that these two modes are often combined to allow for workflow testing) + +* Locally, the `--fetch-gh-workflow=WORKFLOW_ID` can be used to download and + setup the venv from a specific workflow run in one step: + + + ```bash + python3.11 ./build_tools/pkgci/setup_venv.py /tmp/.venv --fetch-gh-workflow=12056182052 + ``` + +* Locally, the `--fetch-git-ref=GIT_REF` can be used to download and setup the + venv from the latest workflow run for a given ref (commit) in one step: + + ```bash + python3.11 ./build_tools/pkgci/setup_venv.py /tmp/.venv --fetch-git-ref=main + ``` + +You must have the `gh` command line tool installed and authenticated if you +will be fetching artifacts. +""" + +from glob import glob +from pathlib import Path +from typing import Optional, Dict, Tuple + +import argparse +import functools +import json +import os +import platform +import subprocess +import sys +import tempfile +import zipfile + +THIS_DIR = Path(__file__).parent.resolve() +REPO_ROOT = THIS_DIR.parent.parent + + +def parse_arguments(argv=None): + parser = argparse.ArgumentParser(description="Setup venv") + parser.add_argument( + "venv_dir", type=Path, help="Directory in which to create the venv" + ) + parser.add_argument("--artifact-path", help="Path in which to find/fetch artifacts") + parser.add_argument( + "--packages", + help="Comma-delimited list of packages to install, in order", + default="shark-ai,shortfin,sharktank", + ) + parser.add_argument( + "--install-using-index", + help="The default mode installs with `--no-index` to be sure that only " + "our packages are installed. Setting this flag removes that option, " + "more closely matching the behavior that users will see when they " + "install published packages.", + action="store_true", + ) + + fetch_group = parser.add_mutually_exclusive_group() + fetch_group.add_argument( + "--fetch-gh-workflow", help="Fetch artifacts from a GitHub workflow" + ) + fetch_group.add_argument("--fetch-git-ref", help="Fetch artifacts for a git ref") + + args = parser.parse_args(argv) + return args + + +def get_latest_workflow_run_id_for_ref(ref: str) -> int: + print(f"Normalizing ref: {ref}") + normalized_ref = ( + subprocess.check_output(["git", "rev-parse", ref], cwd=REPO_ROOT) + .decode() + .strip() + ) + + print(f"Fetching artifacts for normalized ref: {normalized_ref}") + base_path = f"/repos/nod-ai/shark-ai" + workflow_run_args = [ + "gh", + "api", + "-H", + "Accept: application/vnd.github+json", + "-H", + "X-GitHub-Api-Version: 2022-11-28", + f"{base_path}/actions/workflows/pkgci.yml/runs?head_sha={normalized_ref}", + ] + print(f"Running command to list workflow runs:\n {' '.join(workflow_run_args)}") + workflow_run_output = subprocess.check_output(workflow_run_args) + workflow_run_json_output = json.loads(workflow_run_output) + if workflow_run_json_output["total_count"] == 0: + raise RuntimeError("Workflow did not run at this commit") + + latest_run = workflow_run_json_output["workflow_runs"][-1] + print(f"Found workflow run: {latest_run['html_url']}") + return latest_run["id"] + + +@functools.lru_cache +def list_gh_artifacts(run_id: str) -> Dict[str, str]: + print(f"Fetching artifacts for workflow run {run_id}") + base_path = f"/repos/nod-ai/shark-ai" + output = subprocess.check_output( + [ + "gh", + "api", + "-H", + "Accept: application/vnd.github+json", + "-H", + "X-GitHub-Api-Version: 2022-11-28", + f"{base_path}/actions/runs/{run_id}/artifacts", + ] + ) + data = json.loads(output) + # Uncomment to debug: + # print(json.dumps(data, indent=2)) + artifacts = { + rec["name"]: f"{base_path}/actions/artifacts/{rec['id']}/zip" + for rec in data["artifacts"] + } + print("Found artifacts:") + for k, v in artifacts.items(): + print(f" {k}: {v}") + return artifacts + + +def fetch_gh_artifact(api_path: str, file: Path): + print(f"Downloading artifact {api_path}") + contents = subprocess.check_output( + [ + "gh", + "api", + "-H", + "Accept: application/vnd.github+json", + "-H", + "X-GitHub-Api-Version: 2022-11-28", + api_path, + ] + ) + file.write_bytes(contents) + + +def find_venv_python(venv_path: Path) -> Optional[Path]: + paths = [venv_path / "bin" / "python", venv_path / "Scripts" / "python.exe"] + for p in paths: + if p.exists(): + return p + return None + + +def install_with_index(python_exe, wheels): + # Install each of the built wheels, allowing dependencies and an index. + # Note that --pre pulls in prerelease versions of dependencies too, like + # numpy. We could try a solution like https://stackoverflow.com/a/76124424. + for artifact_path, package_name in wheels: + cmd = [ + "uv", + "pip", + "install", + "--pre", + "-f", + str(artifact_path), + package_name, + "--python", + str(python_exe), + ] + print(f"\nRunning command: {' '.join([str(c) for c in cmd])}") + subprocess.check_call(cmd) + + +def install_without_index(python_exe, packages, wheels): + # Install each of the built wheels without deps or consulting an index. + # This is because we absolutely don't want this falling back to anything + # but what we said. + for artifact_path, package_name in wheels: + cmd = [ + "uv", + "pip", + "install", + "--no-deps", + "--no-index", + "-f", + str(artifact_path), + "--force-reinstall", + package_name, + "--python", + str(python_exe), + ] + print(f"\nRunning command: {' '.join([str(c) for c in cmd])}") + subprocess.check_call(cmd) + + # Install requirements for the requested packages. + # Note that not all of these are included in the package dependencies, but + # developers usually want the test requirements too. + requirements_files = [] + if "sharktank" in packages: + requirements_files.append("sharktank/requirements.txt") + requirements_files.append("sharktank/requirements-tests.txt") + if "shortfin" in packages: + requirements_files.append("shortfin/requirements-tests.txt") + + for requirements_file in requirements_files: + cmd = [ + "uv", + "pip", + "install", + "-r", + str(REPO_ROOT / requirements_file), + "--python", + str(python_exe), + ] + print(f"\nRunning command: {' '.join([str(c) for c in cmd])}") + subprocess.check_call(cmd) + + +def find_wheel(args, artifact_prefix: str, package_name: str) -> Tuple[Path, str]: + artifact_path = Path(args.artifact_path) + + def has_package(): + norm_package_name = package_name.replace("-", "_") + pattern = str(artifact_path / f"{norm_package_name}-*.whl") + files = glob(pattern) + return bool(files) + + if has_package(): + return (artifact_path, package_name) + + if not args.fetch_gh_workflow: + raise RuntimeError( + f"Could not find package {package_name} to install from {artifact_path}" + ) + + # Fetch. + artifact_path.mkdir(parents=True, exist_ok=True) + artifact_name = f"{artifact_prefix}_dev_packages" + artifact_file = artifact_path / f"{artifact_name}.zip" + if not artifact_file.exists(): + print(f"Package {package_name} not found. Fetching from {artifact_name}...") + artifacts = list_gh_artifacts(args.fetch_gh_workflow) + if artifact_name not in artifacts: + raise RuntimeError( + f"Could not find required artifact {artifact_name} in run {args.fetch_gh_workflow}" + ) + fetch_gh_artifact(artifacts[artifact_name], artifact_file) + print(f"Extracting {artifact_file}") + with zipfile.ZipFile(artifact_file) as zip_ref: + zip_ref.extractall(artifact_path) + + # Try again. + if not has_package(): + raise RuntimeError(f"Could not find {package_name} in {artifact_path}") + return (artifact_path, package_name) + + +def main(args): + # Look up the workflow run for a ref. + if args.fetch_git_ref: + latest_gh_workflow = get_latest_workflow_run_id_for_ref(args.fetch_git_ref) + args.fetch_git_ref = "" + args.fetch_gh_workflow = str(latest_gh_workflow) + return main(args) + + # Make sure we have an artifact path if fetching. + if not args.artifact_path and args.fetch_gh_workflow: + with tempfile.TemporaryDirectory() as td: + args.artifact_path = td + return main(args) + + # Parse command-delimited list of packages from args. + packages = args.packages.split(",") + print("Installing packages:", packages) + + artifact_prefix = f"{platform.system().lower()}_{platform.machine()}" + wheels = [] + for package_name in packages: + wheels.append(find_wheel(args, artifact_prefix, package_name)) + print("Installing wheels:", wheels) + + # Set up venv using 'uv' (https://docs.astral.sh/uv/). + # We could use 'pip', but 'uv' is much faster at installing packages. + venv_path = args.venv_dir + python_exe = find_venv_python(venv_path) + + if not python_exe: + print(f"Creating venv at {str(venv_path)}") + + subprocess.check_call([sys.executable, "-m", "pip", "install", "uv"]) + subprocess.check_call(["uv", "venv", str(venv_path), "--python", "3.11"]) + python_exe = find_venv_python(venv_path) + if not python_exe: + raise RuntimeError("Error creating venv") + + # Install the PyTorch CPU wheels first to save multiple minutes and a lot of bandwidth. + cmd = [ + "uv", + "pip", + "install", + "-r", + str(REPO_ROOT / "pytorch-cpu-requirements.txt"), + "--python", + str(python_exe), + ] + print(f"\nRunning command: {' '.join([str(c) for c in cmd])}") + subprocess.check_call(cmd) + + if args.install_using_index: + install_with_index(python_exe, wheels) + else: + install_without_index(python_exe, packages, wheels) + + # Log which packages are installed. + print("") + print(f"Checking packages with 'uv pip freeze':") + subprocess.check_call( + [ + "uv", + "pip", + "freeze", + "--python", + str(python_exe), + ] + ) + + print("") + print(f"venv setup using uv, activate with:\n source {venv_path}/bin/activate") + + return 0 + + +if __name__ == "__main__": + sys.exit(main(parse_arguments())) From 015b6ef727495e7207d6df598befafbace990467 Mon Sep 17 00:00:00 2001 From: Xida Date: Wed, 8 Jan 2025 17:58:26 +0000 Subject: [PATCH 02/27] give build_packages call write permission --- .github/workflows/pkgci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pkgci.yml b/.github/workflows/pkgci.yml index 2c655d1c1..9b1b50033 100644 --- a/.github/workflows/pkgci.yml +++ b/.github/workflows/pkgci.yml @@ -28,6 +28,8 @@ jobs: build_packages: name: Build Packages uses: ./.github/workflows/build_packages.yml + permissions: + contents: write with: build_type: "dev" From 9d4f2b672413c481103a1b431b25fab79da57754 Mon Sep 17 00:00:00 2001 From: Xida Date: Wed, 8 Jan 2025 18:23:52 +0000 Subject: [PATCH 03/27] make artifact versions match --- .github/workflows/pkgci_shark_ai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 1ef8c9bf8..3339f344c 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -52,7 +52,7 @@ jobs: - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: - name: linux_x86_64_dev_packages + name: snapshot-shortfin-linux-x86_64-cp${{ replace(matrix.version, '.', '') }}-cp${{ replace(matrix.version, '.', '') }} path: ${{ env.PACKAGE_DOWNLOAD_DIR }} - name: Setup venv run: | From afa818f4097ac559d66add2e281f498488292664 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 10:56:19 -0800 Subject: [PATCH 04/27] use bash to do version string substitution instead --- .github/workflows/pkgci_shark_ai.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 3339f344c..eb3ad7f52 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -50,10 +50,22 @@ jobs: with: python-version: ${{matrix.version}} - - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + - name: Set Python version without dot + run: | + echo "PY_VERSION_NO_DOT=$(echo ${{ matrix.version }} | tr -d '.')" >> $GITHUB_ENV + + - name: Download sharktank artifacts + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: snapshot-sharktank-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }} + path: ${{ env.PACKAGE_DOWNLOAD_DIR }} + + - name: Download shortfin artifacts + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: - name: snapshot-shortfin-linux-x86_64-cp${{ replace(matrix.version, '.', '') }}-cp${{ replace(matrix.version, '.', '') }} + name: snapshot-shortfin-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }} path: ${{ env.PACKAGE_DOWNLOAD_DIR }} + - name: Setup venv run: | ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \ From 6b52effa1d8ebf2532e1133f5396d584cc240818 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 11:03:12 -0800 Subject: [PATCH 05/27] move setup_venv.py to proper location --- build_tools/{ => pkgci}/setup_venv.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename build_tools/{ => pkgci}/setup_venv.py (100%) diff --git a/build_tools/setup_venv.py b/build_tools/pkgci/setup_venv.py similarity index 100% rename from build_tools/setup_venv.py rename to build_tools/pkgci/setup_venv.py From bfaf0b85518dbc32b46bf1f003a20e6728d0b04d Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 11:04:19 -0800 Subject: [PATCH 06/27] run on default runners instead --- .github/workflows/pkgci_shark_ai.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index eb3ad7f52..3d7a55edb 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -34,7 +34,8 @@ jobs: matrix: version: [3.11] fail-fast: false - runs-on: nodai-amdgpu-mi250-x86-64 + # runs-on: nodai-amdgpu-mi250-x86-64 + runs-on: ubuntu-latest defaults: run: shell: bash From efb182e3cca139b3f1f1718e117db7b1ff0306ce Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 11:25:12 -0800 Subject: [PATCH 07/27] enable exec on setup_venv.py --- build_tools/pkgci/setup_venv.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 build_tools/pkgci/setup_venv.py diff --git a/build_tools/pkgci/setup_venv.py b/build_tools/pkgci/setup_venv.py old mode 100644 new mode 100755 From 5dde682762d4b434edd07a83b9e7a2e17942ef30 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 11:34:51 -0800 Subject: [PATCH 08/27] remove concurrency settings from callee workflow --- .github/workflows/pkgci_shark_ai.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 3d7a55edb..70a8be437 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -19,14 +19,6 @@ on: description: "Id for a workflow run that produced dev packages" default: "" -concurrency: - # A PR number if a pull request and otherwise the commit hash. This cancels - # queued and in-progress runs for the same PR (presubmit) or commit - # (postsubmit). The workflow name is prepended to avoid conflicts between - # different workflows. - group: ${{ github.workflow }}-${{ github.event.number || github.sha }} - cancel-in-progress: true - jobs: test_shortfin_llm_server: name: "Integration Tests - Shortfin LLM Server" From 63ef83031ddb2e1a54db6e8c98536ba7b97f4ca3 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 11:41:01 -0800 Subject: [PATCH 09/27] add back shark-ai package build --- .github/workflows/pkgci_shark_ai.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 70a8be437..fb59f1523 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -19,6 +19,14 @@ on: description: "Id for a workflow run that produced dev packages" default: "" +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + jobs: test_shortfin_llm_server: name: "Integration Tests - Shortfin LLM Server" @@ -59,6 +67,12 @@ jobs: name: snapshot-shortfin-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }} path: ${{ env.PACKAGE_DOWNLOAD_DIR }} + - name: Download shark-ai artifacts + uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 + with: + name: snapshot-shark-ai-linux-x86_64-cp${{ env.PY_VERSION_NO_DOT }}-cp${{ env.PY_VERSION_NO_DOT }} + path: ${{ env.PACKAGE_DOWNLOAD_DIR }} + - name: Setup venv run: | ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \ From d7f5f4c3bcde2c6ea7bda53d3ea49552259e647c Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 11:46:45 -0800 Subject: [PATCH 10/27] remove accidentally added-back concurrency settings --- .github/workflows/pkgci_shark_ai.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index fb59f1523..0059e96e9 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -19,14 +19,6 @@ on: description: "Id for a workflow run that produced dev packages" default: "" -concurrency: - # A PR number if a pull request and otherwise the commit hash. This cancels - # queued and in-progress runs for the same PR (presubmit) or commit - # (postsubmit). The workflow name is prepended to avoid conflicts between - # different workflows. - group: ${{ github.workflow }}-${{ github.event.number || github.sha }} - cancel-in-progress: true - jobs: test_shortfin_llm_server: name: "Integration Tests - Shortfin LLM Server" From f651dadfcacf310f81a93dd4b1965bc06a12a5e8 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 13:04:59 -0800 Subject: [PATCH 11/27] match cpu pytorch requirements.txt to iree-turbine's --- pytorch-cpu-requirements.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pytorch-cpu-requirements.txt b/pytorch-cpu-requirements.txt index aae0297db..da41e2243 100644 --- a/pytorch-cpu-requirements.txt +++ b/pytorch-cpu-requirements.txt @@ -1,3 +1,4 @@ ---pre ---index-url https://download.pytorch.org/whl/test/cpu -torch==2.3.0 +--index-url https://download.pytorch.org/whl/cpu +torch>=2.3.0 +torchaudio +torchvision \ No newline at end of file From a1f10d69e7bc175a3573e4f9c1191311583bcf2e Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 13:05:46 -0800 Subject: [PATCH 12/27] match rocm pytorch requirements.txt to iree-turbine's --- pytorch-rocm-requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch-rocm-requirements.txt b/pytorch-rocm-requirements.txt index 85116cfbb..b34305695 100644 --- a/pytorch-rocm-requirements.txt +++ b/pytorch-rocm-requirements.txt @@ -1,3 +1,4 @@ ---pre ---index-url https://download.pytorch.org/whl/nightly/rocm6.0 +--index-url https://download.pytorch.org/whl/rocm6.2 torch>=2.3.0 +torchaudio +torchvision \ No newline at end of file From 6254c9fc89ecda91905c9f84ba7cc938018b300d Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 13:12:44 -0800 Subject: [PATCH 13/27] remove --pre from pinned iree reqs --- requirements-iree-pinned.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-iree-pinned.txt b/requirements-iree-pinned.txt index fa7b1fe05..a721d7ebb 100644 --- a/requirements-iree-pinned.txt +++ b/requirements-iree-pinned.txt @@ -1,7 +1,6 @@ # Pinned versions of IREE dependencies. # Keep these versions synced with SHORTFIN_IREE_GIT_TAG in shortfin/CMakeLists.txt ---pre --find-links https://iree.dev/pip-release-links.html iree-base-compiler==3.2.0rc20250109 iree-base-runtime==3.2.0rc20250109 From cf81dfa1bebfdbb81b059713991594e58d48abc2 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 13:47:06 -0800 Subject: [PATCH 14/27] run on mi250 again --- .github/workflows/pkgci_shark_ai.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 0059e96e9..c9e0ddfea 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -26,8 +26,9 @@ jobs: matrix: version: [3.11] fail-fast: false - # runs-on: nodai-amdgpu-mi250-x86-64 - runs-on: ubuntu-latest + runs-on: nodai-amdgpu-mi250-x86-64 + # runs-on: ubuntu-latest # everything else works but this errs during model loading + # TODO: make this / a copy of this using tiny llamas run on the standard runners defaults: run: shell: bash From f7b99f633a2ea9a11f5c275a8c7b57f580c8c789 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 13:50:03 -0800 Subject: [PATCH 15/27] remove old shark-ai ci file --- .github/workflows/ci-shark-ai.yml | 66 ------------------------------- 1 file changed, 66 deletions(-) delete mode 100644 .github/workflows/ci-shark-ai.yml diff --git a/.github/workflows/ci-shark-ai.yml b/.github/workflows/ci-shark-ai.yml deleted file mode 100644 index 3957b6d11..000000000 --- a/.github/workflows/ci-shark-ai.yml +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright 2024 Advanced Micro Devices, Inc. -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -name: CI - shark-ai - -on: - workflow_dispatch: - pull_request: - push: - branches: - - main - -concurrency: - # A PR number if a pull request and otherwise the commit hash. This cancels - # queued and in-progress runs for the same PR (presubmit) or commit - # (postsubmit). The workflow name is prepended to avoid conflicts between - # different workflows. - group: ${{ github.workflow }}-${{ github.event.number || github.sha }} - cancel-in-progress: true - -jobs: - test_shortfin_llm_server: - name: "Integration Tests - Shortfin LLM Server" - strategy: - matrix: - version: [3.11] - fail-fast: false - runs-on: nodai-amdgpu-mi250-x86-64 - defaults: - run: - shell: bash - env: - VENV_DIR: ${{ github.workspace }}/.venv - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - - name: "Setting up Python" - id: setup_python - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{matrix.version}} - - name: Create Python venv - run: python -m venv ${VENV_DIR} - - - name: Install pip deps - run: | - source ${VENV_DIR}/bin/activate - python -m pip install --no-compile --upgrade pip - - # Note: We install in three steps in order to satisfy requirements - # from non default locations first. - pip install --no-compile -r pytorch-cpu-requirements.txt - pip install -r requirements-iree-pinned.txt - pip install --no-compile \ - -r requirements.txt \ - -e sharktank/ shortfin/ - - pip freeze - - - name: Run LLM Integration Tests - run: | - source ${VENV_DIR}/bin/activate - pytest -v -s app_tests/integration_tests/llm/shortfin --log-cli-level=INFO From 35eb2d2f0ddccfd847f883b6daa78a69c03a3ad6 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 8 Jan 2025 14:10:58 -0800 Subject: [PATCH 16/27] missed newline before EOF --- pytorch-cpu-requirements.txt | 2 +- pytorch-rocm-requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch-cpu-requirements.txt b/pytorch-cpu-requirements.txt index da41e2243..0bb2a823d 100644 --- a/pytorch-cpu-requirements.txt +++ b/pytorch-cpu-requirements.txt @@ -1,4 +1,4 @@ --index-url https://download.pytorch.org/whl/cpu torch>=2.3.0 torchaudio -torchvision \ No newline at end of file +torchvision diff --git a/pytorch-rocm-requirements.txt b/pytorch-rocm-requirements.txt index b34305695..ddacee45b 100644 --- a/pytorch-rocm-requirements.txt +++ b/pytorch-rocm-requirements.txt @@ -1,4 +1,4 @@ --index-url https://download.pytorch.org/whl/rocm6.2 torch>=2.3.0 torchaudio -torchvision \ No newline at end of file +torchvision From e4f0e4879af4bc8be95e57b892f9c754448b84e1 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 10:54:19 -0800 Subject: [PATCH 17/27] remove hardcoded py3.11 --- build_tools/pkgci/setup_venv.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build_tools/pkgci/setup_venv.py b/build_tools/pkgci/setup_venv.py index 9c4caa041..19bc96505 100755 --- a/build_tools/pkgci/setup_venv.py +++ b/build_tools/pkgci/setup_venv.py @@ -332,7 +332,9 @@ def main(args): print(f"Creating venv at {str(venv_path)}") subprocess.check_call([sys.executable, "-m", "pip", "install", "uv"]) - subprocess.check_call(["uv", "venv", str(venv_path), "--python", "3.11"]) + subprocess.check_call( + ["uv", "venv", str(venv_path), "--python", sys.executable] + ) python_exe = find_venv_python(venv_path) if not python_exe: raise RuntimeError("Error creating venv") From a970f529853ace3e02e0f15882cbc9a6cbb4a6b7 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 10:57:33 -0800 Subject: [PATCH 18/27] ci job name: nightly -> pinned to match filename change --- .github/workflows/pkgci_shark_ai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index c9e0ddfea..49601508d 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -72,7 +72,7 @@ jobs: --artifact-path=${PACKAGE_DOWNLOAD_DIR} \ --fetch-gh-workflow=${{ inputs.artifact_run_id }} - - name: Install nightly IREE packages + - name: Install pinned IREE packages run: | source ${VENV_DIR}/bin/activate uv pip install -r requirements-iree-pinned.txt From 6a336a36344622644565f8b83819e96f1401a3a0 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 11:48:42 -0800 Subject: [PATCH 19/27] pytorch-rocm-requirements match iree-turbine but no torch audio and vision --- pytorch-rocm-requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/pytorch-rocm-requirements.txt b/pytorch-rocm-requirements.txt index ddacee45b..0b1d480f5 100644 --- a/pytorch-rocm-requirements.txt +++ b/pytorch-rocm-requirements.txt @@ -1,4 +1,2 @@ --index-url https://download.pytorch.org/whl/rocm6.2 torch>=2.3.0 -torchaudio -torchvision From 92aa1a5bb87ad4a73adb04cd9193c5c78fb896c7 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 13:03:31 -0800 Subject: [PATCH 20/27] cache uv in same filesystem as workspace --- .github/workflows/pkgci_shark_ai.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 49601508d..946a51541 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -48,6 +48,18 @@ jobs: run: | echo "PY_VERSION_NO_DOT=$(echo ${{ matrix.version }} | tr -d '.')" >> $GITHUB_ENV + - name: Setup UV caching + run: | + CACHE_DIR="${GITHUB_WORKSPACE}/.uv-cache" + echo "UV_CACHE_DIR=${CACHE_DIR}" >> $GITHUB_ENV + mkdir -p "${CACHE_DIR}" + + - name: Cache UV packages + uses: actions/cache@v3 + with: + path: .uv-cache + key: ${{ runner.os }}-uv-py${{ matrix.version }}-${{ hashFiles('requirements-iree-pinned.txt') }} + - name: Download sharktank artifacts uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 with: From b1fa99af1be9306d95c6959008aade9654dacb72 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 13:19:51 -0800 Subject: [PATCH 21/27] test on regular ubuntu runners for a bit --- .github/workflows/pkgci_shark_ai.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 946a51541..abfbd6a3a 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -26,8 +26,8 @@ jobs: matrix: version: [3.11] fail-fast: false - runs-on: nodai-amdgpu-mi250-x86-64 - # runs-on: ubuntu-latest # everything else works but this errs during model loading + # runs-on: nodai-amdgpu-mi250-x86-64 + runs-on: ubuntu-latest # everything else works but this errs during model loading # TODO: make this / a copy of this using tiny llamas run on the standard runners defaults: run: @@ -93,3 +93,7 @@ jobs: run: | source ${VENV_DIR}/bin/activate pytest -v -s app_tests/integration_tests/llm/shortfin --log-cli-level=INFO + + - name: Clean up repo to make next checkout faster + run: | + git clean -ffdx From 2d13751a9f9f9cbbac0a8b919f4b00cb6a107745 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 14:56:13 -0800 Subject: [PATCH 22/27] back on mi250 --- .github/workflows/pkgci_shark_ai.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index abfbd6a3a..0174a811a 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -26,8 +26,8 @@ jobs: matrix: version: [3.11] fail-fast: false - # runs-on: nodai-amdgpu-mi250-x86-64 - runs-on: ubuntu-latest # everything else works but this errs during model loading + runs-on: nodai-amdgpu-mi250-x86-64 + # runs-on: ubuntu-latest # everything else works but this errs during model loading # TODO: make this / a copy of this using tiny llamas run on the standard runners defaults: run: From 80ba23367ea03472d54e1a52af62634ad9c80a55 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 15:37:54 -0800 Subject: [PATCH 23/27] comments --- .github/workflows/pkgci_shark_ai.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 0174a811a..b86906297 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -27,8 +27,8 @@ jobs: version: [3.11] fail-fast: false runs-on: nodai-amdgpu-mi250-x86-64 - # runs-on: ubuntu-latest # everything else works but this errs during model loading - # TODO: make this / a copy of this using tiny llamas run on the standard runners + # runs-on: ubuntu-latest # everything else works but this throws an "out of resources" during model loading + # TODO: make a copy of this that runs on standard runners with tiny llama instead of a 8b model defaults: run: shell: bash From 9e5fbe08526434a5d7dc255fd2ea4edbb5cec178 Mon Sep 17 00:00:00 2001 From: "Xida Ren (Cedar)" Date: Thu, 9 Jan 2025 15:55:10 -0800 Subject: [PATCH 24/27] Pin caching to commit used in rest of repo Co-authored-by: Scott Todd --- .github/workflows/pkgci_shark_ai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index b86906297..d16d01242 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -55,7 +55,7 @@ jobs: mkdir -p "${CACHE_DIR}" - name: Cache UV packages - uses: actions/cache@v3 + uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: .uv-cache key: ${{ runner.os }}-uv-py${{ matrix.version }}-${{ hashFiles('requirements-iree-pinned.txt') }} From 1397879531eee7a8c0882ebcf09bc50705d22ac6 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 16:00:09 -0800 Subject: [PATCH 25/27] add some files to hash with uv caching --- .github/workflows/pkgci_shark_ai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index d16d01242..51d3c3bd0 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -58,7 +58,7 @@ jobs: uses: actions/cache@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2 with: path: .uv-cache - key: ${{ runner.os }}-uv-py${{ matrix.version }}-${{ hashFiles('requirements-iree-pinned.txt') }} + key: ${{ runner.os }}-uv-py${{ matrix.version }}-${{ hashFiles('requirements-iree-pinned.txt', 'pytorch-cpu-requirements.txt', 'sharktank/requirements.txt', 'sharktank/requirements-tests.txt', 'shortfin/requirements-tests.txt') }} - name: Download sharktank artifacts uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 From 0820813b26d9edacd194c21274f36d17c0f1d3bd Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 16:17:46 -0800 Subject: [PATCH 26/27] clean up the clean up step --- .github/workflows/pkgci_shark_ai.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index 51d3c3bd0..a842b7795 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -93,7 +93,3 @@ jobs: run: | source ${VENV_DIR}/bin/activate pytest -v -s app_tests/integration_tests/llm/shortfin --log-cli-level=INFO - - - name: Clean up repo to make next checkout faster - run: | - git clean -ffdx From 42827309805fb61efb888170b0837814812ed463 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 9 Jan 2025 16:44:34 -0800 Subject: [PATCH 27/27] try running on mi300x-4 --- .github/workflows/pkgci_shark_ai.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index a842b7795..d7c040c42 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -26,7 +26,7 @@ jobs: matrix: version: [3.11] fail-fast: false - runs-on: nodai-amdgpu-mi250-x86-64 + runs-on: mi300x-4 # runs-on: ubuntu-latest # everything else works but this throws an "out of resources" during model loading # TODO: make a copy of this that runs on standard runners with tiny llama instead of a 8b model defaults: