From cb1331d10836a7222304311c087a0361c5ebd66b Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 2 Oct 2024 18:05:22 -0700 Subject: [PATCH 1/4] Enable ROCM in CI --- .github/workflows/regression_test.yml | 18 +++++++++++++----- torchao/utils.py | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index 74b39d2ef2..52a7688f2b 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -17,6 +17,10 @@ concurrency: env: HF_TOKEN: ${{ secrets.HF_TOKEN }} +permissions: + id-token: write + contents: read + jobs: test-nightly: strategy: @@ -33,10 +37,17 @@ jobs: torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu' gpu-arch-type: "cpu" gpu-arch-version: "" - - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + - name: ROCM Nightly + runs-on: linux.rocm.gpu + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' + gpu-arch-type: "rocm" + gpu-arch-version: "6.3" + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@rocm_experiment with: timeout: 120 + no-sudo: ${{ matrix.gpu-arch-type == 'rocm' }} + continue-on-error: ${{ matrix.gpu-arch-type == 'rocm' }} + test-infra-ref: rocm_experiment runner: ${{ matrix.runs-on }} gpu-arch-type: ${{ matrix.gpu-arch-type }} gpu-arch-version: ${{ matrix.gpu-arch-version }} @@ -71,7 +82,6 @@ jobs: torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121' gpu-arch-type: "cuda" gpu-arch-version: "12.1" - - name: CPU 2.3 runs-on: linux.4xlarge torch-spec: 'torch==2.3.0 --index-url https://download.pytorch.org/whl/cpu' @@ -99,8 +109,6 @@ jobs: conda create -n venv python=3.9 -y conda activate venv echo "::group::Install newer objcopy that supports --set-section-alignment" - yum install -y devtoolset-10-binutils - export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH python -m pip install --upgrade pip pip install ${{ matrix.torch-spec }} pip install -r dev-requirements.txt diff --git a/torchao/utils.py b/torchao/utils.py index 7a17c1b104..4729675a14 100644 --- a/torchao/utils.py +++ b/torchao/utils.py @@ -607,7 +607,7 @@ def _torch_version_at_least(min_version): def is_MI300(): if torch.cuda.is_available() and torch.version.hip: mxArchName = ["gfx940", "gfx941", "gfx942"] - archName = torch.cuda.get_device_properties().gcnArchName + archName = torch.cuda.get_device_properties(0).gcnArchName for arch in mxArchName: if arch in archName: return True From e54e15469886cc5bfe474cb5b95e824e9066c262 Mon Sep 17 00:00:00 2001 From: amdfaa <107946068+amdfaa@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:57:33 -0600 Subject: [PATCH 2/4] Update regression_test.yml --- .github/workflows/regression_test.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index 52a7688f2b..aeaabfeea5 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -42,12 +42,11 @@ jobs: torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' gpu-arch-type: "rocm" gpu-arch-version: "6.3" - uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@rocm_experiment + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: timeout: 120 no-sudo: ${{ matrix.gpu-arch-type == 'rocm' }} - continue-on-error: ${{ matrix.gpu-arch-type == 'rocm' }} - test-infra-ref: rocm_experiment + test-infra-ref: main runner: ${{ matrix.runs-on }} gpu-arch-type: ${{ matrix.gpu-arch-type }} gpu-arch-version: ${{ matrix.gpu-arch-version }} From 7ca8cdf64cb704001e09f20021212ff54216abce Mon Sep 17 00:00:00 2001 From: amdfaa <107946068+amdfaa@users.noreply.github.com> Date: Thu, 16 Jan 2025 16:51:18 -0600 Subject: [PATCH 3/4] Update regression_test.yml --- .github/workflows/regression_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index aeaabfeea5..817b4f4ae4 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -46,7 +46,6 @@ jobs: with: timeout: 120 no-sudo: ${{ matrix.gpu-arch-type == 'rocm' }} - test-infra-ref: main runner: ${{ matrix.runs-on }} gpu-arch-type: ${{ matrix.gpu-arch-type }} gpu-arch-version: ${{ matrix.gpu-arch-version }} From 593fb78608b967b283313f3532f5af0321b58ea1 Mon Sep 17 00:00:00 2001 From: amdfaa <107946068+amdfaa@users.noreply.github.com> Date: Thu, 16 Jan 2025 18:17:40 -0600 Subject: [PATCH 4/4] Update regression_test.yml --- .github/workflows/regression_test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index 817b4f4ae4..eaf2e3cbbb 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -38,10 +38,11 @@ jobs: gpu-arch-type: "cpu" gpu-arch-version: "" - name: ROCM Nightly - runs-on: linux.rocm.gpu + runs-on: linux.rocm.gpu.2 torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3' gpu-arch-type: "rocm" gpu-arch-version: "6.3" + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: timeout: 120