Skip to content

Commit 48c55a6

Browse files
amdfaapytorchmergebot
authored andcommitted
[ROCm] Move ROCm unstable MI300 jobs back to stable (pytorch#146675)
Fixes pytorch#145790 Needs pytorch#145504 to be merged first to resolve an artifact uploading issue with MI300 runners. This PR moves rocm unstable MI300 back to stable. The change to unstable was introduced through this [PR](pytorch#145790). This was because the MI300s were failing with a [docker daemon](https://github.com/pytorch/pytorch/actions/runs/13015957622/job/36306779536) issue which has been resolved. Pull Request resolved: pytorch#146675 Approved by: https://github.com/jithunnair-amd, https://github.com/jeffdaily
1 parent 6778084 commit 48c55a6

File tree

3 files changed

+78
-70
lines changed

3 files changed

+78
-70
lines changed

.github/workflows/inductor-rocm-mi300.yml

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,14 @@ permissions:
1515
contents: read
1616

1717
jobs:
18+
target-determination:
19+
if: github.repository_owner == 'pytorch'
20+
name: before-test
21+
uses: ./.github/workflows/target_determination.yml
22+
permissions:
23+
id-token: write
24+
contents: read
25+
1826
get-label-type:
1927
name: get-label-type
2028
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
@@ -25,4 +33,30 @@ jobs:
2533
curr_branch: ${{ github.head_ref || github.ref_name }}
2634
curr_ref_type: ${{ github.ref_type }}
2735

28-
# Build and test jobs moved to unstable.yml to make CI green. Jobs will be moved back when MI300 runners are better isolated from each other.
36+
linux-focal-rocm6_3-py3_10-inductor-build:
37+
name: rocm6.3-py3.10-inductor
38+
uses: ./.github/workflows/_linux-build.yml
39+
needs: get-label-type
40+
with:
41+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
42+
build-environment: linux-focal-rocm6.3-py3.10
43+
docker-image-name: pytorch-linux-focal-rocm-n-py3
44+
test-matrix: |
45+
{ include: [
46+
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
47+
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
48+
]}
49+
secrets: inherit
50+
51+
linux-focal-rocm6_3-py3_10-inductor-test:
52+
permissions:
53+
id-token: write
54+
contents: read
55+
name: rocm6.3-py3.10-inductor
56+
uses: ./.github/workflows/_rocm-test.yml
57+
needs: linux-focal-rocm6_3-py3_10-inductor-build
58+
with:
59+
build-environment: linux-focal-rocm6.3-py3.10
60+
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }}
61+
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }}
62+
secrets: inherit

.github/workflows/rocm-mi300.yml

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,14 @@ concurrency:
1515
permissions: read-all
1616

1717
jobs:
18+
target-determination:
19+
if: github.repository_owner == 'pytorch'
20+
name: before-test
21+
uses: ./.github/workflows/target_determination.yml
22+
permissions:
23+
id-token: write
24+
contents: read
25+
1826
get-label-type:
1927
name: get-label-type
2028
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
@@ -25,4 +33,38 @@ jobs:
2533
curr_branch: ${{ github.head_ref || github.ref_name }}
2634
curr_ref_type: ${{ github.ref_type }}
2735

28-
# Build and test jobs moved to unstable.yml to make CI green. Jobs will be moved back when MI300 runners are better isolated from each other.
36+
linux-focal-rocm6_3-py3_10-build:
37+
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
38+
name: linux-focal-rocm6.3-py3.10
39+
uses: ./.github/workflows/_linux-build.yml
40+
needs: get-label-type
41+
with:
42+
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
43+
build-environment: linux-focal-rocm6.3-py3.10
44+
docker-image-name: pytorch-linux-focal-rocm-n-py3
45+
sync-tag: rocm-build
46+
test-matrix: |
47+
{ include: [
48+
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
49+
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
50+
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
51+
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
52+
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
53+
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
54+
]}
55+
secrets: inherit
56+
57+
linux-focal-rocm6_3-py3_10-test:
58+
permissions:
59+
id-token: write
60+
contents: read
61+
name: linux-focal-rocm6.3-py3.10
62+
uses: ./.github/workflows/_rocm-test.yml
63+
needs:
64+
- linux-focal-rocm6_3-py3_10-build
65+
- target-determination
66+
with:
67+
build-environment: linux-focal-rocm6.3-py3.10
68+
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }}
69+
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
70+
secrets: inherit

.github/workflows/unstable.yml

Lines changed: 0 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -51,71 +51,3 @@ jobs:
5151
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
5252
curr_branch: ${{ github.head_ref || github.ref_name }}
5353
curr_ref_type: ${{ github.ref_type }}
54-
55-
# Moved from rocm-mi300.yml
56-
57-
linux-focal-rocm6_3-py3_10-build:
58-
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
59-
name: linux-focal-rocm6.3-py3.10
60-
uses: ./.github/workflows/_linux-build.yml
61-
needs: get-label-type
62-
with:
63-
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
64-
build-environment: linux-focal-rocm6.3-py3.10
65-
docker-image-name: pytorch-linux-focal-rocm-n-py3
66-
sync-tag: rocm-build
67-
test-matrix: |
68-
{ include: [
69-
{ config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
70-
{ config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
71-
{ config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
72-
{ config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
73-
{ config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
74-
{ config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi300.2" },
75-
]}
76-
secrets: inherit
77-
78-
linux-focal-rocm6_3-py3_10-test:
79-
permissions:
80-
id-token: write
81-
contents: read
82-
name: linux-focal-rocm6.3-py3.10
83-
uses: ./.github/workflows/_rocm-test.yml
84-
needs:
85-
- linux-focal-rocm6_3-py3_10-build
86-
- target-determination
87-
with:
88-
build-environment: linux-focal-rocm6.3-py3.10
89-
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.docker-image }}
90-
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-build.outputs.test-matrix }}
91-
secrets: inherit
92-
93-
# Moved from inductor-rocm-mi300.yml
94-
95-
linux-focal-rocm6_3-py3_10-inductor-build:
96-
name: rocm6.3-py3.10-inductor
97-
uses: ./.github/workflows/_linux-build.yml
98-
needs: get-label-type
99-
with:
100-
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
101-
build-environment: linux-focal-rocm6.3-py3.10
102-
docker-image-name: pytorch-linux-focal-rocm-n-py3
103-
test-matrix: |
104-
{ include: [
105-
{ config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
106-
{ config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.mi300.2" },
107-
]}
108-
secrets: inherit
109-
110-
linux-focal-rocm6_3-py3_10-inductor-test:
111-
permissions:
112-
id-token: write
113-
contents: read
114-
name: rocm6.3-py3.10-inductor
115-
uses: ./.github/workflows/_rocm-test.yml
116-
needs: linux-focal-rocm6_3-py3_10-inductor-build
117-
with:
118-
build-environment: linux-focal-rocm6.3-py3.10
119-
docker-image: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.docker-image }}
120-
test-matrix: ${{ needs.linux-focal-rocm6_3-py3_10-inductor-build.outputs.test-matrix }}
121-
secrets: inherit

0 commit comments

Comments
 (0)