1
1
name : e2e-tests
2
- on :
2
+ on :
3
3
pull_request :
4
4
branches :
5
5
- " develop"
@@ -73,68 +73,65 @@ jobs:
73
73
training_8GPU_4DP2TP :
74
74
strategy :
75
75
matrix :
76
- runner : [910B ]
76
+ runner : [t_cluster ]
77
77
runs-on : ${{ matrix.runner }}
78
78
timeout-minutes : 15
79
79
steps :
80
80
- name : mask env
81
81
run : |
82
82
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
83
83
echo "::add-mask::$path_prefix"
84
- if [[ ${{ matrix.runner }} == 910B ]];then
85
- sudo git clean -ffdx
86
- fi
87
84
- uses : actions/checkout@v3
88
- - name : training_8GPU_4DP2TP_910B
89
- if : ${{ matrix.runner == '910B ' }}
85
+ - name : training_8GPU_4DP2TP_T
86
+ if : ${{ matrix.runner == 't_cluster ' }}
90
87
run : |
91
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
92
- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
93
- bash ../910B_sco.sh $jobname "$start_command"
88
+ source activate ${evo_env_torch21_flash2}
89
+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
90
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
91
+ exit_code=$?
92
+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
94
93
95
94
training_8GPU_4DP2TPSP :
96
95
strategy :
97
96
matrix :
98
- runner : [910B ]
97
+ runner : [t_cluster ]
99
98
runs-on : ${{ matrix.runner }}
100
99
timeout-minutes : 15
101
100
steps :
102
101
- name : mask env
103
102
run : |
104
103
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
105
104
echo "::add-mask::$path_prefix"
106
- if [[ ${{ matrix.runner }} == 910B ]];then
107
- sudo git clean -ffdx
108
- fi
109
105
- uses : actions/checkout@v3
110
- - name : training_8GPU_4DP2TPSP_910B
111
- if : ${{ matrix.runner == '910B ' }}
106
+ - name : training_8GPU_4DP2TPSP_T
107
+ if : ${{ matrix.runner == 't_cluster ' }}
112
108
run : |
113
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
114
- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
115
- bash ../910B_sco.sh $jobname "$start_command"
109
+ source activate ${evo_env_torch21_flash2}
110
+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
111
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
112
+ exit_code=$?
113
+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
116
114
117
115
training_8GPU_4DP2PP :
118
116
strategy :
119
117
matrix :
120
- runner : [910B ]
118
+ runner : [t_cluster ]
121
119
runs-on : ${{ matrix.runner }}
122
120
timeout-minutes : 15
123
121
steps :
124
122
- name : mask env
125
123
run : |
126
124
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
127
125
echo "::add-mask::$path_prefix"
128
- if [[ ${{ matrix.runner }} == 910B ]];then
129
- sudo git clean -ffdx
130
- fi
131
126
- uses : actions/checkout@v3
132
- - name : training_8GPU_4DP2PP_910B
133
- if : ${{ matrix.runner == '910B ' }}
127
+ - name : training_8GPU_4DP2PP_T
128
+ if : ${{ matrix.runner == 't_cluster ' }}
134
129
run : |
135
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
136
- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
137
- bash ../910B_sco.sh $jobname "$start_command"
130
+ source activate ${evo_env_torch21_flash2}
131
+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
132
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
133
+ exit_code=$?
134
+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
138
135
139
136
training_8GPU_4DP2PP_ZB :
140
137
runs-on : [t_cluster]
@@ -157,107 +154,82 @@ jobs:
157
154
training_16GPU_4DP2TP2PP_MTP :
158
155
strategy :
159
156
matrix :
160
- runner : [910B ]
157
+ runner : [t_cluster ]
161
158
runs-on : ${{ matrix.runner }}
162
159
timeout-minutes : 15
163
160
steps :
164
161
- name : mask env
165
162
run : |
166
163
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
167
164
echo "::add-mask::$path_prefix"
168
- if [[ ${{ matrix.runner }} == 910B ]];then
169
- sudo git clean -ffdx
170
- fi
171
165
- uses : actions/checkout@v3
172
- - name : training_16GPU_4DP2TP2PP_MTP_910B
173
- if : ${{ matrix.runner == '910B ' }}
166
+ - name : training_16GPU_4DP2TP2PP_MTP_T
167
+ if : ${{ matrix.runner == 't_cluster ' }}
174
168
run : |
175
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
176
- start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
177
- bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
169
+ source activate ${evo_env_torch21_flash2}
170
+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
171
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
172
+ exit_code=$?
173
+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
178
174
179
175
training_16GPU_4DP2TP2PP_MSP :
180
176
strategy :
181
177
matrix :
182
- runner : [910B ]
178
+ runner : [t_cluster ]
183
179
runs-on : ${{ matrix.runner }}
184
180
timeout-minutes : 15
185
181
steps :
186
182
- name : mask env
187
183
run : |
188
184
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
189
185
echo "::add-mask::$path_prefix"
190
- if [[ ${{ matrix.runner }} == 910B ]];then
191
- sudo git clean -ffdx
192
- fi
193
186
- uses : actions/checkout@v3
194
- - name : training_16GPU_4DP2TP2PP_MSP_910B
195
- if : ${{ matrix.runner == '910B ' }}
187
+ - name : training_16GPU_4DP2TP2PP_MSP_T
188
+ if : ${{ matrix.runner == 't_cluster ' }}
196
189
run : |
197
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
198
- start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
199
- bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
190
+ source activate ${evo_env_torch21_flash2}
191
+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
192
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
193
+ exit_code=$?
194
+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
200
195
201
196
training_16GPU_4DP2TP2PP_FSP :
202
197
strategy :
203
198
matrix :
204
- runner : [910B ]
199
+ runner : [t_cluster ]
205
200
runs-on : ${{ matrix.runner }}
206
201
timeout-minutes : 15
207
202
steps :
208
203
- name : mask env
209
204
run : |
210
205
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
211
206
echo "::add-mask::$path_prefix"
212
- if [[ ${{ matrix.runner }} == 910B ]];then
213
- sudo git clean -ffdx
214
- fi
215
207
- uses : actions/checkout@v3
216
- - name : training_16GPU_4DP2TP2PP_FSP_910B
217
- if : ${{ matrix.runner == '910B ' }}
208
+ - name : training_16GPU_4DP2TP2PP_FSP_T
209
+ if : ${{ matrix.runner == 't_cluster ' }}
218
210
run : |
219
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
220
- start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
221
- bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
211
+ source activate ${evo_env_torch21_flash2}
212
+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
213
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
214
+ exit_code=$?
215
+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
222
216
223
217
training_llama2 :
224
218
strategy :
225
219
matrix :
226
- runner : [910B ]
220
+ runner : [t_cluster ]
227
221
runs-on : ${{ matrix.runner }}
228
222
timeout-minutes : 20
229
223
steps :
230
224
- name : mask env
231
225
run : |
232
226
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
233
227
echo "::add-mask::$path_prefix"
234
- if [[ ${{ matrix.runner }} == 910B ]];then
235
- sudo git clean -ffdx
236
- fi
237
228
- uses : actions/checkout@v3
238
- - name : training_llama2_910B
229
+ - name : training_llama2_T
239
230
run : |
240
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
241
- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
242
- bash ../910B_sco.sh $jobname "$start_command"
243
-
244
- training_internlm2 :
245
- strategy :
246
- matrix :
247
- runner : [910B]
248
- runs-on : ${{ matrix.runner }}
249
- timeout-minutes : 20
250
- steps :
251
- - name : mask env
252
- run : |
253
- echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
254
- echo "::add-mask::$path_prefix"
255
- if [[ ${{ matrix.runner }} == 910B ]];then
256
- sudo git clean -ffdx
257
- fi
258
- - uses : actions/checkout@v3
259
- - name : training_internlm2_910B
260
- run : |
261
- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
262
- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
263
- bash ../910B_sco.sh $jobname "$start_command"
231
+ source activate ${evo_env_torch21_flash2}
232
+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
233
+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
234
+ exit_code=$?
235
+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
0 commit comments