11name : e2e-tests
2- on :
2+ on :
33 pull_request :
44 branches :
55 - " develop"
@@ -73,68 +73,65 @@ jobs:
7373 training_8GPU_4DP2TP :
7474 strategy :
7575 matrix :
76- runner : [910B ]
76+ runner : [t_cluster ]
7777 runs-on : ${{ matrix.runner }}
7878 timeout-minutes : 15
7979 steps :
8080 - name : mask env
8181 run : |
8282 echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
8383 echo "::add-mask::$path_prefix"
84- if [[ ${{ matrix.runner }} == 910B ]];then
85- sudo git clean -ffdx
86- fi
8784 - uses : actions/checkout@v3
88- - name : training_8GPU_4DP2TP_910B
89- if : ${{ matrix.runner == '910B ' }}
85+ - name : training_8GPU_4DP2TP_T
86+ if : ${{ matrix.runner == 't_cluster ' }}
9087 run : |
91- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
92- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
93- bash ../910B_sco.sh $jobname "$start_command"
88+ source activate ${evo_env_torch21_flash2}
89+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
90+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
91+ exit_code=$?
92+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
9493
9594 training_8GPU_4DP2TPSP :
9695 strategy :
9796 matrix :
98- runner : [910B ]
97+ runner : [t_cluster ]
9998 runs-on : ${{ matrix.runner }}
10099 timeout-minutes : 15
101100 steps :
102101 - name : mask env
103102 run : |
104103 echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
105104 echo "::add-mask::$path_prefix"
106- if [[ ${{ matrix.runner }} == 910B ]];then
107- sudo git clean -ffdx
108- fi
109105 - uses : actions/checkout@v3
110- - name : training_8GPU_4DP2TPSP_910B
111- if : ${{ matrix.runner == '910B ' }}
106+ - name : training_8GPU_4DP2TPSP_T
107+ if : ${{ matrix.runner == 't_cluster ' }}
112108 run : |
113- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
114- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
115- bash ../910B_sco.sh $jobname "$start_command"
109+ source activate ${evo_env_torch21_flash2}
110+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
111+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
112+ exit_code=$?
113+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
116114
117115 training_8GPU_4DP2PP :
118116 strategy :
119117 matrix :
120- runner : [910B ]
118+ runner : [t_cluster ]
121119 runs-on : ${{ matrix.runner }}
122120 timeout-minutes : 15
123121 steps :
124122 - name : mask env
125123 run : |
126124 echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
127125 echo "::add-mask::$path_prefix"
128- if [[ ${{ matrix.runner }} == 910B ]];then
129- sudo git clean -ffdx
130- fi
131126 - uses : actions/checkout@v3
132- - name : training_8GPU_4DP2PP_910B
133- if : ${{ matrix.runner == '910B ' }}
127+ - name : training_8GPU_4DP2PP_T
128+ if : ${{ matrix.runner == 't_cluster ' }}
134129 run : |
135- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
136- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
137- bash ../910B_sco.sh $jobname "$start_command"
130+ source activate ${evo_env_torch21_flash2}
131+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
132+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
133+ exit_code=$?
134+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
138135
139136 training_8GPU_4DP2PP_ZB :
140137 runs-on : [t_cluster]
@@ -157,107 +154,82 @@ jobs:
157154 training_16GPU_4DP2TP2PP_MTP :
158155 strategy :
159156 matrix :
160- runner : [910B ]
157+ runner : [t_cluster ]
161158 runs-on : ${{ matrix.runner }}
162159 timeout-minutes : 15
163160 steps :
164161 - name : mask env
165162 run : |
166163 echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
167164 echo "::add-mask::$path_prefix"
168- if [[ ${{ matrix.runner }} == 910B ]];then
169- sudo git clean -ffdx
170- fi
171165 - uses : actions/checkout@v3
172- - name : training_16GPU_4DP2TP2PP_MTP_910B
173- if : ${{ matrix.runner == '910B ' }}
166+ - name : training_16GPU_4DP2TP2PP_MTP_T
167+ if : ${{ matrix.runner == 't_cluster ' }}
174168 run : |
175- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
176- start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
177- bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
169+ source activate ${evo_env_torch21_flash2}
170+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
171+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
172+ exit_code=$?
173+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
178174
179175 training_16GPU_4DP2TP2PP_MSP :
180176 strategy :
181177 matrix :
182- runner : [910B ]
178+ runner : [t_cluster ]
183179 runs-on : ${{ matrix.runner }}
184180 timeout-minutes : 15
185181 steps :
186182 - name : mask env
187183 run : |
188184 echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
189185 echo "::add-mask::$path_prefix"
190- if [[ ${{ matrix.runner }} == 910B ]];then
191- sudo git clean -ffdx
192- fi
193186 - uses : actions/checkout@v3
194- - name : training_16GPU_4DP2TP2PP_MSP_910B
195- if : ${{ matrix.runner == '910B ' }}
187+ - name : training_16GPU_4DP2TP2PP_MSP_T
188+ if : ${{ matrix.runner == 't_cluster ' }}
196189 run : |
197- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
198- start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
199- bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
190+ source activate ${evo_env_torch21_flash2}
191+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
192+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
193+ exit_code=$?
194+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
200195
201196 training_16GPU_4DP2TP2PP_FSP :
202197 strategy :
203198 matrix :
204- runner : [910B ]
199+ runner : [t_cluster ]
205200 runs-on : ${{ matrix.runner }}
206201 timeout-minutes : 15
207202 steps :
208203 - name : mask env
209204 run : |
210205 echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
211206 echo "::add-mask::$path_prefix"
212- if [[ ${{ matrix.runner }} == 910B ]];then
213- sudo git clean -ffdx
214- fi
215207 - uses : actions/checkout@v3
216- - name : training_16GPU_4DP2TP2PP_FSP_910B
217- if : ${{ matrix.runner == '910B ' }}
208+ - name : training_16GPU_4DP2TP2PP_FSP_T
209+ if : ${{ matrix.runner == 't_cluster ' }}
218210 run : |
219- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
220- start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
221- bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
211+ source activate ${evo_env_torch21_flash2}
212+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
213+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
214+ exit_code=$?
215+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
222216
223217 training_llama2 :
224218 strategy :
225219 matrix :
226- runner : [910B ]
220+ runner : [t_cluster ]
227221 runs-on : ${{ matrix.runner }}
228222 timeout-minutes : 20
229223 steps :
230224 - name : mask env
231225 run : |
232226 echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
233227 echo "::add-mask::$path_prefix"
234- if [[ ${{ matrix.runner }} == 910B ]];then
235- sudo git clean -ffdx
236- fi
237228 - uses : actions/checkout@v3
238- - name : training_llama2_910B
229+ - name : training_llama2_T
239230 run : |
240- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
241- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
242- bash ../910B_sco.sh $jobname "$start_command"
243-
244- training_internlm2 :
245- strategy :
246- matrix :
247- runner : [910B]
248- runs-on : ${{ matrix.runner }}
249- timeout-minutes : 20
250- steps :
251- - name : mask env
252- run : |
253- echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
254- echo "::add-mask::$path_prefix"
255- if [[ ${{ matrix.runner }} == 910B ]];then
256- sudo git clean -ffdx
257- fi
258- - uses : actions/checkout@v3
259- - name : training_internlm2_910B
260- run : |
261- jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
262- start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
263- bash ../910B_sco.sh $jobname "$start_command"
231+ source activate ${evo_env_torch21_flash2}
232+ jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
233+ srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
234+ exit_code=$?
235+ sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
0 commit comments