Skip to content

Commit d89de09

Browse files
committed
Merge branch 'develop' into feat/early-release-isp-rs-memory
2 parents 48f1b94 + e3f5001 commit d89de09

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+2396
-1023
lines changed

.github/workflows/e2e_test.yaml

Lines changed: 56 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: e2e-tests
2-
on:
2+
on:
33
pull_request:
44
branches:
55
- "develop"
@@ -73,68 +73,65 @@ jobs:
7373
training_8GPU_4DP2TP:
7474
strategy:
7575
matrix:
76-
runner: [910B]
76+
runner: [t_cluster]
7777
runs-on: ${{ matrix.runner }}
7878
timeout-minutes: 15
7979
steps:
8080
- name: mask env
8181
run: |
8282
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
8383
echo "::add-mask::$path_prefix"
84-
if [[ ${{ matrix.runner }} == 910B ]];then
85-
sudo git clean -ffdx
86-
fi
8784
- uses: actions/checkout@v3
88-
- name: training_8GPU_4DP2TP_910B
89-
if: ${{ matrix.runner == '910B' }}
85+
- name: training_8GPU_4DP2TP_T
86+
if: ${{ matrix.runner == 't_cluster' }}
9087
run: |
91-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
92-
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
93-
bash ../910B_sco.sh $jobname "$start_command"
88+
source activate ${evo_env_torch21_flash2}
89+
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
90+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
91+
exit_code=$?
92+
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
9493
9594
training_8GPU_4DP2TPSP:
9695
strategy:
9796
matrix:
98-
runner: [910B]
97+
runner: [t_cluster]
9998
runs-on: ${{ matrix.runner }}
10099
timeout-minutes: 15
101100
steps:
102101
- name: mask env
103102
run: |
104103
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
105104
echo "::add-mask::$path_prefix"
106-
if [[ ${{ matrix.runner }} == 910B ]];then
107-
sudo git clean -ffdx
108-
fi
109105
- uses: actions/checkout@v3
110-
- name: training_8GPU_4DP2TPSP_910B
111-
if: ${{ matrix.runner == '910B' }}
106+
- name: training_8GPU_4DP2TPSP_T
107+
if: ${{ matrix.runner == 't_cluster' }}
112108
run: |
113-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
114-
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
115-
bash ../910B_sco.sh $jobname "$start_command"
109+
source activate ${evo_env_torch21_flash2}
110+
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
111+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
112+
exit_code=$?
113+
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
116114
117115
training_8GPU_4DP2PP:
118116
strategy:
119117
matrix:
120-
runner: [910B]
118+
runner: [t_cluster]
121119
runs-on: ${{ matrix.runner }}
122120
timeout-minutes: 15
123121
steps:
124122
- name: mask env
125123
run: |
126124
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
127125
echo "::add-mask::$path_prefix"
128-
if [[ ${{ matrix.runner }} == 910B ]];then
129-
sudo git clean -ffdx
130-
fi
131126
- uses: actions/checkout@v3
132-
- name: training_8GPU_4DP2PP_910B
133-
if: ${{ matrix.runner == '910B' }}
127+
- name: training_8GPU_4DP2PP_T
128+
if: ${{ matrix.runner == 't_cluster' }}
134129
run: |
135-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
136-
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
137-
bash ../910B_sco.sh $jobname "$start_command"
130+
source activate ${evo_env_torch21_flash2}
131+
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
132+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
133+
exit_code=$?
134+
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
138135
139136
training_8GPU_4DP2PP_ZB:
140137
runs-on: [t_cluster]
@@ -157,107 +154,82 @@ jobs:
157154
training_16GPU_4DP2TP2PP_MTP:
158155
strategy:
159156
matrix:
160-
runner: [910B]
157+
runner: [t_cluster]
161158
runs-on: ${{ matrix.runner }}
162159
timeout-minutes: 15
163160
steps:
164161
- name: mask env
165162
run: |
166163
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
167164
echo "::add-mask::$path_prefix"
168-
if [[ ${{ matrix.runner }} == 910B ]];then
169-
sudo git clean -ffdx
170-
fi
171165
- uses: actions/checkout@v3
172-
- name: training_16GPU_4DP2TP2PP_MTP_910B
173-
if: ${{ matrix.runner == '910B' }}
166+
- name: training_16GPU_4DP2TP2PP_MTP_T
167+
if: ${{ matrix.runner == 't_cluster' }}
174168
run: |
175-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
176-
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
177-
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
169+
source activate ${evo_env_torch21_flash2}
170+
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
171+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
172+
exit_code=$?
173+
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
178174
179175
training_16GPU_4DP2TP2PP_MSP:
180176
strategy:
181177
matrix:
182-
runner: [910B]
178+
runner: [t_cluster]
183179
runs-on: ${{ matrix.runner }}
184180
timeout-minutes: 15
185181
steps:
186182
- name: mask env
187183
run: |
188184
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
189185
echo "::add-mask::$path_prefix"
190-
if [[ ${{ matrix.runner }} == 910B ]];then
191-
sudo git clean -ffdx
192-
fi
193186
- uses: actions/checkout@v3
194-
- name: training_16GPU_4DP2TP2PP_MSP_910B
195-
if: ${{ matrix.runner == '910B' }}
187+
- name: training_16GPU_4DP2TP2PP_MSP_T
188+
if: ${{ matrix.runner == 't_cluster' }}
196189
run: |
197-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
198-
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
199-
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
190+
source activate ${evo_env_torch21_flash2}
191+
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
192+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
193+
exit_code=$?
194+
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
200195
201196
training_16GPU_4DP2TP2PP_FSP:
202197
strategy:
203198
matrix:
204-
runner: [910B]
199+
runner: [t_cluster]
205200
runs-on: ${{ matrix.runner }}
206201
timeout-minutes: 15
207202
steps:
208203
- name: mask env
209204
run: |
210205
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
211206
echo "::add-mask::$path_prefix"
212-
if [[ ${{ matrix.runner }} == 910B ]];then
213-
sudo git clean -ffdx
214-
fi
215207
- uses: actions/checkout@v3
216-
- name: training_16GPU_4DP2TP2PP_FSP_910B
217-
if: ${{ matrix.runner == '910B' }}
208+
- name: training_16GPU_4DP2TP2PP_FSP_T
209+
if: ${{ matrix.runner == 't_cluster' }}
218210
run: |
219-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
220-
start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
221-
bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
211+
source activate ${evo_env_torch21_flash2}
212+
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
213+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
214+
exit_code=$?
215+
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
222216
223217
training_llama2:
224218
strategy:
225219
matrix:
226-
runner: [910B]
220+
runner: [t_cluster]
227221
runs-on: ${{ matrix.runner }}
228222
timeout-minutes: 20
229223
steps:
230224
- name: mask env
231225
run: |
232226
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
233227
echo "::add-mask::$path_prefix"
234-
if [[ ${{ matrix.runner }} == 910B ]];then
235-
sudo git clean -ffdx
236-
fi
237228
- uses: actions/checkout@v3
238-
- name: training_llama2_910B
229+
- name: training_llama2_T
239230
run: |
240-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
241-
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
242-
bash ../910B_sco.sh $jobname "$start_command"
243-
244-
training_internlm2:
245-
strategy:
246-
matrix:
247-
runner: [910B]
248-
runs-on: ${{ matrix.runner }}
249-
timeout-minutes: 20
250-
steps:
251-
- name: mask env
252-
run: |
253-
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
254-
echo "::add-mask::$path_prefix"
255-
if [[ ${{ matrix.runner }} == 910B ]];then
256-
sudo git clean -ffdx
257-
fi
258-
- uses: actions/checkout@v3
259-
- name: training_internlm2_910B
260-
run: |
261-
jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
262-
start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
263-
bash ../910B_sco.sh $jobname "$start_command"
231+
source activate ${evo_env_torch21_flash2}
232+
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
233+
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
234+
exit_code=$?
235+
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname

configs/57B_qwen2_MoE.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,6 @@
190190
weight parallel (dict):
191191
1. size: int, the size of weight parallel.
192192
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
193-
3. memory_pool: bool, enable/disable memory pool, defaults to False.
194193
expert parallel (dict):
195194
1. size: int
196195
* if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -201,15 +200,14 @@
201200
expert weight parallel (dict):
202201
1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
203202
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
204-
3. memory_pool: bool, enable/disable memory pool, defaults to False.
205203
"""
206204
parallel = dict(
207205
zero1=dict(size=-1, fsdp=False),
208206
tensor=dict(size=1, mode="mtp"),
209207
pipeline=dict(size=1, interleaved_overlap=True),
210-
weight=dict(size=1, overlap=True, memory_pool=True),
208+
weight=dict(size=1, overlap=True),
211209
expert=dict(size=-1, no_tp=False),
212-
expert_weight=dict(size=1, overlap=True, memory_pool=True),
210+
expert_weight=dict(size=1, overlap=True),
213211
)
214212

215213
cudnn_deterministic = False

configs/7B_MoE4_sft.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,20 @@
103103
clip_grad_norm=1.0,
104104
)
105105

106+
107+
# loss config (dict):
108+
# 1. label_smoothing
109+
# 2. op_type: cross_entropy operator type, we support five types for loss computing,
110+
# including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
111+
# default is "py_vocab_parallel".
112+
# "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
113+
# "apex_naive": cross_entropy from apex
114+
# "py_naive": self-implemented cross_entropy
115+
# "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
116+
# "py_vocab_parallel": self-implemented vocab parallel cross_entropy
117+
# * op_types that ends with "naive" only support parallel_output=False;
118+
# * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
119+
106120
loss = dict(
107121
label_smoothing=0,
108122
moe_loss_coeff=0.1,
@@ -183,6 +197,10 @@
183197
weight parallel (dict):
184198
1. size: int, the size of weight parallel.
185199
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
200+
3. launch_allgather_before: str, before which module to launch the all gather communication to
201+
prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
202+
Must be used with forward_overlap_per 'layer'.
203+
4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
186204
expert parallel (dict):
187205
1. size: int
188206
* if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -193,14 +211,18 @@
193211
expert weight parallel (dict):
194212
1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
195213
2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
214+
3. launch_allgather_before: str, before which module to launch the all gather communication to
215+
prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
216+
Must be used with forward_overlap_per 'layer'.
217+
4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
196218
"""
197219
parallel = dict(
198220
zero1=dict(size=-1, fsdp=False),
199221
tensor=dict(size=1, mode="mtp"),
200222
pipeline=dict(size=1, interleaved_overlap=True),
201-
weight=dict(size=1, overlap=True),
223+
weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
202224
expert=dict(size=-1, no_tp=False),
203-
expert_weight=dict(size=1, overlap=True),
225+
expert_weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
204226
)
205227

206228
cudnn_deterministic = False

configs/7B_internlm2.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,21 @@
9898
clip_grad_norm=1.0,
9999
)
100100

101-
loss = dict(
102-
label_smoothing=0,
103-
)
101+
102+
# loss config (dict):
103+
# 1. label_smoothing
104+
# 2. op_type: cross_entropy operator type, we support five types for loss computing,
105+
# including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
106+
# default is "py_vocab_parallel".
107+
# "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
108+
# "apex_naive": cross_entropy from apex
109+
# "py_naive": self-implemented cross_entropy
110+
# "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
111+
# "py_vocab_parallel": self-implemented vocab parallel cross_entropy
112+
113+
# * op_types that ends with "naive" only support parallel_output=False;
114+
# * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
115+
loss = dict(label_smoothing=0, op_type="py_vocab_parallel")
104116

105117
adam = dict(
106118
lr=1e-4,

0 commit comments

Comments
 (0)