InternLM
diff --git a/‎.github/workflows/e2e_test.yaml
Lines changed: 56 additions & 84 deletions b/‎.github/workflows/e2e_test.yaml
Lines changed: 56 additions & 84 deletions
diff --git a/‎configs/57B_qwen2_MoE.py
Lines changed: 2 additions & 4 deletions b/‎configs/57B_qwen2_MoE.py
Lines changed: 2 additions & 4 deletions
diff --git a/‎configs/7B_MoE4_sft.py
Lines changed: 24 additions & 2 deletions b/‎configs/7B_MoE4_sft.py
Lines changed: 24 additions & 2 deletions
diff --git a/‎configs/7B_internlm2.py
Lines changed: 15 additions & 3 deletions b/‎configs/7B_internlm2.py
Lines changed: 15 additions & 3 deletions
@@ -1,5 +1,5 @@
 name: e2e-tests
-on: 
+on:
   pull_request:
     branches:
       - "develop"
@@ -73,68 +73,65 @@ jobs:
   training_8GPU_4DP2TP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_8GPU_4DP2TP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_8GPU_4DP2TP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_8GPU_4DP2TPSP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_8GPU_4DP2TPSP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_8GPU_4DP2TPSP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2TPSP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_8GPU_4DP2PP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_8GPU_4DP2PP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_8GPU_4DP2PP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU_4DP2PP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_8GPU_4DP2PP_ZB:
     runs-on: [t_cluster]
@@ -157,107 +154,82 @@ jobs:
   training_16GPU_4DP2TP2PP_MTP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_16GPU_4DP2TP2PP_MTP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_16GPU_4DP2TP2PP_MTP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MTP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_16GPU_4DP2TP2PP_MSP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_16GPU_4DP2TP2PP_MSP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_16GPU_4DP2TP2PP_MSP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_MSP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_16GPU_4DP2TP2PP_FSP:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 15
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_16GPU_4DP2TP2PP_FSP_910B
-      if: ${{ matrix.runner == '910B' }}
+    - name: training_16GPU_4DP2TP2PP_FSP_T
+      if: ${{ matrix.runner == 't_cluster' }}
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --nproc_per_node=8 --nnodes=2 --node_rank=$RANK -m pytest -p no:cacheprovider -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command" 2 "AllReduce"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_4DP2TP2PP_FSP" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
 
   training_llama2:
     strategy:
       matrix:
-        runner: [910B]
+        runner: [t_cluster]
     runs-on: ${{ matrix.runner }}
     timeout-minutes: 20
     steps:
     - name: mask env
       run: |
         echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
         echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
     - uses: actions/checkout@v3
-    - name: training_llama2_910B
+    - name: training_llama2_T
       run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
-
-  training_internlm2:
-    strategy:
-      matrix:
-        runner: [910B]
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: 20
-    steps:
-    - name: mask env
-      run: |
-        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
-        echo "::add-mask::$path_prefix"
-        if [[ ${{ matrix.runner }} == 910B ]];then
-           sudo git clean -ffdx
-        fi
-    - uses: actions/checkout@v3
-    - name: training_internlm2_910B
-      run: |
-        jobname=EB910-${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
-        start_command='torchrun --nproc_per_node=8 --nnodes=1 -m pytest -p no:cacheprovider -v --color=yes -m "training_internlm2" ./tests/test_training/test_loss.py'
-        bash ../910B_sco.sh $jobname "$start_command"
+        source activate ${evo_env_torch21_flash2}
+        jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
+        srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_llama2" ./tests/test_training/test_loss.py
+        exit_code=$?
+        sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname
@@ -190,7 +190,6 @@
 weight parallel (dict):
     1. size: int, the size of weight parallel.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 expert parallel (dict):
     1. size: int
         * if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -201,15 +200,14 @@
 expert weight parallel (dict):
     1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
-    3. memory_pool: bool, enable/disable memory pool, defaults to False.
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=1, overlap=True, memory_pool=True),
+    weight=dict(size=1, overlap=True),
     expert=dict(size=-1, no_tp=False),
-    expert_weight=dict(size=1, overlap=True, memory_pool=True),
+    expert_weight=dict(size=1, overlap=True),
 )
 
 cudnn_deterministic = False
 
@@ -103,6 +103,20 @@
     clip_grad_norm=1.0,
 )
 
+
+# loss config (dict):
+#     1. label_smoothing
+#     2. op_type: cross_entropy operator type, we support five types for loss computing,
+#                 including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
+#                 default is "py_vocab_parallel".
+#         "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
+#         "apex_naive": cross_entropy from apex
+#         "py_naive": self-implemented cross_entropy
+#         "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
+#         "py_vocab_parallel": self-implemented vocab parallel cross_entropy
+#         * op_types that ends with "naive" only support parallel_output=False;
+#         * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
+
 loss = dict(
     label_smoothing=0,
     moe_loss_coeff=0.1,
@@ -183,6 +197,10 @@
 weight parallel (dict):
     1. size: int, the size of weight parallel.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. launch_allgather_before: str, before which module to launch the all gather communication to
+        prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
+        Must be used with forward_overlap_per 'layer'.
+    4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
 expert parallel (dict):
     1. size: int
         * if size <= 0, ep size equals to dp size, but if the number of experts is smaller than dp size, set ep size
@@ -193,14 +211,18 @@
 expert weight parallel (dict):
     1. size: int, the size of weight parallel for expert module, distinct with global weight parallel size.
     2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. launch_allgather_before: str, before which module to launch the all gather communication to
+        prefetch next layer's weight, should be in ['wqkv', 'attn', 'wo', 'w1'], defaults to 'wo'.
+        Must be used with forward_overlap_per 'layer'.
+    4. forward_overlap_per: str, all gather prefetch granularity, per 'module' or per 'layer', defaults to 'layer'.
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
     tensor=dict(size=1, mode="mtp"),
     pipeline=dict(size=1, interleaved_overlap=True),
-    weight=dict(size=1, overlap=True),
+    weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
     expert=dict(size=-1, no_tp=False),
-    expert_weight=dict(size=1, overlap=True),
+    expert_weight=dict(size=1, overlap=True, launch_allgather_before="wo", forward_overlap_per="layer"),
 )
 
 cudnn_deterministic = False
 
@@ -98,9 +98,21 @@
     clip_grad_norm=1.0,
 )
 
-loss = dict(
-    label_smoothing=0,
-)
+
+# loss config (dict):
+#     1. label_smoothing
+#     2. op_type: cross_entropy operator type, we support five types for loss computing,
+#                 including ["torch_naive", "apex_naive", "py_naive", "flash_vocab_parallel", "py_vocab_parallel"]
+#                 default is "py_vocab_parallel".
+#         "torch_naive": cross_entropy imported from torch, i.e. torch.nn.CrossEntropyLoss
+#         "apex_naive": cross_entropy from apex
+#         "py_naive": self-implemented cross_entropy
+#         "flash_vocab_parallel": vocab parallel cross_entropy imported from flash_attn
+#         "py_vocab_parallel": self-implemented vocab parallel cross_entropy
+
+#         * op_types that ends with "naive" only support parallel_output=False;
+#         * if in no-GPU env, only "torch_naive" and "py_vocab_parallel" are supported.
+loss = dict(label_smoothing=0, op_type="py_vocab_parallel")
 
 adam = dict(
     lr=1e-4,