845473182
diff --git a/‎.github/actionlint.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.github/actionlint.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/_e2e_nightly_multi_node.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/_e2e_nightly_multi_node.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_e2e_nightly_single_node.yaml‎
Lines changed: 5 additions & 10 deletions b/‎.github/workflows/_e2e_nightly_single_node.yaml‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎.github/workflows/_e2e_nightly_single_node_models.yaml‎
Lines changed: 4 additions & 5 deletions b/‎.github/workflows/_e2e_nightly_single_node_models.yaml‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎.github/workflows/_e2e_test.yaml‎
Lines changed: 71 additions & 64 deletions b/‎.github/workflows/_e2e_test.yaml‎
Lines changed: 71 additions & 64 deletions
@@ -19,3 +19,5 @@ self-hosted-runner:
     - linux-amd64-cpu-8
     - linux-amd64-cpu-16
     - linux-aarch64-a3-0
+    - linux-amd64-cpu-8-hk
+    - linux-amd64-cpu-16-hk
@@ -286,7 +286,7 @@ jobs:
 
         - name: Upload logs
           if: always()
-          uses: actions/upload-artifact@v4
+          uses: actions/upload-artifact@v6
           with:
             name: ${{ inputs.config_file_path }}-pod-logs
             path: /tmp/vllm*_logs.txt
 
@@ -123,15 +123,14 @@ jobs:
           pip install custom_ops-1.0-cp311-cp311-linux_aarch64.whl
           . /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-      - name: Install triton-ascend
-        if: ${{ inputs.name == 'test_custom_op' }}
+      - name: Install Ascend toolkit & triton_ascend
         shell: bash -l {0}
         run: |
-          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20251225.run"
+          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20260105.run"
           BISHENG_URL="https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/${BISHENG_NAME}"
           wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./${BISHENG_NAME}" --install && rm "${BISHENG_NAME}"
-          source /usr/local/Ascend/8.5.0/bisheng_toolkit/set_env.sh
-          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20251229-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
+          export PATH=/usr/local/Ascend/tools/bishengir/bin:$PATH
+          python3 -m pip install -i https://test.pypi.org/simple/ triton-ascend==3.2.0.dev20260105
 
       - name: Run vllm-project/vllm-ascend test
         env:
@@ -143,10 +142,6 @@ jobs:
         run: |
           # ignore test_dispatch_ffn_combine until the test is fixed
           pytest -sv ${{ inputs.tests }} \
-          --ignore=tests/e2e/nightly/ops/test_dispatch_ffn_combine.py \
-          --ignore=tests/e2e/nightly/ops/test_fused_moe.py \
-          --ignore=tests/e2e/nightly/ops/test_rotary_embedding.py \
-          --ignore=tests/e2e/nightly/ops/test_matmul_allreduce_add_rmsnorm.py
-
+          --ignore=tests/e2e/nightly/single_node/ops/singlecard_ops/test_fused_moe.py
 
 
@@ -104,15 +104,14 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct)
-        if: ${{ inputs.runner == 'linux-aarch64-a2-4' && contains(inputs.model_list, 'Qwen3-Next-80B-A3B-Instruct') }}
+      - name: Install Ascend toolkit & triton_ascend
         shell: bash -l {0}
         run: |
-          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20251225.run"
+          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20260105.run"
           BISHENG_URL="https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/${BISHENG_NAME}"
           wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./${BISHENG_NAME}" --install && rm "${BISHENG_NAME}"
-          source /usr/local/Ascend/8.5.0/bisheng_toolkit/set_env.sh
-          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20251229-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
+          export PATH=/usr/local/Ascend/tools/bishengir/bin:$PATH
+          python3 -m pip install -i https://test.pypi.org/simple/ triton-ascend==3.2.0.dev20260105
 
       - name: Install tensorflow (for Molmo-7B-D-0924)
         if: ${{ inputs.runner == 'linux-aarch64-a2-1' && contains(inputs.model_list, 'Molmo-7B-D-0924') }}
 
@@ -68,34 +68,23 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
-      - name: Run vllm-project/vllm-ascend test (non triton)
-        env:
-          VLLM_WORKER_MULTIPROC_METHOD: spawn
-          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
-        if: ${{ inputs.type == 'full' }}
-        run: |
-          pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py
-          pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py
-
       - name: Install Ascend toolkit & triton_ascend
         shell: bash -l {0}
         run: |
-          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20251225.run"
+          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20260105.run"
           BISHENG_URL="https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/${BISHENG_NAME}"
           wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./${BISHENG_NAME}" --install && rm "${BISHENG_NAME}"
-          source /usr/local/Ascend/8.5.0/bisheng_toolkit/set_env.sh
-          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20251229-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
+          export PATH=/usr/local/Ascend/tools/bishengir/bin:$PATH
+          python3 -m pip install -i https://test.pypi.org/simple/ triton-ascend==3.2.0.dev20260105
 
       - name: Run vllm-project/vllm-ascend test
         env:
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         if: ${{ inputs.type == 'light' }}
         run: |
-          # pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
-          # pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
-          pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
-          pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness
+          pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py::test_piecewise_res_consistency
+          pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py::test_qwen3_w8a8_quant
 
       - name: Run e2e test
         env:
@@ -105,30 +94,40 @@ jobs:
         run: |
           # We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
           # the test separately.
-
-          pytest -sv --durations=0 tests/e2e/nightly/ops/triton/
-          pytest -sv --durations=0 tests/e2e/singlecard/test_completion_with_prompt_embeds.py
+          # basic 
           pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py
           pytest -sv --durations=0 tests/e2e/singlecard/test_async_scheduling.py
-          pytest -sv --durations=0 tests/e2e/singlecard/test_guided_decoding.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_batch_invariant.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_completion_with_prompt_embeds.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_cpu_offloading.py
+          # xgrammar has parameter mismatching bug, please follows: https://github.com/vllm-project/vllm-ascend/issues/5524
+          # pytest -sv --durations=0 tests/e2e/singlecard/test_guided_decoding.py
           # torch 2.8 doesn't work with lora, fix me
-          #pytest -sv --durations=0 tests/e2e/singlecard/test_ilama_lora.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_ilama_lora.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_models.py
+          pytest -sv --durations=0 tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
           pytest -sv --durations=0 tests/e2e/singlecard/test_profile_execute_duration.py
           pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py
           pytest -sv --durations=0 tests/e2e/singlecard/test_sampler.py
           pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py
           pytest -sv --durations=0 tests/e2e/singlecard/test_xlite.py
-          pytest -sv --durations=0 tests/e2e/singlecard/test_models.py
-          pytest -sv --durations=0 tests/e2e/singlecard/pooling/
+
+          # compile
           pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py
-          pytest -sv --durations=0 tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
-          pytest -sv --durations=0 tests/e2e/singlecard/test_cpu_offloading.py
+  
+          # model_runner_v2
+          pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py
 
-          # ------------------------------------ v1 spec decode test ------------------------------------ #
-          pytest -sv --durations=0 tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
-          pytest -sv --durations=0 tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+          # pooling
+          pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py
+          pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_embedding.py
+          pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_scoring.py
 
-          pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py
+          # spec_decode
+          pytest -sv --durations=0 tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
+          pytest -sv --durations=0 tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
 
   e2e-2-cards:
     name: multicard-2
@@ -189,52 +188,55 @@ jobs:
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
-          pytest -sv --durations=0 tests/e2e/multicard/test_aclgraph_capture_replay.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
 
       - name: Install Ascend toolkit & triton_ascend
         shell: bash -l {0}
         run: |
-          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20251225.run"
+          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20260105.run"
           BISHENG_URL="https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/${BISHENG_NAME}"
           wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./${BISHENG_NAME}" --install && rm "${BISHENG_NAME}"
-          source /usr/local/Ascend/8.5.0/bisheng_toolkit/set_env.sh
-          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20251229-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
+          export PATH=/usr/local/Ascend/tools/bishengir/bin:$PATH
+          python3 -m pip install -i https://test.pypi.org/simple/ triton-ascend==3.2.0.dev20260105
 
       - name: Run vllm-project/vllm-ascend test (light)
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         if: ${{ inputs.type == 'light' }}
         run: |
-          pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
 
       - name: Run vllm-project/vllm-ascend test (full)
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         if: ${{ inputs.type == 'full' }}
         run: |
-          pytest -sv --durations=0 tests/e2e/multicard/test_quantization.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_full_graph_mode.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_expert_parallel.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_external_launcher.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_single_request_aclgraph.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_data_parallel.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_expert_parallel.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_external_launcher.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_full_graph_mode.py
           # torch 2.8 doesn't work with lora, fix me
-          #pytest -sv --durations=0 tests/e2e/multicard/test_ilama_lora_tp2.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
+          
 
           # To avoid oom, we need to run the test in a single process.
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_w4a8_dynamic_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_sp_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
-
-          pytest -sv --durations=0 tests/e2e/multicard/test_prefix_caching.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_pipeline_parallel.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_moe.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_weight_load.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_w4a8_dynamic_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_moe_sp_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
+
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_weight_load.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_pipeline_parallel.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_prefix_caching.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_quantization.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_qwen3_moe.py
+          # This test is broken, fix me
+          #pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_shared_expert_dp.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
 
   e2e-4-cards:
     name: multicard-4
@@ -294,21 +296,26 @@ jobs:
       - name: Install Ascend toolkit & triton_ascend
         shell: bash -l {0}
         run: |
-          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20251225.run"
+          BISHENG_NAME="Ascend-BiSheng-toolkit_aarch64_20260105.run"
           BISHENG_URL="https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/${BISHENG_NAME}"
           wget -O "${BISHENG_NAME}" "${BISHENG_URL}" && chmod a+x "${BISHENG_NAME}" && "./${BISHENG_NAME}" --install && rm "${BISHENG_NAME}"
-          source /usr/local/Ascend/8.5.0/bisheng_toolkit/set_env.sh
-          python3 -m pip install "https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20251229-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl"
+          export PATH=/usr/local/Ascend/tools/bishengir/bin:$PATH
+          python3 -m pip install -i https://test.pypi.org/simple/ triton-ascend==3.2.0.dev20260105
 
       - name: Run vllm-project/vllm-ascend test for V1 Engine
         working-directory: ./vllm-ascend
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
         run: |
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2
-          pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_kimi_k2_thinking_w4a16_tp4
-          pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py
-          pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_basic.py
-          pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_accuracy.py
-          pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_mtp.py
-          pytest -sv --durations=0 tests/e2e/multicard/test_qwen3_next.py
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_data_parallel_tp2.py
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_kimi_k2.py
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_qwen3_next.py
+
+          # long_sequence
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_basic.py
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill.py
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_mtp.py
+
+          # spec_decode
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/spec_decode/test_mtp_qwen3_next.py