[Bugfix] bugfix for the order of dummy run pad and sync (vllm-project#5777)

LiuYi-Up · zjks98 · commit d1091341db12 · 2026-01-15T19:40:23.000+08:00
### What this PR does / why we need it? This PR addresses an issue in piecewise graph mode when Multi-Threading Parallelism (MTP) is enabled. Specifically, the original dummy run sequence performs the following steps in order: 1. Sync DP (input length = 1 + k) 2. Dispatch (input length = 1 + k, with padding==graph size) However, in the model execution phase, the sequence differs, resulting in: 1. Padding (input length = 1, with padding) 2. Sync DP (input length = 1 + k) 3. Dispatch (input length 1 + k != graph size 1 + k, with padding) This discrepancy leads to a mismatch between the input sizes used in the model execution and those expected by the dispatch graph, causing an inconsistency in graph size. This PR ensures that the dispatch graph size aligns correctly by modifying the sequence of operations during model execution to match the dummy run sequence, resolving the mismatch issue. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - vLLM version: v0.13.0 - vLLM main: vllm-project/vllm@2f4e654 Signed-off-by: LiuYi-UP <1150854440@qq.com>
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2072,10 +2072,14 @@ def _dummy_run(
         if self.is_kv_producer and not self.is_kv_consumer:
             with_prefill = True
 
+        has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
+        _ag_mode, batch_descriptor = \
+            self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
+
         # Padding for DP
         (num_tokens, num_tokens_across_dp,
-         with_prefill) = self._sync_metadata_across_dp(num_tokens,
-                                                       with_prefill)
+         with_prefill) = self._sync_metadata_across_dp(
+             batch_descriptor.num_tokens, with_prefill)
 
         # If cudagraph_mode.decode_mode() == FULL and
         # cudagraph_mode.seperate_routine(). This means that we are using
@@ -2122,9 +2126,11 @@ def _dummy_run(
         if not is_profile and self.dynamic_eplb:
             self.eplb_updator.forward_before()
 
-        has_lora = True if self.lora_config and self.compilation_config.cudagraph_specialize_lora else False
-        _ag_mode, batch_descriptor = \
-            self.cudagraph_dispatcher.dispatch(num_tokens=num_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
+        if num_tokens != batch_descriptor.num_tokens:
+            _ag_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch(
+                num_tokens=num_tokens,
+                uniform_decode=uniform_decode,
+                has_lora=has_lora)
 
         num_tokens_padded = batch_descriptor.num_tokens
         num_reqs_padded = (batch_descriptor.num_reqs if