@@ -2062,10 +2062,14 @@ def _dummy_run(
20622062 if self .is_kv_producer and not self .is_kv_consumer :
20632063 with_prefill = True
20642064
2065+ has_lora = True if self .lora_config and self .compilation_config .cudagraph_specialize_lora else False
2066+ _ag_mode , batch_descriptor = \
2067+ self .cudagraph_dispatcher .dispatch (num_tokens = num_tokens , uniform_decode = uniform_decode , has_lora = has_lora )
2068+
20652069 # Padding for DP
20662070 (num_tokens , num_tokens_across_dp ,
2067- with_prefill ) = self ._sync_metadata_across_dp (num_tokens ,
2068- with_prefill )
2071+ with_prefill ) = self ._sync_metadata_across_dp (
2072+ batch_descriptor . num_tokens , with_prefill )
20692073
20702074 # If cudagraph_mode.decode_mode() == FULL and
20712075 # cudagraph_mode.seperate_routine(). This means that we are using
@@ -2112,9 +2116,11 @@ def _dummy_run(
21122116 if not is_profile and self .dynamic_eplb :
21132117 self .eplb_updator .forward_before ()
21142118
2115- has_lora = True if self .lora_config and self .compilation_config .cudagraph_specialize_lora else False
2116- _ag_mode , batch_descriptor = \
2117- self .cudagraph_dispatcher .dispatch (num_tokens = num_tokens , uniform_decode = uniform_decode , has_lora = has_lora )
2119+ if num_tokens != batch_descriptor .num_tokens :
2120+ _ag_mode , batch_descriptor = self .cudagraph_dispatcher .dispatch (
2121+ num_tokens = num_tokens ,
2122+ uniform_decode = uniform_decode ,
2123+ has_lora = has_lora )
21182124
21192125 num_tokens_padded = batch_descriptor .num_tokens
21202126 num_reqs_padded = (batch_descriptor .num_reqs if
0 commit comments