13
13
from torch .nn .parameter import UninitializedParameter
14
14
15
15
import vllm .envs as envs
16
- from vllm .config import get_current_vllm_config , ParallelConfig
16
+ from vllm .config import ParallelConfig , get_current_vllm_config
17
17
from vllm .distributed import (get_dp_group , get_ep_group ,
18
18
get_tensor_model_parallel_rank ,
19
19
get_tensor_model_parallel_world_size ,
@@ -322,6 +322,7 @@ def __init__(self, moe: MoEConfig):
322
322
super ().__init__ ()
323
323
self .fused_experts = fused_experts
324
324
self .moe = moe
325
+
325
326
self .rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled ()
326
327
if self .rocm_aiter_moe_enabled :
327
328
from .rocm_aiter_fused_moe import rocm_aiter_fused_experts
@@ -501,6 +502,8 @@ def forward_cuda(
501
502
indices_type = torch .uint32 if self .moe .use_pplx_kernels else None )
502
503
503
504
if self .rocm_aiter_moe_enabled :
505
+ assert not apply_router_weight_on_input
506
+ assert expert_map is None
504
507
return self .rocm_aiter_fused_experts (
505
508
hidden_states = x ,
506
509
w1 = layer .w13_weight ,
@@ -510,8 +513,8 @@ def forward_cuda(
510
513
activation = activation ,
511
514
apply_router_weight_on_input = apply_router_weight_on_input )
512
515
else :
513
- return fused_experts (
514
- a1 = x ,
516
+ return self . fused_experts (
517
+ hidden_states = x ,
515
518
w1 = layer .w13_weight ,
516
519
w2 = layer .w2_weight ,
517
520
topk_weights = topk_weights ,
@@ -1191,8 +1194,7 @@ def select_experts(hidden_states: torch.Tensor,
1191
1194
scoring_func : str = "softmax" ,
1192
1195
e_score_correction_bias : Optional [torch .Tensor ] = None ,
1193
1196
indices_type : Optional [torch .dtype ] = None ):
1194
- from vllm .model_executor .layers .fused_moe .fused_moe import (
1195
- fused_topk , grouped_topk )
1197
+ from vllm .model_executor .layers .fused_moe .fused_moe import fused_topk
1196
1198
1197
1199
# DeekSeekv2 uses grouped_top_k
1198
1200
if use_grouped_topk :
@@ -1228,24 +1230,6 @@ def select_experts(hidden_states: torch.Tensor,
1228
1230
1229
1231
return topk_weights , topk_ids
1230
1232
1231
- def naive_multicast (self , x : torch .Tensor ,
1232
- cu_tokens_across_dp_cpu : torch .Tensor ):
1233
- assert (len (x .shape ) == 2 )
1234
- buffer = torch .empty ((cu_tokens_across_dp_cpu [- 1 ], x .size (1 )),
1235
- device = x .device ,
1236
- dtype = x .dtype )
1237
-
1238
- start = 0 if self .dp_rank == 0 else cu_tokens_across_dp_cpu [
1239
- self .dp_rank - 1 ]
1240
- end = cu_tokens_across_dp_cpu [self .dp_rank ]
1241
- buffer [start :end , :].copy_ (x )
1242
- for idx in range (get_dp_group ().world_size ):
1243
- start = 0 if idx == 0 else cu_tokens_across_dp_cpu [idx - 1 ]
1244
- end = cu_tokens_across_dp_cpu [idx ]
1245
- get_dp_group ().broadcast (buffer [start :end , :], idx )
1246
-
1247
- return buffer
1248
-
1249
1233
def must_reduce_shared_expert_outputs (self ) -> bool :
1250
1234
"""
1251
1235
The shared_experts are typically computed using the RowParallelLinear
0 commit comments