adapt to minimax_m2 (vllm-project#5624)

Feng-xiaosuo · zjks98 · commit 7e7dd6d7f063 · 2026-01-15T19:40:22.000+08:00
### What this PR does / why we need it? This PR fixes Minimax model loading in vLLM Ascend backend by: Adding model type check for "minimax" and "minimax_m2" to replace "mlp" prefix with "block_sparse_moe" Implementing special handling for Minimax expert layer naming conventions Adding Minimax configuration to packed_modules_model_mapping for proper qkv_proj and experts module handling Without these changes, Minimax models fail to load on Ascend devices due to incompatible layer naming and module packing. ### Does this PR introduce _any_ user-facing change? Yes. Users can now successfully load and run Minimax models on Ascend hardware with vLLM. This enables inference capabilities for this model family on Ascend devices. ### How was this patch tested? Local Testing: Verified model loading for minimax-xxx and minimax_m2-xxx model variants on Atlas 800I A2 hardware Tested inference with sample prompts using vLLM's OpenAI-compatible API server Benchmark Validation: Compared throughput and latency metrics against GPU baseline Verified memory usage stays within expected limits for different batch sizes Tested multi-card inference scenarios with tensor parallelism - vLLM version: v0.13.0 - vLLM main: vllm-project/vllm@8be6432 --------- Signed-off-by: Feng-xiaosuo <tengchang1@huawei.com>
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -117,6 +117,18 @@ def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         vllm_config = get_current_vllm_config()
         model_type = vllm_config.model_config.hf_text_config.model_type
+
+        if model_type in ["minimax", "minimax_m2"]:
+            prefix = prefix.replace("mlp", "block_sparse_moe")
+
+            #To adapt to minimax, modify the prefix of the model layer name
+            parts = prefix.split('.')
+            if "experts" in parts and len(parts) > 2:
+                exp_idx = parts.index("experts")
+                if exp_idx + 1 < len(parts) and parts[exp_idx + 1].isdigit():
+                    parts = parts[:exp_idx + 1]
+                    prefix = ".".join(parts)
+
         if model_type in packed_modules_model_mapping:
             self.packed_modules_mapping = packed_modules_model_mapping[
                 model_type]
@@ -312,6 +324,14 @@ def get_scaled_act_names(self) -> List[str]:
         ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
         "fused_qkv_a_proj": ["q_a_proj", "kv_a_proj_with_mqa"]
     },
+    "minimax_m2": {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "experts": ["experts.0.w1", "experts.0.w2", "experts.0.w3"]
+    }
 }