GLM4.6 support mtp with fullgraph

1092626063 · 1092626063 · commit 073cfa1af276 · 2026-01-04T17:17:42.000+08:00
Signed-off-by: 1092626063 &lt;1092626063@qq.com&gt;
diff --git a/tests/e2e/nightly/single_node/models/test_glm4_5.py b/tests/e2e/nightly/single_node/models/test_glm4_5.py
@@ -29,6 +29,7 @@
 
 TENSOR_PARALLELS = [8]
 DATA_PARALLELS = [2]
+FULL_GRAPH = [True, False]
 
 prompts = [
     "San Francisco is a",
@@ -65,11 +66,9 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
 @pytest.mark.parametrize("dp_size", DATA_PARALLELS)
-async def test_models(
-    model: str,
-    tp_size: int,
-    dp_size: int,
-) -> None:
+@pytest.mark.parametrize("full_graph", FULL_GRAPH)
+async def test_models(model: str, tp_size: int, dp_size: int,
+                      full_graph: bool) -> None:
     port = get_open_port()
     env_dict = {"HCCL_BUFFSIZE": "1024"}
     server_args = [
@@ -91,6 +90,11 @@ async def test_models(
         "--gpu-memory-utilization",
         "0.9",
     ]
+    if full_graph:
+        server_args += [
+            "--compilation-config",
+            '{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}'
+        ]
     request_keyword_args: dict[str, Any] = {
         **api_keyword_args,
     }
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -173,7 +173,15 @@ def is_layer_skipped_ascend(
                         "are quantized. All shards of fused layers "
                         "to have the same precision.")
         else:
-            is_skipped = self.quant_description[prefix + '.weight'] == "FLOAT"
+            # NOTE: In GLM4.6, the MTP draft model shares the same LM head weigthts
+            # with the main model. Therefore, before `load_weights()` runs, some parameter
+            # names may not include the expected prefix and may appear only with the
+            # ".head" suffix. This can trigger a load-time error, so here we replace the
+            # key with "lm_head.weight".
+            key = prefix + '.weight'
+            if key not in self.quant_description and ".head" in prefix:
+                key = 'lm_head.weight'
+            is_skipped = self.quant_description[key] == "FLOAT"
 
         assert is_skipped is not None
         return is_skipped
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py
@@ -48,7 +48,9 @@
     "DeepseekV32ForCausalLM":
     ("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"),
     "Qwen3NextForCausalLM":
-    ("vllm.model_executor.models.qwen3_next_mtp", "Qwen3NextMTP")
+    ("vllm.model_executor.models.qwen3_next_mtp", "Qwen3NextMTP"),
+    "Glm4MoeForCausalLM": ("vllm.model_executor.models.glm4_moe_mtp",
+                           "Glm4MoeMTP")
 }
 
 

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,9 @@`
`48`	`48`	`"DeepseekV32ForCausalLM":`
`49`	`49`	`("vllm.model_executor.models.deepseek_mtp", "DeepSeekMTP"),`
`50`	`50`	`"Qwen3NextForCausalLM":`
`51`		`- ("vllm.model_executor.models.qwen3_next_mtp", "Qwen3NextMTP")`
	`51`	`+ ("vllm.model_executor.models.qwen3_next_mtp", "Qwen3NextMTP"),`
	`52`	`+ "Glm4MoeForCausalLM": ("vllm.model_executor.models.glm4_moe_mtp",`
	`53`	`+ "Glm4MoeMTP")`
`52`	`54`	`}`
`53`	`55`
`54`	`56`