Skip to content

Commit 5b1f13c

Browse files
convert : proper tensor name mapping for llama4 (ggml-org#12870)
* Llama-4 mapping * remove hacky renaming --------- Co-authored-by: Daniel Han <[email protected]>
1 parent 8b91d53 commit 5b1f13c

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

convert_hf_to_gguf.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1806,10 +1806,6 @@ def set_gguf_parameters(self):
18061806
self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
18071807

18081808
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
1809-
name = name.replace("language_model.", "")
1810-
name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
1811-
name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
1812-
18131809
# split the gate_up into gate and up
18141810
if "gate_up_proj" in name:
18151811
name_up = name.replace("gate_up_proj", "up_proj.weight")

gguf-py/gguf/tensor_mapping.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class TensorNameMap:
3030
"rwkv.embeddings", # rwkv6
3131
"model.embeddings", # rwkv7
3232
"model.word_embeddings", # bailingmoe
33+
"language_model.model.embed_tokens", # llama4
3334
),
3435

3536
# Token type embeddings
@@ -67,6 +68,7 @@ class TensorNameMap:
6768
"output_layer", # chatglm
6869
"head", # rwkv
6970
"head.out", # wavtokenizer
71+
"language_model.lm_head", # llama4
7072
),
7173

7274
# Output norm
@@ -89,6 +91,7 @@ class TensorNameMap:
8991
"rwkv.ln_out", # rwkv6
9092
"model.ln_out", # rwkv7
9193
"backbone.final_layer_norm", # wavtokenizer
94+
"language_model.model.norm", # llama4
9295
),
9396

9497
# Rope frequencies
@@ -130,6 +133,7 @@ class TensorNameMap:
130133
"transformer.layers.{bid}.attn_norm", # openelm
131134
"rwkv.blocks.{bid}.ln1", # rwkv6
132135
"model.layers.{bid}.ln1", # rwkv7
136+
"language_model.model.layers.{bid}.input_layernorm", # llama4
133137
),
134138

135139
# Attention norm 2
@@ -169,6 +173,7 @@ class TensorNameMap:
169173
"model.layers.{bid}.attention.wq", # internlm2
170174
"transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
171175
"transformer.h.{bid}.attn.attention.q_proj", # exaone
176+
"language_model.model.layers.{bid}.self_attn.q_proj", # llama4
172177
),
173178

174179
# Attention key
@@ -183,6 +188,7 @@ class TensorNameMap:
183188
"model.layers.{bid}.attention.wk", # internlm2
184189
"transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
185190
"transformer.h.{bid}.attn.attention.k_proj", # exaone
191+
"language_model.model.layers.{bid}.self_attn.k_proj", # llama4
186192
),
187193

188194
# Attention value
@@ -196,6 +202,7 @@ class TensorNameMap:
196202
"model.layers.{bid}.attention.wv", # internlm2
197203
"transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
198204
"transformer.h.{bid}.attn.attention.v_proj", # exaone
205+
"language_model.model.layers.{bid}.self_attn.v_proj", # llama4
199206
),
200207

201208
# Attention output
@@ -222,6 +229,7 @@ class TensorNameMap:
222229
"encoder.layers.{bid}.self_attention.dense", # chatglm
223230
"transformer.layers.{bid}.attn.out_proj", # openelm
224231
"transformer.h.{bid}.attn.attention.out_proj", # exaone
232+
"language_model.model.layers.{bid}.self_attn.o_proj", # llama4
225233
),
226234

227235
# Attention output norm
@@ -259,6 +267,7 @@ class TensorNameMap:
259267
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
260268
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
261269
"transformer.layers.{bid}.ffn_norm", # openelm
270+
"language_model.model.layers.{bid}.post_attention_layernorm", # llama4
262271
),
263272

264273
# Post feed-forward norm
@@ -278,6 +287,7 @@ class TensorNameMap:
278287
"transformer.decoder_layer.{bid}.router", # Grok
279288
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
280289
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
290+
"language_model.model.layers.{bid}.feed_forward.router", # llama4
281291
),
282292

283293
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -315,6 +325,7 @@ class TensorNameMap:
315325
"model.layers.{bid}.residual_mlp.w3", # arctic
316326
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
317327
"transformer.h.{bid}.mlp.c_fc_1", # exaone
328+
"language_model.model.layers.{bid}.feed_forward.up_proj", # llama4
318329
),
319330

320331
MODEL_TENSOR.FFN_UP_EXP: (
@@ -323,11 +334,13 @@ class TensorNameMap:
323334
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
324335
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
325336
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
337+
"language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
326338
),
327339

328340
MODEL_TENSOR.FFN_UP_SHEXP: (
329341
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
330342
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
343+
"language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
331344
),
332345

333346
# AWQ-activation gate
@@ -348,6 +361,7 @@ class TensorNameMap:
348361
"transformer.h.{bid}.mlp.linear_1", # refact
349362
"model.layers.{bid}.residual_mlp.w1", # arctic
350363
"transformer.h.{bid}.mlp.c_fc_0", # exaone
364+
"language_model.model.layers.{bid}.feed_forward.gate_proj", # llama4
351365
),
352366

353367
MODEL_TENSOR.FFN_GATE_EXP: (
@@ -356,11 +370,13 @@ class TensorNameMap:
356370
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
357371
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
358372
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
373+
"language_model.model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
359374
),
360375

361376
MODEL_TENSOR.FFN_GATE_SHEXP: (
362377
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
363378
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
379+
"language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
364380
),
365381

366382
# Feed-forward down
@@ -389,6 +405,7 @@ class TensorNameMap:
389405
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
390406
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
391407
"model.layers.h.{bid}.mlp.c_proj", # exaone
408+
"language_model.model.layers.{bid}.feed_forward.down_proj", # llama4
392409
),
393410

394411
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -398,11 +415,13 @@ class TensorNameMap:
398415
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
399416
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
400417
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
418+
"language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
401419
),
402420

403421
MODEL_TENSOR.FFN_DOWN_SHEXP: (
404422
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
405423
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
424+
"language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
406425
),
407426

408427
MODEL_TENSOR.ATTN_Q_NORM: (

0 commit comments

Comments
 (0)