@@ -30,6 +30,7 @@ class TensorNameMap:
30
30
"rwkv.embeddings" , # rwkv6
31
31
"model.embeddings" , # rwkv7
32
32
"model.word_embeddings" , # bailingmoe
33
+ "language_model.model.embed_tokens" , # llama4
33
34
),
34
35
35
36
# Token type embeddings
@@ -67,6 +68,7 @@ class TensorNameMap:
67
68
"output_layer" , # chatglm
68
69
"head" , # rwkv
69
70
"head.out" , # wavtokenizer
71
+ "language_model.lm_head" , # llama4
70
72
),
71
73
72
74
# Output norm
@@ -89,6 +91,7 @@ class TensorNameMap:
89
91
"rwkv.ln_out" , # rwkv6
90
92
"model.ln_out" , # rwkv7
91
93
"backbone.final_layer_norm" , # wavtokenizer
94
+ "language_model.model.norm" , # llama4
92
95
),
93
96
94
97
# Rope frequencies
@@ -130,6 +133,7 @@ class TensorNameMap:
130
133
"transformer.layers.{bid}.attn_norm" , # openelm
131
134
"rwkv.blocks.{bid}.ln1" , # rwkv6
132
135
"model.layers.{bid}.ln1" , # rwkv7
136
+ "language_model.model.layers.{bid}.input_layernorm" , # llama4
133
137
),
134
138
135
139
# Attention norm 2
@@ -169,6 +173,7 @@ class TensorNameMap:
169
173
"model.layers.{bid}.attention.wq" , # internlm2
170
174
"transformer.decoder_layer.{bid}.multi_head_attention.query" ,# Grok
171
175
"transformer.h.{bid}.attn.attention.q_proj" , # exaone
176
+ "language_model.model.layers.{bid}.self_attn.q_proj" , # llama4
172
177
),
173
178
174
179
# Attention key
@@ -183,6 +188,7 @@ class TensorNameMap:
183
188
"model.layers.{bid}.attention.wk" , # internlm2
184
189
"transformer.decoder_layer.{bid}.multi_head_attention.key" ,# Grok
185
190
"transformer.h.{bid}.attn.attention.k_proj" , # exaone
191
+ "language_model.model.layers.{bid}.self_attn.k_proj" , # llama4
186
192
),
187
193
188
194
# Attention value
@@ -196,6 +202,7 @@ class TensorNameMap:
196
202
"model.layers.{bid}.attention.wv" , # internlm2
197
203
"transformer.decoder_layer.{bid}.multi_head_attention.value" ,# Grok
198
204
"transformer.h.{bid}.attn.attention.v_proj" , # exaone
205
+ "language_model.model.layers.{bid}.self_attn.v_proj" , # llama4
199
206
),
200
207
201
208
# Attention output
@@ -222,6 +229,7 @@ class TensorNameMap:
222
229
"encoder.layers.{bid}.self_attention.dense" , # chatglm
223
230
"transformer.layers.{bid}.attn.out_proj" , # openelm
224
231
"transformer.h.{bid}.attn.attention.out_proj" , # exaone
232
+ "language_model.model.layers.{bid}.self_attn.o_proj" , # llama4
225
233
),
226
234
227
235
# Attention output norm
@@ -259,6 +267,7 @@ class TensorNameMap:
259
267
"transformer.decoder_layer.{bid}.rms_norm_2" , # Grok
260
268
"encoder.layers.{bid}.post_attention_layernorm" , # chatglm
261
269
"transformer.layers.{bid}.ffn_norm" , # openelm
270
+ "language_model.model.layers.{bid}.post_attention_layernorm" , # llama4
262
271
),
263
272
264
273
# Post feed-forward norm
@@ -278,6 +287,7 @@ class TensorNameMap:
278
287
"transformer.decoder_layer.{bid}.router" , # Grok
279
288
"transformer.blocks.{bid}.ffn.router.layer" , # dbrx
280
289
"model.layers.{bid}.block_sparse_moe.router.layer" , # granitemoe
290
+ "language_model.model.layers.{bid}.feed_forward.router" , # llama4
281
291
),
282
292
283
293
MODEL_TENSOR .FFN_GATE_INP_SHEXP : (
@@ -315,6 +325,7 @@ class TensorNameMap:
315
325
"model.layers.{bid}.residual_mlp.w3" , # arctic
316
326
"encoder.layers.{bid}.mlp.dense_h_to_4h" , # chatglm
317
327
"transformer.h.{bid}.mlp.c_fc_1" , # exaone
328
+ "language_model.model.layers.{bid}.feed_forward.up_proj" , # llama4
318
329
),
319
330
320
331
MODEL_TENSOR .FFN_UP_EXP : (
@@ -323,11 +334,13 @@ class TensorNameMap:
323
334
"transformer.blocks.{bid}.ffn.experts.mlp.v1" , # dbrx
324
335
"model.layers.{bid}.mlp.experts.up_proj" , # qwen2moe olmoe (merged)
325
336
"model.layers.{bid}.block_sparse_moe.experts.w3" , # phimoe (merged)
337
+ "language_model.model.layers.{bid}.feed_forward.experts.up_proj" , # llama4
326
338
),
327
339
328
340
MODEL_TENSOR .FFN_UP_SHEXP : (
329
341
"model.layers.{bid}.mlp.shared_expert.up_proj" , # qwen2moe
330
342
"model.layers.{bid}.mlp.shared_experts.up_proj" , # deepseek deepseek2
343
+ "language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj" , # llama4
331
344
),
332
345
333
346
# AWQ-activation gate
@@ -348,6 +361,7 @@ class TensorNameMap:
348
361
"transformer.h.{bid}.mlp.linear_1" , # refact
349
362
"model.layers.{bid}.residual_mlp.w1" , # arctic
350
363
"transformer.h.{bid}.mlp.c_fc_0" , # exaone
364
+ "language_model.model.layers.{bid}.feed_forward.gate_proj" , # llama4
351
365
),
352
366
353
367
MODEL_TENSOR .FFN_GATE_EXP : (
@@ -356,11 +370,13 @@ class TensorNameMap:
356
370
"transformer.blocks.{bid}.ffn.experts.mlp.w1" , # dbrx
357
371
"model.layers.{bid}.mlp.experts.gate_proj" , # qwen2moe olmoe (merged)
358
372
"model.layers.{bid}.block_sparse_moe.experts.w1" , # phimoe (merged)
373
+ "language_model.model.layers.{bid}.feed_forward.experts.gate_proj" , # llama4
359
374
),
360
375
361
376
MODEL_TENSOR .FFN_GATE_SHEXP : (
362
377
"model.layers.{bid}.mlp.shared_expert.gate_proj" , # qwen2moe
363
378
"model.layers.{bid}.mlp.shared_experts.gate_proj" , # deepseek deepseek2
379
+ "language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj" , # llama4
364
380
),
365
381
366
382
# Feed-forward down
@@ -389,6 +405,7 @@ class TensorNameMap:
389
405
"encoder.layer.{bid}.mlp.down_layer" , # jina-bert-v2
390
406
"encoder.layers.{bid}.mlp.dense_4h_to_h" , # chatglm
391
407
"model.layers.h.{bid}.mlp.c_proj" , # exaone
408
+ "language_model.model.layers.{bid}.feed_forward.down_proj" , # llama4
392
409
),
393
410
394
411
MODEL_TENSOR .FFN_DOWN_EXP : (
@@ -398,11 +415,13 @@ class TensorNameMap:
398
415
"model.layers.{bid}.mlp.experts.down_proj" , # qwen2moe olmoe (merged)
399
416
"model.layers.{bid}.block_sparse_moe.output_linear" , # granitemoe
400
417
"model.layers.{bid}.block_sparse_moe.experts.w2" , # phimoe (merged)
418
+ "language_model.model.layers.{bid}.feed_forward.experts.down_proj" , # llama4
401
419
),
402
420
403
421
MODEL_TENSOR .FFN_DOWN_SHEXP : (
404
422
"model.layers.{bid}.mlp.shared_expert.down_proj" , # qwen2moe
405
423
"model.layers.{bid}.mlp.shared_experts.down_proj" , # deepseek deepseek2
424
+ "language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj" , # llama4
406
425
),
407
426
408
427
MODEL_TENSOR .ATTN_Q_NORM : (
0 commit comments