@@ -2991,7 +2991,7 @@ static void llm_load_tensors(
2991
2991
} break ;
2992
2992
case LLM_ARCH_STABLELM:
2993
2993
{
2994
- model.tok_embd = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2994
+ model.tok_embeddings = ml.create_tensor (ctx, tn (LLM_TENSOR_TOKEN_EMBD, " weight" ), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2995
2995
2996
2996
// output
2997
2997
{
@@ -3002,12 +3002,12 @@ static void llm_load_tensors(
3002
3002
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
3003
3003
// on Windows however this is detrimental unless everything is on the GPU
3004
3004
#ifndef _WIN32
3005
- backend_norm = llama_backend_offload ;
3005
+ backend_norm = GGML_BACKEND_GPU ;
3006
3006
#else
3007
- backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload ;
3007
+ backend_norm = n_gpu_layers <= (int ) n_layer + 2 ? GGML_BACKEND_CPU : GGML_BACKEND_GPU ;
3008
3008
#endif // _WIN32
3009
3009
3010
- backend_output = llama_backend_offload_split ;
3010
+ backend_output = GGML_BACKEND_GPU_SPLIT ;
3011
3011
} else {
3012
3012
backend_norm = GGML_BACKEND_CPU;
3013
3013
backend_output = GGML_BACKEND_CPU;
@@ -3035,8 +3035,8 @@ static void llm_load_tensors(
3035
3035
/*
3036
3036
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3037
3037
*/
3038
- const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload ; // NOLINT
3039
- const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split ; // NOLINT
3038
+ const ggml_backend_type backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_GPU ; // NOLINT
3039
+ const ggml_backend_type backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_GPU_SPLIT ; // NOLINT
3040
3040
3041
3041
auto & layer = model.layers [i];
3042
3042
@@ -3051,15 +3051,15 @@ static void llm_load_tensors(
3051
3051
layer.ffn_norm = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " weight" , i), {n_embd}, backend);
3052
3052
layer.ffn_norm_b = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_NORM, " bias" , i), {n_embd}, backend);
3053
3053
3054
- layer.ffn_gate = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff}, backend_split);
3055
- layer.ffn_down = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd}, backend_split);
3056
- layer.ffn_up = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
3054
+ layer.w1 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_GATE, " weight" , i), {n_embd, n_ff}, backend_split);
3055
+ layer.w2 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_DOWN, " weight" , i), { n_ff, n_embd}, backend_split);
3056
+ layer.w3 = ml.create_tensor (ctx, tn (LLM_TENSOR_FFN_UP, " weight" , i), {n_embd, n_ff}, backend_split);
3057
3057
3058
3058
if (backend == GGML_BACKEND_GPU) {
3059
3059
vram_weights +=
3060
3060
ggml_nbytes (layer.attn_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk ) +
3061
3061
ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.ffn_norm ) +
3062
- ggml_nbytes (layer.ffn_gate ) + ggml_nbytes (layer.ffn_down ) + ggml_nbytes (layer.ffn_up );
3062
+ ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
3063
3063
}
3064
3064
}
3065
3065
} break ;
@@ -5943,7 +5943,7 @@ struct ggml_cgraph * build_stablelm() {
5943
5943
struct ggml_tensor * cur;
5944
5944
struct ggml_tensor * inpL;
5945
5945
5946
- inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , cb);
5946
+ inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embeddings , cb);
5947
5947
cb (inpL, " inp_embd" , -1 );
5948
5948
5949
5949
// inp_pos - contains the positions
@@ -6076,9 +6076,9 @@ struct ggml_cgraph * build_stablelm() {
6076
6076
cb (cur, " ffn_norm" , il);
6077
6077
6078
6078
cur = llm_build_ffn (ctx0, cur,
6079
- model.layers [il].ffn_up , NULL ,
6080
- model.layers [il].ffn_gate , NULL ,
6081
- model.layers [il].ffn_down , NULL ,
6079
+ model.layers [il].w3 , NULL ,
6080
+ model.layers [il].w1 , NULL ,
6081
+ model.layers [il].w2 , NULL ,
6082
6082
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
6083
6083
cb (cur, " ffn_out" , il);
6084
6084
}
0 commit comments