Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update llama-quant.cpp llama_tensor_get_type with DeepSeek friendly modifications #12727

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -368,8 +368,8 @@ typedef struct {
} block_iq3_xxs;
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");

// 3.4375 bpw
#define IQ3S_N_SCALE QK_K/64
// 3.4375 bpw
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was bothering me that my IDE couldn't see the BPW from the docstring

typedef struct {
ggml_half d;
uint8_t qs[QK_K/4];
Expand Down
186 changes: 177 additions & 9 deletions src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,22 @@ struct quantize_state_impl {
int n_ffn_down = 0;
int n_ffn_gate = 0;
int n_ffn_up = 0;
int n_ffn_down_exp = 0;
int n_ffn_gate_exp = 0;
int n_ffn_up_exp = 0;
int n_ffn_down_shexp = 0;
int n_ffn_gate_shexp = 0;
int n_ffn_up_shexp = 0;
int i_attention_wv = 0;
int i_ffn_down = 0;
int i_ffn_gate = 0;
int i_ffn_up = 0;
int i_ffn_down_exp = 0;
int i_ffn_gate_exp = 0;
int i_ffn_up_exp = 0;
int i_ffn_down_shexp = 0;
int i_ffn_gate_shexp = 0;
int i_ffn_up_shexp = 0;

int n_k_quantized = 0;
int n_fallback = 0;
Expand Down Expand Up @@ -119,6 +131,23 @@ static void llama_tensor_dequantize_impl(
workers.clear();
}

// Check if ftype is specifically IQ2_S or IQ2_M
static inline bool is_iq2s_or_iq2m(llama_ftype ftype) {
Copy link
Contributor Author

@bartowski1182 bartowski1182 Apr 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is used all over the place, made it an inline helper, happy to change it back if changes like these are unwanted (same below with is_iq1_group and get_expert_exps_type)

return ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M;
}

// Check if ftype belongs to the IQ1 group
static inline bool is_iq1_group(llama_ftype ftype) {
return ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M;
}

// Returns the appropriate type for expert _exps tensors based on ftype
static inline ggml_type get_expert_exps_type(llama_ftype ftype) {
if (is_iq1_group(ftype)) return GGML_TYPE_IQ2_XXS;
if (is_iq2s_or_iq2m(ftype)) return GGML_TYPE_IQ3_XXS;
/* otherwise */ return GGML_TYPE_IQ2_XS;
}

static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
const std::string name = ggml_get_name(tensor);

Expand Down Expand Up @@ -175,7 +204,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
new_type = GGML_TYPE_Q2_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
else if (is_iq2s_or_iq2m(ftype)) {
new_type = GGML_TYPE_IQ3_S;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
Expand All @@ -189,24 +218,105 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
else new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
++qs.i_attention_wv;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) {
if (qs.i_attention_wv < qs.n_attention_wv/8) {
new_type = GGML_TYPE_Q4_K;
}
else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_attention_wv;
}
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) {
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down.weight") != std::string::npos) {
if (qs.i_ffn_down < qs.n_ffn_down/16) {
new_type = GGML_TYPE_Q4_K;
}
else if (qs.i_ffn_down < qs.n_ffn_down/8) {
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_ffn_down;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) {
if (qs.i_ffn_gate < qs.n_ffn_gate/16) {
new_type = GGML_TYPE_Q4_K;
}
else if (qs.i_ffn_gate < qs.n_ffn_gate/8) {
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_ffn_gate;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) {
if (qs.i_ffn_up < qs.n_ffn_up/16) {
new_type = GGML_TYPE_Q4_K;
}
else if (qs.i_ffn_up < qs.n_ffn_up/8) {
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_ffn_up;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
if (qs.i_ffn_down_exp < qs.n_ffn_down_exp/8) {
new_type = get_expert_exps_type(ftype);
}
++qs.i_ffn_down_exp;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) {
if (qs.i_ffn_gate_exp < qs.n_ffn_gate_exp/8) {
new_type = get_expert_exps_type(ftype);
}
++qs.i_ffn_gate_exp;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) {
if (qs.i_ffn_up_exp < qs.n_ffn_up_exp/8) {
new_type = get_expert_exps_type(ftype);
}
++qs.i_ffn_up_exp;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
new_type = GGML_TYPE_Q4_K;
}
++qs.i_ffn_down_shexp;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
new_type = GGML_TYPE_Q4_K;
}
++qs.i_ffn_gate_shexp;
}
else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
new_type = GGML_TYPE_Q4_K;
}
++qs.i_ffn_up_shexp;
}
else if (name.find("ffn_down") != std::string::npos) {
if (qs.i_ffn_down < qs.n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
++qs.i_ffn_down;
}
else if (name.find("attn_output.weight") != std::string::npos) {
if (qs.model.hparams.n_expert == 8) {
new_type = GGML_TYPE_Q5_K;
if (qs.model.hparams.n_expert >= 8) {
new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
if (is_iq1_group(ftype)) new_type = GGML_TYPE_IQ2_XXS;
else if (is_iq2s_or_iq2m(ftype)) new_type = GGML_TYPE_IQ3_S;
}
}
} else if (name.find("attn_v.weight") != std::string::npos) {
Expand Down Expand Up @@ -266,6 +376,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_IQ2_S;
}
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
new_type = GGML_TYPE_Q5_K;
if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
++qs.i_ffn_down_shexp;
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
new_type = GGML_TYPE_Q5_K;
if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
++qs.i_ffn_gate_shexp;
} else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
new_type = GGML_TYPE_Q5_K;
if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
new_type = GGML_TYPE_Q8_0;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
++qs.i_ffn_up_shexp;
} else if (name.find("ffn_down") != std::string::npos) {
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;
Expand Down Expand Up @@ -313,7 +447,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
++qs.i_ffn_down;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
if (qs.model.hparams.n_expert >= 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
Expand Down Expand Up @@ -353,6 +487,28 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
new_type = GGML_TYPE_IQ3_XXS;
}
++qs.i_ffn_up;
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
new_type = GGML_TYPE_Q8_0;
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
if (qs.i_attention_wv < qs.n_attention_wv/16) {
new_type = GGML_TYPE_Q8_0;
} else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
new_type = GGML_TYPE_Q6_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
++qs.i_attention_wv;
} else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
} else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
new_type = GGML_TYPE_Q5_K;
if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
}

// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
Expand Down Expand Up @@ -618,6 +774,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
++qs.n_attention_wv;
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
} else if (name.find("ffn_gate_exps.weight") != std::string::npos) {
++qs.n_ffn_gate_exp;
} else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {
++qs.n_ffn_gate_shexp;
} else if (name.find("ffn_down_exps.weight") != std::string::npos) {
++qs.n_ffn_down_exp;
} else if (name.find("ffn_down_shexp.weight") != std::string::npos) {
++qs.n_ffn_down_shexp;
} else if (name.find("ffn_up_exps.weight") != std::string::npos) {
++qs.n_ffn_up_exp;
} else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
++qs.n_ffn_up_shexp;
}
}

Expand Down
Loading