Skip to content

Commit f57fadc

Browse files
ikawrakowKawrakow
andauthored
Slight quantization improvement for Q4_K and Q5_K (#5361)
* Q4_K: slightly better quantization * Q5_K: slightly better quantization --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 2e9c0bd commit f57fadc

File tree

1 file changed

+33
-42
lines changed

1 file changed

+33
-42
lines changed

ggml-quants.c

+33-42
Original file line numberDiff line numberDiff line change
@@ -2381,45 +2381,38 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
23812381

23822382
uint8_t L[QK_K];
23832383
uint8_t Laux[32];
2384+
uint8_t Ls[QK_K/32];
2385+
uint8_t Lm[QK_K/32];
23842386
float weights[32];
2385-
float mins[QK_K/32];
2386-
float scales[QK_K/32];
2387+
float sw[QK_K/32];
2388+
float mins[QK_K/32];
2389+
float scales[QK_K/32];
23872390

23882391
for (int i = 0; i < nb; i++) {
23892392

23902393
float sum_x2 = 0;
23912394
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2392-
float sigma2 = sum_x2/QK_K;
2395+
float sigma2 = 2*sum_x2/QK_K;
23932396
float av_x = sqrtf(sigma2);
23942397

2395-
float max_scale = 0; // as we are deducting the min, scales are always positive
2396-
float max_min = 0;
23972398
for (int j = 0; j < QK_K/32; ++j) {
23982399
if (quant_weights) {
23992400
const float * qw = quant_weights + QK_K*i + 32*j;
24002401
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
24012402
} else {
24022403
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
24032404
}
2405+
float sumw = 0;
2406+
for (int l = 0; l < 32; ++l) sumw += weights[l];
2407+
sw[j] = sumw;
24042408
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2405-
//scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2406-
float scale = scales[j];
2407-
if (scale > max_scale) {
2408-
max_scale = scale;
2409-
}
2410-
float min = mins[j];
2411-
if (min > max_min) {
2412-
max_min = min;
2413-
}
24142409
}
24152410

2416-
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2417-
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2411+
float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
2412+
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
24182413
for (int j = 0; j < QK_K/32; ++j) {
2419-
uint8_t ls = nearest_int(inv_scale*scales[j]);
2420-
uint8_t lm = nearest_int(inv_min*mins[j]);
2421-
ls = MIN(63, ls);
2422-
lm = MIN(63, lm);
2414+
uint8_t ls = Ls[j];
2415+
uint8_t lm = Lm[j];
24232416
if (j < 4) {
24242417
y[i].scales[j] = ls;
24252418
y[i].scales[j+4] = lm;
@@ -2429,8 +2422,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
24292422
y[i].scales[j-0] |= ((lm >> 4) << 6);
24302423
}
24312424
}
2432-
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2433-
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2425+
y[i].d = GGML_FP32_TO_FP16(d_block);
2426+
y[i].dmin = GGML_FP32_TO_FP16(m_block);
24342427

24352428
uint8_t sc, m;
24362429
for (int j = 0; j < QK_K/32; ++j) {
@@ -2688,43 +2681,41 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
26882681
const int nb = n_per_row / QK_K;
26892682

26902683
uint8_t L[QK_K];
2691-
float mins[QK_K/32];
2692-
float scales[QK_K/32];
2693-
float weights[32];
26942684
uint8_t Laux[32];
2685+
uint8_t Ls[QK_K/32];
2686+
uint8_t Lm[QK_K/32];
2687+
float mins[QK_K/32];
2688+
float scales[QK_K/32];
2689+
float sw[QK_K/32];
2690+
float weights[32];
26952691

26962692
for (int i = 0; i < nb; i++) {
26972693

26982694
float sum_x2 = 0;
26992695
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2700-
float sigma2 = sum_x2/QK_K;
2696+
float sigma2 = 2*sum_x2/QK_K;
27012697
float av_x = sqrtf(sigma2);
27022698

2703-
float max_scale = 0; // as we are deducting the min, scales are always positive
2704-
float max_min = 0;
27052699
for (int j = 0; j < QK_K/32; ++j) {
27062700
if (quant_weights) {
27072701
const float * qw = quant_weights + QK_K*i + 32*j;
27082702
for (int l = 0; l < 32; ++l) weights[l] = qw[l] * sqrtf(sigma2 + x[32*j + l]*x[32*j + l]);
27092703
} else {
27102704
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
27112705
}
2706+
float sumw = 0;
2707+
for (int l = 0; l < 32; ++l) sumw += weights[l];
2708+
sw[j] = sumw;
2709+
27122710
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2713-
float scale = scales[j];
2714-
if (scale > max_scale) {
2715-
max_scale = scale;
2716-
}
2717-
float min = mins[j];
2718-
if (min > max_min) {
2719-
max_min = min;
2720-
}
27212711
}
27222712

2723-
float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2724-
float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2713+
float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
2714+
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
2715+
27252716
for (int j = 0; j < QK_K/32; ++j) {
2726-
uint8_t ls = nearest_int(inv_scale*scales[j]);
2727-
uint8_t lm = nearest_int(inv_min*mins[j]);
2717+
uint8_t ls = Ls[j];
2718+
uint8_t lm = Lm[j];
27282719
ls = MIN(63, ls);
27292720
lm = MIN(63, lm);
27302721
if (j < 4) {
@@ -2736,8 +2727,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
27362727
y[i].scales[j-0] |= ((lm >> 4) << 6);
27372728
}
27382729
}
2739-
y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2740-
y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2730+
y[i].d = GGML_FP32_TO_FP16(d_block);
2731+
y[i].dmin = GGML_FP32_TO_FP16(m_block);
27412732

27422733
uint8_t sc, m;
27432734
for (int j = 0; j < QK_K/32; ++j) {

0 commit comments

Comments
 (0)