@@ -2381,45 +2381,38 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2381
2381
2382
2382
uint8_t L [QK_K ];
2383
2383
uint8_t Laux [32 ];
2384
+ uint8_t Ls [QK_K /32 ];
2385
+ uint8_t Lm [QK_K /32 ];
2384
2386
float weights [32 ];
2385
- float mins [QK_K /32 ];
2386
- float scales [QK_K /32 ];
2387
+ float sw [QK_K /32 ];
2388
+ float mins [QK_K /32 ];
2389
+ float scales [QK_K /32 ];
2387
2390
2388
2391
for (int i = 0 ; i < nb ; i ++ ) {
2389
2392
2390
2393
float sum_x2 = 0 ;
2391
2394
for (int l = 0 ; l < QK_K ; ++ l ) sum_x2 += x [l ] * x [l ];
2392
- float sigma2 = sum_x2 /QK_K ;
2395
+ float sigma2 = 2 * sum_x2 /QK_K ;
2393
2396
float av_x = sqrtf (sigma2 );
2394
2397
2395
- float max_scale = 0 ; // as we are deducting the min, scales are always positive
2396
- float max_min = 0 ;
2397
2398
for (int j = 0 ; j < QK_K /32 ; ++ j ) {
2398
2399
if (quant_weights ) {
2399
2400
const float * qw = quant_weights + QK_K * i + 32 * j ;
2400
2401
for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = qw [l ] * sqrtf (sigma2 + x [32 * j + l ]* x [32 * j + l ]);
2401
2402
} else {
2402
2403
for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = av_x + fabsf (x [32 * j + l ]);
2403
2404
}
2405
+ float sumw = 0 ;
2406
+ for (int l = 0 ; l < 32 ; ++ l ) sumw += weights [l ];
2407
+ sw [j ] = sumw ;
2404
2408
scales [j ] = make_qkx3_quants (32 , 15 , x + 32 * j , weights , L + 32 * j , & mins [j ], Laux , -0.9f , 0.05f , 36 , false);
2405
- //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2406
- float scale = scales [j ];
2407
- if (scale > max_scale ) {
2408
- max_scale = scale ;
2409
- }
2410
- float min = mins [j ];
2411
- if (min > max_min ) {
2412
- max_min = min ;
2413
- }
2414
2409
}
2415
2410
2416
- float inv_scale = max_scale > 0 ? 63.f / max_scale : 0.f ;
2417
- float inv_min = max_min > 0 ? 63.f / max_min : 0.f ;
2411
+ float d_block = make_qp_quants ( QK_K / 32 , 63 , scales , Ls , sw ) ;
2412
+ float m_block = make_qp_quants ( QK_K / 32 , 63 , mins , Lm , sw ) ;
2418
2413
for (int j = 0 ; j < QK_K /32 ; ++ j ) {
2419
- uint8_t ls = nearest_int (inv_scale * scales [j ]);
2420
- uint8_t lm = nearest_int (inv_min * mins [j ]);
2421
- ls = MIN (63 , ls );
2422
- lm = MIN (63 , lm );
2414
+ uint8_t ls = Ls [j ];
2415
+ uint8_t lm = Lm [j ];
2423
2416
if (j < 4 ) {
2424
2417
y [i ].scales [j ] = ls ;
2425
2418
y [i ].scales [j + 4 ] = lm ;
@@ -2429,8 +2422,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2429
2422
y [i ].scales [j - 0 ] |= ((lm >> 4 ) << 6 );
2430
2423
}
2431
2424
}
2432
- y [i ].d = GGML_FP32_TO_FP16 (max_scale / 63.f );
2433
- y [i ].dmin = GGML_FP32_TO_FP16 (max_min / 63.f );
2425
+ y [i ].d = GGML_FP32_TO_FP16 (d_block );
2426
+ y [i ].dmin = GGML_FP32_TO_FP16 (m_block );
2434
2427
2435
2428
uint8_t sc , m ;
2436
2429
for (int j = 0 ; j < QK_K /32 ; ++ j ) {
@@ -2688,43 +2681,41 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2688
2681
const int nb = n_per_row / QK_K ;
2689
2682
2690
2683
uint8_t L [QK_K ];
2691
- float mins [QK_K /32 ];
2692
- float scales [QK_K /32 ];
2693
- float weights [32 ];
2694
2684
uint8_t Laux [32 ];
2685
+ uint8_t Ls [QK_K /32 ];
2686
+ uint8_t Lm [QK_K /32 ];
2687
+ float mins [QK_K /32 ];
2688
+ float scales [QK_K /32 ];
2689
+ float sw [QK_K /32 ];
2690
+ float weights [32 ];
2695
2691
2696
2692
for (int i = 0 ; i < nb ; i ++ ) {
2697
2693
2698
2694
float sum_x2 = 0 ;
2699
2695
for (int l = 0 ; l < QK_K ; ++ l ) sum_x2 += x [l ] * x [l ];
2700
- float sigma2 = sum_x2 /QK_K ;
2696
+ float sigma2 = 2 * sum_x2 /QK_K ;
2701
2697
float av_x = sqrtf (sigma2 );
2702
2698
2703
- float max_scale = 0 ; // as we are deducting the min, scales are always positive
2704
- float max_min = 0 ;
2705
2699
for (int j = 0 ; j < QK_K /32 ; ++ j ) {
2706
2700
if (quant_weights ) {
2707
2701
const float * qw = quant_weights + QK_K * i + 32 * j ;
2708
2702
for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = qw [l ] * sqrtf (sigma2 + x [32 * j + l ]* x [32 * j + l ]);
2709
2703
} else {
2710
2704
for (int l = 0 ; l < 32 ; ++ l ) weights [l ] = av_x + fabsf (x [32 * j + l ]);
2711
2705
}
2706
+ float sumw = 0 ;
2707
+ for (int l = 0 ; l < 32 ; ++ l ) sumw += weights [l ];
2708
+ sw [j ] = sumw ;
2709
+
2712
2710
scales [j ] = make_qkx3_quants (32 , 31 , x + 32 * j , weights , L + 32 * j , & mins [j ], Laux , -0.9f , 0.05f , 36 , false);
2713
- float scale = scales [j ];
2714
- if (scale > max_scale ) {
2715
- max_scale = scale ;
2716
- }
2717
- float min = mins [j ];
2718
- if (min > max_min ) {
2719
- max_min = min ;
2720
- }
2721
2711
}
2722
2712
2723
- float inv_scale = max_scale > 0 ? 63.f /max_scale : 0.f ;
2724
- float inv_min = max_min > 0 ? 63.f /max_min : 0.f ;
2713
+ float d_block = make_qp_quants (QK_K /32 , 63 , scales , Ls , sw );
2714
+ float m_block = make_qp_quants (QK_K /32 , 63 , mins , Lm , sw );
2715
+
2725
2716
for (int j = 0 ; j < QK_K /32 ; ++ j ) {
2726
- uint8_t ls = nearest_int ( inv_scale * scales [j ]) ;
2727
- uint8_t lm = nearest_int ( inv_min * mins [j ]) ;
2717
+ uint8_t ls = Ls [j ];
2718
+ uint8_t lm = Lm [j ];
2728
2719
ls = MIN (63 , ls );
2729
2720
lm = MIN (63 , lm );
2730
2721
if (j < 4 ) {
@@ -2736,8 +2727,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2736
2727
y [i ].scales [j - 0 ] |= ((lm >> 4 ) << 6 );
2737
2728
}
2738
2729
}
2739
- y [i ].d = GGML_FP32_TO_FP16 (max_scale / 63.f );
2740
- y [i ].dmin = GGML_FP32_TO_FP16 (max_min / 63.f );
2730
+ y [i ].d = GGML_FP32_TO_FP16 (d_block );
2731
+ y [i ].dmin = GGML_FP32_TO_FP16 (m_block );
2741
2732
2742
2733
uint8_t sc , m ;
2743
2734
for (int j = 0 ; j < QK_K /32 ; ++ j ) {
0 commit comments