@@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3387
3387
}
3388
3388
3389
3389
// FIXME: this code is duplicated from ggml-aarch64.c
3390
- static block_q4_0x4 make_block_q4_0x4 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3390
+ static block_q4_0x4 make_block_q4_0x4 (block_q4_0 * in , unsigned int blck_size_interleave ) {
3391
3391
block_q4_0x4 out ;
3392
3392
3393
3393
for (int i = 0 ; i < 4 ; i ++ ) {
3394
3394
out .d [i ] = in [i ].d ;
3395
3395
}
3396
3396
3397
- for (int i = 0 ; i < QK4_0 * 2 ; i ++ ) {
3398
- int src_offset = (i / (4 * blck_size_interleave )) * blck_size_interleave ;
3399
- int src_id = (i % (4 * blck_size_interleave )) / blck_size_interleave ;
3400
- src_offset += (i % blck_size_interleave );
3397
+ const int end = QK4_0 * 2 / blck_size_interleave ;
3401
3398
3402
- out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3399
+ if (blck_size_interleave == 8 ) {
3400
+ const uint64_t xor_mask = 0x8888888888888888ULL ;
3401
+ for (int i = 0 ; i < end ; ++ i ) {
3402
+ int src_id = i % 4 ;
3403
+ int src_offset = (i / 4 ) * blck_size_interleave ;
3404
+ int dst_offset = i * blck_size_interleave ;
3405
+
3406
+ uint64_t elems ;
3407
+ // Using memcpy to avoid unaligned memory accesses
3408
+ memcpy (& elems , & in [src_id ].qs [src_offset ], sizeof (uint64_t ));
3409
+ elems ^= xor_mask ;
3410
+ memcpy (& out .qs [dst_offset ], & elems , sizeof (uint64_t ));
3411
+ }
3412
+ } else if (blck_size_interleave == 4 ) {
3413
+ const uint32_t xor_mask = 0x88888888 ;
3414
+ for (int i = 0 ; i < end ; ++ i ) {
3415
+ int src_id = i % 4 ;
3416
+ int src_offset = (i / 4 ) * blck_size_interleave ;
3417
+ int dst_offset = i * blck_size_interleave ;
3418
+
3419
+ uint32_t elems ;
3420
+ memcpy (& elems , & in [src_id ].qs [src_offset ], sizeof (uint32_t ));
3421
+ elems ^= xor_mask ;
3422
+ memcpy (& out .qs [dst_offset ], & elems , sizeof (uint32_t ));
3423
+ }
3424
+ } else {
3425
+ GGML_ASSERT (false);
3403
3426
}
3404
3427
3405
3428
return out ;
@@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
3409
3432
// returns an interleaved block_q4_0x8
3410
3433
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
3411
3434
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3412
- static block_q4_0x8 make_block_q4_0x8 (block_q4_0 * in , unsigned int blck_size_interleave , unsigned int xor_mask ) {
3435
+ static block_q4_0x8 make_block_q4_0x8 (block_q4_0 * in , unsigned int blck_size_interleave ) {
3413
3436
block_q4_0x8 out ;
3414
3437
3415
3438
for (int i = 0 ; i < 8 ; i ++ ) {
3416
3439
out .d [i ] = in [i ].d ;
3417
3440
}
3418
3441
3419
- for (int i = 0 ; i < QK4_0 * 4 ; i ++ ) {
3420
- int src_offset = (i / (8 * blck_size_interleave )) * blck_size_interleave ;
3421
- int src_id = (i % (8 * blck_size_interleave )) / blck_size_interleave ;
3422
- src_offset += (i % blck_size_interleave );
3442
+ const int end = QK4_0 * 4 / blck_size_interleave ;
3443
+ const uint64_t xor_mask = 0x8888888888888888ULL ;
3444
+
3445
+ for (int i = 0 ; i < end ; ++ i ) {
3446
+ int src_id = i % 8 ;
3447
+ int src_offset = (i / 8 ) * blck_size_interleave ;
3448
+ int dst_offset = i * blck_size_interleave ;
3423
3449
3424
- out .qs [i ] = in [src_id ].qs [src_offset ] ^ xor_mask ;
3450
+ uint64_t elems ;
3451
+ memcpy (& elems , & in [src_id ].qs [src_offset ], sizeof (uint64_t ));
3452
+ elems ^= xor_mask ;
3453
+ memcpy (& out .qs [dst_offset ], & elems , sizeof (uint64_t ));
3425
3454
}
3426
3455
3427
3456
return out ;
@@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
3449
3478
for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3450
3479
dst_tmp [i ] = src [x + i * nblocks ];
3451
3480
}
3452
- * dst ++ = make_block_q4_0x4 (dst_tmp , interleave_block , 0x88 );
3481
+ * dst ++ = make_block_q4_0x4 (dst_tmp , interleave_block );
3453
3482
}
3454
3483
src += nrows_interleaved * nblocks ;
3455
3484
}
@@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
3480
3509
for (int i = 0 ; i < nrows_interleaved ; i ++ ) {
3481
3510
dst_tmp [i ] = src [x + i * nblocks ];
3482
3511
}
3483
- * dst ++ = make_block_q4_0x8 (dst_tmp , interleave_block , 0x88 );
3512
+ * dst ++ = make_block_q4_0x8 (dst_tmp , interleave_block );
3484
3513
}
3485
3514
src += nrows_interleaved * nblocks ;
3486
3515
}
0 commit comments