Skip to content

Commit 1e58ee1

Browse files
authored
ggml : optimize Q4_0 into Q4_0_X_Y repack (#10324)
1 parent 89e4caa commit 1e58ee1

File tree

2 files changed

+86
-28
lines changed

2 files changed

+86
-28
lines changed

ggml/src/ggml-aarch64.c

+43-14
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,42 @@
88

99
#define UNUSED GGML_UNUSED
1010

11-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
11+
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
1212
block_q4_0x4 out;
1313

1414
for (int i = 0; i < 4; i++) {
1515
out.d[i] = in[i].d;
1616
}
1717

18-
for (int i = 0; i < QK4_0 * 2; i++) {
19-
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
20-
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
21-
src_offset += (i % blck_size_interleave);
18+
const int end = QK4_0 * 2 / blck_size_interleave;
2219

23-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
20+
if (blck_size_interleave == 8) {
21+
const uint64_t xor_mask = 0x8888888888888888ULL;
22+
for (int i = 0; i < end; ++i) {
23+
int src_id = i % 4;
24+
int src_offset = (i / 4) * blck_size_interleave;
25+
int dst_offset = i * blck_size_interleave;
26+
27+
uint64_t elems;
28+
// Using memcpy to avoid unaligned memory accesses
29+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
30+
elems ^= xor_mask;
31+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
32+
}
33+
} else if (blck_size_interleave == 4) {
34+
const uint32_t xor_mask = 0x88888888;
35+
for (int i = 0; i < end; ++i) {
36+
int src_id = i % 4;
37+
int src_offset = (i / 4) * blck_size_interleave;
38+
int dst_offset = i * blck_size_interleave;
39+
40+
uint32_t elems;
41+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
42+
elems ^= xor_mask;
43+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
44+
}
45+
} else {
46+
GGML_ASSERT(false);
2447
}
2548

2649
return out;
@@ -30,19 +53,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
3053
// returns an interleaved block_q4_0x8
3154
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
3255
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
33-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
56+
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
3457
block_q4_0x8 out;
3558

3659
for (int i = 0; i < 8; i++) {
3760
out.d[i] = in[i].d;
3861
}
3962

40-
for (int i = 0; i < QK4_0 * 4; i++) {
41-
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
42-
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
43-
src_offset += (i % blck_size_interleave);
63+
const int end = QK4_0 * 4 / blck_size_interleave;
64+
const uint64_t xor_mask = 0x8888888888888888ULL;
65+
66+
for (int i = 0; i < end; ++i) {
67+
int src_id = i % 8;
68+
int src_offset = (i / 8) * blck_size_interleave;
69+
int dst_offset = i * blck_size_interleave;
4470

45-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
71+
uint64_t elems;
72+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
73+
elems ^= xor_mask;
74+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
4675
}
4776

4877
return out;
@@ -71,11 +100,11 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
71100
}
72101

73102
if (nrows_interleaved == 8) {
74-
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave, 0x88);
103+
*(block_q4_0x8 *) out_ptr = make_block_q4_0x8(dst_tmp, blck_size_interleave);
75104
out_ptr = (block_q4_0x8 *) out_ptr + 1;
76105
}
77106
else if (nrows_interleaved == 4) {
78-
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
107+
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave);
79108
out_ptr = (block_q4_0x4 *) out_ptr + 1;
80109
}
81110
}

ggml/src/ggml-cpu/ggml-cpu-aarch64.c

+43-14
Original file line numberDiff line numberDiff line change
@@ -3387,19 +3387,42 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
33873387
}
33883388

33893389
// FIXME: this code is duplicated from ggml-aarch64.c
3390-
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
3390+
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
33913391
block_q4_0x4 out;
33923392

33933393
for (int i = 0; i < 4; i++) {
33943394
out.d[i] = in[i].d;
33953395
}
33963396

3397-
for (int i = 0; i < QK4_0 * 2; i++) {
3398-
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
3399-
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
3400-
src_offset += (i % blck_size_interleave);
3397+
const int end = QK4_0 * 2 / blck_size_interleave;
34013398

3402-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
3399+
if (blck_size_interleave == 8) {
3400+
const uint64_t xor_mask = 0x8888888888888888ULL;
3401+
for (int i = 0; i < end; ++i) {
3402+
int src_id = i % 4;
3403+
int src_offset = (i / 4) * blck_size_interleave;
3404+
int dst_offset = i * blck_size_interleave;
3405+
3406+
uint64_t elems;
3407+
// Using memcpy to avoid unaligned memory accesses
3408+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
3409+
elems ^= xor_mask;
3410+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
3411+
}
3412+
} else if (blck_size_interleave == 4) {
3413+
const uint32_t xor_mask = 0x88888888;
3414+
for (int i = 0; i < end; ++i) {
3415+
int src_id = i % 4;
3416+
int src_offset = (i / 4) * blck_size_interleave;
3417+
int dst_offset = i * blck_size_interleave;
3418+
3419+
uint32_t elems;
3420+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
3421+
elems ^= xor_mask;
3422+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
3423+
}
3424+
} else {
3425+
GGML_ASSERT(false);
34033426
}
34043427

34053428
return out;
@@ -3409,19 +3432,25 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
34093432
// returns an interleaved block_q4_0x8
34103433
// in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
34113434
// first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
3412-
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
3435+
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
34133436
block_q4_0x8 out;
34143437

34153438
for (int i = 0; i < 8; i++) {
34163439
out.d[i] = in[i].d;
34173440
}
34183441

3419-
for (int i = 0; i < QK4_0 * 4; i++) {
3420-
int src_offset = (i / (8 * blck_size_interleave)) * blck_size_interleave;
3421-
int src_id = (i % (8 * blck_size_interleave)) / blck_size_interleave;
3422-
src_offset += (i % blck_size_interleave);
3442+
const int end = QK4_0 * 4 / blck_size_interleave;
3443+
const uint64_t xor_mask = 0x8888888888888888ULL;
3444+
3445+
for (int i = 0; i < end; ++i) {
3446+
int src_id = i % 8;
3447+
int src_offset = (i / 8) * blck_size_interleave;
3448+
int dst_offset = i * blck_size_interleave;
34233449

3424-
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
3450+
uint64_t elems;
3451+
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
3452+
elems ^= xor_mask;
3453+
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
34253454
}
34263455

34273456
return out;
@@ -3449,7 +3478,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
34493478
for (int i = 0; i < nrows_interleaved; i++) {
34503479
dst_tmp[i] = src[x + i * nblocks];
34513480
}
3452-
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
3481+
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
34533482
}
34543483
src += nrows_interleaved * nblocks;
34553484
}
@@ -3480,7 +3509,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block,
34803509
for (int i = 0; i < nrows_interleaved; i++ ) {
34813510
dst_tmp[i] = src[x + i * nblocks];
34823511
}
3483-
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
3512+
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
34843513
}
34853514
src += nrows_interleaved * nblocks;
34863515
}

0 commit comments

Comments
 (0)