Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 26 additions & 11 deletions src/ggml-cpu/ggml-cpu-aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3592,13 +3592,16 @@ static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs,
}

static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
block_q4_0x4 out;
// Zero initialize the output structure
block_q4_0x4 out = {};

// Copy d values
for (int i = 0; i < 4; i++) {
out.d[i] = in[i].d;
}

const int end = QK4_0 * 2 / blck_size_interleave;
const size_t qs_size = sizeof(out.qs);

if (blck_size_interleave == 8) {
const uint64_t xor_mask = 0x8888888888888888ULL;
Expand All @@ -3607,11 +3610,17 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;

uint64_t elems;
// Using memcpy to avoid unaligned memory accesses
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
// Bounds checking
if (dst_offset + sizeof(uint64_t) <= qs_size &&
src_offset + sizeof(uint64_t) <= sizeof(in[src_id].qs)) {
uint64_t elems;
// Using memcpy to avoid unaligned memory accesses
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
} else {
GGML_ASSERT(false && "buffer overflow prevented in make_block_q4_0x4");
}
}
} else if (blck_size_interleave == 4) {
const uint32_t xor_mask = 0x88888888;
Expand All @@ -3620,13 +3629,19 @@ static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_in
int src_offset = (i / 4) * blck_size_interleave;
int dst_offset = i * blck_size_interleave;

uint32_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
// Bounds checking
if (dst_offset + sizeof(uint32_t) <= qs_size &&
src_offset + sizeof(uint32_t) <= sizeof(in[src_id].qs)) {
uint32_t elems;
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
elems ^= xor_mask;
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
} else {
GGML_ASSERT(false && "buffer overflow prevented in make_block_q4_0x4");
}
}
} else {
GGML_ASSERT(false);
GGML_ASSERT(false && "invalid block size interleave value");
}

return out;
Expand Down