Skip to content

Commit 1c63021

Browse files
gguf-py: implement byteswapping for Q4_0
This is needed to byteswap Mistral model. Also restore original shapes after byteswapping tensors. It is not needed at the moment, but do it in case they'd be used in future.
1 parent 1d343b4 commit 1c63021

File tree

1 file changed

+47
-1
lines changed

1 file changed

+47
-1
lines changed

gguf-py/gguf/scripts/gguf_convert_endian.py

+47-1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
3535
if tensor.tensor_type not in (
3636
gguf.GGMLQuantizationType.F32,
3737
gguf.GGMLQuantizationType.F16,
38+
gguf.GGMLQuantizationType.Q4_0,
3839
gguf.GGMLQuantizationType.Q8_0,
3940
gguf.GGMLQuantizationType.Q4_K,
4041
gguf.GGMLQuantizationType.Q6_K,
@@ -72,11 +73,48 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
7273
part.byteswap(inplace=True)
7374

7475
# Byte-swap tensor data if necessary
75-
if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
76+
if tensor.tensor_type == gguf.GGMLQuantizationType.Q4_0:
77+
# Handle Q4_0 tensor blocks (block_q4_0)
78+
# Specific handling of block_q4_0 is required.
79+
# Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations.
80+
81+
# first flatten structure
82+
oldshape = tensor.data.shape
83+
newshape = 1
84+
for i in tensor.data.shape:
85+
newshape *= i
86+
87+
tensor.data.resize(newshape)
88+
89+
block_size = 18 # 18 bytes = <f16 delta scaling factor> + 16 * <int8 quant>
90+
91+
n_blocks = len(tensor.data) // block_size
92+
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
93+
block_offs = block_num * block_size
94+
95+
# Byte-Swap f16 sized delta field
96+
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
97+
delta.byteswap(inplace=True)
98+
99+
# Byte-Swap Q8 weights
100+
if block_num % 100000 == 0:
101+
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
102+
103+
# restore old shape in case it's ever used
104+
tensor.data.resize(oldshape)
105+
elif tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
76106
# Handle Q8_0 tensor blocks (block_q8_0)
77107
# Specific handling of block_q8_0 is required.
78108
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
79109

110+
# first flatten structure
111+
oldshape = tensor.data.shape
112+
newshape = 1
113+
for i in tensor.data.shape:
114+
newshape *= i
115+
116+
tensor.data.resize(newshape)
117+
80118
block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
81119

82120
n_blocks = len(tensor.data) // block_size
@@ -91,12 +129,15 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
91129
if block_num % 100000 == 0:
92130
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
93131

132+
# restore old shape in case it's ever used
133+
tensor.data.resize(oldshape)
94134
elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K:
95135
# Handle Q4_K tensor blocks (block_q4_k)
96136
# Specific handling of block_q4_k is required.
97137
# Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
98138

99139
# first flatten structure
140+
oldshape = tensor.data.shape
100141
newshape = 1
101142
for i in tensor.data.shape:
102143
newshape *= i
@@ -119,12 +160,15 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
119160
if block_num % 100000 == 0:
120161
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
121162

163+
# restore old shape in case it's ever used
164+
tensor.data.resize(oldshape)
122165
elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K:
123166
# Handle Q6_K tensor blocks (block_q6_k)
124167
# Specific handling of block_q6_k is required.
125168
# Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
126169

127170
# first flatten structure
171+
oldshape = tensor.data.shape
128172
newshape = 1
129173
for i in tensor.data.shape:
130174
newshape *= i
@@ -144,6 +188,8 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
144188
if block_num % 100000 == 0:
145189
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
146190

191+
# restore old shape in case it's ever used
192+
tensor.data.resize(oldshape)
147193
else:
148194
# Handle other tensor types
149195
tensor.data.byteswap(inplace=True)

0 commit comments

Comments
 (0)