Add gguf q4_k quantization (#2001)

jerryzh168 · web-flow · commit ef10f348d739 · 2025-04-08T11:06:32.000-07:00
* Add gguf q4_k_s quantization Summary: Didn't implement the algorithm to choose_qparams from gguf, since it's complicated, e.g. https://github.com/ggml-org/llama.cpp/blob/f423981ac806bf031d83784bcb47d2721bc70f97/ggml/src/ggml-quants.c#L744 and https://github.com/ggml-org/llama.cpp/blob/f423981ac806bf031d83784bcb47d2721bc70f97/ggml/src/ggml-quants.c#L827C14-L827C28 but implemented a simple choose_qparams that can fit the gguf format: Q4_K: w = q * block_scale(6-bit) + block_min(6-bit) Test Plan: python test/prototype/test_gguf_quant.py Reviewers: Subscribers: Tasks: Tags: * fix * test with phi4 * pre-commit run * update * run precommit * format
diff --git a/test/prototype/test_gguf_quant.py b/test/prototype/test_gguf_quant.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+
+from torchao.prototype.quantization.gguf import (
+    GGUFQuantizedTensor,
+    GGUFWeightOnlyConfig,
+)
+from torchao.quantization import quantize_
+from torchao.quantization.quant_primitives import choose_qparams_gguf
+from torchao.quantization.utils import compute_error
+
+
+class TestGGUFQuantization(unittest.TestCase):
+    def setUp(self):
+        torch.manual_seed(123)
+        self.input = torch.randn(2, 256, dtype=torch.float32)
+        self.n_blocks_per_superblock = 8
+        self.block_size = (1, 32)
+        self.dtype = torch.uint4
+
+    def test_choose_qparams_gguf(self):
+        (
+            super_block_scale_scale,
+            super_block_min_scale,
+            quantized_block_scale,
+            quantized_block_min,
+        ) = choose_qparams_gguf(self.input, self.block_size, self.dtype)
+
+        assert super_block_scale_scale.shape, (2, 8)
+        assert super_block_min_scale.shape, (2, 8)
+        assert quantized_block_scale.shape, (2, 32)
+
+    def test_gguf_quantized_tensor_from_float(self):
+        gqt = GGUFQuantizedTensor.from_float(
+            self.input,
+            self.n_blocks_per_superblock,
+            self.dtype,
+        )
+
+        dequant = gqt.dequantize()
+
+        sqnr = compute_error(dequant, self.input)
+        self.assertGreater(sqnr, 30)
+
+    def test_quantize_api(self):
+        m = torch.nn.Sequential(torch.nn.Linear(256, 64))
+        quantize_(m, GGUFWeightOnlyConfig())
+        assert type(m[0].weight) == GGUFQuantizedTensor
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchao/core/config.py b/torchao/core/config.py
@@ -171,7 +171,11 @@ def config_to_dict(config: AOBaseConfig) -> Dict[str, Any]:
     return json.loads(json.dumps(config, cls=ConfigJSONEncoder))
 
 
-ALLOWED_AO_MODULES = {"torchao.quantization", "torchao.sparsity.sparse_api"}
+ALLOWED_AO_MODULES = {
+    "torchao.quantization",
+    "torchao.sparsity.sparse_api",
+    "torchao.prototype.quantization",
+}
 
 
 def config_from_dict(data: Dict[str, Any]) -> AOBaseConfig:
diff --git a/torchao/prototype/quantization/__init__.py b/torchao/prototype/quantization/__init__.py
@@ -0,0 +1,5 @@
+from .gguf import GGUFWeightOnlyConfig
+
+__all__ = [
+    "GGUFWeightOnlyConfig",
+]
diff --git a/torchao/prototype/quantization/gguf/__init__.py b/torchao/prototype/quantization/gguf/__init__.py
@@ -0,0 +1,9 @@
+from .api import GGUFWeightOnlyConfig
+from .gguf_quantized_tensor import (
+    GGUFQuantizedTensor,
+)
+
+__all__ = [
+    "GGUFQuantizedTensor",
+    "GGUFWeightOnlyConfig",
+]
diff --git a/torchao/prototype/quantization/gguf/api.py b/torchao/prototype/quantization/gguf/api.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+
+import torch
+
+from torchao.core.config import AOBaseConfig
+from torchao.quantization.transform_module import register_quantize_module_handler
+
+from .gguf_quantized_tensor import GGUFQuantizedTensor
+
+__all__ = [
+    "GGUFWeightOnlyConfig",
+]
+
+
+@dataclass
+class GGUFWeightOnlyConfig(AOBaseConfig):
+    dtype: torch.dtype = torch.uint4
+    n_blocks_per_superblock: int = 8
+
+
+@register_quantize_module_handler(GGUFWeightOnlyConfig)
+def _gguf_weight_only_transform(
+    module: torch.nn.Module,
+    config: GGUFWeightOnlyConfig,
+):
+    """
+    Applies gguf weight-only quantization to linear layers.
+
+    Args:
+        dtype: torch.uint1 to torch.uint8, torch.int32 supported.
+        n_blocks_per_superblock: the number of super blocks in a 256 element block for gguf, e.g. when it is 8
+            it means we have blocks of 32 and 8 blocks in a superblock of 256 elements.
+    Returns:
+        Callable for quantization transformation.
+    """
+    weight = module.weight
+    if (weight.ndim != 2) or (weight.shape[-1] % 256 != 0):
+        return module
+
+    quantized_weight = GGUFQuantizedTensor.from_float(
+        weight,
+        n_blocks_per_superblock=config.n_blocks_per_superblock,
+        target_dtype=config.dtype,
+    )
+    module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
+    return module
diff --git a/torchao/prototype/quantization/gguf/gguf_quantized_tensor.py b/torchao/prototype/quantization/gguf/gguf_quantized_tensor.py
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py