Update

vkuzo · vkuzo · commit 24114cebb3fd · 2025-01-22T08:49:11.000-08:00
[ghstack-poisoned]
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1185,7 +1185,7 @@ def test_qat_prototype_bc(self):
     @unittest.skipIf(
         not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
     )
-    def test_quantize_api(self):
+    def test_quantize_api_standalone(self):
         """
         Test that the following:
 
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -40,6 +40,7 @@
     Int4WeightOnlyQuantizedLinearWeight,
     Int8WeightOnlyQuantizedLinearWeight,
 )
+from torchao.quantization.utils import compute_error
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_3,
     TORCH_VERSION_AT_LEAST_2_4,
@@ -761,6 +762,31 @@ def reset_memory():
             assert param.is_cuda
         self.assertLess(memory_streaming, memory_baseline)
 
+    def test_int4_weight_only_numerics(self):
+        """
+        Simple test of e2e int4_weight_only workflow, comparing numerics
+        to a bfloat16 baseline.
+        """
+        # TODO(before land) skip on cpu-only
+        # TODO(before land) support other inference techniques?
+
+        # set up inputs
+        x = torch.randn(128, 128, device="cuda", dtype=torch.bfloat16)
+        # TODO: model in float32 leads to error: https://gist.github.com/vkuzo/63b3bcd7818393021a6e3fb4ccf3c469
+        # is that expected?
+        m_ref = torch.nn.Sequential(torch.nn.Linear(128, 128)).cuda().bfloat16()
+        m_int4_wo = copy.deepcopy(m_ref)
+
+        # quantize
+        quantize_(m_int4_wo, int4_weight_only())
+
+        with torch.no_grad():
+            y_ref = m_ref(x)
+            y_int4_wo = m_int4_wo(x)
+
+        sqnr = compute_error(y_ref, y_int4_wo)
+        assert sqnr >= 20, f"SQNR {sqnr} is too low"
+
 
 class TestMultiTensorFlow(TestCase):
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
diff --git a/torchao/core/__init__.py b/torchao/core/__init__.py
diff --git a/torchao/core/config.py b/torchao/core/config.py
@@ -0,0 +1,13 @@
+import abc
+
+
+# directory location for this might need more polish
+class AOBaseWorkflowConfig(abc.ABC):
+    """
+    If a workflow config inherits from this then `quantize_` knows
+    what to do with it.
+
+    TODO write a better docblock.
+    """
+
+    pass
diff --git a/torchao/quantization/_transform_module.py b/torchao/quantization/_transform_module.py
@@ -0,0 +1,17 @@
+from typing import Callable, Dict
+
+import torch
+
+from torchao.core.config import AOBaseWorkflowConfig
+
+_QUANTIZE_CONFIG_HANDLER: Dict[
+    AOBaseWorkflowConfig,
+    Callable[[torch.nn.Module, AOBaseWorkflowConfig], torch.nn.Module],
+] = {}
+
+
+def register_quantize_module_handler(config_type):
+    def decorator(func):
+        _QUANTIZE_CONFIG_HANDLER[config_type] = func
+
+    return decorator
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -5,10 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, List, Optional, Union
 
 import torch
 
+from torchao.core.config import AOBaseWorkflowConfig
+from torchao.quantization._transform_module import (
+    register_quantize_module_handler,
+)
 from torchao.quantization.granularity import (
     Granularity,
     PerAxis,
@@ -239,12 +243,26 @@ def __setattr__(self, name: str, value: Any):
             super().__setattr__(name, value)
 
 
-def intx_quantization_aware_training(
-    activation_config: Optional[FakeQuantizeConfig] = None,
-    weight_config: Optional[FakeQuantizeConfig] = None,
-) -> Callable:
+@dataclass
+class IntXQuantizationAwareTrainingConfig(AOBaseWorkflowConfig):
+    activation_config: Optional[FakeQuantizeConfig] = None
+    weight_config: Optional[FakeQuantizeConfig] = None
+
+
+# for BC
+intx_quantization_aware_training = IntXQuantizationAwareTrainingConfig
+
+
+@register_quantize_module_handler(IntXQuantizationAwareTrainingConfig)
+def _intx_quantization_aware_training_transform(
+    module: torch.nn.Module,
+    config: IntXQuantizationAwareTrainingConfig,
+) -> torch.nn.Module:
     """
-    Return a function that applies fake quantization to a `torch.nn.Module`.
+    THIS IS NOT A PUBLIC API - any usage of this outside of torchao
+    can break at any time.
+
+    Apply fake quantization to a `torch.nn.Module`.
     to be used with :func:`~torchao.quantization.quant_api.quantize_`.
 
     Example usage::
@@ -267,37 +285,32 @@ def intx_quantization_aware_training(
     `torch.nn.Embedding` with an activation config, then we will raise
     ValueError as these are not supported.
     """
-
-    def _insert_fake_quantize(mod: torch.nn.Module):
-        """
-        Swap the given module with its corresponding fake quantized version.
-        """
-        from .embedding import FakeQuantizedEmbedding
-        from .linear import FakeQuantizedLinear
-
-        if isinstance(mod, torch.nn.Linear):
-            return FakeQuantizedLinear.from_linear(
-                mod,
-                activation_config,
-                weight_config,
-            )
-        elif isinstance(mod, torch.nn.Embedding):
-            if activation_config is not None:
-                raise ValueError(
-                    "Activation fake quantization is not supported for embedding"
-                )
-            return FakeQuantizedEmbedding.from_embedding(mod, weight_config)
-        else:
+    from .embedding import FakeQuantizedEmbedding
+    from .linear import FakeQuantizedLinear
+
+    mod = module
+    activation_config = config.activation_config
+    weight_config = config.weight_config
+
+    if isinstance(mod, torch.nn.Linear):
+        return FakeQuantizedLinear.from_linear(
+            mod,
+            activation_config,
+            weight_config,
+        )
+    elif isinstance(mod, torch.nn.Embedding):
+        if activation_config is not None:
             raise ValueError(
-                "Module of type '%s' does not have QAT support" % type(mod)
+                "Activation fake quantization is not supported for embedding"
             )
+        return FakeQuantizedEmbedding.from_embedding(mod, weight_config)
+    else:
+        raise ValueError("Module of type '%s' does not have QAT support" % type(mod))
 
-    return _insert_fake_quantize
 
-
-def from_intx_quantization_aware_training() -> Callable:
+class FromIntXQuantizationAwareTrainingConfig(AOBaseWorkflowConfig):
     """
-    Return a function that converts a model with fake quantized modules,
+    Object that knows how to convert a model with fake quantized modules,
     such as :func:`~torchao.quantization.qat.linear.FakeQuantizedLinear`
     and :func:`~torchao.quantization.qat.linear.FakeQuantizedEmbedding`,
     back to model with the original, corresponding modules without
@@ -313,22 +326,31 @@ def from_intx_quantization_aware_training() -> Callable:
         )
     """
 
-    def _remove_fake_quantize(mod: torch.nn.Module):
-        """
-        If the given module is a fake quantized module, return the original
-        corresponding version of the module without fake quantization.
-        """
-        from .embedding import FakeQuantizedEmbedding
-        from .linear import FakeQuantizedLinear
+    pass
+
+
+# for BC
+from_intx_quantization_aware_training = FromIntXQuantizationAwareTrainingConfig
 
-        if isinstance(mod, FakeQuantizedLinear):
-            return mod.to_linear()
-        elif isinstance(mod, FakeQuantizedEmbedding):
-            return mod.to_embedding()
-        else:
-            return mod
 
-    return _remove_fake_quantize
+@register_quantize_module_handler(FromIntXQuantizationAwareTrainingConfig)
+def _from_intx_quantization_aware_training_transform(
+    mod: torch.nn.Module,
+    config: FromIntXQuantizationAwareTrainingConfig,
+) -> torch.nn.Module:
+    """
+    If the given module is a fake quantized module, return the original
+    corresponding version of the module without fake quantization.
+    """
+    from .embedding import FakeQuantizedEmbedding
+    from .linear import FakeQuantizedLinear
+
+    if isinstance(mod, FakeQuantizedLinear):
+        return mod.to_linear()
+    elif isinstance(mod, FakeQuantizedEmbedding):
+        return mod.to_embedding()
+    else:
+        return mod
 
 
 class ComposableQATQuantizer(TwoStepQuantizer):
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py

Original file line number	Diff line number	Diff line change
`@@ -1185,7 +1185,7 @@ def test_qat_prototype_bc(self):`
`1185`	`1185`	`@unittest.skipIf(`
`1186`	`1186`	`not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"`
`1187`	`1187`	`)`
`1188`		`- def test_quantize_api(self):`
	`1188`	`+ def test_quantize_api_standalone(self):`
`1189`	`1189`	`"""`
`1190`	`1190`	`Test that the following:`
`1191`	`1191`