Update

vkuzo · vkuzo · commit 138883b4f400 · 2025-01-22T12:44:06.000-08:00
[ghstack-poisoned]
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -60,7 +60,8 @@ def get_quantization_functions(
                     )
                 )
 
-    if do_sparse:
+    # TODO(before land): revert this back, added due to lack of cuSparseLt in my env
+    if do_sparse and False:
         base_functions.append(
             int8_dynamic_activation_int8_weight(layout=SemiSparseLayout())
         )
@@ -78,7 +79,8 @@ def test_tensor_core_layout_transpose(self):
         t = linear.weight
         shape = t.shape
         apply_int4_weight_only_quant = int4_weight_only(group_size=32)
-        ql = apply_int4_weight_only_quant(linear)
+        quantize_(linear, apply_int4_weight_only_quant)
+        ql = linear
         aqt = ql.weight
         aqt_shape = aqt.shape
         self.assertEqual(aqt_shape, shape)
@@ -97,7 +99,11 @@ def test_tensor_core_layout_transpose(self):
     )
     def test_weights_only(self, apply_quant):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
-        ql = apply_quant(linear)
+        if isinstance(apply_quant, AOBaseWorkflowConfig):
+            quantize_(linear, apply_quant)
+            ql = linear
+        else:
+            ql = apply_quant(linear)
         with tempfile.NamedTemporaryFile() as f:
             torch.save(ql.state_dict(), f)
             f.seek(0)
@@ -173,8 +179,13 @@ def apply_uint6_weight_only_quant(linear):
     @common_utils.parametrize("apply_quant", get_quantization_functions(True, True))
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_print_quantized_module(self, apply_quant):
+        print(apply_quant)
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
-        ql = apply_quant(linear)
+        if isinstance(apply_quant, AOBaseWorkflowConfig):
+            quantize_(linear, apply_quant)
+            ql = linear
+        else:
+            ql = apply_quant(linear)
         assert "AffineQuantizedTensor" in str(ql)
 
 
diff --git a/test/hqq/test_hqq_affine.py b/test/hqq/test_hqq_affine.py
@@ -6,6 +6,7 @@
     MappingType,
     ZeroPointDomain,
     int4_weight_only,
+    quantize_,
     uintx_weight_only,
 )
 from torchao.utils import (
@@ -51,9 +52,9 @@ def _eval_hqq(dtype):
     )
     dummy_linear.weight.data = W
     if dtype == torch.uint4:
-        q_tensor_hqq = int4_weight_only(group_size=max(block_size), use_hqq=True)(
-            dummy_linear
-        ).weight
+        config = int4_weight_only(group_size=max(block_size), use_hqq=True)
+        quantize_(dummy_linear, config)
+        q_tensor_hqq = dummy_linear.weight
     else:
         q_tensor_hqq = uintx_weight_only(
             dtype, group_size=max(block_size), use_hqq=True
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -794,6 +794,7 @@ def _int4_weight_only_transform(
         use_hqq=use_hqq,
     )
     module.weight = torch.nn.Parameter(new_weight)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
     return module
 
 

Original file line number	Diff line number	Diff line change
`@@ -794,6 +794,7 @@ def _int4_weight_only_transform(`
`794`	`794`	`use_hqq=use_hqq,`
`795`	`795`	`)`
`796`	`796`	`module.weight = torch.nn.Parameter(new_weight)`
	`797`	`+ module.extra_repr = types.MethodType(_linear_extra_repr, module)`
`797`	`798`	`return module`
`798`	`799`
`799`	`800`