Revert D73201409 (#2105)

jerryzh168 · web-flow · commit 896f61b2c1b6 · 2025-04-22T13:03:09.000-07:00
Summary:
This diff reverts D73201409
Depends on D73414112
(The context such as a Sandcastle job, Task, SEV, etc. was not provided.)

Reviewed By: navsud

Differential Revision: D73414124
diff --git a/torchao/quantization/qat/embedding.py b/torchao/quantization/qat/embedding.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
@@ -196,40 +196,15 @@ def convert(
         """
         self._convert_helper(model)
         return model
-    
-    @staticmethod
-    def quantize_weights(
-        weight: torch.Tensor,
-        bit_width: int,
-        group_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Helper function to quantize weights
-        """
-        (qmin, qmax) = _get_qmin_qmax(bit_width)
-        (s, zp) = get_group_qparams_symmetric(
-            weight, bit_width, group_size
-        )
-        from torchao._executorch_ops import (
-            _quantized_decomposed_quantize_per_channel_group_wrapper,
-        )
-        q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
-            weight,
-            s,
-            zp,
-            qmin,
-            qmax,
-            torch.int8,
-            group_size,
-        )
-        return (q_weight, s, zp)
-
 
     def _convert_helper(self, module: torch.nn.Module):
         """
         Helper function to recursively swap `Int4WeightOnlyQATEmbedding`
         modules with `Int4WeightOnlyEmbedding`
         """
+        from torchao._executorch_ops import (
+            _quantized_decomposed_quantize_per_channel_group_wrapper,
+        )
 
         for name, child in module.named_children():
             if isinstance(child, Int4WeightOnlyQATEmbedding):
@@ -255,8 +230,20 @@ def _convert_helper(self, module: torch.nn.Module):
                 )
                 setattr(module, name, quantized_embedding)
 
-                q_weight, s, zp = self.quantize_weights(child.weight, self.bit_width, group_size)
                 # Load weights and qparams into quantized embedding
+                (qmin, qmax) = _get_qmin_qmax(self.bit_width)
+                (s, zp) = get_group_qparams_symmetric(
+                    child.weight, self.bit_width, group_size
+                )
+                q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
+                    child.weight,
+                    s,
+                    zp,
+                    qmin,
+                    qmax,
+                    torch.int8,
+                    group_size,
+                )
                 quantized_embedding.weight = q_weight
                 quantized_embedding.scale = s.to(scale_precision)
                 quantized_embedding.zero_point = zp.to(zero_point_precision)
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import torch.nn.functional as F
@@ -197,36 +197,6 @@ def convert(
     ) -> torch.nn.Module:
         self._convert_qat_linear_8da4w(model)
         return model
-    
-    @staticmethod
-    def quantize_weights(
-        weight: torch.Tensor,
-        group_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Helper function to quantize weights
-        """
-        # Load weights and qparams into quantized linear
-        n_bit = 4
-        (qmin, qmax) = _get_qmin_qmax(n_bit)
-        (s, zp) = get_group_qparams_symmetric(
-            weight, n_bit, group_size
-        )
-        from torchao._executorch_ops import (
-            _quantized_decomposed_quantize_per_channel_group_wrapper,
-        )
-
-        q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
-            weight,
-            s,
-            zp,
-            qmin,
-            qmax,
-            torch.int8,
-            group_size,
-        )
-        return (q_weight, s, zp)
-
 
     def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
         """
@@ -245,10 +215,28 @@ def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
                 )
                 setattr(module, name, quantized_linear)
 
-                q_weight, scales, zeros = self.quantize_weights(child.weight, config.group_size)         
+                # Load weights and qparams into quantized linear
+                n_bit = 4
+                (qmin, qmax) = _get_qmin_qmax(n_bit)
+                (s, zp) = get_group_qparams_symmetric(
+                    child.weight, n_bit, config.group_size
+                )
+                from torchao._executorch_ops import (
+                    _quantized_decomposed_quantize_per_channel_group_wrapper,
+                )
+
+                q_weight = _quantized_decomposed_quantize_per_channel_group_wrapper(
+                    child.weight,
+                    s,
+                    zp,
+                    qmin,
+                    qmax,
+                    torch.int8,
+                    config.group_size,
+                )
                 quantized_linear.weight = q_weight
-                quantized_linear.scales = scales
-                quantized_linear.zeros = zeros
+                quantized_linear.scales = s
+                quantized_linear.zeros = zp
                 if child.bias is not None:
                     quantized_linear.bias = child.bias
             else: