temp

andrewor14 · andrewor14 · commit 7361238681ee · 2025-01-22T15:26:56.000-08:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/docs/source/contributor_guide.rst b/docs/source/contributor_guide.rst
@@ -287,6 +287,7 @@ A tensor subclass needs to define a few basic methods: ``__new__``, ``__init__``
 and also dispatch functions for torch functions ``__torch_function__`` and aten ops ``__torch_dispatch__``.
 
 Here is an example of basic structure::
+
   # check out docs in https://github.com/pytorch/ao/blob/e283743b3cc4612bb641b88dca3670231724d396/torchao/utils.py#L437
   from torchao.utils import TorchAOBaseTensor
 
@@ -374,12 +375,12 @@ Operator Support
 ~~~~~~~~~~~~~~~~
 There are two types of operator support, torch function and aten ops. For torch functions (e.g. ``torch.nn.functional.linear``), we’ll need to overwrite ``__torch_function__`` callback in the Tensor subclass, for aten ops (e.g. ``torch.ops.aten.mm``), we’ll need to overwrite ``__torch_dispatch__`` callback function.
 
-For a new dtype, we’d like people to define the following decorator::
-  if your dtype class is inherited from `torchao.utils.TorchAoBaseTensor`, you can do:
+For a new dtype, we’d like people to define the following decorator. If your dtype class is inherited from `torchao.utils.TorchAoBaseTensor`, you can do::
 
   implements = my_dtype_tensor_cls.implements
 
 And we can implement the operator dispatch with the following::
+
   # Example for torch_function dispatch for torch.nn.functional.linear
   def _quantized_linear_op(input_tensor, weight_tensor, bias):
       if isinstance(input_tensor, MyDtypeTensor):
@@ -426,6 +427,7 @@ What ops do we need to overwrite? This depends on the model we are trying to qua
 ``__torch_dispatch__``: ``torch.ops.aten.addmm.default``, ``torch.ops.aten.mm.default``, ``torch.ops.aten.detach.default``, ``torch.ops.aten.t.default``
 
 You can also find the ops that can be overwritten in ``__torch_function__`` or ``__torch_dispatch__`` with the following code, and you can start with a model that you want to optimize, start with just overwriting the important ops like linear, and gradually expand the coverage until the test runs and you get the expected optimized generated code (see Optimized Operators section for more details)::
+
   class M(torch.nn.Module): 
     def __init__(self) -> None: 
         super().__init__() 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -102,4 +102,4 @@ Welcome to the torchao Documentation
    :caption: Tutorials
 
    serialization
-   
+   subclass_basic
diff --git a/docs/source/subclass_basic.py b/docs/source/subclass_basic.py
@@ -0,0 +1,5 @@
+Quantization with Tensor Subclasses
+===================================
+
+Coming soon.
+
diff --git a/scripts/module_swap_example.py b/scripts/module_swap_example.py
@@ -0,0 +1,65 @@
+from typing import Tuple
+import torch
+
+
+class ToyModel(torch.nn.Module):
+    def __init__(self, m: int, n: int, k: int):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(m, n, bias=False)
+        self.linear2 = torch.nn.Linear(n, k, bias=False)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+class QuantizedLinear(torch.nn.Linear):
+    """
+    Linear module that performs dynamic and symmetric weight-only
+    int8 quantization.
+    """
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        w_int8, scale = int8_symmetric_quantize(self.weight)
+        return torch.mm(x, w_int8.t().to(x.dtype)) * scale.t()
+
+    @classmethod
+    def from_float(cls, mod: torch.nn.Linear):
+        new_linear = cls(mod.in_features, mod.out_features, mod.bias)
+        new_linear.weight = mod.weight
+        return new_linear
+
+
+def int8_symmetric_quantize(
+    fp32_tensor: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Symmetrically quantize the torch.float32 tensor into torch.int8.
+    Return a 2-tuple of (quantized value, scale).
+    """
+    quant_min = -128
+    quant_max = 127
+    min_val = torch.amin(fp32_tensor, dim=[1], keepdim=False)
+    max_val = torch.amax(fp32_tensor, dim=[1], keepdim=False)
+    min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
+    max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
+    max_val_pos = torch.max(-min_val_neg, max_val_pos)
+    scale = max_val_pos / (float(quant_max - quant_min) / 2)
+    scale = scale.view(fp32_tensor.shape[0], -1)
+    out = torch.round(fp32_tensor * (1.0 / scale))
+    out = torch.clamp(out, quant_min, quant_max).to(torch.int8)
+    return out, scale
+
+
+if __name__ == "__main__":
+    model = ToyModel(64, 128, 32).cuda()
+    example_inputs = torch.randn((1, 64), dtype=torch.float32, device="cuda")
+    
+    # Swap torch.nn.Linear with QuantizedLinear
+    for name, child in model.named_children():
+        if type(child) == torch.nn.Linear:
+            new_linear = QuantizedLinear.from_float(child)
+            setattr(model, name, new_linear)
+    
+    print("quantized model: ", model)
+    print("output: ", model(example_inputs))