Move fpx to tensor subclass

jainapurva · jainapurva · commit 9ecdb3b3ca03 · 2025-01-22T23:05:36.000-08:00
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -4,12 +4,12 @@
     to_affine_quantized_floatx,
     to_affine_quantized_floatx_static,
     # experimental, will be merged into floatx in the future
-    to_affine_quantized_fpx,
     to_affine_quantized_intx,
     to_affine_quantized_intx_static,
 )
 from .floatx import (
     Float8Layout,
+    to_affine_quantized_fpx,
 )
 from .nf4tensor import NF4Tensor, to_nf4
 from .uintx import (
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -14,12 +14,9 @@
     MappingType,
     ZeroPointDomain,
     choose_qparams_affine,
-    choose_qparams_affine_floatx,
     choose_qparams_and_quantize_affine_hqq,
     dequantize_affine,
-    dequantize_affine_floatx,
     quantize_affine,
-    quantize_affine_floatx,
 )
 from torchao.utils import (
     TORCH_VERSION_AT_LEAST_2_5,
@@ -36,7 +33,6 @@
     "to_affine_quantized_floatx",
     "to_affine_quantized_intx_static",
     "to_affine_quantized_floatx_static",
-    "to_affine_quantized_fpx",
 ]
 
 
@@ -126,40 +122,28 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
         if output_dtype is None:
             output_dtype = self.dtype
 
-        from torchao.dtypes.floatx import FloatxTensorCoreLayout
-
-        if isinstance(self._layout, FloatxTensorCoreLayout):
-            int_data, scale = self.tensor_impl.get_plain()
-            return dequantize_affine_floatx(
-                int_data,
-                scale,
-                self._layout.ebits,
-                self._layout.mbits,
-                output_dtype=output_dtype,
-            )
-        else:
-            data, scale, zero_point = self.tensor_impl.get_plain()
-            dq = dequantize_affine(
-                data,
-                self.block_size,
-                scale,
-                zero_point,
-                data.dtype,
-                self.quant_min,
-                self.quant_max,
-                self.zero_point_domain,
-                output_dtype=output_dtype,
-            )
-            from torchao.dtypes.uintx import TensorCoreTiledLayout
+        data, scale, zero_point = self.tensor_impl.get_plain()
+        dq = dequantize_affine(
+            data,
+            self.block_size,
+            scale,
+            zero_point,
+            data.dtype,
+            self.quant_min,
+            self.quant_max,
+            self.zero_point_domain,
+            output_dtype=output_dtype,
+        )
+        from torchao.dtypes.uintx import TensorCoreTiledLayout
 
-            if isinstance(self._layout, TensorCoreTiledLayout):
-                # need to return to original shape if tensor was padded
-                # in preprocessing
-                # TODO: we could add an API for this if there are more use cases
-                # (e.g. dequant_post_process) in TensorImpl or Layout
-                for dim, dim_size in enumerate(self.shape):
-                    dq = dq.narrow(dim, 0, dim_size)
-            return dq
+        if isinstance(self._layout, TensorCoreTiledLayout):
+            # need to return to original shape if tensor was padded
+            # in preprocessing
+            # TODO: we could add an API for this if there are more use cases
+            # (e.g. dequant_post_process) in TensorImpl or Layout
+            for dim, dim_size in enumerate(self.shape):
+                dq = dq.narrow(dim, 0, dim_size)
+        return dq
 
     def __tensor_flatten__(self):
         return ["tensor_impl"], [
@@ -395,33 +379,6 @@ def from_hp_to_floatx_static(
                 f"Unsupported dtype {target_dtype} for from_hp_to_floatx_static"
             )
 
-    @classmethod
-    def from_hp_to_fpx(
-        cls,
-        input_float: torch.Tensor,
-        _layout: Layout,
-    ):
-        from torchao.dtypes.floatx import FloatxTensorCoreLayout
-
-        assert isinstance(
-            _layout, FloatxTensorCoreLayout
-        ), f"Only FloatxTensorCoreLayout is supported for floatx, got {_layout}"
-        original_shape = input_float.shape
-        input_float = _layout.pre_process(input_float)
-        # per axis quantization, where axis = 1
-        block_size = list(input_float.shape)
-        block_size[1] = 1
-
-        ebits, mbits = _layout.ebits, _layout.mbits
-        # Note: these ops are hardcoded to have per axis quantization (axis=1) right now
-        scale = choose_qparams_affine_floatx(input_float, ebits, mbits)
-        floatx_unpacked = quantize_affine_floatx(input_float, scale, ebits, mbits)
-        floatx_packed = _layout.post_process(floatx_unpacked)
-
-        tensor_impl_ctr = get_tensor_impl_constructor(type(_layout))
-        tensor_impl = tensor_impl_ctr(floatx_packed, scale, None, _layout)
-        return cls(tensor_impl, block_size, original_shape, dtype=input_float.dtype)
-
     @property
     def _layout(self) -> Layout:
         return self.tensor_impl._layout
@@ -477,8 +434,6 @@ def _apply_fn_to_data(self, fn):
 to_affine_quantized_intx_static = AffineQuantizedTensor.from_hp_to_intx_static
 to_affine_quantized_floatx = AffineQuantizedTensor.from_hp_to_floatx
 to_affine_quantized_floatx_static = AffineQuantizedTensor.from_hp_to_floatx_static
-# experimental will be merged in to floatx
-to_affine_quantized_fpx = AffineQuantizedTensor.from_hp_to_fpx
 
 if TORCH_VERSION_AT_LEAST_2_5:
     # Allow a model with AffineQuantizedTensor weights to be loaded with `weights_only=True`
diff --git a/torchao/dtypes/floatx/__init__.py b/torchao/dtypes/floatx/__init__.py
@@ -2,6 +2,7 @@
 from .floatx_tensor_core_layout import (
     FloatxTensorCoreLayout,
     from_scaled_tc_floatx,
+    to_affine_quantized_fpx,
     to_scaled_tc_floatx,
 )
 
@@ -10,4 +11,5 @@
     "to_scaled_tc_floatx",
     "from_scaled_tc_floatx",
     "Float8Layout",
+    "to_affine_quantized_fpx",
 ]
diff --git a/torchao/dtypes/floatx/floatx_tensor_core_layout.py b/torchao/dtypes/floatx/floatx_tensor_core_layout.py
@@ -11,6 +11,7 @@
 
 from torchao.dtypes.affine_quantized_tensor import (
     AffineQuantizedTensor,
+    get_tensor_impl_constructor,
     register_layout,
 )
 from torchao.dtypes.utils import (
@@ -22,6 +23,11 @@
     _floatx_unpacked_to_f32,
     _n_ones,
 )
+from torchao.quantization.quant_primitives import (
+    choose_qparams_affine_floatx,
+    dequantize_affine_floatx,
+    quantize_affine_floatx,
+)
 
 aten = torch.ops.aten
 _ONES_TABLE = [_n_ones(i) for i in range(8)]
@@ -456,6 +462,53 @@ class FloatxTensorCoreLayout(Layout):
     mbits: int
 
 
+class FloatxTensor(AffineQuantizedTensor):
+    """
+    Floatx quantized tensor subclass which inherits AffineQuantizedTensor class.
+
+    To see what happens during choose_qparams_and_quantize_affine_fpx, quantization and dequantization for floatx quantization,
+    please checkout https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_primitives.py
+    and check the two quant primitive ops: choose_qparams_affine_floatx, quantize_affine_floatx and dequantize_affine_floatx.
+    """
+
+    def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        if output_dtype is None:
+            output_dtype = self.dtype
+        int_data, scale = self.tensor_impl.get_plain()
+        return dequantize_affine_floatx(
+            int_data,
+            scale,
+            self._layout.ebits,
+            self._layout.mbits,
+            output_dtype=output_dtype,
+        )
+
+    @classmethod
+    def from_hp_to_floatx(
+        cls,
+        input_float: torch.Tensor,
+        _layout: Layout,
+    ):
+        assert isinstance(
+            _layout, FloatxTensorCoreLayout
+        ), f"Only FloatxTensorCoreLayout is supported for floatx, got {_layout}"
+        original_shape = input_float.shape
+        input_float = _layout.pre_process(input_float)
+        # per axis quantization, where axis = 1
+        block_size = list(input_float.shape)
+        block_size[1] = 1
+
+        ebits, mbits = _layout.ebits, _layout.mbits
+        # Note: these ops are hardcoded to have per axis quantization (axis=1) right now
+        scale = choose_qparams_affine_floatx(input_float, ebits, mbits)
+        floatx_unpacked = quantize_affine_floatx(input_float, scale, ebits, mbits)
+        floatx_packed = _layout.post_process(floatx_unpacked)
+
+        tensor_impl_ctr = get_tensor_impl_constructor(type(_layout))
+        tensor_impl = tensor_impl_ctr(floatx_packed, scale, None, _layout)
+        return cls(tensor_impl, block_size, original_shape, dtype=input_float.dtype)
+
+
 @register_layout(FloatxTensorCoreLayout)
 class FloatxTensorCoreAQTTensorImpl(AQTTensorImpl):
     """FloatxTensorCoreAQTTensorImpl represents a Tensor with dtype floatx(ebits=a, mbits=b),
@@ -657,3 +710,6 @@ def _linear_f16_bf16_act_floatx_weight_impl(input_tensor, weight_tensor, bias):
         out += bias
 
     return out.view(*act.shape[:-1], out_dim).to(act.dtype)
+
+
+to_affine_quantized_fpx = FloatxTensor.from_hp_to_floatx

Original file line number	Diff line number	Diff line change
`@@ -4,12 +4,12 @@`
`4`	`4`	`to_affine_quantized_floatx,`
`5`	`5`	`to_affine_quantized_floatx_static,`
`6`	`6`	`# experimental, will be merged into floatx in the future`
`7`		`- to_affine_quantized_fpx,`
`8`	`7`	`to_affine_quantized_intx,`
`9`	`8`	`to_affine_quantized_intx_static,`
`10`	`9`	`)`
`11`	`10`	`from .floatx import (`
`12`	`11`	`Float8Layout,`
	`12`	`+ to_affine_quantized_fpx,`
`13`	`13`	`)`
`14`	`14`	`from .nf4tensor import NF4Tensor, to_nf4`
`15`	`15`	`from .uintx import (`
Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`from .floatx_tensor_core_layout import (`
`3`	`3`	`FloatxTensorCoreLayout,`
`4`	`4`	`from_scaled_tc_floatx,`
	`5`	`+ to_affine_quantized_fpx,`
`5`	`6`	`to_scaled_tc_floatx,`
`6`	`7`	`)`
`7`	`8`
`@@ -10,4 +11,5 @@`
`10`	`11`	`"to_scaled_tc_floatx",`
`11`	`12`	`"from_scaled_tc_floatx",`
`12`	`13`	`"Float8Layout",`
	`14`	`+ "to_affine_quantized_fpx",`
`13`	`15`	`]`