replace to_affine_quantized_floatx with to_affine_quantized_float8 in quantization APIs

danielvegamyhre · danielvegamyhre · commit 96062198ba35 · 2025-01-22T17:24:31.000-08:00
ghstack-source-id: cba5e1c ghstack-comment-id: 2608105249 Pull Request resolved: #1599
diff --git a/docs/source/api_ref_dtypes.rst b/docs/source/api_ref_dtypes.rst
@@ -13,7 +13,7 @@ torchao.dtypes
     to_nf4
     to_affine_quantized_intx
     to_affine_quantized_intx_static
-    to_affine_quantized_floatx
+    to_affine_quantized_float8
     to_affine_quantized_floatx_static
     to_affine_quantized_fpx
     NF4Tensor
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -1,16 +1,14 @@
 from . import affine_quantized_tensor_ops
 from .affine_quantized_tensor import (
     AffineQuantizedTensor,
-    to_affine_quantized_floatx,
+    to_affine_quantized_float8,
     to_affine_quantized_floatx_static,
     # experimental, will be merged into floatx in the future
     to_affine_quantized_fpx,
     to_affine_quantized_intx,
     to_affine_quantized_intx_static,
 )
-from .floatx import (
-    Float8Layout,
-)
+from .floatx import Float8Layout
 from .nf4tensor import NF4Tensor, to_nf4
 from .uintx import (
     BlockSparseLayout,
@@ -24,10 +22,7 @@
     UintxLayout,
     to_marlinqqq_quantized_intx,
 )
-from .utils import (
-    Layout,
-    PlainLayout,
-)
+from .utils import Layout, PlainLayout
 
 __all__ = [
     "NF4Tensor",
@@ -36,8 +31,8 @@
     "to_affine_quantized_intx",
     "to_affine_quantized_intx_static",
     "to_affine_quantized_fpx",
-    "to_affine_quantized_floatx",
     "to_affine_quantized_floatx_static",
+    "to_affine_quantized_float8",
     "to_marlinqqq_quantized_intx",
     "Layout",
     "PlainLayout",
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -28,7 +28,7 @@
     "AffineQuantizedTensor",
     "register_layout",
     "to_affine_quantized_intx",
-    "to_affine_quantized_floatx",
+    "to_affine_quantized_float8",
     "to_affine_quantized_intx_static",
     "to_affine_quantized_floatx_static",
     "to_affine_quantized_fpx",
@@ -430,7 +430,6 @@ def from_hp_to_float8(
         scale = choose_qparams_affine_float8(
             input_float,
             target_dtype,
-            target_dtype,
         )
         fp8_data = quantize_affine_float8(
             input_float,
diff --git a/torchao/prototype/quantization/autoquant_v2.py b/torchao/prototype/quantization/autoquant_v2.py
@@ -27,14 +27,8 @@
 from torchao.quantization.autoquant import (
     AutoQuantizableLinearWeight as AutoQuantizableLinearWeightV1,
 )
-from torchao.quantization.granularity import (
-    PerRow,
-    PerTensor,
-)
-from torchao.quantization.quant_primitives import (
-    MappingType,
-    ZeroPointDomain,
-)
+from torchao.quantization.granularity import PerRow, PerTensor
+from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
 from torchao.quantization.subclass import (  # noqa
     Int8DynamicallyQuantizedLinearWeight,
     Int8WeightOnlyQuantizedLinearWeight,
@@ -991,7 +985,7 @@ class AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight(
     @classmethod
     def from_float(cls, weight):
         # avoid circular dep
-        from torchao.dtypes import to_affine_quantized_floatx
+        from torchao.dtypes import to_affine_quantized_float8
         from torchao.quantization.quant_api import _input_activation_quant_func_fp8
 
         # weight settings
@@ -1015,12 +1009,11 @@ def get_per_token_block_size(x):
             activation_dtype=input_target_dtype,
         )
         block_size = get_weight_block_size(weight)
-        weight = to_affine_quantized_floatx(
+        weight = to_affine_quantized_float8(
             input_float=weight,
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = super(
             AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight, cls
@@ -1040,7 +1033,7 @@ class AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight(
     @classmethod
     def from_float(cls, weight):
         # avoid circular dep
-        from torchao.dtypes import to_affine_quantized_floatx
+        from torchao.dtypes import to_affine_quantized_float8
         from torchao.quantization.quant_api import _input_activation_quant_func_fp8
 
         # weight settings
@@ -1058,12 +1051,11 @@ def get_weight_block_size(x):
             activation_dtype=input_target_dtype,
         )
         block_size = get_weight_block_size(weight)
-        weight = to_affine_quantized_floatx(
+        weight = to_affine_quantized_float8(
             input_float=weight,
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = super(
             AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, cls
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -18,10 +18,7 @@
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
 )
-from torchao.quantization.quant_primitives import (
-    MappingType,
-    ZeroPointDomain,
-)
+from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
 from torchao.quantization.utils import (
     compute_error,
     quantize_activation_per_token_absmax,
@@ -34,10 +31,7 @@
     is_sm_at_least_90,
 )
 
-from .granularity import (
-    PerRow,
-    PerTensor,
-)
+from .granularity import PerRow, PerTensor
 from .subclass import (  # noqa
     Int8DynamicallyQuantizedLinearWeight,
     Int8WeightOnlyQuantizedLinearWeight,
@@ -969,7 +963,7 @@ class AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight(AQMixin, BFloat16Ten
     @classmethod
     def from_float(cls, weight):
         # avoid circular dep
-        from torchao.dtypes import to_affine_quantized_floatx
+        from torchao.dtypes import to_affine_quantized_float8
         from torchao.quantization.quant_api import _input_activation_quant_func_fp8
 
         # weight settings
@@ -995,12 +989,11 @@ def get_per_token_block_size(x):
         }
         block_size = get_weight_block_size(weight)
 
-        weight = to_affine_quantized_floatx(
+        weight = to_affine_quantized_float8(
             input_float=weight,
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = to_linear_activation_quantized(
             weight, input_quant_func, quant_kwargs=input_quant_kwargs
@@ -1025,7 +1018,7 @@ class AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight(
     @classmethod
     def from_float(cls, weight):
         # avoid circular dep
-        from torchao.dtypes import to_affine_quantized_floatx
+        from torchao.dtypes import to_affine_quantized_float8
         from torchao.quantization.quant_api import _input_activation_quant_func_fp8
 
         # weight settings
@@ -1043,12 +1036,11 @@ def get_weight_block_size(x):
             "activation_dtype": input_target_dtype,
         }
         block_size = get_weight_block_size(weight)
-        weight = to_affine_quantized_floatx(
+        weight = to_affine_quantized_float8(
             input_float=weight,
             block_size=block_size,
             target_dtype=target_dtype,
             _layout=_layout,
-            scale_dtype=torch.float32,
         )
         weight = super(
             AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight, cls
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -36,7 +36,7 @@
     SemiSparseLayout,
     TensorCoreTiledLayout,
     UintxLayout,
-    to_affine_quantized_floatx,
+    to_affine_quantized_float8,
     to_affine_quantized_floatx_static,
     to_affine_quantized_intx,
     to_marlinqqq_quantized_intx,
@@ -66,21 +66,13 @@
     Int8DynActInt4WeightGPTQQuantizer,
     Int8DynActInt4WeightQuantizer,
 )
-from .granularity import (
-    PerRow,
-    PerTensor,
-)
+from .granularity import PerRow, PerTensor
 from .linear_activation_quantized_tensor import (
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
 )
-from .qat import (
-    intx_quantization_aware_training,
-)
-from .quant_primitives import (
-    MappingType,
-    ZeroPointDomain,
-)
+from .qat import intx_quantization_aware_training
+from .quant_primitives import MappingType, ZeroPointDomain
 from .subclass import (
     Int4WeightOnlyQuantizedLinearWeight,
     Int8DynamicallyQuantizedLinearWeight,
@@ -915,10 +907,12 @@ def int8_dynamic_activation_int8_semi_sparse_weight():
     Applies int8 dnynamic symmetric per-token activation and int8 per-channel weight
     quantization + 2:4 sparsity to linear layers.
     """
-    warnings.warn("""int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in int8_dynamic_activation_int8_weight instead.
+    warnings.warn(
+        """int8_dyanmic_activation_int8_semi_sparse_weight() will be deprecated at a later release. Please use the layout kwarg in int8_dynamic_activation_int8_weight instead.
 
     from torchao.dtypes import SemiSparseLayout
-    int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()""")
+    int8_dynamic_activation_int8_weight(layout=SemiSparseLayout()"""
+    )
 
     return int8_dynamic_activation_int8_weight(layout=SemiSparseLayout())
 
@@ -934,15 +928,13 @@ def float8_weight_only(weight_dtype: torch.dtype = torch.float8_e4m3fn):
         The actual matmul will be computed in original precision of the weight tensor.
 
     """
-    from torchao.dtypes import to_affine_quantized_floatx
 
     def apply_float8wo_quant(weight):
         block_size = (1, weight.shape[1])
-        return to_affine_quantized_floatx(
+        return to_affine_quantized_float8(
             input_float=weight,
             block_size=block_size,
             target_dtype=weight_dtype,
-            scale_dtype=None,
             _layout=Float8Layout(mm_config=None),
         )
 
@@ -1016,11 +1008,10 @@ def _input_activation_quant_func_fp8(
 
     block_size = get_block_size(x.shape, activation_granularity)
     if scale is None:
-        activation = to_affine_quantized_floatx(
+        activation = to_affine_quantized_float8(
             input_float=x,
             block_size=block_size,
             target_dtype=activation_dtype,
-            scale_dtype=torch.float32,
             _layout=Float8Layout(mm_config=None),  # Config is stored on weight
         )
     else:
@@ -1102,11 +1093,10 @@ def apply_float8_dynamic_activation_quant(weight: torch.Tensor):
             ), "PerRow quantization only works for bfloat16 precision input weight"
 
         block_size = get_block_size(weight.shape, weight_granularity)
-        quantized_weight = to_affine_quantized_floatx(
+        quantized_weight = to_affine_quantized_float8(
             input_float=weight,
             block_size=block_size,
             target_dtype=weight_dtype,
-            scale_dtype=torch.float32,
             _layout=Float8Layout(mm_config=mm_config),
         )
 
@@ -1157,11 +1147,10 @@ def apply_float8_static_activation_quant(weight: torch.Tensor):
         if not _fp8_mm_compat(weight):
             return weight
         block_size = get_block_size(weight.shape, weight_granularity)
-        quantized_weight = to_affine_quantized_floatx(
+        quantized_weight = to_affine_quantized_float8(
             input_float=weight,
             block_size=block_size,
             target_dtype=weight_dtype,
-            scale_dtype=torch.float32,
             _layout=Float8Layout(mm_config=mm_config),
         )