[quant][fx] Add support for fused modules in _convert_do_not_use (pytorch#67245)

jerryzh168 · facebook-github-bot · commit 54241a9cfa1d · 2021-11-02T19:21:54.000-07:00
Summary: Pull Request resolved: pytorch#67245 Add support for fused modules in the new convert path, including linear-relu, conv{1-3}d-relu and their qat versions, also tested with trt (conv2d-relu and linear-relu) Test Plan: ``` python test/fx2trt/test_quantize_fx.py TestQuantizeFxTRTOps.test_linear_relu_module python test/fx2trt/test_quantize_fx.py TestQuantizeFxTRTOps.test_conv_relu_module ``` Imported from OSS Reviewed By: vkuzo Differential Revision: D31919724 fbshipit-source-id: 7e5c96eba30706f7989da680aa3443159847bdfd
diff --git a/test/fx2trt/test_quant_trt.py b/test/fx2trt/test_quant_trt.py
@@ -20,10 +20,13 @@
 from torch.testing._internal.common_quantization import (
     QuantizationTestCase,
 )
+import torch.nn.functional as F
+
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.common_quantization import NodeSpec as ns
 import unittest
+import itertools
 
 def lower_to_trt(model, inputs, shape_ranges):
     """ Lower a quantized model to TensorRT
@@ -92,36 +95,60 @@ def _test_module(
         trt_mod(*inputs_cuda)
 
 
-    def test_conv(self):
-        class Conv2dModule(torch.nn.Module):
-            def __init__(self):
+    def test_conv_relu_module(self):
+        conv_module = {1 : torch.nn.Conv1d, 2 : torch.nn.Conv2d, 3 : torch.nn.Conv3d}
+
+        conv1d_input = torch.rand(1, 3, 10)
+        conv2d_input = torch.rand(1, 3, 10, 10)
+        conv3d_input = torch.rand(1, 3, 10, 10, 10)
+        conv_input = {1: conv1d_input, 2: conv2d_input, 3: conv3d_input}
+
+        class ConvNdModule(torch.nn.Module):
+            def __init__(self, dim, has_relu=False, f_relu=False):
                 super().__init__()
-                self.conv = torch.nn.Conv2d(3, 3, 3)
+                self.conv = conv_module[dim](3, 3, 3).float()
+                if has_relu:
+                    if f_relu:
+                        self.relu = F.relu
+                    else:
+                        self.relu = torch.nn.ReLU()
+                else:
+                    self.relu = torch.nn.Identity()
 
             def forward(self, x):
-                return self.conv(x)
-
-        conv2d_input = torch.rand(1, 3, 224, 224)
-        no_convert = {
-            ns.call_function(torch.quantize_per_tensor): 2,
-            ns.call_method("dequantize"): 2
-        }
-        self._test_module(
-            Conv2dModule(),
-            [conv2d_input],
-            [((1, 3, 224, 224),
-              (5, 3, 224, 224),
-              (10, 3, 224, 224))],
-            no_convert=no_convert)
-
-    def test_linear(self):
+                return self.relu(self.conv(x))
+
+        # just testing conv2d since conv1d and conv3d are not supported in fx2trt
+        for dim, has_relu, f_relu in itertools.product([2], [True, False], [True, False]):
+            # when has_relu=False, we have torch.nn.Identity, which would introduce
+            # extra quant-dequat pair
+            no_convert = {
+                ns.call_function(torch.quantize_per_tensor): 2 + int(not has_relu),
+                ns.call_method("dequantize"): 2 + int(not has_relu),
+            }
+            self._test_module(
+                ConvNdModule(dim, has_relu, f_relu),
+                [conv_input[dim]],
+                [((1, *conv_input[dim].shape[1:]),
+                  (5, *conv_input[dim].shape[1:]),
+                  (10, *conv_input[dim].shape[1:]))],
+                no_convert=no_convert)
+
+    def test_linear_relu_module(self):
         class LinearModule(torch.nn.Module):
-            def __init__(self):
+            def __init__(self, has_relu=False, f_relu=False):
                 super().__init__()
-                self.linear = torch.nn.Linear(5, 10)
+                self.linear = torch.nn.Linear(5, 10).float()
+                if has_relu:
+                    if f_relu:
+                        self.relu = F.relu
+                    else:
+                        self.relu = torch.nn.ReLU()
+                else:
+                    self.relu = torch.nn.Identity()
 
             def forward(self, x):
-                return self.linear(x)
+                return self.relu(self.linear(x))
 
         linear_input = torch.rand(8, 5)
 
@@ -130,15 +157,18 @@ def forward(self, x):
              (5, 5),
              (10, 5))
         ]
-        no_convert = {
-            ns.call_function(torch.quantize_per_tensor): 2,
-            ns.call_method("dequantize"): 2,
-        }
-        self._test_module(
-            LinearModule(),
-            [linear_input],
-            shape_ranges,
-            no_convert=no_convert)
+        for has_relu, f_relu in itertools.product([True, False], [True, False]):
+            # when has_relu=False, we have torch.nn.Identity, which would introduce
+            # extra quant-dequat pair
+            no_convert = {
+                ns.call_function(torch.quantize_per_tensor): 2 + int(not has_relu),
+                ns.call_method("dequantize"): 2 + int(not has_relu),
+            }
+            self._test_module(
+                LinearModule(has_relu, f_relu),
+                [linear_input],
+                shape_ranges,
+                no_convert=no_convert)
 
     def test_ops(self):
         class M(torch.nn.Module):
diff --git a/torch/ao/quantization/fx/_convert_do_not_use.py b/torch/ao/quantization/fx/_convert_do_not_use.py
@@ -35,6 +35,34 @@
 
 from .convert import restore_state
 
+# these are tuples so that they can work with isinstance(module, tuple_of_classes)
+WEIGHTED_MODULE_CLASSES = (
+    torch.nn.Linear,
+    torch.nn.Conv1d,
+    torch.nn.Conv2d,
+    torch.nn.Conv3d
+)
+
+FUSED_MODULE_CLASSES = (
+    torch.nn.intrinsic.LinearReLU,
+    torch.nn.intrinsic.ConvReLU1d,
+    torch.nn.intrinsic.ConvReLU2d,
+    torch.nn.intrinsic.ConvReLU3d,
+)
+
+QAT_MODULE_CLASSES = (
+    torch.nn.qat.Linear,
+    torch.nn.qat.Conv2d,
+    torch.nn.qat.Conv3d,
+    torch.nn.intrinsic.qat.LinearReLU,
+    torch.nn.intrinsic.qat.ConvBn2d,
+    torch.nn.intrinsic.qat.ConvBnReLU2d,
+    torch.nn.intrinsic.qat.ConvReLU2d,
+    torch.nn.intrinsic.qat.ConvBn3d,
+    torch.nn.intrinsic.qat.ConvBnReLU3d,
+    torch.nn.intrinsic.qat.ConvReLU3d
+)
+
 def _convert_do_not_use(
         model: GraphModule, is_reference: bool = False,
         convert_custom_config_dict: Dict[str, Any] = None,
@@ -64,7 +92,7 @@ def _convert_do_not_use(
     patterns, node_name_to_scope, prepare_custom_config_dict, observed_node_names = restore_state(model)
     qconfig_map: Dict[str, QConfigAny] = model._qconfig_map  # type: ignore[assignment]
 
-    assert is_reference, "convert2 only supports reference option"
+    assert is_reference, "_convert_do_not_use only supports reference option"
 
     # mapping from fully qualified module name to module instance
     # for example,
@@ -167,24 +195,54 @@ def replace_observer_with_quantize_dequantize_node(graph: Graph, node: Node, mod
         elif node.op == "call_module":
             if is_activation_post_process(modules[node.target]):
                 replace_observer_with_quantize_dequantize_node(model.graph, node, modules)
-            elif type(modules[node.target]) in [
-                    torch.nn.Linear,
-                    torch.nn.Conv1d,
-                    torch.nn.Conv2d,
-                    torch.nn.Conv3d]:
-                fmodule = modules[node.target]
-                qconfig = fmodule.qconfig
+            elif type(modules[node.target]) in set(
+                    WEIGHTED_MODULE_CLASSES).union(QAT_MODULE_CLASSES).union(FUSED_MODULE_CLASSES):
+                # TODO: refactor this part to a function
+                original_module = modules[node.target]
+                qconfig = original_module.qconfig
 
                 is_observed = node.name in observed_node_names
                 is_weight_quantized = weight_is_statically_quantized(qconfig)
                 # TODO: rename weight_is_statically_quantized to weight_is_int8_quantized
-                if qconfig is not None and is_observed and is_weight_quantized:
+                if qconfig is None or not is_observed or not is_weight_quantized:
+                    continue
+
+                float_module = original_module
+                fused_module = None
+                if isinstance(
+                        original_module,
+                        QAT_MODULE_CLASSES):
+                    # case 1. converting qat module to
+                    # a float module, we need to attch
+                    # weight fake_quant to the module,
+                    # weight fake_quant is assumed to be run during
+                    # QAT so we don't need to run it again here
+                    float_module = original_module.to_float()  # type: ignore[operator]
+                    # change qat conv to conv
+                    parent_name, name = _parent_name(node.target)
+                    setattr(modules[parent_name], name, float_module)
+                    if isinstance(float_module, torch.nn.intrinsic._FusedModule):
+                        fused_module = float_module
+                        float_module = fused_module[0]
+                    weight_post_process = original_module.weight_fake_quant
+                else:
+                    # case 2. converting a float module/fused float module
+                    # to float module, we need to attach
+                    # weight observer to the conv module and run it
+                    # with conv weight
+                    if isinstance(original_module, torch.nn.intrinsic._FusedModule):
+                        fused_module = original_module
+                        float_module = fused_module[0]  # type: ignore[index]
+                    assert qconfig is not None
                     weight_post_process = qconfig.weight()
                     # run weight observer
-                    weight_post_process(fmodule.weight)  # type: ignore[operator]
-                    weight_qparams = get_qparam_dict(weight_post_process)
-                    ref_qmodule_cls = get_static_quant_module_class(type(fmodule), is_reference=True)
-                    ref_qmodule = ref_qmodule_cls.from_float(fmodule, weight_qparams)
+                    weight_post_process(float_module.weight)  # type: ignore[operator]
+                weight_qparams = get_qparam_dict(weight_post_process)
+                ref_qmodule_cls = get_static_quant_module_class(type(float_module), is_reference=True)
+                ref_qmodule = ref_qmodule_cls.from_float(float_module, weight_qparams)
+                if fused_module is not None:
+                    fused_module[0] = ref_qmodule
+                else:
                     parent_name, name = _parent_name(node.target)
                     setattr(modules[parent_name], name, ref_qmodule)
 
diff --git a/torch/ao/quantization/fx/backend_config_dict/tensorrt.py b/torch/ao/quantization/fx/backend_config_dict/tensorrt.py
@@ -33,26 +33,86 @@ def get_tensorrt_backend_config_dict():
             weighted_op_qint8_dtype_config,
         ]
     }
+    # TODO: maybe make "pattern" to be a list of patterns
+    # TODO: current patterns are the ones after fusion, we will want to expose fusion
+    # here as well in the future, maybe we need to
+    # linear_relu_mm_config = {
+    #     "pattern": (torch.nn.ReLU, torch.nn.Linear),
+    #     "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+    #     "dtype_configs": [
+    #         weighted_op_qint8_dtype_config,
+    #     ]
+    # }
+    # linear_relu_mf_config = {
+    #     "pattern": (torch.nn.functional.relu, torch.nn.Linear),
+    #     "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+    #     "dtype_configs": [
+    #         weighted_op_qint8_dtype_config,
+    #     ]
+    # }
+
+    linear_relu_fused_config = {
+        "pattern": torch.nn.intrinsic.LinearReLU,
+        "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        "dtype_configs": [
+            weighted_op_qint8_dtype_config,
+        ]
+    }
     conv_module_config = {
         "pattern": torch.nn.Conv2d,
         "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
         "dtype_configs": [
             weighted_op_qint8_dtype_config,
         ]
     }
+    conv_relu_1d_fused_config = {
+        "pattern": torch.nn.intrinsic.ConvReLU1d,
+        "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        "dtype_configs": [
+            weighted_op_qint8_dtype_config,
+        ]
+    }
+    conv_relu_2d_fused_config = {
+        "pattern": torch.nn.intrinsic.ConvReLU2d,
+        "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        "dtype_configs": [
+            weighted_op_qint8_dtype_config,
+        ]
+    }
+    conv_relu_3d_fused_config = {
+        "pattern": torch.nn.intrinsic.ConvReLU3d,
+        "observation_type": ObservationType.OUTPUT_USE_DIFFERENT_OBSERVER_AS_INPUT,
+        "dtype_configs": [
+            weighted_op_qint8_dtype_config,
+        ]
+    }
     cat_config = {
         "pattern": torch.cat,
         "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
         "dtype_configs": [
             non_weighted_op_qint8_dtype_config,
         ]
     }
+    identity_config = {
+        "pattern": torch.nn.Identity,
+        "observation_type": ObservationType.OUTPUT_SHARE_OBSERVER_WITH_INPUT,
+        "dtype_configs": [
+            non_weighted_op_qint8_dtype_config,
+        ]
+    }
     return {
         # optional
         "name": "tensorrt",
         "configs": [
             linear_module_config,
+            linear_relu_fused_config,
             conv_module_config,
+            # conv1d is not supported in fx2trt
+            # conv_relu_1d_fused_config,
+            conv_relu_2d_fused_config,
+            # conv3d is not supported in fx2trt
+            # conv_relu_3d_fused_config,
             cat_config,
+            identity_config,
         ]
     }