Qualcomm AI Engine Direct - OSS models breakage fix (#10191)

DannyYuyang-quic · web-flow · commit e6bdeda554fe · 2025-04-16T10:08:42.000-07:00
### Summary - Fastvit breakage fix - ConvFormer breakage fix - Changed dataset for Edsr due to unavailable dataset link - Add test case for ConvertSquareToPow pass ### Test plan ```bash python ./examples/qualcomm/oss_scripts/fastvit.py -m ${soc} -b build-android -H ${host_id} -s ${device_id} --oss_repo ${Path_to_oss_repo} --pretrained_weight ${Path_to_pretrained_weight} -d ${Path_to_dataset_dir} ``` ```bash python ./examples/qualcomm/oss_scripts/conv_former.py -m ${soc} -b build-android -H ${host_id} -s ${device_id} -d ${Path_to_dataset_dir} ``` cc @cccclai @cbilgin
diff --git a/backends/qualcomm/_passes/__init__.py b/backends/qualcomm/_passes/__init__.py
@@ -9,6 +9,7 @@
 from .annotate_unbind import AnnotateUnbind
 from .convert_bmm_to_matmul import ConvertBmmToMatmul
 from .convert_conv1d_to_conv2d import ConvertConv1dToConv2d
+from .convert_square_to_pow import ConvertSquareToPow
 from .convert_upsample_bicubic2d import ConvertUpsampleBicubicWithBilinear
 from .decompose_any import DecomposeAny
 from .decompose_cdist import DecomposeCDist
@@ -42,6 +43,7 @@
     AnnotateUnbind,
     ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
+    ConvertSquareToPow,
     ConvertUpsampleBicubicWithBilinear,
     DecomposeAny,
     DecomposeCDist,
diff --git a/backends/qualcomm/_passes/convert_square_to_pow.py b/backends/qualcomm/_passes/convert_square_to_pow.py
@@ -0,0 +1,38 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from executorch.exir.pass_base import ExportPass, PassResult
+
+from .utils import copy_meta
+
+
+class ConvertSquareToPow(ExportPass):
+    """
+    Convert square to pow with a scalar value of 2.
+    This allows LiftConstantScalarOperands to lift the scalar into a scalar.
+    Otherwise, the square op will be converted to pow.tensor_scalar after to_edge.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if node.target == torch.ops.aten.square.default:
+                input_node = node.args[0]
+                with graph_module.graph.inserting_after(input_node):
+                    pow_op = torch.ops.aten.pow.Tensor_Scalar
+                    pow_node = graph.create_node(
+                        "call_function", pow_op, (input_node, 2)
+                    )
+                    pow_node.meta = copy_meta(node.meta)
+                for user in node.users.copy():
+                    user.replace_input_with(node, pow_node)
+
+        graph.eliminate_dead_code()
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/qualcomm/_passes/qnn_pass_manager.py b/backends/qualcomm/_passes/qnn_pass_manager.py
@@ -14,6 +14,7 @@
     AnnotateUnbind,
     ConvertBmmToMatmul,
     ConvertConv1dToConv2d,
+    ConvertSquareToPow,
     ConvertUpsampleBicubicWithBilinear,
     DecomposeAny,
     DecomposeCDist,
@@ -199,6 +200,7 @@ def transform_for_export_pipeline(self, exported_program: ExportedProgram):
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeLinalgVectorNorm(quantization_capture=True))
         self.add_pass(DecomposeExpM1())
+        self.add_pass(ConvertSquareToPow())
         self.add_pass(LiftConstantScalarOperands())
         self._transform(exported_program.graph_module)
         ep = lift_constant_tensor_pass(exported_program)
diff --git a/backends/qualcomm/tests/models.py b/backends/qualcomm/tests/models.py
@@ -1436,6 +1436,15 @@ def forward(self, x):
         return x / torch.sqrt(torch.tensor([64.0]))
 
 
+class SquaredReLU(torch.nn.Module):
+    def __init__(self, inplace=False):
+        super().__init__()
+        self.relu = torch.nn.ReLU(inplace=inplace)
+
+    def forward(self, x):
+        return torch.square(self.relu(x))
+
+
 class Squeeze(torch.nn.Module):
     def __init__(self):
         super().__init__()
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -843,6 +843,11 @@ def test_qnn_backend_softmax(self):
         sample_input = (torch.randn([1, 4, 8, 8]),)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_squared_relu(self):
+        module = SquaredReLU()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_squeeze(self):
         module = Squeeze()  # noqa: F405
         sample_input = (torch.randn([1, 3, 3]),)
@@ -2001,6 +2006,12 @@ def test_qnn_backend_softmax(self):
         module = self.get_qdq_module(module, sample_input)
         self.lower_module_and_test_output(module, sample_input)
 
+    def test_qnn_backend_squared_relu(self):
+        module = SquaredReLU()  # noqa: F405
+        sample_input = (torch.randn([2, 5, 1, 3]),)
+        module = self.get_qdq_module(module, sample_input)
+        self.lower_module_and_test_output(module, sample_input)
+
     def test_qnn_backend_squeeze(self):
         module = Squeeze()  # noqa: F405
         sample_input = (torch.randn([1, 3, 3]),)
@@ -3642,7 +3653,7 @@ def test_efficientSAM(self):
             self.skipTest("missing required envs")
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/oss_scripts/efficientSAM.py",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/efficientSAM/efficientSAM.py",
             "--dataset",
             self.image_dataset,
             "--artifact",
diff --git a/examples/qualcomm/oss_scripts/conv_former.py b/examples/qualcomm/oss_scripts/conv_former.py
@@ -12,10 +12,14 @@
 import numpy as np
 import timm
 import torch
-from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-from executorch.backends.qualcomm.utils.constants import (
-    QCOM_PASS_EXPAND_BROADCAST_SHAPE,
+from executorch.backends.qualcomm._passes.expand_broadcast_tensor_shape import (
+    ExpandBroadcastTensorShape,
+)
+from executorch.backends.qualcomm._passes.qnn_pass_manager import (
+    get_capture_program_passes,
 )
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.utils.constants import QCOM_PASS_ACTIVATE_KEY
 from executorch.examples.qualcomm.utils import (
     build_executorch_binary,
     get_imagenet_dataset,
@@ -55,14 +59,17 @@ def main(args):
 
     model = model.eval()
 
+    # lower to QNN
+    passes_job = get_capture_program_passes()
+    passes_job[ExpandBroadcastTensorShape][QCOM_PASS_ACTIVATE_KEY] = True
     build_executorch_binary(
         model,
         inputs[0],
         args.model,
         f"{args.artifact}/{pte_filename}",
         inputs,
         quant_dtype=QuantDtype.use_8a8w,
-        custom_pass_config={QCOM_PASS_EXPAND_BROADCAST_SHAPE},
+        passes_job=passes_job,
     )
 
     if args.compile_only:
diff --git a/examples/qualcomm/oss_scripts/fastvit.py b/examples/qualcomm/oss_scripts/fastvit.py
@@ -101,16 +101,16 @@ def main(args):
         ),
     )
     # rewrite default per-channel ptq config
-    quantizer.per_channel_quant_config = QuantizationConfig(
+    quantizer.default_quant_config.per_channel_quant_config = QuantizationConfig(
         input_activation=act_qspec,
         output_activation=act_qspec,
         weight=weight_qspec,
         bias=_derived_bias_quant_spec,
     )
 
     # rewrite default ptq config
-    q_config = quantizer.quant_config
-    quantizer.quant_config = QuantizationConfig(
+    q_config = quantizer.default_quant_config.quant_config
+    quantizer.default_quant_config.quant_config = QuantizationConfig(
         input_activation=act_qspec,
         output_activation=act_qspec,
         weight=q_config.weight,
diff --git a/examples/qualcomm/scripts/edsr.py b/examples/qualcomm/scripts/edsr.py
@@ -24,7 +24,7 @@
 
 from PIL import Image
 from torch.utils.data import Dataset
-from torchsr.datasets import B100
+from torchsr.datasets import B100, Div2K
 from torchvision.transforms.functional import to_pil_image, to_tensor
 
 
@@ -75,6 +75,16 @@ def get_b100(
     return SrDataset(hr_dir, lr_dir)
 
 
+def get_Div2K(
+    dataset_dir: str,
+):
+    hr_dir = f"{dataset_dir}/sr_bm_dataset/DIV2K/DIV2K_valid_HR"
+    lr_dir = f"{dataset_dir}/sr_bm_dataset/DIV2K/DIV2K_valid_LR_bicubic/X2"
+    if not os.path.exists(hr_dir) or not os.path.exists(lr_dir):
+        Div2K(root=f"{dataset_dir}/sr_bm_dataset", scale=2, download=True)
+    return SrDataset(hr_dir, lr_dir)
+
+
 def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str):
     if not (lr_dir and hr_dir) and not default_dataset:
         raise RuntimeError(
@@ -85,7 +95,7 @@ def get_dataset(hr_dir: str, lr_dir: str, default_dataset: str, dataset_dir: str
         raise RuntimeError("Either use custom dataset, or use default dataset.")
 
     if default_dataset:
-        return get_b100(dataset_dir)
+        return get_Div2K(dataset_dir)
 
     return SrDataset(hr_dir, lr_dir)