Add support for transposed grouped convolution in torch to linalg lowering

Ivan Garcia · Ivan Garcia · commit a9b80ee97a27 · 2025-02-26T12:32:12.000-05:00
diff --git a/lib/Conversion/TorchToLinalg/Linear.cpp b/lib/Conversion/TorchToLinalg/Linear.cpp
@@ -955,7 +955,50 @@ class ConvertAtenConvolutionOp : public OpConversionPattern<AtenConvolutionOp> {
       if (isa<mlir::IntegerType>(inputDTy))
         pad = rewriter.create<arith::TruncIOp>(op.getLoc(), inputDTy, pad);
     }
+
+    // This code was moved earlier because in the grouped transposed convolution
+    // case we need to expand before doing the dimension permutation. For the
+    // grouped non-transposed convolution, we don't need to do filter/channel
+    // dimension flipping, we can just expand the group from the filter in place
+    // to have the group dimension in front:
+    // expand F,C,H,W -> G,F/G,C,H,W
+    //
+    // When we have grouped transposed convolution we need to first expand the
+    // input channel: expand C,F,H,W -> G,C/G,F,H,W
+    //
+    // And then flip the output filters with the input channel to make it linalg
+    // compatible: permute G,C/G,F,H,W -> G,F,C/G,H,W
+    //
+    // Notice that if the flipping happens first, then we can't move the group
+    // dimension to the front as the linalg convolution operation requires.
+    //
+    auto expandWeight = [&](Value tensor) {
+      auto inType = cast<RankedTensorType>(tensor.getType());
+      auto inShape = makeShapeTorchCompatible(inType.getShape());
+
+      SmallVector<int64_t> outShape{numGroups,
+                                    (inShape[0] == kUnknownSize
+                                         ? kUnknownSize
+                                         : (inShape[0] / numGroups)),
+                                    inShape[1]};
+      outShape.append(inShape.begin() + 2, inShape.end());
+
+      SmallVector<ReassociationIndices> indices{};
+      int currIndex = 0;
+      indices.push_back({0, 1});
+      currIndex += 2;
+      for (int i = currIndex; i <= (long)inShape.size(); i++)
+        indices.push_back({i});
+
+      auto retType = inType.clone(makeShapeLLVMCompatible(outShape));
+      return rewriter.create<tensor::ExpandShapeOp>(loc, retType, tensor,
+                                                    indices);
+    };
+
     if (transposed) {
+      bool isGroupedConv = numGroups > 1;
+      weight = isGroupedConv ? expandWeight(weight) : weight;
+
       Value c0 =
           rewriter.create<arith::ConstantOp>(loc, rewriter.getIndexAttr(0));
       Value c1 =
@@ -965,25 +1008,40 @@ class ConvertAtenConvolutionOp : public OpConversionPattern<AtenConvolutionOp> {
 
       // Transpose and flip weight
       SmallVector<Value> weightInitDims = getTensorSizes(rewriter, loc, weight);
-      std::iter_swap(weightInitDims.begin(), weightInitDims.begin() + 1);
-      outDims[1] = weightInitDims[0];
+      if (isGroupedConv) {
+        // We need to skip the first dimension (group) in this case, also the
+        // output dimension needs to consider the number of groups.
+        std::iter_swap(weightInitDims.begin() + 1, weightInitDims.begin() + 2);
+        auto numGroupsVal =
+            rewriter.create<mlir::arith::ConstantIndexOp>(loc, numGroups);
+        outDims[1] = rewriter.createOrFold<mlir::arith::MulIOp>(
+            loc, weightInitDims[1], numGroupsVal);
+      } else {
+        std::iter_swap(weightInitDims.begin(), weightInitDims.begin() + 1);
+        outDims[1] = weightInitDims[0];
+      }
+      auto weightRank = weightInitDims.size();
       Value weightInitTensor =
           createZeroInitTensor(rewriter, loc, weightInitDims, weightDTy);
       SmallVector<utils::IteratorType> iteratorTypes(
-          inRank, utils::IteratorType::parallel);
+          weightRank, utils::IteratorType::parallel);
       SmallVector<AffineMap> indexingMaps{
-          AffineMap::getMultiDimIdentityMap(inRank, context)};
+          AffineMap::getMultiDimIdentityMap(weightRank, context)};
       weight = rewriter
                    .create<linalg::GenericOp>(
                        loc, weightInitTensor.getType(), ValueRange{},
                        weightInitTensor, indexingMaps, iteratorTypes,
                        [&](OpBuilder &b, Location loc, ValueRange args) {
                          SmallVector<Value> indices;
-                         for (size_t i = 0; i < inRank; i++)
+                         for (size_t i = 0; i < weightRank; i++)
                            indices.push_back(b.create<linalg::IndexOp>(loc, i));
-                         std::iter_swap(indices.begin(), indices.begin() + 1);
-                         // Flip only the spatial dimensions (from 2 to inRank)
-                         for (size_t flipDim = 2; flipDim < inRank; flipDim++) {
+                         auto fcIdxSwapOffset = isGroupedConv ? 1 : 0;
+                         std::iter_swap(indices.begin() + fcIdxSwapOffset,
+                                        indices.begin() + fcIdxSwapOffset + 1);
+                         // Flip only the spatial dimensions (from 2 to
+                         // weightRank)
+                         for (size_t flipDim = fcIdxSwapOffset + 2;
+                              flipDim < weightRank; flipDim++) {
                            indices[flipDim] = b.create<arith::SubIOp>(
                                loc,
                                b.create<arith::SubIOp>(
@@ -1373,43 +1431,26 @@ class ConvertAtenConvolutionOp : public OpConversionPattern<AtenConvolutionOp> {
                                                     indices);
     };
 
-    // expand F,C,H,W -> G,F/G,C,H,W
-    auto expandWeight = [&](Value tensor) {
-      auto inType = cast<RankedTensorType>(tensor.getType());
-      auto inShape = makeShapeTorchCompatible(inType.getShape());
-
-      SmallVector<int64_t> outShape{
-          numGroups,
-          (inShape[0] == kUnknownSize ? kUnknownSize : inShape[0] / numGroups)};
-      outShape.append(inShape.begin() + 1, inShape.end());
-
-      SmallVector<ReassociationIndices> indices{{0, 1}};
-      for (auto i = 2; i <= (long)inShape.size(); i++)
-        indices.push_back({i});
-
-      auto retType = inType.clone(makeShapeLLVMCompatible(outShape));
-      return rewriter.create<tensor::ExpandShapeOp>(loc, retType, tensor,
-                                                    indices);
-    };
-
     Value paddedInputExpanded = expandGroups(paddedInput, 1);
-    Value weightExpanded = expandWeight(weight);
+    // If we have a transposed convolution, this needs to be handled before
+    // dimension permutation. See comments in the expandWeight lambda definition
+    // for details.
+    weight = transposed ? weight : expandWeight(weight);
     auto expandOutputTensor = expandGroups(outputTensor, 1);
 
     // TODO: add 1D and 3D case
     if (!inputZp) {
       conv = rewriter
                  .create<linalg::Conv2DNgchwGfchwOp>(
                      loc, expandOutputTensor.getResultType(),
-                     ValueRange{paddedInputExpanded, weightExpanded},
+                     ValueRange{paddedInputExpanded, weight},
                      expandOutputTensor.getResult(), stridesAttr, dilationAttr)
                  .getResult(0);
     } else {
       conv = rewriter
                  .create<linalg::Conv2DNgchwGfchwQOp>(
                      loc, expandOutputTensor.getResultType(),
-                     ValueRange{paddedInputExpanded, weightExpanded, inputZp,
-                                weightZp},
+                     ValueRange{paddedInputExpanded, weight, inputZp, weightZp},
                      expandOutputTensor.getResult(), stridesAttr, dilationAttr)
                  .getResult(0);
     }
diff --git a/projects/pt1/e2e_testing/xfail_sets.py b/projects/pt1/e2e_testing/xfail_sets.py
@@ -3523,6 +3523,7 @@
     "ConvolutionModule2DTransposeStridedStatic_basic",
     "ConvolutionModule2DTransposeStrided_basic",
     "ConvolutionModule2DTranspose_basic",
+    "ConvolutionModule2DGroupedTranspose_basic",
     "CumsumInputDtypeInt32Module_basic",
     "CumsumModule_basic",
     "CumsumStaticModule_basic",
@@ -4099,6 +4100,7 @@
     "ConvolutionModule2DTransposeStridedStatic_basic",
     "ConvolutionModule2DTransposeStrided_basic",
     "ConvolutionModule2DTranspose_basic",
+    "ConvolutionModule2DGroupedTranspose_basic",
     "CopyModule_basic",
     "CopyWithDifferentDTypesAndSizesModule_basic",
     "CopyWithDifferentDTypesModule_basic",
diff --git a/projects/pt1/python/torch_mlir_e2e_test/test_suite/conv.py b/projects/pt1/python/torch_mlir_e2e_test/test_suite/conv.py
@@ -1725,3 +1725,35 @@ def DeformConv2D_basic(module, tu: TestUtils):
     offset = tu.rand(N, offset_dim1, Hout, Wout)
     weight = tu.rand(Cout, Cin, Hker, Wker)
     module.forward(input, offset, weight)
+
+
+class ConvolutionModule2DGroupedTranspose(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @export
+    @annotate_args(
+        [
+            None,
+            ([1, 2, 5, 7], torch.float32, True),
+            ([2, 2, 3, 3], torch.float32, True),
+            ([4], torch.float32, True),
+        ]
+    )
+    def forward(self, inputVec, weight, bias):
+        return torch.ops.aten.convolution(
+            inputVec,
+            weight,
+            bias=bias,
+            stride=[2, 2],
+            padding=[1, 1],
+            dilation=[1, 1],
+            transposed=True,
+            output_padding=[0, 0],
+            groups=2,
+        )
+
+@register_test_case(module_factory=lambda: ConvolutionModule2DGroupedTranspose())
+def ConvolutionModule2DGroupedTranspose_basic(module, tu: TestUtils):
+    module.forward(tu.rand(1, 2, 5, 7), tu.rand(2, 2, 3, 3), tu.rand(4))
+
diff --git a/test/Conversion/TorchToLinalg/convolution.mlir b/test/Conversion/TorchToLinalg/convolution.mlir
@@ -76,3 +76,75 @@ func.func @conv_broadcast(%arg0: !torch.vtensor<[1,80,3000],f32>, %arg1: !torch.
   %2 = torch.aten.convolution %arg0, %arg1, %arg2, %0, %0, %0, %false, %1, %int1 : !torch.vtensor<[1,80,3000],f32>, !torch.vtensor<[1024,80,3],f32>, !torch.vtensor<[1024],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,1024,3000],f32>
   return %2 : !torch.vtensor<[1,1024,3000],f32>
 }
+
+// CHECK-LABEL:   func.func @transposedConv2D(
+// CHECK-SAME:       %[[arg0:.*]]: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32>
+// CHECK:         = linalg.generic
+// CHECK-SAME:       outs(%[[VAR1:.*]] : tensor<4x2x3x3xf32>) {
+// CHECK:         %[[VAR2:.*]] = tensor.extract
+// CHECK-SAME:       : tensor<2x4x3x3xf32>
+// CHECK-NEXT:    linalg.yield %[[VAR3:.*]] : f32
+// CHECK-NEXT:    } -> tensor<4x2x3x3xf32>
+// CHECK:         %[[VAR4:.*]] = linalg.broadcast ins(%[[VAR5:.*]] : tensor<4xf32>) outs(%[[VAR6:.*]] : tensor<1x4x11x15xf32>) dimensions = [0, 2, 3]
+// CHECK:         %[[VAR7:.*]] = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
+// CHECK-SAME:        ins(%[[VAR8:.*]], %[[VAR9:.*]] : tensor<1x2x13x17xf32>, tensor<4x2x3x3xf32>) outs(%[[VAR10:.*]] : tensor<1x4x11x15xf32>) -> tensor<1x4x11x15xf32>
+// CHECK-NEXT:    %[[VAR11:.*]] = tensor.cast %[[VAR12:.*]] : tensor<1x4x11x15xf32> to tensor<1x4x?x?xf32>
+func.func @transposedConv2D(%arg0: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32> attributes {torch.assume_strict_symbolic_shapes} {
+  %int0 = torch.constant.int 0
+  %true = torch.constant.bool true
+  %int1 = torch.constant.int 1
+  %int2 = torch.constant.int 2
+  %0 = torch.vtensor.literal(dense_resource<torch_tensor_2_4_3_3_torch.float32> : tensor<2x4x3x3xf32>) : !torch.vtensor<[2,4,3,3],f32>
+  %1 = torch.vtensor.literal(dense_resource<torch_tensor_4_torch.float32> : tensor<4xf32>) : !torch.vtensor<[4],f32>
+  %2 = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %5 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %6 = torch.aten.convolution %arg0, %0, %1, %2, %3, %4, %true, %5, %int1 : !torch.vtensor<[1,2,5,7],f32>, !torch.vtensor<[2,4,3,3],f32>, !torch.vtensor<[4],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,4,10,14],f32>
+  return %6 : !torch.vtensor<[1,4,10,14],f32>
+}
+
+// CHECK-LABEL:   func.func @groupedConvolution2D(
+// CHECK-SAME:       %[[arg0:.*]]: !torch.vtensor<[1,4,5,7],f32>) -> !torch.vtensor<[1,4,5,7],f32>
+// CHECK:         %[[VAR1:.*]] = linalg.broadcast ins(%[[VAR2:.*]] : tensor<4xf32>) outs(%[[VAR3:.*]] : tensor<1x4x5x7xf32>) dimensions = [0, 2, 3]
+// CHECK:         %[[VAR4:.*]] = linalg.conv_2d_ngchw_gfchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
+// CHECK-SAME:        ins(%[[VAR5:.*]], %[[VAR6:.*]] : tensor<1x2x2x7x9xf32>, tensor<2x2x2x3x3xf32>) outs(%[[VAR7:.*]] : tensor<1x2x2x5x7xf32>) -> tensor<1x2x2x5x7xf32>
+// CHECK-NEXT:    %[[VAR8:.*]] = tensor.collapse_shape
+// CHECK-SAME:        tensor<1x2x2x5x7xf32> into tensor<1x4x5x7xf32>
+func.func @groupedConvolution2D(%arg0: !torch.vtensor<[1,4,5,7],f32>) -> !torch.vtensor<[1,4,5,7],f32> attributes {torch.assume_strict_symbolic_shapes} {
+  %int0 = torch.constant.int 0
+  %false = torch.constant.bool false
+  %int1 = torch.constant.int 1
+  %int2 = torch.constant.int 2
+  %0 = torch.vtensor.literal(dense_resource<torch_tensor_4_2_3_3_torch.float32> : tensor<4x2x3x3xf32>) : !torch.vtensor<[4,2,3,3],f32>
+  %1 = torch.vtensor.literal(dense_resource<torch_tensor_4_torch.float32> : tensor<4xf32>) : !torch.vtensor<[4],f32>
+  %2 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %5 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %6 = torch.aten.convolution %arg0, %0, %1, %2, %3, %4, %false, %5, %int2 : !torch.vtensor<[1,4,5,7],f32>, !torch.vtensor<[4,2,3,3],f32>, !torch.vtensor<[4],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,4,5,7],f32>
+  return %6 : !torch.vtensor<[1,4,5,7],f32>
+}
+
+// CHECK-LABEL:   func.func @transposedGroupedConvolution2D(
+// CHECK-SAME:       %[[arg0:.*]]: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32>
+// CHECK:         %[[VAR1:.*]] = linalg.broadcast ins(%[[VAR2:.*]] : tensor<4xf32>) outs(%[[VAR3:.*]] : tensor<1x4x11x15xf32>) dimensions = [0, 2, 3]
+// CHECK:         %[[VAR4:.*]] = linalg.conv_2d_ngchw_gfchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
+// CHECK-SAME:        ins(%[[VAR5:.*]], %[[VAR6:.*]] : tensor<1x2x1x13x17xf32>, tensor<2x2x1x3x3xf32>) outs(%[[VAR7:.*]] : tensor<1x2x2x11x15xf32>) -> tensor<1x2x2x11x15xf32>
+// CHECK-NEXT:    %[[VAR8:.*]] = tensor.collapse_shape
+// CHECK-SAME:        tensor<1x2x2x11x15xf32> into tensor<1x4x11x15xf32>
+func.func @transposedGroupedConvolution2D(%arg0: !torch.vtensor<[1,2,5,7],f32>) -> !torch.vtensor<[1,4,10,14],f32> attributes {torch.assume_strict_symbolic_shapes} {
+  %int0 = torch.constant.int 0
+  %true = torch.constant.bool true
+  %int1 = torch.constant.int 1
+  %int2 = torch.constant.int 2
+  %0 = torch.vtensor.literal(dense_resource<torch_tensor_2_2_3_3_torch.float32> : tensor<2x2x3x3xf32>) : !torch.vtensor<[2,2,3,3],f32>
+  %1 = torch.vtensor.literal(dense_resource<torch_tensor_4_torch.float32> : tensor<4xf32>) : !torch.vtensor<[4],f32>
+  %2 = torch.prim.ListConstruct %int2, %int2 : (!torch.int, !torch.int) -> !torch.list<int>
+  %3 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %4 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
+  %5 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
+  %6 = torch.aten.convolution %arg0, %0, %1, %2, %3, %4, %true, %5, %int2 : !torch.vtensor<[1,2,5,7],f32>, !torch.vtensor<[2,2,3,3],f32>, !torch.vtensor<[4],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,4,10,14],f32>
+  return %6 : !torch.vtensor<[1,4,10,14],f32>
+}
+