[mlir][affine]make affine-loop-unroll a FunctionOpInterface pass. #126475

linuxlonelyeagle · 2025-02-10T07:09:14Z

Make affine-loop-unroll a FunctionOpInterface pass.Now unroll can be done on gpu.func.

llvmbot · 2025-02-10T07:09:48Z

@llvm/pr-subscribers-mlir-scf

@llvm/pr-subscribers-mlir-affine

Author: lonely eagle (linuxlonelyeagle)

Changes

Make affine-loop-unroll a FunctionOpInterface pass.Now unroll can be done on gpu.func.

Full diff: https://github.com/llvm/llvm-project/pull/126475.diff

4 Files Affected:

(modified) mlir/include/mlir/Dialect/Affine/Passes.h (+2-1)
(modified) mlir/include/mlir/Dialect/Affine/Passes.td (+1-1)
(modified) mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp (+40-35)
(modified) mlir/test/Dialect/Affine/unroll.mlir (+100)

diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
index bc29d04287ac462..37147b079e5d992 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -19,6 +19,7 @@
 
 namespace mlir {
 
+class ModuleOp;
 namespace func {
 class FuncOp;
 } // namespace func
@@ -93,7 +94,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLoopTilingPass();
 /// factors supplied through other means. If -1 is passed as the unrollFactor
 /// and no callback is provided, anything passed from the command-line (if at
 /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
-std::unique_ptr<OperationPass<func::FuncOp>> createLoopUnrollPass(
+std::unique_ptr<OperationPass<mlir::ModuleOp>> createLoopUnrollPass(
     int unrollFactor = -1, bool unrollUpToFactor = false,
     bool unrollFull = false,
     const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index d7c7897c6573016..d96b50c3e81043c 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -199,7 +199,7 @@ def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> {
   ];
 }
 
-def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> {
+def AffineLoopUnroll : Pass<"affine-loop-unroll", "ModuleOp"> {
   let summary = "Unroll affine loops";
   let constructor = "mlir::affine::createLoopUnrollPass()";
   let options = [
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
index 57df7ada91654c0..4dc9809574115eb 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -82,7 +83,7 @@ static bool isInnermostAffineForOp(AffineForOp op) {
 }
 
 /// Gathers loops that have no affine.for's nested within.
-static void gatherInnermostLoops(func::FuncOp f,
+static void gatherInnermostLoops(FunctionOpInterface f,
                                  SmallVectorImpl<AffineForOp> &loops) {
   f.walk([&](AffineForOp forOp) {
     if (isInnermostAffineForOp(forOp))
@@ -91,40 +92,44 @@ static void gatherInnermostLoops(func::FuncOp f,
 }
 
 void LoopUnroll::runOnOperation() {
-  func::FuncOp func = getOperation();
-  if (func.isExternal())
-    return;
-
-  if (unrollFull && unrollFullThreshold.hasValue()) {
-    // Store short loops as we walk.
+  mlir::ModuleOp module = getOperation();
+  SmallVector<FunctionOpInterface> funcOps;
+  module.walk([&](FunctionOpInterface func) { funcOps.push_back(func); });
+  for (auto func : funcOps) {
+    if (func.isExternal())
+      return;
+
+    if (unrollFull && unrollFullThreshold.hasValue()) {
+      // Store short loops as we walk.
+      SmallVector<AffineForOp, 4> loops;
+
+      // Gathers all loops with trip count <= minTripCount. Do a post order walk
+      // so that loops are gathered from innermost to outermost (or else
+      // unrolling an outer one may delete gathered inner ones).
+      getOperation().walk([&](AffineForOp forOp) {
+        std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+        if (tripCount && *tripCount <= unrollFullThreshold)
+          loops.push_back(forOp);
+      });
+      for (auto forOp : loops)
+        (void)loopUnrollFull(forOp);
+      return;
+    }
+
+    // If the call back is provided, we will recurse until no loops are found.
     SmallVector<AffineForOp, 4> loops;
-
-    // Gathers all loops with trip count <= minTripCount. Do a post order walk
-    // so that loops are gathered from innermost to outermost (or else unrolling
-    // an outer one may delete gathered inner ones).
-    getOperation().walk([&](AffineForOp forOp) {
-      std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-      if (tripCount && *tripCount <= unrollFullThreshold)
-        loops.push_back(forOp);
-    });
-    for (auto forOp : loops)
-      (void)loopUnrollFull(forOp);
-    return;
-  }
-
-  // If the call back is provided, we will recurse until no loops are found.
-  SmallVector<AffineForOp, 4> loops;
-  for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
-    loops.clear();
-    gatherInnermostLoops(func, loops);
-    if (loops.empty())
-      break;
-    bool unrolled = false;
-    for (auto forOp : loops)
-      unrolled |= succeeded(runOnAffineForOp(forOp));
-    if (!unrolled)
-      // Break out if nothing was unrolled.
-      break;
+    for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
+      loops.clear();
+      gatherInnermostLoops(func, loops);
+      if (loops.empty())
+        break;
+      bool unrolled = false;
+      for (auto forOp : loops)
+        unrolled |= succeeded(runOnAffineForOp(forOp));
+      if (!unrolled)
+        // Break out if nothing was unrolled.
+        break;
+    }
   }
 }
 
@@ -145,7 +150,7 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
                             cleanUpUnroll);
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::affine::createLoopUnrollPass(
+std::unique_ptr<OperationPass<ModuleOp>> mlir::affine::createLoopUnrollPass(
     int unrollFactor, bool unrollUpToFactor, bool unrollFull,
     const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
   return std::make_unique<LoopUnroll>(
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index e398c3fe2011dd8..43485ca56deeba5 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -240,6 +240,23 @@ func.func @loop_nest_unroll_full() {
   return
 } // UNROLL-FULL }
 
+gpu.module @unroll_full {
+  // UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() {
+  gpu.func @gpu_loop_nest_simplest() {
+    // UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 {
+    affine.for %i = 0 to 100 step 2 {
+      // UNROLL-FULL: %c1_i32 = arith.constant 1 : i32
+      // UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32
+      // UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32
+      // UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32
+      affine.for %j = 0 to 4 {
+        %x = arith.constant 1 : i32
+      }
+    }       // UNROLL-FULL:  }
+    gpu.return  // UNROLL-FULL:  return
+  }
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -260,6 +277,28 @@ func.func @loop_nest_outer_unroll() {
   return  // SHORT:  return
 }         // SHORT }
 
+gpu.module @short {
+  // SHORT-LABEL: func @gpu_loop_nest_outer_unroll() {
+  gpu.func @gpu_loop_nest_outer_unroll() {
+    // SHORT:      affine.for %arg0 = 0 to 4 {
+    // SHORT-NEXT:   %0 = affine.apply [[$MAP0]](%arg0)
+    // SHORT-NEXT:   %1 = "addi32"(%0, %0) : (index, index) -> index
+    // SHORT-NEXT: }
+    // SHORT-NEXT: affine.for %arg0 = 0 to 4 {
+    // SHORT-NEXT:   %0 = affine.apply [[$MAP0]](%arg0)
+    // SHORT-NEXT:   %1 = "addi32"(%0, %0) : (index, index) -> index
+    // SHORT-NEXT: }
+    affine.for %i = 0 to 2 {
+      affine.for %j = 0 to 4 {
+        %x = "affine.apply" (%j) { map = affine_map<(d0) -> (d0 + 1)> } :
+          (index) -> (index)
+        %y = "addi32"(%x, %x) : (index, index) -> index
+      }
+    }
+    gpu.return  // SHORT:  gpu.return
+  }             // SHORT }
+}
+
 // We are doing a minimal FileCheck here. We just need this test case to
 // successfully run. Both %x and %y will get unrolled here as the min trip
 // count threshold set to 2.
@@ -345,6 +384,37 @@ func.func @unroll_unit_stride_no_cleanup() {
   return
 }
 
+gpu.module @unroll_by_4{
+  // UNROLL-BY-4-LABEL: func @gpu_unroll_unit_stride_no_cleanup() {
+  gpu.func @gpu_unroll_unit_stride_no_cleanup() {
+    // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
+    affine.for %i = 0 to 100 {
+      // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 {
+      // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]*}}([[L1]])
+      // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]*}}([[L1]])
+      // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]*}}([[L1]])
+      // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: }
+      affine.for %j = 0 to 8 {
+        %x = "addi32"(%j, %j) : (index, index) -> i32
+        %y = "addi32"(%x, %x) : (i32, i32) -> i32
+      }
+      // empty loop
+      // UNROLL-BY-4: affine.for %arg1 = 0 to 8 {
+      affine.for %k = 0 to 8 {
+      }
+    }
+    gpu.return
+  }
+}
+
 // UNROLL-BY-4-LABEL: func @unroll_unit_stride_cleanup() {
 func.func @unroll_unit_stride_cleanup() {
   // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
@@ -632,6 +702,19 @@ func.func @unroll_by_one_should_promote_single_iteration_loop() {
 // UNROLL-BY-1-NEXT: return
 }
 
+gpu.module @unroll_by_1 {
+  // UNROLL-BY-1-LABEL: func @gpu_unroll_by_one_should_promote_single_iteration_loop()
+  gpu.func @gpu_unroll_by_one_should_promote_single_iteration_loop() {
+    affine.for %i = 0 to 1 {
+      %x = "foo"(%i) : (index) -> i32
+    }
+    gpu.return
+    // UNROLL-BY-1-NEXT: %c0 = arith.constant 0 : index
+    // UNROLL-BY-1-NEXT: %0 = "foo"(%c0) : (index) -> i32
+    // UNROLL-BY-1-NEXT: gpu.return
+  }
+}
+
 // Test unrolling with affine.for iter_args.
 
 // UNROLL-BY-4-LABEL: loop_unroll_with_iter_args_and_cleanup
@@ -706,6 +789,23 @@ func.func @unroll_cleanup_loop_with_larger_unroll_factor() {
 // UNROLL-CLEANUP-LOOP-NEXT: return
 }
 
+gpu.module @unroll_cleanup_loop {
+  // UNROLL-CLEANUP-LOOP-LABEL: func @gpu_unroll_cleanup_loop_with_larger_unroll_factor()
+  gpu.func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() {
+    affine.for %i = 0 to 3 {
+      %x = "foo"(%i) : (index) -> i32
+    }
+    gpu.return
+    // UNROLL-CLEANUP-LOOP-NEXT: %[[C0:.*]] = arith.constant 0 : index
+    // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[C0]]) : (index) -> i32
+    // UNROLL-CLEANUP-LOOP-NEXT: %[[V1:.*]] = affine.apply {{.*}}
+    // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V1]]) : (index) -> i32
+    // UNROLL-CLEANUP-LOOP-NEXT: %[[V2:.*]] = affine.apply {{.*}}
+    // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V2]]) : (index) -> i32
+    // UNROLL-CLEANUP-LOOP-NEXT: gpu.return
+  }
+}
+
 // UNROLL-CLEANUP-LOOP-LABEL: func @unroll_cleanup_loop_with_smaller_unroll_factor()
 func.func @unroll_cleanup_loop_with_smaller_unroll_factor() {
   affine.for %i = 0 to 7 {

llvmbot · 2025-02-10T07:09:50Z

@llvm/pr-subscribers-mlir

Author: lonely eagle (linuxlonelyeagle)

Changes

Make affine-loop-unroll a FunctionOpInterface pass.Now unroll can be done on gpu.func.

Full diff: https://github.com/llvm/llvm-project/pull/126475.diff

4 Files Affected:

(modified) mlir/include/mlir/Dialect/Affine/Passes.h (+2-1)
(modified) mlir/include/mlir/Dialect/Affine/Passes.td (+1-1)
(modified) mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp (+40-35)
(modified) mlir/test/Dialect/Affine/unroll.mlir (+100)

diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
index bc29d04287ac462..37147b079e5d992 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -19,6 +19,7 @@
 
 namespace mlir {
 
+class ModuleOp;
 namespace func {
 class FuncOp;
 } // namespace func
@@ -93,7 +94,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLoopTilingPass();
 /// factors supplied through other means. If -1 is passed as the unrollFactor
 /// and no callback is provided, anything passed from the command-line (if at
 /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
-std::unique_ptr<OperationPass<func::FuncOp>> createLoopUnrollPass(
+std::unique_ptr<OperationPass<mlir::ModuleOp>> createLoopUnrollPass(
     int unrollFactor = -1, bool unrollUpToFactor = false,
     bool unrollFull = false,
     const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index d7c7897c6573016..d96b50c3e81043c 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -199,7 +199,7 @@ def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> {
   ];
 }
 
-def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> {
+def AffineLoopUnroll : Pass<"affine-loop-unroll", "ModuleOp"> {
   let summary = "Unroll affine loops";
   let constructor = "mlir::affine::createLoopUnrollPass()";
   let options = [
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
index 57df7ada91654c0..4dc9809574115eb 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -82,7 +83,7 @@ static bool isInnermostAffineForOp(AffineForOp op) {
 }
 
 /// Gathers loops that have no affine.for's nested within.
-static void gatherInnermostLoops(func::FuncOp f,
+static void gatherInnermostLoops(FunctionOpInterface f,
                                  SmallVectorImpl<AffineForOp> &loops) {
   f.walk([&](AffineForOp forOp) {
     if (isInnermostAffineForOp(forOp))
@@ -91,40 +92,44 @@ static void gatherInnermostLoops(func::FuncOp f,
 }
 
 void LoopUnroll::runOnOperation() {
-  func::FuncOp func = getOperation();
-  if (func.isExternal())
-    return;
-
-  if (unrollFull && unrollFullThreshold.hasValue()) {
-    // Store short loops as we walk.
+  mlir::ModuleOp module = getOperation();
+  SmallVector<FunctionOpInterface> funcOps;
+  module.walk([&](FunctionOpInterface func) { funcOps.push_back(func); });
+  for (auto func : funcOps) {
+    if (func.isExternal())
+      return;
+
+    if (unrollFull && unrollFullThreshold.hasValue()) {
+      // Store short loops as we walk.
+      SmallVector<AffineForOp, 4> loops;
+
+      // Gathers all loops with trip count <= minTripCount. Do a post order walk
+      // so that loops are gathered from innermost to outermost (or else
+      // unrolling an outer one may delete gathered inner ones).
+      getOperation().walk([&](AffineForOp forOp) {
+        std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+        if (tripCount && *tripCount <= unrollFullThreshold)
+          loops.push_back(forOp);
+      });
+      for (auto forOp : loops)
+        (void)loopUnrollFull(forOp);
+      return;
+    }
+
+    // If the call back is provided, we will recurse until no loops are found.
     SmallVector<AffineForOp, 4> loops;
-
-    // Gathers all loops with trip count <= minTripCount. Do a post order walk
-    // so that loops are gathered from innermost to outermost (or else unrolling
-    // an outer one may delete gathered inner ones).
-    getOperation().walk([&](AffineForOp forOp) {
-      std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-      if (tripCount && *tripCount <= unrollFullThreshold)
-        loops.push_back(forOp);
-    });
-    for (auto forOp : loops)
-      (void)loopUnrollFull(forOp);
-    return;
-  }
-
-  // If the call back is provided, we will recurse until no loops are found.
-  SmallVector<AffineForOp, 4> loops;
-  for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
-    loops.clear();
-    gatherInnermostLoops(func, loops);
-    if (loops.empty())
-      break;
-    bool unrolled = false;
-    for (auto forOp : loops)
-      unrolled |= succeeded(runOnAffineForOp(forOp));
-    if (!unrolled)
-      // Break out if nothing was unrolled.
-      break;
+    for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
+      loops.clear();
+      gatherInnermostLoops(func, loops);
+      if (loops.empty())
+        break;
+      bool unrolled = false;
+      for (auto forOp : loops)
+        unrolled |= succeeded(runOnAffineForOp(forOp));
+      if (!unrolled)
+        // Break out if nothing was unrolled.
+        break;
+    }
   }
 }
 
@@ -145,7 +150,7 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
                             cleanUpUnroll);
 }
 
-std::unique_ptr<OperationPass<func::FuncOp>> mlir::affine::createLoopUnrollPass(
+std::unique_ptr<OperationPass<ModuleOp>> mlir::affine::createLoopUnrollPass(
     int unrollFactor, bool unrollUpToFactor, bool unrollFull,
     const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
   return std::make_unique<LoopUnroll>(
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index e398c3fe2011dd8..43485ca56deeba5 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -240,6 +240,23 @@ func.func @loop_nest_unroll_full() {
   return
 } // UNROLL-FULL }
 
+gpu.module @unroll_full {
+  // UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() {
+  gpu.func @gpu_loop_nest_simplest() {
+    // UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 {
+    affine.for %i = 0 to 100 step 2 {
+      // UNROLL-FULL: %c1_i32 = arith.constant 1 : i32
+      // UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32
+      // UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32
+      // UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32
+      affine.for %j = 0 to 4 {
+        %x = arith.constant 1 : i32
+      }
+    }       // UNROLL-FULL:  }
+    gpu.return  // UNROLL-FULL:  return
+  }
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -260,6 +277,28 @@ func.func @loop_nest_outer_unroll() {
   return  // SHORT:  return
 }         // SHORT }
 
+gpu.module @short {
+  // SHORT-LABEL: func @gpu_loop_nest_outer_unroll() {
+  gpu.func @gpu_loop_nest_outer_unroll() {
+    // SHORT:      affine.for %arg0 = 0 to 4 {
+    // SHORT-NEXT:   %0 = affine.apply [[$MAP0]](%arg0)
+    // SHORT-NEXT:   %1 = "addi32"(%0, %0) : (index, index) -> index
+    // SHORT-NEXT: }
+    // SHORT-NEXT: affine.for %arg0 = 0 to 4 {
+    // SHORT-NEXT:   %0 = affine.apply [[$MAP0]](%arg0)
+    // SHORT-NEXT:   %1 = "addi32"(%0, %0) : (index, index) -> index
+    // SHORT-NEXT: }
+    affine.for %i = 0 to 2 {
+      affine.for %j = 0 to 4 {
+        %x = "affine.apply" (%j) { map = affine_map<(d0) -> (d0 + 1)> } :
+          (index) -> (index)
+        %y = "addi32"(%x, %x) : (index, index) -> index
+      }
+    }
+    gpu.return  // SHORT:  gpu.return
+  }             // SHORT }
+}
+
 // We are doing a minimal FileCheck here. We just need this test case to
 // successfully run. Both %x and %y will get unrolled here as the min trip
 // count threshold set to 2.
@@ -345,6 +384,37 @@ func.func @unroll_unit_stride_no_cleanup() {
   return
 }
 
+gpu.module @unroll_by_4{
+  // UNROLL-BY-4-LABEL: func @gpu_unroll_unit_stride_no_cleanup() {
+  gpu.func @gpu_unroll_unit_stride_no_cleanup() {
+    // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
+    affine.for %i = 0 to 100 {
+      // UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 {
+      // UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]*}}([[L1]])
+      // UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]*}}([[L1]])
+      // UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]*}}([[L1]])
+      // UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
+      // UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
+      // UNROLL-BY-4-NEXT: }
+      affine.for %j = 0 to 8 {
+        %x = "addi32"(%j, %j) : (index, index) -> i32
+        %y = "addi32"(%x, %x) : (i32, i32) -> i32
+      }
+      // empty loop
+      // UNROLL-BY-4: affine.for %arg1 = 0 to 8 {
+      affine.for %k = 0 to 8 {
+      }
+    }
+    gpu.return
+  }
+}
+
 // UNROLL-BY-4-LABEL: func @unroll_unit_stride_cleanup() {
 func.func @unroll_unit_stride_cleanup() {
   // UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
@@ -632,6 +702,19 @@ func.func @unroll_by_one_should_promote_single_iteration_loop() {
 // UNROLL-BY-1-NEXT: return
 }
 
+gpu.module @unroll_by_1 {
+  // UNROLL-BY-1-LABEL: func @gpu_unroll_by_one_should_promote_single_iteration_loop()
+  gpu.func @gpu_unroll_by_one_should_promote_single_iteration_loop() {
+    affine.for %i = 0 to 1 {
+      %x = "foo"(%i) : (index) -> i32
+    }
+    gpu.return
+    // UNROLL-BY-1-NEXT: %c0 = arith.constant 0 : index
+    // UNROLL-BY-1-NEXT: %0 = "foo"(%c0) : (index) -> i32
+    // UNROLL-BY-1-NEXT: gpu.return
+  }
+}
+
 // Test unrolling with affine.for iter_args.
 
 // UNROLL-BY-4-LABEL: loop_unroll_with_iter_args_and_cleanup
@@ -706,6 +789,23 @@ func.func @unroll_cleanup_loop_with_larger_unroll_factor() {
 // UNROLL-CLEANUP-LOOP-NEXT: return
 }
 
+gpu.module @unroll_cleanup_loop {
+  // UNROLL-CLEANUP-LOOP-LABEL: func @gpu_unroll_cleanup_loop_with_larger_unroll_factor()
+  gpu.func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() {
+    affine.for %i = 0 to 3 {
+      %x = "foo"(%i) : (index) -> i32
+    }
+    gpu.return
+    // UNROLL-CLEANUP-LOOP-NEXT: %[[C0:.*]] = arith.constant 0 : index
+    // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[C0]]) : (index) -> i32
+    // UNROLL-CLEANUP-LOOP-NEXT: %[[V1:.*]] = affine.apply {{.*}}
+    // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V1]]) : (index) -> i32
+    // UNROLL-CLEANUP-LOOP-NEXT: %[[V2:.*]] = affine.apply {{.*}}
+    // UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V2]]) : (index) -> i32
+    // UNROLL-CLEANUP-LOOP-NEXT: gpu.return
+  }
+}
+
 // UNROLL-CLEANUP-LOOP-LABEL: func @unroll_cleanup_loop_with_smaller_unroll_factor()
 func.func @unroll_cleanup_loop_with_smaller_unroll_factor() {
   affine.for %i = 0 to 7 {

linuxlonelyeagle · 2025-02-10T07:13:43Z

The number of tests added is not much compared to func.func, and I don’t know if it is enough.

mlir/include/mlir/Dialect/Affine/Passes.td

mlir/include/mlir/Dialect/Affine/Passes.h

mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp

mlir/test/Dialect/Affine/unroll.mlir

krzysz00

Seems fine to me, approved

…vm#126475) [mlir][affine]make affine-loop-unroll a FunctionOpInterface pass Make `affine-loop-unroll` a `FunctionOpInterface` pass.Now unroll can be done on gpu.func.

make affine-loop-unroll a FunctionOpInterface pass.

824b650

linuxlonelyeagle requested a review from ftynse February 10, 2025 07:09

llvmbot added mlir:affine mlir labels Feb 10, 2025

linuxlonelyeagle requested review from bondhugula, krzysz00, grypp and Groverkss February 10, 2025 07:10

krzysz00 requested changes Feb 10, 2025

View reviewed changes

mlir/include/mlir/Dialect/Affine/Passes.td Outdated Show resolved Hide resolved

llvmbot added the mlir:scf label Feb 11, 2025

use InterfacePass implement it.

0861464

linuxlonelyeagle force-pushed the unroll-on-function-interface branch from 7074eaa to 0861464 Compare February 11, 2025 03:04

Groverkss reviewed Feb 13, 2025

View reviewed changes

update c++ impl and update test.

4383806

krzysz00 approved these changes Feb 13, 2025

View reviewed changes

linuxlonelyeagle merged commit a472147 into llvm:main Feb 13, 2025
8 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[mlir][affine]make affine-loop-unroll a FunctionOpInterface pass. #126475

[mlir][affine]make affine-loop-unroll a FunctionOpInterface pass. #126475

linuxlonelyeagle commented Feb 10, 2025

llvmbot commented Feb 10, 2025 •

edited

Loading

llvmbot commented Feb 10, 2025

linuxlonelyeagle commented Feb 10, 2025

krzysz00 left a comment

[mlir][affine]make affine-loop-unroll a FunctionOpInterface pass. #126475

[mlir][affine]make affine-loop-unroll a FunctionOpInterface pass. #126475

Conversation

linuxlonelyeagle commented Feb 10, 2025

llvmbot commented Feb 10, 2025 • edited Loading

llvmbot commented Feb 10, 2025

linuxlonelyeagle commented Feb 10, 2025

krzysz00 left a comment

Choose a reason for hiding this comment

llvmbot commented Feb 10, 2025 •

edited

Loading