Skip to content

Commit 824b650

Browse files
make affine-loop-unroll a FunctionOpInterface pass.
1 parent 2e3729b commit 824b650

File tree

4 files changed

+143
-37
lines changed

4 files changed

+143
-37
lines changed

mlir/include/mlir/Dialect/Affine/Passes.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
namespace mlir {
2121

22+
class ModuleOp;
2223
namespace func {
2324
class FuncOp;
2425
} // namespace func
@@ -93,7 +94,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLoopTilingPass();
9394
/// factors supplied through other means. If -1 is passed as the unrollFactor
9495
/// and no callback is provided, anything passed from the command-line (if at
9596
/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
96-
std::unique_ptr<OperationPass<func::FuncOp>> createLoopUnrollPass(
97+
std::unique_ptr<OperationPass<mlir::ModuleOp>> createLoopUnrollPass(
9798
int unrollFactor = -1, bool unrollUpToFactor = false,
9899
bool unrollFull = false,
99100
const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);

mlir/include/mlir/Dialect/Affine/Passes.td

+1-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def AffineLoopTiling : Pass<"affine-loop-tile", "func::FuncOp"> {
199199
];
200200
}
201201

202-
def AffineLoopUnroll : Pass<"affine-loop-unroll", "func::FuncOp"> {
202+
def AffineLoopUnroll : Pass<"affine-loop-unroll", "ModuleOp"> {
203203
let summary = "Unroll affine loops";
204204
let constructor = "mlir::affine::createLoopUnrollPass()";
205205
let options = [

mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp

+40-35
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "mlir/IR/AffineExpr.h"
2020
#include "mlir/IR/AffineMap.h"
2121
#include "mlir/IR/Builders.h"
22+
#include "mlir/IR/BuiltinOps.h"
2223
#include "llvm/ADT/DenseMap.h"
2324
#include "llvm/Support/CommandLine.h"
2425
#include "llvm/Support/Debug.h"
@@ -82,7 +83,7 @@ static bool isInnermostAffineForOp(AffineForOp op) {
8283
}
8384

8485
/// Gathers loops that have no affine.for's nested within.
85-
static void gatherInnermostLoops(func::FuncOp f,
86+
static void gatherInnermostLoops(FunctionOpInterface f,
8687
SmallVectorImpl<AffineForOp> &loops) {
8788
f.walk([&](AffineForOp forOp) {
8889
if (isInnermostAffineForOp(forOp))
@@ -91,40 +92,44 @@ static void gatherInnermostLoops(func::FuncOp f,
9192
}
9293

9394
void LoopUnroll::runOnOperation() {
94-
func::FuncOp func = getOperation();
95-
if (func.isExternal())
96-
return;
97-
98-
if (unrollFull && unrollFullThreshold.hasValue()) {
99-
// Store short loops as we walk.
95+
mlir::ModuleOp module = getOperation();
96+
SmallVector<FunctionOpInterface> funcOps;
97+
module.walk([&](FunctionOpInterface func) { funcOps.push_back(func); });
98+
for (auto func : funcOps) {
99+
if (func.isExternal())
100+
return;
101+
102+
if (unrollFull && unrollFullThreshold.hasValue()) {
103+
// Store short loops as we walk.
104+
SmallVector<AffineForOp, 4> loops;
105+
106+
// Gathers all loops with trip count <= minTripCount. Do a post order walk
107+
// so that loops are gathered from innermost to outermost (or else
108+
// unrolling an outer one may delete gathered inner ones).
109+
getOperation().walk([&](AffineForOp forOp) {
110+
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
111+
if (tripCount && *tripCount <= unrollFullThreshold)
112+
loops.push_back(forOp);
113+
});
114+
for (auto forOp : loops)
115+
(void)loopUnrollFull(forOp);
116+
return;
117+
}
118+
119+
// If the call back is provided, we will recurse until no loops are found.
100120
SmallVector<AffineForOp, 4> loops;
101-
102-
// Gathers all loops with trip count <= minTripCount. Do a post order walk
103-
// so that loops are gathered from innermost to outermost (or else unrolling
104-
// an outer one may delete gathered inner ones).
105-
getOperation().walk([&](AffineForOp forOp) {
106-
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
107-
if (tripCount && *tripCount <= unrollFullThreshold)
108-
loops.push_back(forOp);
109-
});
110-
for (auto forOp : loops)
111-
(void)loopUnrollFull(forOp);
112-
return;
113-
}
114-
115-
// If the call back is provided, we will recurse until no loops are found.
116-
SmallVector<AffineForOp, 4> loops;
117-
for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
118-
loops.clear();
119-
gatherInnermostLoops(func, loops);
120-
if (loops.empty())
121-
break;
122-
bool unrolled = false;
123-
for (auto forOp : loops)
124-
unrolled |= succeeded(runOnAffineForOp(forOp));
125-
if (!unrolled)
126-
// Break out if nothing was unrolled.
127-
break;
121+
for (unsigned i = 0; i < numRepetitions || getUnrollFactor; i++) {
122+
loops.clear();
123+
gatherInnermostLoops(func, loops);
124+
if (loops.empty())
125+
break;
126+
bool unrolled = false;
127+
for (auto forOp : loops)
128+
unrolled |= succeeded(runOnAffineForOp(forOp));
129+
if (!unrolled)
130+
// Break out if nothing was unrolled.
131+
break;
132+
}
128133
}
129134
}
130135

@@ -145,7 +150,7 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
145150
cleanUpUnroll);
146151
}
147152

148-
std::unique_ptr<OperationPass<func::FuncOp>> mlir::affine::createLoopUnrollPass(
153+
std::unique_ptr<OperationPass<ModuleOp>> mlir::affine::createLoopUnrollPass(
149154
int unrollFactor, bool unrollUpToFactor, bool unrollFull,
150155
const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
151156
return std::make_unique<LoopUnroll>(

mlir/test/Dialect/Affine/unroll.mlir

+100
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,23 @@ func.func @loop_nest_unroll_full() {
240240
return
241241
} // UNROLL-FULL }
242242

243+
gpu.module @unroll_full {
244+
// UNROLL-FULL-LABEL: func @gpu_loop_nest_simplest() {
245+
gpu.func @gpu_loop_nest_simplest() {
246+
// UNROLL-FULL: affine.for %arg0 = 0 to 100 step 2 {
247+
affine.for %i = 0 to 100 step 2 {
248+
// UNROLL-FULL: %c1_i32 = arith.constant 1 : i32
249+
// UNROLL-FULL-NEXT: %c1_i32_0 = arith.constant 1 : i32
250+
// UNROLL-FULL-NEXT: %c1_i32_1 = arith.constant 1 : i32
251+
// UNROLL-FULL-NEXT: %c1_i32_2 = arith.constant 1 : i32
252+
affine.for %j = 0 to 4 {
253+
%x = arith.constant 1 : i32
254+
}
255+
} // UNROLL-FULL: }
256+
gpu.return // UNROLL-FULL: return
257+
}
258+
}
259+
243260
// SHORT-LABEL: func @loop_nest_outer_unroll() {
244261
func.func @loop_nest_outer_unroll() {
245262
// SHORT: affine.for %arg0 = 0 to 4 {
@@ -260,6 +277,28 @@ func.func @loop_nest_outer_unroll() {
260277
return // SHORT: return
261278
} // SHORT }
262279

280+
gpu.module @short {
281+
// SHORT-LABEL: func @gpu_loop_nest_outer_unroll() {
282+
gpu.func @gpu_loop_nest_outer_unroll() {
283+
// SHORT: affine.for %arg0 = 0 to 4 {
284+
// SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0)
285+
// SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
286+
// SHORT-NEXT: }
287+
// SHORT-NEXT: affine.for %arg0 = 0 to 4 {
288+
// SHORT-NEXT: %0 = affine.apply [[$MAP0]](%arg0)
289+
// SHORT-NEXT: %1 = "addi32"(%0, %0) : (index, index) -> index
290+
// SHORT-NEXT: }
291+
affine.for %i = 0 to 2 {
292+
affine.for %j = 0 to 4 {
293+
%x = "affine.apply" (%j) { map = affine_map<(d0) -> (d0 + 1)> } :
294+
(index) -> (index)
295+
%y = "addi32"(%x, %x) : (index, index) -> index
296+
}
297+
}
298+
gpu.return // SHORT: gpu.return
299+
} // SHORT }
300+
}
301+
263302
// We are doing a minimal FileCheck here. We just need this test case to
264303
// successfully run. Both %x and %y will get unrolled here as the min trip
265304
// count threshold set to 2.
@@ -345,6 +384,37 @@ func.func @unroll_unit_stride_no_cleanup() {
345384
return
346385
}
347386

387+
gpu.module @unroll_by_4{
388+
// UNROLL-BY-4-LABEL: func @gpu_unroll_unit_stride_no_cleanup() {
389+
gpu.func @gpu_unroll_unit_stride_no_cleanup() {
390+
// UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
391+
affine.for %i = 0 to 100 {
392+
// UNROLL-BY-4: for [[L1:%arg[0-9]+]] = 0 to 8 step 4 {
393+
// UNROLL-BY-4-NEXT: %0 = "addi32"([[L1]], [[L1]]) : (index, index) -> i32
394+
// UNROLL-BY-4-NEXT: %1 = "addi32"(%0, %0) : (i32, i32) -> i32
395+
// UNROLL-BY-4-NEXT: %2 = affine.apply #map{{[0-9]*}}([[L1]])
396+
// UNROLL-BY-4-NEXT: %3 = "addi32"(%2, %2) : (index, index) -> i32
397+
// UNROLL-BY-4-NEXT: %4 = "addi32"(%3, %3) : (i32, i32) -> i32
398+
// UNROLL-BY-4-NEXT: %5 = affine.apply #map{{[0-9]*}}([[L1]])
399+
// UNROLL-BY-4-NEXT: %6 = "addi32"(%5, %5) : (index, index) -> i32
400+
// UNROLL-BY-4-NEXT: %7 = "addi32"(%6, %6) : (i32, i32) -> i32
401+
// UNROLL-BY-4-NEXT: %8 = affine.apply #map{{[0-9]*}}([[L1]])
402+
// UNROLL-BY-4-NEXT: %9 = "addi32"(%8, %8) : (index, index) -> i32
403+
// UNROLL-BY-4-NEXT: %10 = "addi32"(%9, %9) : (i32, i32) -> i32
404+
// UNROLL-BY-4-NEXT: }
405+
affine.for %j = 0 to 8 {
406+
%x = "addi32"(%j, %j) : (index, index) -> i32
407+
%y = "addi32"(%x, %x) : (i32, i32) -> i32
408+
}
409+
// empty loop
410+
// UNROLL-BY-4: affine.for %arg1 = 0 to 8 {
411+
affine.for %k = 0 to 8 {
412+
}
413+
}
414+
gpu.return
415+
}
416+
}
417+
348418
// UNROLL-BY-4-LABEL: func @unroll_unit_stride_cleanup() {
349419
func.func @unroll_unit_stride_cleanup() {
350420
// UNROLL-BY-4: affine.for %arg0 = 0 to 100 {
@@ -632,6 +702,19 @@ func.func @unroll_by_one_should_promote_single_iteration_loop() {
632702
// UNROLL-BY-1-NEXT: return
633703
}
634704

705+
gpu.module @unroll_by_1 {
706+
// UNROLL-BY-1-LABEL: func @gpu_unroll_by_one_should_promote_single_iteration_loop()
707+
gpu.func @gpu_unroll_by_one_should_promote_single_iteration_loop() {
708+
affine.for %i = 0 to 1 {
709+
%x = "foo"(%i) : (index) -> i32
710+
}
711+
gpu.return
712+
// UNROLL-BY-1-NEXT: %c0 = arith.constant 0 : index
713+
// UNROLL-BY-1-NEXT: %0 = "foo"(%c0) : (index) -> i32
714+
// UNROLL-BY-1-NEXT: gpu.return
715+
}
716+
}
717+
635718
// Test unrolling with affine.for iter_args.
636719

637720
// UNROLL-BY-4-LABEL: loop_unroll_with_iter_args_and_cleanup
@@ -706,6 +789,23 @@ func.func @unroll_cleanup_loop_with_larger_unroll_factor() {
706789
// UNROLL-CLEANUP-LOOP-NEXT: return
707790
}
708791

792+
gpu.module @unroll_cleanup_loop {
793+
// UNROLL-CLEANUP-LOOP-LABEL: func @gpu_unroll_cleanup_loop_with_larger_unroll_factor()
794+
gpu.func @gpu_unroll_cleanup_loop_with_larger_unroll_factor() {
795+
affine.for %i = 0 to 3 {
796+
%x = "foo"(%i) : (index) -> i32
797+
}
798+
gpu.return
799+
// UNROLL-CLEANUP-LOOP-NEXT: %[[C0:.*]] = arith.constant 0 : index
800+
// UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[C0]]) : (index) -> i32
801+
// UNROLL-CLEANUP-LOOP-NEXT: %[[V1:.*]] = affine.apply {{.*}}
802+
// UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V1]]) : (index) -> i32
803+
// UNROLL-CLEANUP-LOOP-NEXT: %[[V2:.*]] = affine.apply {{.*}}
804+
// UNROLL-CLEANUP-LOOP-NEXT: {{.*}} = "foo"(%[[V2]]) : (index) -> i32
805+
// UNROLL-CLEANUP-LOOP-NEXT: gpu.return
806+
}
807+
}
808+
709809
// UNROLL-CLEANUP-LOOP-LABEL: func @unroll_cleanup_loop_with_smaller_unroll_factor()
710810
func.func @unroll_cleanup_loop_with_smaller_unroll_factor() {
711811
affine.for %i = 0 to 7 {

0 commit comments

Comments
 (0)