From 022bdf813d38d9888893519af1c991ce9c2d3dd3 Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Thu, 12 Dec 2024 21:50:46 +0000
Subject: [PATCH 1/8] first commit

---
 .../AMDAIEControlCodeToTransaction.cpp        |  28 +++-
 .../Transforms/AMDAIEFoldDmaWaits.cpp         | 155 +++++++++++++++++-
 .../test/controlcode_to_transaction.mlir      |  53 ++++++
 .../Transforms/test/fold_dma_waits.mlir       |  90 ++++++++++
 4 files changed, 316 insertions(+), 10 deletions(-)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
index 665ea08a8..b427036b3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
@@ -200,16 +200,34 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op,
 }
 
 LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
-  for (Value token : op.getAsyncTokens()) {
-    auto pushToQueueOp =
-        dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(token.getDefiningOp());
+  // Batch DMA operations with the same row, channel, and direction into a
+  // single TCT sync operation, as long as they have consecutive columns.
+  SmallVector<std::pair<AMDAIE::NpuPushToQueueOp, uint32_t>> columnBatches;
+  for (Value asyncToken : op.getAsyncTokens()) {
+    auto pushToQueueOp = dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(
+        asyncToken.getDefiningOp());
     if (!pushToQueueOp) {
       return op.emitOpError()
-             << "should operate on an `amdaie.push_to_queue` op";
+             << "should operate on an `amdaie.push_to_queue` op async token";
     }
+    if (!columnBatches.empty()) {
+      auto &[lastPushOp, lastColNum] = columnBatches.back();
+      if (lastPushOp.getRow() == pushToQueueOp.getRow() &&
+          lastPushOp.getCol() + lastColNum == pushToQueueOp.getCol() &&
+          lastPushOp.getDirection() == pushToQueueOp.getDirection() &&
+          lastPushOp.getChannel() == pushToQueueOp.getChannel()) {
+        ++lastColNum;
+        continue;
+      }
+    }
+    columnBatches.push_back({pushToQueueOp, 1});
+  }
+
+  // Convert to TCT sync ops.
+  for (auto &[pushToQueueOp, colNum] : columnBatches) {
     if (failed(builder.appendTCTSync(
             pushToQueueOp.getCol(), pushToQueueOp.getRow(),
-            static_cast<uint32_t>(pushToQueueOp.getDirection()), 1, 1,
+            static_cast<uint32_t>(pushToQueueOp.getDirection()), 1, colNum,
             pushToQueueOp.getChannel()))) {
       return failure();
     }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index 2f0c6030d..e578203a4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -18,7 +18,7 @@ namespace {
 
 /// Utility function to determine whether a DMA wait op can be folded based on
 /// its half DMA copy operation.
-FailureOr<bool> canFoldBasedOnHalfDmaCpy(
+FailureOr<bool> canFoldByConnection(
     const AMDAIE::AMDAIEDeviceModel &deviceModel,
     AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
     DenseMap<std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>,
@@ -101,8 +101,9 @@ FailureOr<bool> canFoldBasedOnHalfDmaCpy(
 /// Reverse traversal simplifies handling duplicate BD IDs, preventing
 /// the need to revisit and modify earlier operations after processing later
 /// ones.
-LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel,
-                           AMDAIE::ControlCodeOp controlCodeOp) {
+LogicalResult foldDmaWaitsByConnection(
+    const AMDAIE::AMDAIEDeviceModel &deviceModel,
+    AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
   std::vector<AMDAIE::NpuDmaWaitOp> waitOpsToErase;
   DenseMap<std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>,
@@ -116,7 +117,7 @@ LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel,
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            FailureOr<bool> result = canFoldBasedOnHalfDmaCpy(
+            FailureOr<bool> result = canFoldByConnection(
                 deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue);
             if (failed(result)) return WalkResult::interrupt();
             toErase &= *result;
@@ -152,6 +153,147 @@ LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel,
   return success();
 }
 
+struct DmaColumnBatch {
+  uint32_t row;
+  uint32_t channel;
+  AMDAIE::DMAChannelDir direction;
+
+  // Sorted by column.
+  std::map<uint32_t, AMDAIE::NpuDmaWaitOp> colWaitOpMap;
+};
+
+/// Updates a batch of asynchronous DMA wait operations by combining their
+/// async tokens into a single NpuDmaWaitOp.
+void updateColumnBatchTokens(
+    IRRewriter &rewriter,
+    std::map<uint32_t, AMDAIE::NpuDmaWaitOp> &colWaitOpMap) {
+  if (colWaitOpMap.size() < 2) return;
+
+  // Check if there is any discontinuity in the columns, and if so, split into
+  // separate batches.
+  SmallVector<SmallVector<AMDAIE::NpuDmaWaitOp>> waitOpsList;
+  uint32_t prevCol = 0;
+  for (auto &entry : colWaitOpMap) {
+    uint32_t col = entry.first;
+    AMDAIE::NpuDmaWaitOp waitOp = entry.second;
+    if (waitOpsList.empty() || col != prevCol + 1) {
+      waitOpsList.push_back({});
+    }
+    waitOpsList.back().push_back(waitOp);
+    prevCol = col;
+  }
+
+  for (SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps : waitOpsList) {
+    // For each batch, combine the async tokens into a single NpuDmaWaitOp.
+    SmallVector<Value> asyncTokens;
+    for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
+      asyncTokens.append(waitOp.getAsyncTokens().begin(),
+                         waitOp.getAsyncTokens().end());
+    }
+    rewriter.setInsertionPointAfter(waitOps.back());
+    rewriter.create<AMDAIE::NpuDmaWaitOp>(waitOps.back().getLoc(), asyncTokens);
+    for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
+      rewriter.eraseOp(waitOp);
+    }
+  }
+}
+
+/// Utility function to determine if a DMA wait operation can be folded.
+/// This is achieved by verifying whether it shares the same row, channel,
+/// and direction with preceding wait operations.
+LogicalResult foldByColumn(IRRewriter &rewriter, DmaColumnBatch &dmaBatch,
+                           AMDAIE::NpuHalfDmaCpyNdOp dmaOp,
+                           AMDAIE::NpuDmaWaitOp waitOp) {
+  // Get the row and column.
+  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = dmaOp.getBdIdOp();
+  if (!maybeBdIdOp) return dmaOp.emitOpError() << "must have a BD ID op";
+  AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+  AMDAIE::TileOp tileOp =
+      dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
+  if (!tileOp)
+    return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
+  uint32_t col = getConstantIndexOrAssert(tileOp.getCol());
+  uint32_t row = getConstantIndexOrAssert(tileOp.getRow());
+
+  // Get the channel.
+  std::optional<AMDAIE::ChannelOp> maybeChannelOp = dmaOp.getChannelOp();
+  if (!maybeChannelOp)
+    return dmaOp.emitOpError() << "found non-`amdaie.channel` channel";
+  AMDAIE::ChannelOp channelOp = maybeChannelOp.value();
+  std::optional<AMDAIE::DMAChannelDir> maybeDirection =
+      channelOp.getDirection();
+  std::optional<uint32_t> maybeChannel = channelOp.getValue();
+  if (!maybeDirection || !maybeChannel)
+    return channelOp.emitOpError() << "direction and channel needed";
+  AMDAIE::DMAChannelDir direction = maybeDirection.value();
+  uint32_t channel = maybeChannel.value();
+
+  if (dmaBatch.colWaitOpMap.empty() || row != dmaBatch.row ||
+      channel != dmaBatch.channel || direction != dmaBatch.direction) {
+    updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap);
+    dmaBatch = {row, channel, direction, {}};
+  }
+  dmaBatch.colWaitOpMap[col] = waitOp;
+  return success();
+}
+
+/// Traverses the control code forward, ensuring that only one DMA wait op is
+/// retained for all the columns.
+///
+/// Example Input:
+///   %0 = dma_cpy_nd(col=0)
+///   %1 = dma_cpy_nd(col=1)
+///   %2 = dma_cpy_nd(col=2)
+///   %3 = dma_cpy_nd(col=3)
+///   dma_wait(%0)
+///   dma_wait(%1)
+///   dma_wait(%2)
+///   dma_wait(%3)
+/// Example Output:
+///   %0 = dma_cpy_nd(col=0)
+///   %1 = dma_cpy_nd(col=1)
+///   %2 = dma_cpy_nd(col=2)
+///   %3 = dma_cpy_nd(col=3)
+///   dma_wait(%0, %1, %2, %3)
+LogicalResult foldDmaWaitsByColumn(const AMDAIE::AMDAIEDeviceModel &deviceModel,
+                                   AMDAIE::ControlCodeOp controlCodeOp) {
+  IRRewriter rewriter(controlCodeOp->getContext());
+  DmaColumnBatch dmaBatch = {};
+
+  WalkResult res = controlCodeOp->walk([&](Operation *op) {
+    auto waitOp = dyn_cast<AMDAIE::NpuDmaWaitOp>(op);
+    // Skip if not a DMA wait op or if it already has multiple async tokens.
+    if (!waitOp || waitOp.getAsyncTokens().size() != 1) {
+      updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap);
+      dmaBatch.colWaitOpMap.clear();
+      return WalkResult::advance();
+    }
+
+    // Get the half DMA copy operation.
+    Value token = waitOp.getAsyncTokens().front();
+    auto npuHalfDmaCpyNdOp =
+        dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(token.getDefiningOp());
+    if (!npuHalfDmaCpyNdOp) {
+      waitOp.emitOpError() << "expected to operate on an "
+                              "`amdaie.npu.half_dma_cpy_nd`";
+      return WalkResult::interrupt();
+    }
+
+    // Check if the DMA wait op can be folded into the column batch.
+    if (succeeded(
+            foldByColumn(rewriter, dmaBatch, npuHalfDmaCpyNdOp, waitOp))) {
+      return WalkResult::advance();
+    } else {
+      return WalkResult::interrupt();
+    }
+  });
+
+  // Process the remaining wait ops.
+  updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap);
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
 class AMDAIEFoldDmaWaitsPass
     : public impl::AMDAIEFoldDmaWaitsBase<AMDAIEFoldDmaWaitsPass> {
  public:
@@ -181,7 +323,10 @@ void AMDAIEFoldDmaWaitsPass::runOnOperation() {
 
   WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) {
     AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode();
-    if (failed(foldDmaWaits(deviceModel, controlCodeOp))) {
+    if (failed(foldDmaWaitsByConnection(deviceModel, controlCodeOp))) {
+      return WalkResult::interrupt();
+    }
+    if (failed(foldDmaWaitsByColumn(deviceModel, controlCodeOp))) {
       return WalkResult::interrupt();
     }
     return WalkResult::advance();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
index fa83b2028..a75546cff 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
@@ -153,6 +153,59 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000005
+// CHECK:       0x00000080
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0001D214
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0201D214
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0401D214
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0601D214
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000080
+// CHECK:       0x00000010
+// CHECK:       0x00000001
+// CHECK:       0x00040100
+// CHECK-LABEL: @async_push_to_queue_and_wait_col_num
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<32xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @async_push_to_queue_and_wait_col_num() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
 // CHECK:       0x06030100
 // CHECK:       0x00000105
 // CHECK:       0x00000001
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
index 4032221cc..a0034a971 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
@@ -220,3 +220,93 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
+
+// -----
+
+// The first two DMA operations are expected to be batched into a single DMA wait, as they share the same row, 
+// channel, and direction, with consecutive columns (0 and 1). The third DMA operation is not batched because 
+// its column (3) is not consecutive with the previous operations.
+// CHECK-LABEL: @fold_dma_waits_column_batch
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
+// CHECK:       %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK:       %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK:       %[[TILE_3_0:.+]] = amdaie.tile(%[[C3]], %[[C0]])
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]]
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_1]]
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]])
+// CHECK:         %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]]
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]] : !amdaie.async_token, !amdaie.async_token)
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_2]] : !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_column_batch() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    amdaie.workgroup {
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_1_1 = amdaie.tile(%c1, %c1)
+      %tile_1_0 = amdaie.tile(%c1, %c0)
+      %tile_3_1 = amdaie.tile(%c3, %c1)
+      %tile_3_0 = amdaie.tile(%c3, %c0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %buffer_4 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(4), 4)
+      %lock_5 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_6 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_7 = amdaie.lock(%tile_1_1(5), 0)
+      %lock_8 = amdaie.lock(%tile_3_1(4), 4)
+      %lock_9 = amdaie.lock(%tile_3_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_6}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %8 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_10 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_11 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_12 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
+      %channel_13 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S)
+      %channel_14 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM)
+      %9 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false}
+      %10 = amdaie.connection(%0 {%channel_10}, %2 {%channel}, flow = %9) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %11 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false}
+      %12 = amdaie.connection(%3 {%channel_12}, %5 {%channel_11}, flow = %11) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %13 = amdaie.flow({%channel_13} -> {%channel_14}) {is_packet_flow = false}
+      %14 = amdaie.connection(%6 {%channel_14}, %8 {%channel_13}, flow = %13) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %15 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %16 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %7, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0_0, %c0)
+        %18 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_15 = amdaie.bd_id(%tile_1_0, %c0)
+        %19 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_15 channel = %channel_11) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_16 = amdaie.bd_id(%tile_3_0, %c0)
+        %20 = amdaie.npu.half_dma_cpy_nd async %14(%17 [] [] [] bd_id = %bd_id_16 channel = %channel_13) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%18 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%19 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%20 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}

From 344b7963afa0da8708c562e0c80fa4dc883c02be Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Mon, 16 Dec 2024 16:50:11 +0000
Subject: [PATCH 2/8] resolve comments

---
 .../AMDAIEControlCodeToTransaction.cpp        |  22 +-
 .../Transforms/AMDAIEFoldDmaWaits.cpp         | 255 ++++++++---------
 .../test/controlcode_to_transaction.mlir      |  77 +++++-
 .../Transforms/test/fold_dma_waits.mlir       | 258 +++++++++++-------
 4 files changed, 365 insertions(+), 247 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
index b427036b3..0c1cf7ef9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
@@ -200,9 +200,8 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op,
 }
 
 LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
-  // Batch DMA operations with the same row, channel, and direction into a
-  // single TCT sync operation, as long as they have consecutive columns.
-  SmallVector<std::pair<AMDAIE::NpuPushToQueueOp, uint32_t>> columnBatches;
+  // Collect all half DMA ops from the async tokens.
+  SmallVector<AMDAIE::NpuPushToQueueOp> pushToQueueOps;
   for (Value asyncToken : op.getAsyncTokens()) {
     auto pushToQueueOp = dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(
         asyncToken.getDefiningOp());
@@ -210,6 +209,20 @@ LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
       return op.emitOpError()
              << "should operate on an `amdaie.push_to_queue` op async token";
     }
+    pushToQueueOps.push_back(pushToQueueOp);
+  }
+  // Sort the half DMA ops by channel, direction, row, and column.
+  std::sort(pushToQueueOps.begin(), pushToQueueOps.end(),
+            [](AMDAIE::NpuPushToQueueOp a, AMDAIE::NpuPushToQueueOp b) {
+              return std::make_tuple(a.getChannel(), a.getDirection(),
+                                     a.getRow(), a.getCol()) <
+                     std::make_tuple(b.getChannel(), b.getDirection(),
+                                     b.getRow(), b.getCol());
+            });
+  // Batch DMA operations with the same row, channel, and direction into a
+  // single TCT sync operation, as long as they have consecutive columns.
+  llvm::MapVector<AMDAIE::NpuPushToQueueOp, uint32_t> columnBatches;
+  for (auto pushToQueueOp : pushToQueueOps) {
     if (!columnBatches.empty()) {
       auto &[lastPushOp, lastColNum] = columnBatches.back();
       if (lastPushOp.getRow() == pushToQueueOp.getRow() &&
@@ -220,9 +233,8 @@ LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
         continue;
       }
     }
-    columnBatches.push_back({pushToQueueOp, 1});
+    columnBatches.insert({pushToQueueOp, 1});
   }
-
   // Convert to TCT sync ops.
   for (auto &[pushToQueueOp, colNum] : columnBatches) {
     if (failed(builder.appendTCTSync(
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index e578203a4..b6bf6c877 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -16,13 +16,14 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-/// Utility function to determine whether a DMA wait op can be folded based on
-/// its half DMA copy operation.
-FailureOr<bool> canFoldByConnection(
+using DmaQueue = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+
+/// Utility function to determine whether a DMA wait op can be folded into a
+/// queue based on its half DMA copy operation.
+FailureOr<bool> canFoldByQueue(
     const AMDAIE::AMDAIEDeviceModel &deviceModel,
     AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
-    DenseMap<std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>,
-             SmallVector<uint32_t>> &tileConnectToBdIdQueue) {
+    DenseMap<DmaQueue, SmallVector<uint32_t>> &dmaQueueToBdIds) {
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
       npuHalfDmaCpyNdOp.getConnectionOp();
@@ -35,7 +36,7 @@ FailureOr<bool> canFoldByConnection(
   // Retrieve the flow op.
   std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
   if (!maybeFlowOp) {
-    return connectionOp->emitOpError()
+    return connectionOp.emitOpError()
            << "expected to operate on an `amdaie.flow`";
   }
   AMDAIE::FlowOp flowOp = maybeFlowOp.value();
@@ -66,20 +67,18 @@ FailureOr<bool> canFoldByConnection(
   // duplicate BD ID in the same tile, or packet flow, or the queue is
   // empty
   uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
-  bool isDuplicateBdId =
-      llvm::any_of(tileConnectToBdIdQueue, [&](const auto &entry) {
-        return entry.first.first == tileOp &&
-               llvm::is_contained(entry.second, bdId);
-      });
-  SmallVector<uint32_t> &bdIdQueue =
-      tileConnectToBdIdQueue[{tileOp, connectionOp}];
+  bool isDuplicateBdId = llvm::any_of(dmaQueueToBdIds, [&](const auto &entry) {
+    return entry.first.first == tileOp &&
+           llvm::is_contained(entry.second, bdId);
+  });
+  SmallVector<uint32_t> &bdIds = dmaQueueToBdIds[{tileOp, connectionOp}];
   bool canFold = true;
-  if (isDuplicateBdId || isPacketFlow || bdIdQueue.size() >= maxQueueSize ||
-      bdIdQueue.empty()) {
-    bdIdQueue.clear();
+  if (isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize ||
+      bdIds.empty()) {
+    bdIds.clear();
     canFold = false;
   }
-  bdIdQueue.push_back(bdId);
+  bdIds.push_back(bdId);
   return canFold;
 }
 
@@ -87,13 +86,13 @@ FailureOr<bool> canFoldByConnection(
 /// only one DMA wait op is retained for every maximum queue size.
 ///
 /// Example Output: assuming a maximum queue size of 4.
-///   dma_cpy_nd
-///   %0 = dma_cpy_nd
+///   dma_cpy_nd(connection=0, bd_id=0)
+///   %0 = dma_cpy_nd(connection=0, bd_id=1)
 ///   dma_wait(%0)
-///   dma_cpy_nd
-///   dma_cpy_nd
-///   dma_cpy_nd
-///   %1 = dma_cpy_nd
+///   dma_cpy_nd(connection=0, bd_id=2)
+///   dma_cpy_nd(connection=0, bd_id=3)
+///   dma_cpy_nd(connection=0, bd_id=4)
+///   %1 = dma_cpy_nd(connection=0, bd_id=5)
 ///   dma_wait(%1)
 /// From the bottom up, for every four DMA copy operations, only one DMA wait
 /// operation is retained.
@@ -101,14 +100,11 @@ FailureOr<bool> canFoldByConnection(
 /// Reverse traversal simplifies handling duplicate BD IDs, preventing
 /// the need to revisit and modify earlier operations after processing later
 /// ones.
-LogicalResult foldDmaWaitsByConnection(
-    const AMDAIE::AMDAIEDeviceModel &deviceModel,
-    AMDAIE::ControlCodeOp controlCodeOp) {
+LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
+                                  AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
   std::vector<AMDAIE::NpuDmaWaitOp> waitOpsToErase;
-  DenseMap<std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>,
-           SmallVector<uint32_t>>
-      tileConnectToBdIdQueue;
+  DenseMap<DmaQueue, SmallVector<uint32_t>> dmaQueueToBdIds;
   // Traverse the control code in reverse.
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
@@ -117,8 +113,8 @@ LogicalResult foldDmaWaitsByConnection(
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            FailureOr<bool> result = canFoldByConnection(
-                deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue);
+            FailureOr<bool> result =
+                canFoldByQueue(deviceModel, npuHalfDmaCpyNdOp, dmaQueueToBdIds);
             if (failed(result)) return WalkResult::interrupt();
             toErase &= *result;
           }
@@ -153,144 +149,113 @@ LogicalResult foldDmaWaitsByConnection(
   return success();
 }
 
-struct DmaColumnBatch {
-  uint32_t row;
-  uint32_t channel;
-  AMDAIE::DMAChannelDir direction;
-
-  // Sorted by column.
-  std::map<uint32_t, AMDAIE::NpuDmaWaitOp> colWaitOpMap;
-};
-
-/// Updates a batch of asynchronous DMA wait operations by combining their
-/// async tokens into a single NpuDmaWaitOp.
-void updateColumnBatchTokens(
-    IRRewriter &rewriter,
-    std::map<uint32_t, AMDAIE::NpuDmaWaitOp> &colWaitOpMap) {
-  if (colWaitOpMap.size() < 2) return;
-
-  // Check if there is any discontinuity in the columns, and if so, split into
-  // separate batches.
-  SmallVector<SmallVector<AMDAIE::NpuDmaWaitOp>> waitOpsList;
-  uint32_t prevCol = 0;
-  for (auto &entry : colWaitOpMap) {
-    uint32_t col = entry.first;
-    AMDAIE::NpuDmaWaitOp waitOp = entry.second;
-    if (waitOpsList.empty() || col != prevCol + 1) {
-      waitOpsList.push_back({});
+/// For each batch, combine the async tokens into a single NpuDmaWaitOp.
+LogicalResult updateBatchTokens(IRRewriter &rewriter,
+                                SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps) {
+  // Skip if there are less than two DMA wait operations.
+  if (waitOps.size() < 2) return success();
+
+  SmallVector<Value> asyncTokens;
+  Operation *parentOp = waitOps[0]->getParentOp();
+  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
+    if (waitOp->getParentOp() != parentOp) {
+      return waitOp.emitError(
+          "DMA operations to be batched must belong to the same scope");
     }
-    waitOpsList.back().push_back(waitOp);
-    prevCol = col;
+    asyncTokens.append(waitOp.getAsyncTokens().begin(),
+                       waitOp.getAsyncTokens().end());
   }
 
-  for (SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps : waitOpsList) {
-    // For each batch, combine the async tokens into a single NpuDmaWaitOp.
-    SmallVector<Value> asyncTokens;
-    for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
-      asyncTokens.append(waitOp.getAsyncTokens().begin(),
-                         waitOp.getAsyncTokens().end());
-    }
-    rewriter.setInsertionPointAfter(waitOps.back());
-    rewriter.create<AMDAIE::NpuDmaWaitOp>(waitOps.back().getLoc(), asyncTokens);
-    for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
-      rewriter.eraseOp(waitOp);
-    }
+  rewriter.setInsertionPointAfter(waitOps.back());
+  rewriter.create<AMDAIE::NpuDmaWaitOp>(waitOps.back().getLoc(), asyncTokens);
+  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
+    rewriter.eraseOp(waitOp);
   }
+  return success();
 }
 
-/// Utility function to determine if a DMA wait operation can be folded.
-/// This is achieved by verifying whether it shares the same row, channel,
-/// and direction with preceding wait operations.
-LogicalResult foldByColumn(IRRewriter &rewriter, DmaColumnBatch &dmaBatch,
-                           AMDAIE::NpuHalfDmaCpyNdOp dmaOp,
-                           AMDAIE::NpuDmaWaitOp waitOp) {
-  // Get the row and column.
-  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = dmaOp.getBdIdOp();
-  if (!maybeBdIdOp) return dmaOp.emitOpError() << "must have a BD ID op";
-  AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
-  AMDAIE::TileOp tileOp =
-      dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
-  if (!tileOp)
-    return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
-  uint32_t col = getConstantIndexOrAssert(tileOp.getCol());
-  uint32_t row = getConstantIndexOrAssert(tileOp.getRow());
+/// Utility function to determine if a DMA wait operation can be folded into a
+/// a batch based on its half DMA copy operation.
+FailureOr<bool> canFoldByBatch(
+    AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
+    SmallVector<AMDAIE::ConnectionOp> &connectionOps) {
+  // Retrieve the connection op.
+  std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
+      npuHalfDmaCpyNdOp.getConnectionOp();
+  if (!maybeConnectionOp) {
+    return npuHalfDmaCpyNdOp.emitOpError()
+           << "expected to operate on an `amdaie.connection`";
+  }
+  AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
 
-  // Get the channel.
-  std::optional<AMDAIE::ChannelOp> maybeChannelOp = dmaOp.getChannelOp();
-  if (!maybeChannelOp)
-    return dmaOp.emitOpError() << "found non-`amdaie.channel` channel";
-  AMDAIE::ChannelOp channelOp = maybeChannelOp.value();
-  std::optional<AMDAIE::DMAChannelDir> maybeDirection =
-      channelOp.getDirection();
-  std::optional<uint32_t> maybeChannel = channelOp.getValue();
-  if (!maybeDirection || !maybeChannel)
-    return channelOp.emitOpError() << "direction and channel needed";
-  AMDAIE::DMAChannelDir direction = maybeDirection.value();
-  uint32_t channel = maybeChannel.value();
+  // Retrieve the flow op.
+  std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
+  if (!maybeFlowOp) {
+    return connectionOp.emitOpError()
+           << "expected to operate on an `amdaie.flow`";
+  }
+  AMDAIE::FlowOp flowOp = maybeFlowOp.value();
+  bool isPacketFlow = flowOp.getIsPacketFlow();
 
-  if (dmaBatch.colWaitOpMap.empty() || row != dmaBatch.row ||
-      channel != dmaBatch.channel || direction != dmaBatch.direction) {
-    updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap);
-    dmaBatch = {row, channel, direction, {}};
+  bool canFold = true;
+  // Can't fold if the current connection op already occurs in the batch, or
+  // if the current operation is a packet flow, or if the batch is empty.
+  if (llvm::is_contained(connectionOps, connectionOp) || isPacketFlow ||
+      connectionOps.empty()) {
+    connectionOps.clear();
+    canFold = false;
   }
-  dmaBatch.colWaitOpMap[col] = waitOp;
-  return success();
+  connectionOps.push_back(connectionOp);
+  return canFold;
 }
 
 /// Traverses the control code forward, ensuring that only one DMA wait op is
-/// retained for all the columns.
+/// retained for every batch of DMA copy operations.
 ///
 /// Example Input:
-///   %0 = dma_cpy_nd(col=0)
-///   %1 = dma_cpy_nd(col=1)
-///   %2 = dma_cpy_nd(col=2)
-///   %3 = dma_cpy_nd(col=3)
+///   %0 = dma_cpy_nd(connection0)
 ///   dma_wait(%0)
+///   %1 = dma_cpy_nd(connection1)
+///   %2 = dma_cpy_nd(connection2)
+///   %3 = dma_cpy_nd(connection3)
 ///   dma_wait(%1)
 ///   dma_wait(%2)
 ///   dma_wait(%3)
 /// Example Output:
-///   %0 = dma_cpy_nd(col=0)
-///   %1 = dma_cpy_nd(col=1)
-///   %2 = dma_cpy_nd(col=2)
-///   %3 = dma_cpy_nd(col=3)
+///   %0 = dma_cpy_nd(connection0)
+///   %1 = dma_cpy_nd(connection1)
+///   %2 = dma_cpy_nd(connection2)
+///   %3 = dma_cpy_nd(connection3)
 ///   dma_wait(%0, %1, %2, %3)
-LogicalResult foldDmaWaitsByColumn(const AMDAIE::AMDAIEDeviceModel &deviceModel,
-                                   AMDAIE::ControlCodeOp controlCodeOp) {
+LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
-  DmaColumnBatch dmaBatch = {};
-
-  WalkResult res = controlCodeOp->walk([&](Operation *op) {
-    auto waitOp = dyn_cast<AMDAIE::NpuDmaWaitOp>(op);
-    // Skip if not a DMA wait op or if it already has multiple async tokens.
-    if (!waitOp || waitOp.getAsyncTokens().size() != 1) {
-      updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap);
-      dmaBatch.colWaitOpMap.clear();
-      return WalkResult::advance();
+  SmallVector<AMDAIE::NpuDmaWaitOp> waitOps;
+  SmallVector<AMDAIE::ConnectionOp> connectionOps;
+  WalkResult res = controlCodeOp->walk([&](AMDAIE::NpuDmaWaitOp waitOp) {
+    bool toBatch = true;
+    for (Value token : waitOp.getAsyncTokens()) {
+      if (auto npuHalfDmaCpyNdOp =
+              dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
+                  token.getDefiningOp())) {
+        FailureOr<bool> result =
+            canFoldByBatch(npuHalfDmaCpyNdOp, connectionOps);
+        if (failed(result)) return WalkResult::interrupt();
+        toBatch &= *result;
+      }
     }
-
-    // Get the half DMA copy operation.
-    Value token = waitOp.getAsyncTokens().front();
-    auto npuHalfDmaCpyNdOp =
-        dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(token.getDefiningOp());
-    if (!npuHalfDmaCpyNdOp) {
-      waitOp.emitOpError() << "expected to operate on an "
-                              "`amdaie.npu.half_dma_cpy_nd`";
-      return WalkResult::interrupt();
-    }
-
-    // Check if the DMA wait op can be folded into the column batch.
-    if (succeeded(
-            foldByColumn(rewriter, dmaBatch, npuHalfDmaCpyNdOp, waitOp))) {
-      return WalkResult::advance();
-    } else {
-      return WalkResult::interrupt();
+    // Process the previous batch of wait ops, and start a new batch.
+    if (!toBatch) {
+      if (failed(updateBatchTokens(rewriter, waitOps)))
+        return WalkResult::interrupt();
+      waitOps.clear();
     }
+    waitOps.push_back(waitOp);
+    return WalkResult::advance();
   });
 
-  // Process the remaining wait ops.
-  updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap);
   if (res.wasInterrupted()) return failure();
+  // Process the remaining wait ops.
+  if (failed(updateBatchTokens(rewriter, waitOps))) return failure();
   return success();
 }
 
@@ -323,10 +288,10 @@ void AMDAIEFoldDmaWaitsPass::runOnOperation() {
 
   WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) {
     AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode();
-    if (failed(foldDmaWaitsByConnection(deviceModel, controlCodeOp))) {
+    if (failed(foldDmaWaitsByQueue(deviceModel, controlCodeOp))) {
       return WalkResult::interrupt();
     }
-    if (failed(foldDmaWaitsByColumn(deviceModel, controlCodeOp))) {
+    if (failed(foldDmaWaitsByBatch(controlCodeOp))) {
       return WalkResult::interrupt();
     }
     return WalkResult::advance();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
index a75546cff..f36ad7fa2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
@@ -153,6 +153,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Same channel, direction, and row, but different col.
+// Expect one TCT sync operation (0x00000080), with col_num = 4.
 // CHECK:       0x06030100
 // CHECK:       0x00000105
 // CHECK:       0x00000005
@@ -165,7 +167,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000018
 // CHECK:       0x00000000
 // CHECK:       0x00000000
-// CHECK:       0x0201D214
+// CHECK:       0x0601D214
 // CHECK:       0x00000000
 // CHECK:       0x80000000
 // CHECK:       0x00000018
@@ -177,7 +179,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       0x00000018
 // CHECK:       0x00000000
 // CHECK:       0x00000000
-// CHECK:       0x0601D214
+// CHECK:       0x0201D214
 // CHECK:       0x00000000
 // CHECK:       0x80000000
 // CHECK:       0x00000018
@@ -193,9 +195,76 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       amdaie.controlcode {
         %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
         %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
-        %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Completely different channels, directions, rows, and cols.
+// Expect four TCT sync operations (0x00000080).
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000008
+// CHECK:       0x000000B0
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0001D214
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0201D21C
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0401D204
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0601D20C
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK:       0x00000080
+// CHECK:       0x00000010
+// CHECK:       0x00020000
+// CHECK:       0x00010100
+// CHECK:       0x00000080
+// CHECK:       0x00000010
+// CHECK:       0x00000001
+// CHECK:       0x00010100
+// CHECK:       0x00000080
+// CHECK:       0x00000010
+// CHECK:       0x00030000
+// CHECK:       0x01010100
+// CHECK:       0x00000080
+// CHECK:       0x00000010
+// CHECK:       0x00010001
+// CHECK:       0x01010100
+// CHECK-LABEL: @wait_different_row_col_channel_direction
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<44xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @wait_different_row_col_channel_direction() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
         amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
         amdaie.end
       }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
index a0034a971..954f86687 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
@@ -29,7 +29,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Expect no DMA waits to be folded, since the same BD ID is used.
+// Expect no DMA waits to be folded, since the same BD ID is used on the same connection.
 // CHECK-LABEL: @fold_dma_waits_same_bd_id
 // CHECK-COUNT-2: amdaie.npu.dma_wait
 // CHECK-NOT:     amdaie.npu.dma_wait
@@ -70,9 +70,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// DMA queue has a maximum size of 4. To optimize, starting from 
-// the end of the control code, retain every 4th DMA wait operation 
-// while folding the others.
+// Same connection, but different BD IDs are used. Expect the DMA waits to be folded.
+// DMA queue has a maximum size of 4. To optimize, starting from the end of the control code, 
+// retain every 4th DMA wait operation, while folding the others and removing their tokens.
 // CHECK-LABEL: @fold_dma_waits_max_queue_size
 // CHECK:       %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers
 // CHECK:       %[[CHANNEL_0:.+]] = amdaie.channel
@@ -141,92 +141,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// Two circuit connections are used, corresponding to two separate channels.
-// Each channel operates with its own independent queue.
-// CHECK-LABEL: @fold_dma_waits_two_connections
-// CHECK:       %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers
-// CHECK:       %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers
-// CHECK:       %[[CHANNEL_0:.+]] = amdaie.channel
-// CHECK:       %[[CHANNEL_1:.+]] = amdaie.channel
-// CHECK:       %[[CHANNEL_2:.+]] = amdaie.channel
-// CHECK:       %[[CHANNEL_3:.+]] = amdaie.channel
-// CHECK:       %[[CONNECTION_0:.+]] = amdaie.connection
-// CHECK:       %[[CONNECTION_1:.+]] = amdaie.connection
-// CHECK:         %[[OBJECT_FIFO_2:.+]] = amdaie.logicalobjectfifo.from_memref
-// CHECK:         %[[OBJECT_FIFO_3:.+]] = amdaie.logicalobjectfifo.from_memref
-// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
-// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
-// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id
-// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
-// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id
-// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @fold_dma_waits_two_connections() {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %c2 = arith.constant 2 : index
-    %c3 = arith.constant 3 : index
-    amdaie.workgroup {
-      %tile = amdaie.tile(%c0, %c1)
-      %tile_0 = amdaie.tile(%c0, %c0)
-      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
-      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
-      %buffer_2 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
-      %buffer_3 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
-      %lock = amdaie.lock(%tile(4), 4)
-      %lock_4 = amdaie.lock(%tile(5), 0)
-      %lock_5 = amdaie.lock(%tile(6), 4)
-      %lock_6 = amdaie.lock(%tile(7), 0)
-      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_4}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
-      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
-      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_5}, {%lock_6}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
-      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
-      %5 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
-      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
-      %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S)
-      %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
-      %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) 
-      %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false}
-      %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false}
-      %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
-      %9 = amdaie.connection(%3 {%channel_9}, %5 {%channel_8}, flow = %7) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
-      amdaie.controlcode {
-        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        memref.assume_alignment %1, 64 : memref<64x32xi32>
-        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        memref.assume_alignment %4, 64 : memref<64x32xi32>
-        %bd_id = amdaie.bd_id(%tile_0, %c0)
-        %12 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
-        %bd_id_1 = amdaie.bd_id(%tile_0, %c1)
-        %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_8) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%13 : !amdaie.async_token)
-        %bd_id_2 = amdaie.bd_id(%tile_0, %c2)
-        %14 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%14 : !amdaie.async_token)
-        %bd_id_3 = amdaie.bd_id(%tile_0, %c3)
-        %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_8) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%15 : !amdaie.async_token)
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// The first two DMA operations are expected to be batched into a single DMA wait, as they share the same row, 
-// channel, and direction, with consecutive columns (0 and 1). The third DMA operation is not batched because 
-// its column (3) is not consecutive with the previous operations.
-// CHECK-LABEL: @fold_dma_waits_column_batch
+// The three DMA operations are accessed through different connections.
+// They are expected to be batched into a single DMA wait.
+// CHECK-LABEL: @fold_dma_waits_batching
 // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
 // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
@@ -239,12 +156,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_1]]
 // CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]])
 // CHECK:         %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]]
-// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]] : !amdaie.async_token, !amdaie.async_token)
-// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_2]] : !amdaie.async_token)
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token)
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @fold_dma_waits_column_batch() {
+  func.func @fold_dma_waits_batching() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
@@ -310,3 +226,159 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     return
   }
 }
+
+// -----
+
+// The three DMA are operating on two different connections.
+// Expect the last two DMA operations to be batched into a single DMA wait,
+// while the first DMA operation is retained standalone, as each connection can only be accessed once per batch.
+// CHECK-LABEL: @fold_dma_waits_batching
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK:       %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK:       %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]]
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_1]]
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
+// CHECK:         %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]]
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_batching() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    amdaie.workgroup {
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_1_1 = amdaie.tile(%c1, %c1)
+      %tile_1_0 = amdaie.tile(%c1, %c0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(4), 4)
+      %lock_3 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_4 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_5 = amdaie.lock(%tile_1_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
+      %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false}
+      %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false}
+      %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0_0, %c0)
+        %12 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
+        %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0)
+        %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_10 = amdaie.bd_id(%tile_1_0, %c0)
+        %14 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_10 channel = %channel_7) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%13 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%14 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// Four DMA operations interleaved on two connections.
+// DMA operations on the same connection are expected to be folded using the DMA task queue.
+// DMA operations on different connections are expected to be folded using DMA batching.
+// With both optimizations, a single DMA wait is retained.
+// CHECK-LABEL: @fold_dma_waits_two_connections
+// CHECK:       %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers
+// CHECK:       %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers
+// CHECK:       %[[CHANNEL_0:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_1:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_2:.+]] = amdaie.channel
+// CHECK:       %[[CHANNEL_3:.+]] = amdaie.channel
+// CHECK:       %[[CONNECTION_0:.+]] = amdaie.connection
+// CHECK:       %[[CONNECTION_1:.+]] = amdaie.connection
+// CHECK:         %[[OBJECT_FIFO_2:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[OBJECT_FIFO_3:.+]] = amdaie.logicalobjectfifo.from_memref
+// CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_1:.+]] = amdaie.bd_id
+// CHECK:         amdaie.npu.half_dma_cpy_nd  %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id
+// CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id
+// CHECK:         %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]] : !amdaie.async_token, !amdaie.async_token)
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_two_connections() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_4 = amdaie.lock(%tile(5), 0)
+      %lock_5 = amdaie.lock(%tile(6), 4)
+      %lock_6 = amdaie.lock(%tile(7), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_4}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_5}, {%lock_6}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S)
+      %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) 
+      %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false}
+      %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false}
+      %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %9 = amdaie.connection(%3 {%channel_9}, %5 {%channel_8}, flow = %7) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %bd_id = amdaie.bd_id(%tile_0, %c0)
+        %12 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
+        %bd_id_1 = amdaie.bd_id(%tile_0, %c1)
+        %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_8) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%13 : !amdaie.async_token)
+        %bd_id_2 = amdaie.bd_id(%tile_0, %c2)
+        %14 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%14 : !amdaie.async_token)
+        %bd_id_3 = amdaie.bd_id(%tile_0, %c3)
+        %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_8) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%15 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}

From fb6d4d28130f48120d73fe535daf54def6542acc Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Tue, 17 Dec 2024 12:20:50 +0000
Subject: [PATCH 3/8] resolve comments

---
 .../Transforms/AMDAIEFoldDmaWaits.cpp         |  80 +++++-----
 .../Transforms/AMDAIEInsertDmaBdChain.cpp     |  17 ++-
 .../Transforms/test/fold_dma_waits.mlir       | 142 ++++++++++++++----
 3 files changed, 168 insertions(+), 71 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index b6bf6c877..7bdd8d0d8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -16,14 +16,14 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-using DmaQueue = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+using DmaQueueKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
 
 /// Utility function to determine whether a DMA wait op can be folded into a
 /// queue based on its half DMA copy operation.
 FailureOr<bool> canFoldByQueue(
     const AMDAIE::AMDAIEDeviceModel &deviceModel,
     AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
-    DenseMap<DmaQueue, SmallVector<uint32_t>> &dmaQueueToBdIds) {
+    DenseMap<DmaQueueKey, SmallVector<uint32_t>> &dmaQueueToBdIds) {
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
       npuHalfDmaCpyNdOp.getConnectionOp();
@@ -104,7 +104,7 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
                                   AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
   std::vector<AMDAIE::NpuDmaWaitOp> waitOpsToErase;
-  DenseMap<DmaQueue, SmallVector<uint32_t>> dmaQueueToBdIds;
+  DenseMap<DmaQueueKey, SmallVector<uint32_t>> dmaQueueToBdIds;
   // Traverse the control code in reverse.
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
@@ -168,17 +168,15 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter,
 
   rewriter.setInsertionPointAfter(waitOps.back());
   rewriter.create<AMDAIE::NpuDmaWaitOp>(waitOps.back().getLoc(), asyncTokens);
-  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
-    rewriter.eraseOp(waitOp);
-  }
+  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) rewriter.eraseOp(waitOp);
   return success();
 }
 
 /// Utility function to determine if a DMA wait operation can be folded into a
 /// a batch based on its half DMA copy operation.
-FailureOr<bool> canFoldByBatch(
-    AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
-    SmallVector<AMDAIE::ConnectionOp> &connectionOps) {
+FailureOr<bool> canFoldByBatch(Operation *batchParentOp,
+                               AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
+                               DenseSet<AMDAIE::ConnectionOp> &connectionOps) {
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
       npuHalfDmaCpyNdOp.getConnectionOp();
@@ -199,17 +197,19 @@ FailureOr<bool> canFoldByBatch(
 
   bool canFold = true;
   // Can't fold if the current connection op already occurs in the batch, or
-  // if the current operation is a packet flow, or if the batch is empty.
-  if (llvm::is_contained(connectionOps, connectionOp) || isPacketFlow ||
-      connectionOps.empty()) {
+  // if the current operation is a packet flow, or if the batch is empty, or
+  // if the current operation is not in the same scope as the batch.
+  if (connectionOps.contains(connectionOp) || isPacketFlow ||
+      connectionOps.empty() ||
+      (batchParentOp != npuHalfDmaCpyNdOp->getParentOp())) {
     connectionOps.clear();
     canFold = false;
   }
-  connectionOps.push_back(connectionOp);
+  connectionOps.insert(connectionOp);
   return canFold;
 }
 
-/// Traverses the control code forward, ensuring that only one DMA wait op is
+/// Traverses the control code in reverse, ensuring that only one DMA wait op is
 /// retained for every batch of DMA copy operations.
 ///
 /// Example Input:
@@ -227,34 +227,42 @@ FailureOr<bool> canFoldByBatch(
 ///   %2 = dma_cpy_nd(connection2)
 ///   %3 = dma_cpy_nd(connection3)
 ///   dma_wait(%0, %1, %2, %3)
+/// Reverse traversal simplifies handling duplicate connections, preventing
+/// the need to revisit and modify earlier operations after processing later
+/// ones.
 LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
   SmallVector<AMDAIE::NpuDmaWaitOp> waitOps;
-  SmallVector<AMDAIE::ConnectionOp> connectionOps;
-  WalkResult res = controlCodeOp->walk([&](AMDAIE::NpuDmaWaitOp waitOp) {
-    bool toBatch = true;
-    for (Value token : waitOp.getAsyncTokens()) {
-      if (auto npuHalfDmaCpyNdOp =
-              dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
-                  token.getDefiningOp())) {
-        FailureOr<bool> result =
-            canFoldByBatch(npuHalfDmaCpyNdOp, connectionOps);
-        if (failed(result)) return WalkResult::interrupt();
-        toBatch &= *result;
-      }
-    }
-    // Process the previous batch of wait ops, and start a new batch.
-    if (!toBatch) {
-      if (failed(updateBatchTokens(rewriter, waitOps)))
-        return WalkResult::interrupt();
-      waitOps.clear();
-    }
-    waitOps.push_back(waitOp);
-    return WalkResult::advance();
-  });
+  DenseSet<AMDAIE::ConnectionOp> connectionOps;
+  WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
+      [&](AMDAIE::NpuDmaWaitOp waitOp) {
+        bool toBatch = true;
+        Operation *batchParentOp =
+            waitOps.empty() ? waitOp->getParentOp() : waitOps[0]->getParentOp();
+        for (Value token : waitOp.getAsyncTokens()) {
+          if (auto npuHalfDmaCpyNdOp =
+                  dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
+                      token.getDefiningOp())) {
+            FailureOr<bool> result =
+                canFoldByBatch(batchParentOp, npuHalfDmaCpyNdOp, connectionOps);
+            if (failed(result)) return WalkResult::interrupt();
+            toBatch &= *result;
+          }
+        }
+        // Process the previous batch of wait ops, and start a new batch.
+        if (!toBatch) {
+          std::reverse(waitOps.begin(), waitOps.end());
+          if (failed(updateBatchTokens(rewriter, waitOps)))
+            return WalkResult::interrupt();
+          waitOps.clear();
+        }
+        waitOps.push_back(waitOp);
+        return WalkResult::advance();
+      });
 
   if (res.wasInterrupted()) return failure();
   // Process the remaining wait ops.
+  std::reverse(waitOps.begin(), waitOps.end());
   if (failed(updateBatchTokens(rewriter, waitOps))) return failure();
   return success();
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
index b21ceb025..352c8e500 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
@@ -17,7 +17,7 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-using DmaChain = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+using DmaChainKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
 
 /// Utility function to update `next_bd` and `start_bd` operands.
 LogicalResult updateChainOperands(
@@ -83,9 +83,9 @@ LogicalResult updateChainOperands(
 ///   - Chain X: [0] (the newly added BD ID).
 ///   - Chain Y: [] (emptied after breaking).
 void checkForChainsToBeBroken(
-    uint32_t currBdId, const DmaChain &currDmaChain,
-    const DenseMap<DmaChain, DenseSet<uint32_t>> &dmaChainToBdIds,
-    SmallVector<DmaChain> &chainsToBreak) {
+    uint32_t currBdId, const DmaChainKey &currDmaChain,
+    const DenseMap<DmaChainKey, DenseSet<uint32_t>> &dmaChainToBdIds,
+    SmallVector<DmaChainKey> &chainsToBreak) {
   for (auto &[entry, bdIds] : dmaChainToBdIds) {
     if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) {
       // Break the chain that contains the duplicate BD ID.
@@ -120,9 +120,10 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
   }
 
   // BD IDs that have been assigned in each tile.
-  DenseMap<DmaChain, DenseSet<uint32_t>> dmaChainToBdIds;
+  DenseMap<DmaChainKey, DenseSet<uint32_t>> dmaChainToBdIds;
   // Buffers the DMA ops that will be chained.
-  DenseMap<DmaChain, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>> dmaChainToDmaOps;
+  DenseMap<DmaChainKey, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>>
+      dmaChainToDmaOps;
 
   res = controlCodeOp->walk<WalkOrder::PostOrder,
                             ReverseIterator>([&](Operation *op) {
@@ -185,8 +186,8 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
       // Any duplicate BD ID from the same tile indicates that the chain
       // cannot grow further and requires breaking to release the
       // conflicting BD ID.
-      SmallVector<DmaChain> chainsToBreak;
-      DmaChain currDmaChain = {tileOp, connectionOp};
+      SmallVector<DmaChainKey> chainsToBreak;
+      DmaChainKey currDmaChain = {tileOp, connectionOp};
       checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds,
                                chainsToBreak);
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
index 954f86687..f74b8bad6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir
@@ -70,6 +70,66 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
+// Expect no DMA waits to be folded, since they are operating on different scopes.
+// CHECK-LABEL: @fold_dma_waits_loop
+// CHECK-COUNT-2: amdaie.npu.dma_wait
+// CHECK-NOT:     amdaie.npu.dma_wait
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @fold_dma_waits_loop() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c8 = arith.constant 8 : index
+    amdaie.workgroup {
+      %tile_0_1 = amdaie.tile(%c0, %c1)
+      %tile_0_0 = amdaie.tile(%c0, %c0)
+      %tile_1_1 = amdaie.tile(%c1, %c1)
+      %tile_1_0 = amdaie.tile(%c1, %c0)
+      %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile_0_1(4), 4)
+      %lock_3 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_4 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_5 = amdaie.lock(%tile_1_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
+      %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false}
+      %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false}
+      %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %4, 64 : memref<64x32xi32>
+        scf.for %arg0 = %c0 to %c1 step %c8 {
+          %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0)
+          %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+          amdaie.npu.dma_wait(%13 : !amdaie.async_token)
+        }
+        %bd_id = amdaie.bd_id(%tile_1_0, %c0)
+        %12 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id channel = %channel_7) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
 // Same connection, but different BD IDs are used. Expect the DMA waits to be folded.
 // DMA queue has a maximum size of 4. To optimize, starting from the end of the control code, 
 // retain every 4th DMA wait operation, while folding the others and removing their tokens.
@@ -229,14 +289,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// The three DMA are operating on two different connections.
-// Expect the last two DMA operations to be batched into a single DMA wait,
-// while the first DMA operation is retained standalone, as each connection can only be accessed once per batch.
-// CHECK-LABEL: @fold_dma_waits_batching
+// The five DMA are operating on three different connections.
+// Expect the first DMA operation to be retained standalone, while the rest are batched into two DMA waits.
+// This is because each connection can only be accessed once per batch.
+// CHECK-LABEL: @fold_dma_waits_multi_batching
 // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index
 // CHECK:       %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:       %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK:       %[[TILE_3_0:.+]] = amdaie.tile(%[[C3]], %[[C0]])
 // CHECK:         %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]])
 // CHECK:         %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]]
 // CHECK:         amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
@@ -245,10 +307,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:         %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
 // CHECK:         %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]]
 // CHECK:         amdaie.npu.dma_wait(%[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token)
+// CHECK:         %[[BD_ID_3:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]])
+// CHECK:         %[[TOKEN_3:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_3]]
+// CHECK:         %[[BD_ID_4:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]])
+// CHECK:         %[[TOKEN_4:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_4]]
+// CHECK:         amdaie.npu.dma_wait(%[[TOKEN_3]], %[[TOKEN_4]] : !amdaie.async_token, !amdaie.async_token)
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @fold_dma_waits_batching() {
+  func.func @fold_dma_waits_multi_batching() {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %c3 = arith.constant 3 : index
@@ -257,42 +324,63 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_1_1 = amdaie.tile(%c1, %c1)
       %tile_1_0 = amdaie.tile(%c1, %c0)
+      %tile_3_1 = amdaie.tile(%c3, %c1)
+      %tile_3_0 = amdaie.tile(%c3, %c0)
       %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
       %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
       %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
       %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32>
+      %buffer_3 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
+      %buffer_4 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32>
       %lock = amdaie.lock(%tile_0_1(4), 4)
-      %lock_3 = amdaie.lock(%tile_0_1(5), 0)
-      %lock_4 = amdaie.lock(%tile_1_1(4), 4)
-      %lock_5 = amdaie.lock(%tile_1_1(5), 0)
-      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %lock_5 = amdaie.lock(%tile_0_1(5), 0)
+      %lock_6 = amdaie.lock(%tile_1_1(4), 4)
+      %lock_7 = amdaie.lock(%tile_1_1(5), 0)
+      %lock_8 = amdaie.lock(%tile_3_1(4), 4)
+      %lock_9 = amdaie.lock(%tile_3_1(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
       %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
       %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
-      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_6}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
       %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
       %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %8 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
       %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
-      %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
-      %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
-      %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
-      %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false}
-      %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
-      %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false}
-      %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %channel_10 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_11 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S)
+      %channel_12 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM)
+      %channel_13 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S)
+      %channel_14 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM)
+      %9 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false}
+      %10 = amdaie.connection(%0 {%channel_10}, %2 {%channel}, flow = %9) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %11 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false}
+      %12 = amdaie.connection(%3 {%channel_12}, %5 {%channel_11}, flow = %11) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      %13 = amdaie.flow({%channel_13} -> {%channel_14}) {is_packet_flow = false}
+      %14 = amdaie.connection(%6 {%channel_14}, %8 {%channel_13}, flow = %13) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
       amdaie.controlcode {
-        %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %15 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
         memref.assume_alignment %1, 64 : memref<64x32xi32>
-        %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %16 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
         memref.assume_alignment %4, 64 : memref<64x32xi32>
+        %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %7, 64 : memref<64x32xi32>
         %bd_id = amdaie.bd_id(%tile_0_0, %c0)
-        %12 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%12 : !amdaie.async_token)
-        %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0)
-        %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %bd_id_10 = amdaie.bd_id(%tile_1_0, %c0)
-        %14 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_10 channel = %channel_7) : !amdaie.logicalobjectfifo<memref<2048xi32>>
-        amdaie.npu.dma_wait(%13 : !amdaie.async_token)
-        amdaie.npu.dma_wait(%14 : !amdaie.async_token)
+        %18 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%18 : !amdaie.async_token)
+        %bd_id_15 = amdaie.bd_id(%tile_0_0, %c0)
+        %19 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id_15 channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_16 = amdaie.bd_id(%tile_1_0, %c0)
+        %20 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_16 channel = %channel_11) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%19 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%20 : !amdaie.async_token)
+        %bd_id_17 = amdaie.bd_id(%tile_3_0, %c0)
+        %21 = amdaie.npu.half_dma_cpy_nd async %14(%17 [] [] [] bd_id = %bd_id_17 channel = %channel_13) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        %bd_id_18 = amdaie.bd_id(%tile_1_0, %c0)
+        %22 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_18 channel = %channel_11) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%21 : !amdaie.async_token)
+        amdaie.npu.dma_wait(%22 : !amdaie.async_token)
         amdaie.end
       }
     }

From b13c577440f469ed31a07ab776a2e5ba6dbf745b Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Tue, 17 Dec 2024 16:53:12 +0000
Subject: [PATCH 4/8] fix test and refactor

---
 .../Transforms/AMDAIEFoldDmaWaits.cpp         | 183 +++++++++++++-----
 1 file changed, 131 insertions(+), 52 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index 7bdd8d0d8..b622c7146 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -16,14 +16,58 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-using DmaQueueKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+using DmaBdIdKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+
+/// Utility function to erase the DMA wait operations in the queue, except for
+/// the last one.
+LogicalResult eraseQueueOperations(IRRewriter &rewriter,
+                                   SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps) {
+  // Skip if there are less than two DMA wait operations in the queue.
+  if (waitOps.size() < 2) return success();
+
+  Operation *parentOp = waitOps.back()->getParentOp();
+  // Do not modify the last wait op, it will be kept.
+  waitOps.pop_back();
+
+  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
+    if (waitOp->getParentOp() != parentOp) {
+      return waitOp.emitError(
+          "DMA operations to be queued must belong to the same scope");
+    }
+    // Erase the wait op.
+    SmallVector<Value> asyncTokens(waitOp.getAsyncTokens());
+    rewriter.eraseOp(waitOp);
+    for (Value token : asyncTokens) {
+      auto dmaOp =
+          dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(token.getDefiningOp());
+      if (!dmaOp)
+        waitOp.emitError("expected to operate on an `amdaie.half_dma_cpy_nd`");
+      if (dmaOp.use_empty()) {
+        rewriter.setInsertionPoint(dmaOp);
+        TypeRange resultTypeRange = TypeRange{};
+        // Nullify the result to avoid issuing a token.
+        rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+            dmaOp.getLoc(), resultTypeRange, dmaOp.getConnection(),
+            dmaOp.getInput(), dmaOp.getMixedOffsets(), dmaOp.getMixedSizes(),
+            dmaOp.getMixedStrides(), dmaOp.getBdId(), dmaOp.getChannel(),
+            dmaOp.getNextBd(), dmaOp.getStartBd());
+        rewriter.eraseOp(dmaOp);
+      }
+    }
+  }
+  return success();
+}
 
 /// Utility function to determine whether a DMA wait op can be folded into a
 /// queue based on its half DMA copy operation.
 FailureOr<bool> canFoldByQueue(
-    const AMDAIE::AMDAIEDeviceModel &deviceModel,
+    const AMDAIE::AMDAIEDeviceModel &deviceModel, Operation *queueParentOp,
     AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
-    DenseMap<DmaQueueKey, SmallVector<uint32_t>> &dmaQueueToBdIds) {
+    DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap) {
+  // Check if the current operation is in the same scope as the rest of the
+  // queue.
+  bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == queueParentOp;
+
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
       npuHalfDmaCpyNdOp.getConnectionOp();
@@ -63,22 +107,24 @@ FailureOr<bool> canFoldByQueue(
   uint32_t row = getConstantIndexOrAssert(tileOp.getRow());
   uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row);
 
-  // Keep wait op if, either reaches the maximum queue size, or a
-  // duplicate BD ID in the same tile, or packet flow, or the queue is
-  // empty
   uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
-  bool isDuplicateBdId = llvm::any_of(dmaQueueToBdIds, [&](const auto &entry) {
-    return entry.first.first == tileOp &&
-           llvm::is_contained(entry.second, bdId);
+  bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) {
+    return entry.first.first == tileOp && entry.second.contains(bdId);
   });
-  SmallVector<uint32_t> &bdIds = dmaQueueToBdIds[{tileOp, connectionOp}];
+  DenseSet<uint32_t> &bdIds = dmaBdIdsMap[{tileOp, connectionOp}];
   bool canFold = true;
+  // Can't fold wait op if:
+  // (1) the current BD ID on the same tile already occurs in the queue, or
+  // (2) the current operation is a packet flow, or
+  // (3) reaches the maximum queue size, or
+  // (4) the queue is empty, or
+  // (5) the current operation is not in the same scope as the queue.
   if (isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize ||
-      bdIds.empty()) {
+      bdIds.empty() || !isSameScope) {
     bdIds.clear();
     canFold = false;
   }
-  bdIds.push_back(bdId);
+  bdIds.insert(bdId);
   return canFold;
 }
 
@@ -103,49 +149,43 @@ FailureOr<bool> canFoldByQueue(
 LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
                                   AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
-  std::vector<AMDAIE::NpuDmaWaitOp> waitOpsToErase;
-  DenseMap<DmaQueueKey, SmallVector<uint32_t>> dmaQueueToBdIds;
+  SmallVector<SmallVector<AMDAIE::NpuDmaWaitOp>> waitOpQueues;
+  DenseMap<DmaBdIdKey, DenseSet<uint32_t>> dmaBdIdsMap;
   // Traverse the control code in reverse.
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
-        bool toErase = true;
+        bool toFold = true;
+        Operation *queueParentOp =
+            waitOpQueues.empty() ? waitOp->getParentOp()
+                                 : waitOpQueues.back().front()->getParentOp();
         for (Value token : waitOp.getAsyncTokens()) {
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            FailureOr<bool> result =
-                canFoldByQueue(deviceModel, npuHalfDmaCpyNdOp, dmaQueueToBdIds);
+            FailureOr<bool> result = canFoldByQueue(
+                deviceModel, queueParentOp, npuHalfDmaCpyNdOp, dmaBdIdsMap);
             if (failed(result)) return WalkResult::interrupt();
-            toErase &= *result;
+            toFold &= *result;
           }
         }
-        // Erase later to avoid invalidating the iterator.
-        if (toErase) waitOpsToErase.push_back(waitOp);
+        // Store all the queues, and modify later to avoid invalidating the
+        // iterator.
+        if (toFold) {
+          // Append the wait op to the last queue if it can be folded.
+          waitOpQueues.back().push_back(waitOp);
+        } else {
+          // Create a new queue if the wait op cannot be folded.
+          waitOpQueues.push_back(SmallVector<AMDAIE::NpuDmaWaitOp>{waitOp});
+        }
         return WalkResult::advance();
       });
   if (res.wasInterrupted()) return failure();
-
-  for (AMDAIE::NpuDmaWaitOp waitOp : waitOpsToErase) {
-    SmallVector<Value> asyncTokens(waitOp.getAsyncTokens());
-    // Erase the wait op.
-    rewriter.eraseOp(waitOp);
-    for (Value token : asyncTokens) {
-      if (auto op = dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
-              token.getDefiningOp())) {
-        if (op.use_empty()) {
-          rewriter.setInsertionPoint(op);
-          TypeRange resultTypeRange = TypeRange{};
-          // Nullify the result to avoid issuing a token.
-          rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
-              op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(),
-              op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(),
-              op.getBdId(), op.getChannel(), op.getNextBd(), op.getStartBd());
-          rewriter.eraseOp(op);
-        }
-      }
-    }
+  for (SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps : waitOpQueues) {
+    // Since the controlcode is traversed in reverse order, we need to
+    // restore the original order of the DMA operations.
+    std::reverse(waitOps.begin(), waitOps.end());
+    if (failed(eraseQueueOperations(rewriter, waitOps))) return failure();
   }
-
   return success();
 }
 
@@ -174,9 +214,14 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter,
 
 /// Utility function to determine if a DMA wait operation can be folded into a
 /// a batch based on its half DMA copy operation.
-FailureOr<bool> canFoldByBatch(Operation *batchParentOp,
-                               AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
-                               DenseSet<AMDAIE::ConnectionOp> &connectionOps) {
+FailureOr<bool> canFoldByBatch(
+    Operation *batchParentOp, AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
+    DenseSet<AMDAIE::ConnectionOp> &connectionOps,
+    DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap) {
+  // Check if the current operation is in the same scope as the rest of the
+  // batch.
+  bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == batchParentOp;
+
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
       npuHalfDmaCpyNdOp.getConnectionOp();
@@ -195,17 +240,48 @@ FailureOr<bool> canFoldByBatch(Operation *batchParentOp,
   AMDAIE::FlowOp flowOp = maybeFlowOp.value();
   bool isPacketFlow = flowOp.getIsPacketFlow();
 
+  // Retrieve the BD ID op.
+  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
+  if (!maybeBdIdOp) {
+    return npuHalfDmaCpyNdOp.emitOpError()
+           << "must have a BD ID op to lower to "
+              "`amdaie.npu.write_bd`";
+  }
+  AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+
+  // Retrieve the tile op.
+  AMDAIE::TileOp tileOp =
+      dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
+  if (!tileOp) {
+    return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
+  }
+
+  bool isDuplicateConnection = connectionOps.contains(connectionOp);
+  uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
+  bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) {
+    return entry.first.first == tileOp && entry.second.contains(bdId);
+  });
+
   bool canFold = true;
-  // Can't fold if the current connection op already occurs in the batch, or
-  // if the current operation is a packet flow, or if the batch is empty, or
-  // if the current operation is not in the same scope as the batch.
-  if (connectionOps.contains(connectionOp) || isPacketFlow ||
-      connectionOps.empty() ||
-      (batchParentOp != npuHalfDmaCpyNdOp->getParentOp())) {
+  // Can't fold wait op if:
+  // (1) the current connection op already occurs in the batch, or
+  // (2) the current BD ID on the same tile already occurs in the batch, or
+  // (3) the current operation is a packet flow, or
+  // (4) the batch is empty, or
+  // (5) the current operation is not in the same scope as the batch.
+  if (isDuplicateConnection || isDuplicateBdId || isPacketFlow ||
+      connectionOps.empty() || !isSameScope) {
+    // Clear the BD IDs for all the connections in the batch.
+    for (auto &entry : dmaBdIdsMap) {
+      ConnectionOp connectionOp = entry.first.second;
+      DenseSet<uint32_t> &bdIds = entry.second;
+      if (connectionOps.contains(connectionOp)) bdIds.clear();
+    }
     connectionOps.clear();
     canFold = false;
   }
   connectionOps.insert(connectionOp);
+  dmaBdIdsMap[{tileOp, connectionOp}].insert(bdId);
   return canFold;
 }
 
@@ -234,6 +310,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
   SmallVector<AMDAIE::NpuDmaWaitOp> waitOps;
   DenseSet<AMDAIE::ConnectionOp> connectionOps;
+  DenseMap<DmaBdIdKey, DenseSet<uint32_t>> dmaBdIdsMap;
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
         bool toBatch = true;
@@ -243,14 +320,16 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            FailureOr<bool> result =
-                canFoldByBatch(batchParentOp, npuHalfDmaCpyNdOp, connectionOps);
+            FailureOr<bool> result = canFoldByBatch(
+                batchParentOp, npuHalfDmaCpyNdOp, connectionOps, dmaBdIdsMap);
             if (failed(result)) return WalkResult::interrupt();
             toBatch &= *result;
           }
         }
         // Process the previous batch of wait ops, and start a new batch.
         if (!toBatch) {
+          // Since the controlcode is traversed in reverse order, we need to
+          // restore the original order of the DMA operations.
           std::reverse(waitOps.begin(), waitOps.end());
           if (failed(updateBatchTokens(rewriter, waitOps)))
             return WalkResult::interrupt();

From a10aedd5b56184d850f1619bf18668998e041c4b Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Tue, 17 Dec 2024 21:30:12 +0000
Subject: [PATCH 5/8] separate canFold decisions with update

---
 .../Transforms/AMDAIEFoldDmaWaits.cpp         | 134 +++++++++++-------
 1 file changed, 81 insertions(+), 53 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index b622c7146..21b0af3f5 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -61,18 +61,20 @@ LogicalResult eraseQueueOperations(IRRewriter &rewriter,
 /// Utility function to determine whether a DMA wait op can be folded into a
 /// queue based on its half DMA copy operation.
 FailureOr<bool> canFoldByQueue(
-    const AMDAIE::AMDAIEDeviceModel &deviceModel, Operation *queueParentOp,
-    AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
-    DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap) {
+    const AMDAIE::AMDAIEDeviceModel &deviceModel,
+    const Operation *queueParentOp,
+    const DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
+    DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal,
+    AMDAIE::NpuHalfDmaCpyNdOp &currHalfDmaCpyNdOp) {
   // Check if the current operation is in the same scope as the rest of the
   // queue.
-  bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == queueParentOp;
+  bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == queueParentOp;
 
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
-      npuHalfDmaCpyNdOp.getConnectionOp();
+      currHalfDmaCpyNdOp.getConnectionOp();
   if (!maybeConnectionOp) {
-    return npuHalfDmaCpyNdOp.emitOpError()
+    return currHalfDmaCpyNdOp.emitOpError()
            << "expected to operate on an `amdaie.connection`";
   }
   AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
@@ -87,13 +89,14 @@ FailureOr<bool> canFoldByQueue(
   bool isPacketFlow = flowOp.getIsPacketFlow();
 
   // Retrieve the BD ID op.
-  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
+  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp();
   if (!maybeBdIdOp) {
-    return npuHalfDmaCpyNdOp.emitOpError()
+    return currHalfDmaCpyNdOp.emitOpError()
            << "must have a BD ID op to lower to "
               "`amdaie.npu.write_bd`";
   }
   AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+  currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue());
 
   // Retrieve the tile op.
   AMDAIE::TileOp tileOp =
@@ -101,31 +104,26 @@ FailureOr<bool> canFoldByQueue(
   if (!tileOp) {
     return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
   }
+  currBdIdKey = {tileOp, connectionOp};
 
   // Get the maximum queue size.
   uint32_t col = getConstantIndexOrAssert(tileOp.getCol());
   uint32_t row = getConstantIndexOrAssert(tileOp.getRow());
   uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row);
 
-  uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
   bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) {
-    return entry.first.first == tileOp && entry.second.contains(bdId);
+    return entry.first.first == tileOp && entry.second.contains(currBdIdVal);
   });
-  DenseSet<uint32_t> &bdIds = dmaBdIdsMap[{tileOp, connectionOp}];
-  bool canFold = true;
+  const DenseSet<uint32_t> &bdIds = dmaBdIdsMap.lookup(currBdIdKey);
+
   // Can't fold wait op if:
   // (1) the current BD ID on the same tile already occurs in the queue, or
   // (2) the current operation is a packet flow, or
   // (3) reaches the maximum queue size, or
   // (4) the queue is empty, or
   // (5) the current operation is not in the same scope as the queue.
-  if (isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize ||
-      bdIds.empty() || !isSameScope) {
-    bdIds.clear();
-    canFold = false;
-  }
-  bdIds.insert(bdId);
-  return canFold;
+  return !(isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize ||
+           bdIds.empty() || !isSameScope);
 }
 
 /// Traverses the control code in reverse, ensuring that for each connection,
@@ -151,6 +149,16 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
   IRRewriter rewriter(controlCodeOp->getContext());
   SmallVector<SmallVector<AMDAIE::NpuDmaWaitOp>> waitOpQueues;
   DenseMap<DmaBdIdKey, DenseSet<uint32_t>> dmaBdIdsMap;
+
+  auto updateWithCurrBdId =
+      [&](bool canFold, DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
+          DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) {
+        assert(currBdIdKey.first && "TileOp must not be null");
+        assert(currBdIdKey.second && "ConnectionOp must not be null");
+        if (!canFold) dmaBdIdsMap[currBdIdKey].clear();
+        dmaBdIdsMap[currBdIdKey].insert(currBdIdVal);
+      };
+
   // Traverse the control code in reverse.
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
@@ -162,10 +170,14 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            FailureOr<bool> result = canFoldByQueue(
-                deviceModel, queueParentOp, npuHalfDmaCpyNdOp, dmaBdIdsMap);
+            DmaBdIdKey currBdIdKey = {nullptr, nullptr};
+            uint32_t currBdIdVal = 0;
+            FailureOr<bool> result =
+                canFoldByQueue(deviceModel, queueParentOp, dmaBdIdsMap,
+                               currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp);
             if (failed(result)) return WalkResult::interrupt();
             toFold &= *result;
+            updateWithCurrBdId(*result, dmaBdIdsMap, currBdIdKey, currBdIdVal);
           }
         }
         // Store all the queues, and modify later to avoid invalidating the
@@ -190,8 +202,8 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
 }
 
 /// For each batch, combine the async tokens into a single NpuDmaWaitOp.
-LogicalResult updateBatchTokens(IRRewriter &rewriter,
-                                SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps) {
+LogicalResult eraseBatchOperations(IRRewriter &rewriter,
+                                   SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps) {
   // Skip if there are less than two DMA wait operations.
   if (waitOps.size() < 2) return success();
 
@@ -215,21 +227,24 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter,
 /// Utility function to determine if a DMA wait operation can be folded into a
 /// a batch based on its half DMA copy operation.
 FailureOr<bool> canFoldByBatch(
-    Operation *batchParentOp, AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp,
-    DenseSet<AMDAIE::ConnectionOp> &connectionOps,
-    DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap) {
+    const Operation *batchParentOp,
+    const DenseSet<AMDAIE::ConnectionOp> &connectionOps,
+    const DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
+    DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal,
+    AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp) {
   // Check if the current operation is in the same scope as the rest of the
   // batch.
-  bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == batchParentOp;
+  bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == batchParentOp;
 
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
-      npuHalfDmaCpyNdOp.getConnectionOp();
+      currHalfDmaCpyNdOp.getConnectionOp();
   if (!maybeConnectionOp) {
-    return npuHalfDmaCpyNdOp.emitOpError()
+    return currHalfDmaCpyNdOp.emitOpError()
            << "expected to operate on an `amdaie.connection`";
   }
   AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
+  bool isDuplicateConnection = connectionOps.contains(connectionOp);
 
   // Retrieve the flow op.
   std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
@@ -241,13 +256,14 @@ FailureOr<bool> canFoldByBatch(
   bool isPacketFlow = flowOp.getIsPacketFlow();
 
   // Retrieve the BD ID op.
-  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
+  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp();
   if (!maybeBdIdOp) {
-    return npuHalfDmaCpyNdOp.emitOpError()
+    return currHalfDmaCpyNdOp.emitOpError()
            << "must have a BD ID op to lower to "
               "`amdaie.npu.write_bd`";
   }
   AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+  currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue());
 
   // Retrieve the tile op.
   AMDAIE::TileOp tileOp =
@@ -255,34 +271,20 @@ FailureOr<bool> canFoldByBatch(
   if (!tileOp) {
     return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
   }
+  currBdIdKey = {tileOp, connectionOp};
 
-  bool isDuplicateConnection = connectionOps.contains(connectionOp);
-  uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
   bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) {
-    return entry.first.first == tileOp && entry.second.contains(bdId);
+    return entry.first.first == tileOp && entry.second.contains(currBdIdVal);
   });
 
-  bool canFold = true;
   // Can't fold wait op if:
   // (1) the current connection op already occurs in the batch, or
   // (2) the current BD ID on the same tile already occurs in the batch, or
   // (3) the current operation is a packet flow, or
   // (4) the batch is empty, or
   // (5) the current operation is not in the same scope as the batch.
-  if (isDuplicateConnection || isDuplicateBdId || isPacketFlow ||
-      connectionOps.empty() || !isSameScope) {
-    // Clear the BD IDs for all the connections in the batch.
-    for (auto &entry : dmaBdIdsMap) {
-      ConnectionOp connectionOp = entry.first.second;
-      DenseSet<uint32_t> &bdIds = entry.second;
-      if (connectionOps.contains(connectionOp)) bdIds.clear();
-    }
-    connectionOps.clear();
-    canFold = false;
-  }
-  connectionOps.insert(connectionOp);
-  dmaBdIdsMap[{tileOp, connectionOp}].insert(bdId);
-  return canFold;
+  return !(isDuplicateConnection || isDuplicateBdId || isPacketFlow ||
+           connectionOps.empty() || !isSameScope);
 }
 
 /// Traverses the control code in reverse, ensuring that only one DMA wait op is
@@ -311,6 +313,27 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   SmallVector<AMDAIE::NpuDmaWaitOp> waitOps;
   DenseSet<AMDAIE::ConnectionOp> connectionOps;
   DenseMap<DmaBdIdKey, DenseSet<uint32_t>> dmaBdIdsMap;
+
+  auto updateWithCurrBdId =
+      [&](bool canFold, DenseSet<AMDAIE::ConnectionOp> &connectionOps,
+          DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
+          DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) {
+        assert(currBdIdKey.first && "TileOp must not be null");
+        assert(currBdIdKey.second && "ConnectionOp must not be null");
+        if (!canFold) {
+          // Clear the BD IDs for all the connections in the batch.
+          for (auto &entry : dmaBdIdsMap) {
+            ConnectionOp connectionOp = entry.first.second;
+            DenseSet<uint32_t> &bdIds = entry.second;
+            if (connectionOps.contains(connectionOp)) bdIds.clear();
+          }
+          connectionOps.clear();
+        }
+        connectionOps.insert(currBdIdKey.second);
+        dmaBdIdsMap[currBdIdKey].insert(currBdIdVal);
+      };
+
+  // Traverse the control code in reverse.
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
         bool toBatch = true;
@@ -320,10 +343,15 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            FailureOr<bool> result = canFoldByBatch(
-                batchParentOp, npuHalfDmaCpyNdOp, connectionOps, dmaBdIdsMap);
+            DmaBdIdKey currBdIdKey = {nullptr, nullptr};
+            uint32_t currBdIdVal = 0;
+            FailureOr<bool> result =
+                canFoldByBatch(batchParentOp, connectionOps, dmaBdIdsMap,
+                               currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp);
             if (failed(result)) return WalkResult::interrupt();
             toBatch &= *result;
+            updateWithCurrBdId(*result, connectionOps, dmaBdIdsMap, currBdIdKey,
+                               currBdIdVal);
           }
         }
         // Process the previous batch of wait ops, and start a new batch.
@@ -331,7 +359,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
           // Since the controlcode is traversed in reverse order, we need to
           // restore the original order of the DMA operations.
           std::reverse(waitOps.begin(), waitOps.end());
-          if (failed(updateBatchTokens(rewriter, waitOps)))
+          if (failed(eraseBatchOperations(rewriter, waitOps)))
             return WalkResult::interrupt();
           waitOps.clear();
         }
@@ -342,7 +370,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   if (res.wasInterrupted()) return failure();
   // Process the remaining wait ops.
   std::reverse(waitOps.begin(), waitOps.end());
-  if (failed(updateBatchTokens(rewriter, waitOps))) return failure();
+  if (failed(eraseBatchOperations(rewriter, waitOps))) return failure();
   return success();
 }
 

From cc1ae9e027fc2a580421325de63725b90896cdfd Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Tue, 17 Dec 2024 21:39:10 +0000
Subject: [PATCH 6/8] separate refactor

---
 .../Transforms/AMDAIEFoldDmaWaits.cpp         | 176 +++++++-----------
 .../Transforms/AMDAIEInsertDmaBdChain.cpp     |  17 +-
 2 files changed, 71 insertions(+), 122 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index 21b0af3f5..973d30449 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -18,63 +18,17 @@ namespace {
 
 using DmaBdIdKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
 
-/// Utility function to erase the DMA wait operations in the queue, except for
-/// the last one.
-LogicalResult eraseQueueOperations(IRRewriter &rewriter,
-                                   SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps) {
-  // Skip if there are less than two DMA wait operations in the queue.
-  if (waitOps.size() < 2) return success();
-
-  Operation *parentOp = waitOps.back()->getParentOp();
-  // Do not modify the last wait op, it will be kept.
-  waitOps.pop_back();
-
-  for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) {
-    if (waitOp->getParentOp() != parentOp) {
-      return waitOp.emitError(
-          "DMA operations to be queued must belong to the same scope");
-    }
-    // Erase the wait op.
-    SmallVector<Value> asyncTokens(waitOp.getAsyncTokens());
-    rewriter.eraseOp(waitOp);
-    for (Value token : asyncTokens) {
-      auto dmaOp =
-          dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(token.getDefiningOp());
-      if (!dmaOp)
-        waitOp.emitError("expected to operate on an `amdaie.half_dma_cpy_nd`");
-      if (dmaOp.use_empty()) {
-        rewriter.setInsertionPoint(dmaOp);
-        TypeRange resultTypeRange = TypeRange{};
-        // Nullify the result to avoid issuing a token.
-        rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
-            dmaOp.getLoc(), resultTypeRange, dmaOp.getConnection(),
-            dmaOp.getInput(), dmaOp.getMixedOffsets(), dmaOp.getMixedSizes(),
-            dmaOp.getMixedStrides(), dmaOp.getBdId(), dmaOp.getChannel(),
-            dmaOp.getNextBd(), dmaOp.getStartBd());
-        rewriter.eraseOp(dmaOp);
-      }
-    }
-  }
-  return success();
-}
-
-/// Utility function to determine whether a DMA wait op can be folded into a
-/// queue based on its half DMA copy operation.
+/// Utility function to determine whether a DMA wait op can be folded based on
+/// its half DMA copy operation.
 FailureOr<bool> canFoldByQueue(
     const AMDAIE::AMDAIEDeviceModel &deviceModel,
-    const Operation *queueParentOp,
-    const DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
-    DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal,
-    AMDAIE::NpuHalfDmaCpyNdOp &currHalfDmaCpyNdOp) {
-  // Check if the current operation is in the same scope as the rest of the
-  // queue.
-  bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == queueParentOp;
-
+    AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp,
+    DenseMap<DmaBdIdKey, SmallVector<uint32_t>> &tileConnectToBdIdQueue) {
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
-      currHalfDmaCpyNdOp.getConnectionOp();
+      npuHalfDmaCpyNdOp.getConnectionOp();
   if (!maybeConnectionOp) {
-    return currHalfDmaCpyNdOp.emitOpError()
+    return npuHalfDmaCpyNdOp.emitOpError()
            << "expected to operate on an `amdaie.connection`";
   }
   AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
@@ -82,21 +36,20 @@ FailureOr<bool> canFoldByQueue(
   // Retrieve the flow op.
   std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
   if (!maybeFlowOp) {
-    return connectionOp.emitOpError()
+    return connectionOp->emitOpError()
            << "expected to operate on an `amdaie.flow`";
   }
   AMDAIE::FlowOp flowOp = maybeFlowOp.value();
   bool isPacketFlow = flowOp.getIsPacketFlow();
 
   // Retrieve the BD ID op.
-  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp();
+  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
   if (!maybeBdIdOp) {
-    return currHalfDmaCpyNdOp.emitOpError()
+    return npuHalfDmaCpyNdOp.emitOpError()
            << "must have a BD ID op to lower to "
               "`amdaie.npu.write_bd`";
   }
   AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
-  currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue());
 
   // Retrieve the tile op.
   AMDAIE::TileOp tileOp =
@@ -104,39 +57,44 @@ FailureOr<bool> canFoldByQueue(
   if (!tileOp) {
     return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
   }
-  currBdIdKey = {tileOp, connectionOp};
 
   // Get the maximum queue size.
   uint32_t col = getConstantIndexOrAssert(tileOp.getCol());
   uint32_t row = getConstantIndexOrAssert(tileOp.getRow());
   uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row);
 
-  bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) {
-    return entry.first.first == tileOp && entry.second.contains(currBdIdVal);
-  });
-  const DenseSet<uint32_t> &bdIds = dmaBdIdsMap.lookup(currBdIdKey);
-
-  // Can't fold wait op if:
-  // (1) the current BD ID on the same tile already occurs in the queue, or
-  // (2) the current operation is a packet flow, or
-  // (3) reaches the maximum queue size, or
-  // (4) the queue is empty, or
-  // (5) the current operation is not in the same scope as the queue.
-  return !(isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize ||
-           bdIds.empty() || !isSameScope);
+  // Keep wait op if, either reaches the maximum queue size, or a
+  // duplicate BD ID in the same tile, or packet flow, or the queue is
+  // empty
+  uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue());
+  bool isDuplicateBdId =
+      llvm::any_of(tileConnectToBdIdQueue, [&](const auto &entry) {
+        return entry.first.first == tileOp &&
+               llvm::is_contained(entry.second, bdId);
+      });
+  SmallVector<uint32_t> &bdIdQueue =
+      tileConnectToBdIdQueue[{tileOp, connectionOp}];
+  bool canFold = true;
+  if (isDuplicateBdId || isPacketFlow || bdIdQueue.size() >= maxQueueSize ||
+      bdIdQueue.empty()) {
+    bdIdQueue.clear();
+    canFold = false;
+  }
+  bdIdQueue.push_back(bdId);
+  return canFold;
 }
 
 /// Traverses the control code in reverse, ensuring that for each connection,
 /// only one DMA wait op is retained for every maximum queue size.
 ///
 /// Example Output: assuming a maximum queue size of 4.
-///   dma_cpy_nd(connection=0, bd_id=0)
-///   %0 = dma_cpy_nd(connection=0, bd_id=1)
+///   dma_cpy_nd
+///   %0 = dma_cpy_nd
 ///   dma_wait(%0)
-///   dma_cpy_nd(connection=0, bd_id=2)
-///   dma_cpy_nd(connection=0, bd_id=3)
-///   dma_cpy_nd(connection=0, bd_id=4)
-///   %1 = dma_cpy_nd(connection=0, bd_id=5)
+///   dma_cpy_nd
+///   dma_cpy_nd
+///   dma_cpy_nd
+///   %1 = dma_cpy_nd
 ///   dma_wait(%1)
 /// From the bottom up, for every four DMA copy operations, only one DMA wait
 /// operation is retained.
@@ -147,57 +105,49 @@ FailureOr<bool> canFoldByQueue(
 LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel,
                                   AMDAIE::ControlCodeOp controlCodeOp) {
   IRRewriter rewriter(controlCodeOp->getContext());
-  SmallVector<SmallVector<AMDAIE::NpuDmaWaitOp>> waitOpQueues;
-  DenseMap<DmaBdIdKey, DenseSet<uint32_t>> dmaBdIdsMap;
-
-  auto updateWithCurrBdId =
-      [&](bool canFold, DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
-          DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) {
-        assert(currBdIdKey.first && "TileOp must not be null");
-        assert(currBdIdKey.second && "ConnectionOp must not be null");
-        if (!canFold) dmaBdIdsMap[currBdIdKey].clear();
-        dmaBdIdsMap[currBdIdKey].insert(currBdIdVal);
-      };
-
+  std::vector<AMDAIE::NpuDmaWaitOp> waitOpsToErase;
+  DenseMap<DmaBdIdKey, SmallVector<uint32_t>> tileConnectToBdIdQueue;
   // Traverse the control code in reverse.
   WalkResult res = controlCodeOp->walk<WalkOrder::PostOrder, ReverseIterator>(
       [&](AMDAIE::NpuDmaWaitOp waitOp) {
-        bool toFold = true;
-        Operation *queueParentOp =
-            waitOpQueues.empty() ? waitOp->getParentOp()
-                                 : waitOpQueues.back().front()->getParentOp();
+        bool toErase = true;
         for (Value token : waitOp.getAsyncTokens()) {
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            DmaBdIdKey currBdIdKey = {nullptr, nullptr};
-            uint32_t currBdIdVal = 0;
-            FailureOr<bool> result =
-                canFoldByQueue(deviceModel, queueParentOp, dmaBdIdsMap,
-                               currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp);
+            FailureOr<bool> result = canFoldByQueue(
+                deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue);
             if (failed(result)) return WalkResult::interrupt();
-            toFold &= *result;
-            updateWithCurrBdId(*result, dmaBdIdsMap, currBdIdKey, currBdIdVal);
+            toErase &= *result;
           }
         }
-        // Store all the queues, and modify later to avoid invalidating the
-        // iterator.
-        if (toFold) {
-          // Append the wait op to the last queue if it can be folded.
-          waitOpQueues.back().push_back(waitOp);
-        } else {
-          // Create a new queue if the wait op cannot be folded.
-          waitOpQueues.push_back(SmallVector<AMDAIE::NpuDmaWaitOp>{waitOp});
-        }
+        // Erase later to avoid invalidating the iterator.
+        if (toErase) waitOpsToErase.push_back(waitOp);
         return WalkResult::advance();
       });
   if (res.wasInterrupted()) return failure();
-  for (SmallVector<AMDAIE::NpuDmaWaitOp> &waitOps : waitOpQueues) {
-    // Since the controlcode is traversed in reverse order, we need to
-    // restore the original order of the DMA operations.
-    std::reverse(waitOps.begin(), waitOps.end());
-    if (failed(eraseQueueOperations(rewriter, waitOps))) return failure();
+
+  for (AMDAIE::NpuDmaWaitOp waitOp : waitOpsToErase) {
+    SmallVector<Value> asyncTokens(waitOp.getAsyncTokens());
+    // Erase the wait op.
+    rewriter.eraseOp(waitOp);
+    for (Value token : asyncTokens) {
+      if (auto op = dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
+              token.getDefiningOp())) {
+        if (op.use_empty()) {
+          rewriter.setInsertionPoint(op);
+          TypeRange resultTypeRange = TypeRange{};
+          // Nullify the result to avoid issuing a token.
+          rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+              op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(),
+              op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(),
+              op.getBdId(), op.getChannel(), op.getNextBd(), op.getStartBd());
+          rewriter.eraseOp(op);
+        }
+      }
+    }
   }
+
   return success();
 }
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
index 352c8e500..b21ceb025 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp
@@ -17,7 +17,7 @@ namespace mlir::iree_compiler::AMDAIE {
 
 namespace {
 
-using DmaChainKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+using DmaChain = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
 
 /// Utility function to update `next_bd` and `start_bd` operands.
 LogicalResult updateChainOperands(
@@ -83,9 +83,9 @@ LogicalResult updateChainOperands(
 ///   - Chain X: [0] (the newly added BD ID).
 ///   - Chain Y: [] (emptied after breaking).
 void checkForChainsToBeBroken(
-    uint32_t currBdId, const DmaChainKey &currDmaChain,
-    const DenseMap<DmaChainKey, DenseSet<uint32_t>> &dmaChainToBdIds,
-    SmallVector<DmaChainKey> &chainsToBreak) {
+    uint32_t currBdId, const DmaChain &currDmaChain,
+    const DenseMap<DmaChain, DenseSet<uint32_t>> &dmaChainToBdIds,
+    SmallVector<DmaChain> &chainsToBreak) {
   for (auto &[entry, bdIds] : dmaChainToBdIds) {
     if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) {
       // Break the chain that contains the duplicate BD ID.
@@ -120,10 +120,9 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
   }
 
   // BD IDs that have been assigned in each tile.
-  DenseMap<DmaChainKey, DenseSet<uint32_t>> dmaChainToBdIds;
+  DenseMap<DmaChain, DenseSet<uint32_t>> dmaChainToBdIds;
   // Buffers the DMA ops that will be chained.
-  DenseMap<DmaChainKey, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>>
-      dmaChainToDmaOps;
+  DenseMap<DmaChain, SmallVector<AMDAIE::NpuHalfDmaCpyNdOp>> dmaChainToDmaOps;
 
   res = controlCodeOp->walk<WalkOrder::PostOrder,
                             ReverseIterator>([&](Operation *op) {
@@ -186,8 +185,8 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel,
       // Any duplicate BD ID from the same tile indicates that the chain
       // cannot grow further and requires breaking to release the
       // conflicting BD ID.
-      SmallVector<DmaChainKey> chainsToBreak;
-      DmaChainKey currDmaChain = {tileOp, connectionOp};
+      SmallVector<DmaChain> chainsToBreak;
+      DmaChain currDmaChain = {tileOp, connectionOp};
       checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds,
                                chainsToBreak);
 

From e47e0f315870692eb55ac5fb1def653d78977a93 Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Wed, 18 Dec 2024 09:53:13 +0000
Subject: [PATCH 7/8] retrive current BD ID key value in a separate function

---
 .../Transforms/AMDAIEFoldDmaWaits.cpp         | 126 ++++++++++--------
 1 file changed, 71 insertions(+), 55 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index 973d30449..1446c55e3 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -17,6 +17,39 @@ namespace mlir::iree_compiler::AMDAIE {
 namespace {
 
 using DmaBdIdKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
+using DmaBdIdPair = std::pair<DmaBdIdKey, uint32_t>;
+
+FailureOr<DmaBdIdPair> retriveDmaBdIdPair(
+    AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp) {
+  // Retrieve the connection op.
+  std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
+      npuHalfDmaCpyNdOp.getConnectionOp();
+  if (!maybeConnectionOp) {
+    return npuHalfDmaCpyNdOp.emitOpError()
+           << "expected to operate on an `amdaie.connection`";
+  }
+  AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
+
+  // Retrieve the BD ID op.
+  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp();
+  if (!maybeBdIdOp) {
+    return npuHalfDmaCpyNdOp.emitOpError()
+           << "must have a BD ID op to lower to "
+              "`amdaie.npu.write_bd`";
+  }
+  AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
+  uint32_t currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue());
+
+  // Retrieve the tile op.
+  AMDAIE::TileOp tileOp =
+      dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
+  if (!tileOp) {
+    return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
+  }
+
+  DmaBdIdKey currBdIdKey = {tileOp, connectionOp};
+  return DmaBdIdPair{currBdIdKey, currBdIdVal};
+}
 
 /// Utility function to determine whether a DMA wait op can be folded based on
 /// its half DMA copy operation.
@@ -176,65 +209,44 @@ LogicalResult eraseBatchOperations(IRRewriter &rewriter,
 
 /// Utility function to determine if a DMA wait operation can be folded into a
 /// a batch based on its half DMA copy operation.
+/// Can't fold wait op if:
+/// (1) the current operation is not in the same scope as the batch, or
+/// (2) the current connection op already occurs in the batch, or
+/// (3) the batch is empty, or
+/// (4) the current operation is a packet flow, or
+/// (5) the current BD ID on the same tile already occurs in the batch.
 FailureOr<bool> canFoldByBatch(
     const Operation *batchParentOp,
     const DenseSet<AMDAIE::ConnectionOp> &connectionOps,
     const DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
-    DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal,
-    AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp) {
-  // Check if the current operation is in the same scope as the rest of the
-  // batch.
-  bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == batchParentOp;
+    AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp, DmaBdIdPair &currBdIdPair) {
+  // Not in the same scope? Can't fold.
+  if (currHalfDmaCpyNdOp->getParentOp() != batchParentOp) return false;
 
-  // Retrieve the connection op.
-  std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
-      currHalfDmaCpyNdOp.getConnectionOp();
-  if (!maybeConnectionOp) {
-    return currHalfDmaCpyNdOp.emitOpError()
-           << "expected to operate on an `amdaie.connection`";
-  }
-  AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value();
-  bool isDuplicateConnection = connectionOps.contains(connectionOp);
+  // Connection op already in the batch, or an empty batch? Can't fold.
+  AMDAIE::ConnectionOp connectionOp = currBdIdPair.first.second;
+  if (connectionOps.contains(connectionOp) || connectionOps.empty())
+    return false;
 
-  // Retrieve the flow op.
+  // Packet flow? Can't fold.
   std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
   if (!maybeFlowOp) {
     return connectionOp.emitOpError()
            << "expected to operate on an `amdaie.flow`";
   }
   AMDAIE::FlowOp flowOp = maybeFlowOp.value();
-  bool isPacketFlow = flowOp.getIsPacketFlow();
-
-  // Retrieve the BD ID op.
-  std::optional<AMDAIE::BdIdOp> maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp();
-  if (!maybeBdIdOp) {
-    return currHalfDmaCpyNdOp.emitOpError()
-           << "must have a BD ID op to lower to "
-              "`amdaie.npu.write_bd`";
-  }
-  AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value();
-  currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue());
-
-  // Retrieve the tile op.
-  AMDAIE::TileOp tileOp =
-      dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
-  if (!tileOp) {
-    return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
-  }
-  currBdIdKey = {tileOp, connectionOp};
+  if (flowOp.getIsPacketFlow()) return false;
 
+  // Duplicate BD ID on the same tile? Can't fold.
+  AMDAIE::TileOp tileOp = currBdIdPair.first.first;
+  uint32_t currBdIdVal = currBdIdPair.second;
   bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) {
     return entry.first.first == tileOp && entry.second.contains(currBdIdVal);
   });
+  if (isDuplicateBdId) return false;
 
-  // Can't fold wait op if:
-  // (1) the current connection op already occurs in the batch, or
-  // (2) the current BD ID on the same tile already occurs in the batch, or
-  // (3) the current operation is a packet flow, or
-  // (4) the batch is empty, or
-  // (5) the current operation is not in the same scope as the batch.
-  return !(isDuplicateConnection || isDuplicateBdId || isPacketFlow ||
-           connectionOps.empty() || !isSameScope);
+  // Can fold.
+  return true;
 }
 
 /// Traverses the control code in reverse, ensuring that only one DMA wait op is
@@ -265,11 +277,11 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   DenseMap<DmaBdIdKey, DenseSet<uint32_t>> dmaBdIdsMap;
 
   auto updateWithCurrBdId =
-      [&](bool canFold, DenseSet<AMDAIE::ConnectionOp> &connectionOps,
-          DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
-          DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) {
-        assert(currBdIdKey.first && "TileOp must not be null");
-        assert(currBdIdKey.second && "ConnectionOp must not be null");
+      [&](bool canFold, DmaBdIdPair &currBdIdPair,
+          DenseSet<AMDAIE::ConnectionOp> &connectionOps,
+          DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap) {
+        DmaBdIdKey currBdIdKey = currBdIdPair.first;
+        uint32_t currBdIdVal = currBdIdPair.second;
         if (!canFold) {
           // Clear the BD IDs for all the connections in the batch.
           for (auto &entry : dmaBdIdsMap) {
@@ -293,15 +305,19 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
           if (auto npuHalfDmaCpyNdOp =
                   dyn_cast_if_present<AMDAIE::NpuHalfDmaCpyNdOp>(
                       token.getDefiningOp())) {
-            DmaBdIdKey currBdIdKey = {nullptr, nullptr};
-            uint32_t currBdIdVal = 0;
-            FailureOr<bool> result =
+            // Retrieve the TileOp, ConnectionOp, and BD ID.
+            FailureOr<DmaBdIdPair> currBdIdPair =
+                retriveDmaBdIdPair(npuHalfDmaCpyNdOp);
+            if (failed(currBdIdPair)) return WalkResult::interrupt();
+            // Check if the current DMA wait op can be folded into the batch.
+            FailureOr<bool> canFold =
                 canFoldByBatch(batchParentOp, connectionOps, dmaBdIdsMap,
-                               currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp);
-            if (failed(result)) return WalkResult::interrupt();
-            toBatch &= *result;
-            updateWithCurrBdId(*result, connectionOps, dmaBdIdsMap, currBdIdKey,
-                               currBdIdVal);
+                               npuHalfDmaCpyNdOp, *currBdIdPair);
+            if (failed(canFold)) return WalkResult::interrupt();
+            // Update the `connectionOps` and `dmaBdIdsMap`.
+            updateWithCurrBdId(*canFold, *currBdIdPair, connectionOps,
+                               dmaBdIdsMap);
+            toBatch &= *canFold;
           }
         }
         // Process the previous batch of wait ops, and start a new batch.

From f4e5f07def39fa43839701d6159c77e839d42877 Mon Sep 17 00:00:00 2001
From: Yu-Zhewen <zhewenyu@amd.com>
Date: Wed, 18 Dec 2024 11:08:36 +0000
Subject: [PATCH 8/8] resolve comments

---
 .../iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
index 1446c55e3..8699ecf25 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp
@@ -19,7 +19,9 @@ namespace {
 using DmaBdIdKey = std::pair<AMDAIE::TileOp, AMDAIE::ConnectionOp>;
 using DmaBdIdPair = std::pair<DmaBdIdKey, uint32_t>;
 
-FailureOr<DmaBdIdPair> retriveDmaBdIdPair(
+/// Utility function to retrieve TileOp, ConnectionOp, and BD ID from a given
+/// half DMA copy operation.
+FailureOr<DmaBdIdPair> retrieveDmaBdIdPair(
     AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp) {
   // Retrieve the connection op.
   std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
@@ -219,7 +221,7 @@ FailureOr<bool> canFoldByBatch(
     const Operation *batchParentOp,
     const DenseSet<AMDAIE::ConnectionOp> &connectionOps,
     const DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap,
-    AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp, DmaBdIdPair &currBdIdPair) {
+    AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp, DmaBdIdPair currBdIdPair) {
   // Not in the same scope? Can't fold.
   if (currHalfDmaCpyNdOp->getParentOp() != batchParentOp) return false;
 
@@ -277,7 +279,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
   DenseMap<DmaBdIdKey, DenseSet<uint32_t>> dmaBdIdsMap;
 
   auto updateWithCurrBdId =
-      [&](bool canFold, DmaBdIdPair &currBdIdPair,
+      [&](bool canFold, DmaBdIdPair currBdIdPair,
           DenseSet<AMDAIE::ConnectionOp> &connectionOps,
           DenseMap<DmaBdIdKey, DenseSet<uint32_t>> &dmaBdIdsMap) {
         DmaBdIdKey currBdIdKey = currBdIdPair.first;
@@ -307,7 +309,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) {
                       token.getDefiningOp())) {
             // Retrieve the TileOp, ConnectionOp, and BD ID.
             FailureOr<DmaBdIdPair> currBdIdPair =
-                retriveDmaBdIdPair(npuHalfDmaCpyNdOp);
+                retrieveDmaBdIdPair(npuHalfDmaCpyNdOp);
             if (failed(currBdIdPair)) return WalkResult::interrupt();
             // Check if the current DMA wait op can be folded into the batch.
             FailureOr<bool> canFold =