From 022bdf813d38d9888893519af1c991ce9c2d3dd3 Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Thu, 12 Dec 2024 21:50:46 +0000 Subject: [PATCH 1/8] first commit --- .../AMDAIEControlCodeToTransaction.cpp | 28 +++- .../Transforms/AMDAIEFoldDmaWaits.cpp | 155 +++++++++++++++++- .../test/controlcode_to_transaction.mlir | 53 ++++++ .../Transforms/test/fold_dma_waits.mlir | 90 ++++++++++ 4 files changed, 316 insertions(+), 10 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp index 665ea08a8..b427036b3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -200,16 +200,34 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op, } LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { - for (Value token : op.getAsyncTokens()) { - auto pushToQueueOp = - dyn_cast_if_present(token.getDefiningOp()); + // Batch DMA operations with the same row, channel, and direction into a + // single TCT sync operation, as long as they have consecutive columns. + SmallVector> columnBatches; + for (Value asyncToken : op.getAsyncTokens()) { + auto pushToQueueOp = dyn_cast_if_present( + asyncToken.getDefiningOp()); if (!pushToQueueOp) { return op.emitOpError() - << "should operate on an `amdaie.push_to_queue` op"; + << "should operate on an `amdaie.push_to_queue` op async token"; } + if (!columnBatches.empty()) { + auto &[lastPushOp, lastColNum] = columnBatches.back(); + if (lastPushOp.getRow() == pushToQueueOp.getRow() && + lastPushOp.getCol() + lastColNum == pushToQueueOp.getCol() && + lastPushOp.getDirection() == pushToQueueOp.getDirection() && + lastPushOp.getChannel() == pushToQueueOp.getChannel()) { + ++lastColNum; + continue; + } + } + columnBatches.push_back({pushToQueueOp, 1}); + } + + // Convert to TCT sync ops. + for (auto &[pushToQueueOp, colNum] : columnBatches) { if (failed(builder.appendTCTSync( pushToQueueOp.getCol(), pushToQueueOp.getRow(), - static_cast(pushToQueueOp.getDirection()), 1, 1, + static_cast(pushToQueueOp.getDirection()), 1, colNum, pushToQueueOp.getChannel()))) { return failure(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 2f0c6030d..e578203a4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -18,7 +18,7 @@ namespace { /// Utility function to determine whether a DMA wait op can be folded based on /// its half DMA copy operation. -FailureOr canFoldBasedOnHalfDmaCpy( +FailureOr canFoldByConnection( const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, DenseMap, @@ -101,8 +101,9 @@ FailureOr canFoldBasedOnHalfDmaCpy( /// Reverse traversal simplifies handling duplicate BD IDs, preventing /// the need to revisit and modify earlier operations after processing later /// ones. -LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, - AMDAIE::ControlCodeOp controlCodeOp) { +LogicalResult foldDmaWaitsByConnection( + const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); std::vector waitOpsToErase; DenseMap, @@ -116,7 +117,7 @@ LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - FailureOr result = canFoldBasedOnHalfDmaCpy( + FailureOr result = canFoldByConnection( deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue); if (failed(result)) return WalkResult::interrupt(); toErase &= *result; @@ -152,6 +153,147 @@ LogicalResult foldDmaWaits(const AMDAIE::AMDAIEDeviceModel &deviceModel, return success(); } +struct DmaColumnBatch { + uint32_t row; + uint32_t channel; + AMDAIE::DMAChannelDir direction; + + // Sorted by column. + std::map colWaitOpMap; +}; + +/// Updates a batch of asynchronous DMA wait operations by combining their +/// async tokens into a single NpuDmaWaitOp. +void updateColumnBatchTokens( + IRRewriter &rewriter, + std::map &colWaitOpMap) { + if (colWaitOpMap.size() < 2) return; + + // Check if there is any discontinuity in the columns, and if so, split into + // separate batches. + SmallVector> waitOpsList; + uint32_t prevCol = 0; + for (auto &entry : colWaitOpMap) { + uint32_t col = entry.first; + AMDAIE::NpuDmaWaitOp waitOp = entry.second; + if (waitOpsList.empty() || col != prevCol + 1) { + waitOpsList.push_back({}); + } + waitOpsList.back().push_back(waitOp); + prevCol = col; + } + + for (SmallVector &waitOps : waitOpsList) { + // For each batch, combine the async tokens into a single NpuDmaWaitOp. + SmallVector asyncTokens; + for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { + asyncTokens.append(waitOp.getAsyncTokens().begin(), + waitOp.getAsyncTokens().end()); + } + rewriter.setInsertionPointAfter(waitOps.back()); + rewriter.create(waitOps.back().getLoc(), asyncTokens); + for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { + rewriter.eraseOp(waitOp); + } + } +} + +/// Utility function to determine if a DMA wait operation can be folded. +/// This is achieved by verifying whether it shares the same row, channel, +/// and direction with preceding wait operations. +LogicalResult foldByColumn(IRRewriter &rewriter, DmaColumnBatch &dmaBatch, + AMDAIE::NpuHalfDmaCpyNdOp dmaOp, + AMDAIE::NpuDmaWaitOp waitOp) { + // Get the row and column. + std::optional maybeBdIdOp = dmaOp.getBdIdOp(); + if (!maybeBdIdOp) return dmaOp.emitOpError() << "must have a BD ID op"; + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + AMDAIE::TileOp tileOp = + dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) + return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); + uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); + + // Get the channel. + std::optional maybeChannelOp = dmaOp.getChannelOp(); + if (!maybeChannelOp) + return dmaOp.emitOpError() << "found non-`amdaie.channel` channel"; + AMDAIE::ChannelOp channelOp = maybeChannelOp.value(); + std::optional maybeDirection = + channelOp.getDirection(); + std::optional maybeChannel = channelOp.getValue(); + if (!maybeDirection || !maybeChannel) + return channelOp.emitOpError() << "direction and channel needed"; + AMDAIE::DMAChannelDir direction = maybeDirection.value(); + uint32_t channel = maybeChannel.value(); + + if (dmaBatch.colWaitOpMap.empty() || row != dmaBatch.row || + channel != dmaBatch.channel || direction != dmaBatch.direction) { + updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap); + dmaBatch = {row, channel, direction, {}}; + } + dmaBatch.colWaitOpMap[col] = waitOp; + return success(); +} + +/// Traverses the control code forward, ensuring that only one DMA wait op is +/// retained for all the columns. +/// +/// Example Input: +/// %0 = dma_cpy_nd(col=0) +/// %1 = dma_cpy_nd(col=1) +/// %2 = dma_cpy_nd(col=2) +/// %3 = dma_cpy_nd(col=3) +/// dma_wait(%0) +/// dma_wait(%1) +/// dma_wait(%2) +/// dma_wait(%3) +/// Example Output: +/// %0 = dma_cpy_nd(col=0) +/// %1 = dma_cpy_nd(col=1) +/// %2 = dma_cpy_nd(col=2) +/// %3 = dma_cpy_nd(col=3) +/// dma_wait(%0, %1, %2, %3) +LogicalResult foldDmaWaitsByColumn(const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::ControlCodeOp controlCodeOp) { + IRRewriter rewriter(controlCodeOp->getContext()); + DmaColumnBatch dmaBatch = {}; + + WalkResult res = controlCodeOp->walk([&](Operation *op) { + auto waitOp = dyn_cast(op); + // Skip if not a DMA wait op or if it already has multiple async tokens. + if (!waitOp || waitOp.getAsyncTokens().size() != 1) { + updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap); + dmaBatch.colWaitOpMap.clear(); + return WalkResult::advance(); + } + + // Get the half DMA copy operation. + Value token = waitOp.getAsyncTokens().front(); + auto npuHalfDmaCpyNdOp = + dyn_cast_if_present(token.getDefiningOp()); + if (!npuHalfDmaCpyNdOp) { + waitOp.emitOpError() << "expected to operate on an " + "`amdaie.npu.half_dma_cpy_nd`"; + return WalkResult::interrupt(); + } + + // Check if the DMA wait op can be folded into the column batch. + if (succeeded( + foldByColumn(rewriter, dmaBatch, npuHalfDmaCpyNdOp, waitOp))) { + return WalkResult::advance(); + } else { + return WalkResult::interrupt(); + } + }); + + // Process the remaining wait ops. + updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap); + if (res.wasInterrupted()) return failure(); + return success(); +} + class AMDAIEFoldDmaWaitsPass : public impl::AMDAIEFoldDmaWaitsBase { public: @@ -181,7 +323,10 @@ void AMDAIEFoldDmaWaitsPass::runOnOperation() { WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); - if (failed(foldDmaWaits(deviceModel, controlCodeOp))) { + if (failed(foldDmaWaitsByConnection(deviceModel, controlCodeOp))) { + return WalkResult::interrupt(); + } + if (failed(foldDmaWaitsByColumn(deviceModel, controlCodeOp))) { return WalkResult::interrupt(); } return WalkResult::advance(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir index fa83b2028..a75546cff 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -153,6 +153,59 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000005 +// CHECK: 0x00000080 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0001D214 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0201D214 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0401D214 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0601D214 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000080 +// CHECK: 0x00000010 +// CHECK: 0x00000001 +// CHECK: 0x00040100 +// CHECK-LABEL: @async_push_to_queue_and_wait_col_num +// CHECK: npu_instructions = dense_resource : tensor<32xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @async_push_to_queue_and_wait_col_num() { + amdaie.workgroup { + amdaie.controlcode { + %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + // CHECK: 0x06030100 // CHECK: 0x00000105 // CHECK: 0x00000001 diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir index 4032221cc..a0034a971 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -220,3 +220,93 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } + +// ----- + +// The first two DMA operations are expected to be batched into a single DMA wait, as they share the same row, +// channel, and direction, with consecutive columns (0 and 1). The third DMA operation is not batched because +// its column (3) is not consecutive with the previous operations. +// CHECK-LABEL: @fold_dma_waits_column_batch +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: %[[TILE_3_0:.+]] = amdaie.tile(%[[C3]], %[[C0]]) +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]] +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_1]] +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]]) +// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]] +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]] : !amdaie.async_token, !amdaie.async_token) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_2]] : !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_column_batch() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_1 = amdaie.tile(%c1, %c1) + %tile_1_0 = amdaie.tile(%c1, %c0) + %tile_3_1 = amdaie.tile(%c3, %c1) + %tile_3_0 = amdaie.tile(%c3, %c0) + %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %buffer_4 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(4), 4) + %lock_5 = amdaie.lock(%tile_0_1(5), 0) + %lock_6 = amdaie.lock(%tile_1_1(4), 4) + %lock_7 = amdaie.lock(%tile_1_1(5), 0) + %lock_8 = amdaie.lock(%tile_3_1(4), 4) + %lock_9 = amdaie.lock(%tile_3_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_6}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %8 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_10 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_11 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_12 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) + %channel_13 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S) + %channel_14 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM) + %9 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false} + %10 = amdaie.connection(%0 {%channel_10}, %2 {%channel}, flow = %9) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %11 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false} + %12 = amdaie.connection(%3 {%channel_12}, %5 {%channel_11}, flow = %11) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %13 = amdaie.flow({%channel_13} -> {%channel_14}) {is_packet_flow = false} + %14 = amdaie.connection(%6 {%channel_14}, %8 {%channel_13}, flow = %13) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %15 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %16 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %7, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0_0, %c0) + %18 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + %bd_id_15 = amdaie.bd_id(%tile_1_0, %c0) + %19 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_15 channel = %channel_11) : !amdaie.logicalobjectfifo> + %bd_id_16 = amdaie.bd_id(%tile_3_0, %c0) + %20 = amdaie.npu.half_dma_cpy_nd async %14(%17 [] [] [] bd_id = %bd_id_16 channel = %channel_13) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%18 : !amdaie.async_token) + amdaie.npu.dma_wait(%19 : !amdaie.async_token) + amdaie.npu.dma_wait(%20 : !amdaie.async_token) + amdaie.end + } + } + return + } +} From 344b7963afa0da8708c562e0c80fa4dc883c02be Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Mon, 16 Dec 2024 16:50:11 +0000 Subject: [PATCH 2/8] resolve comments --- .../AMDAIEControlCodeToTransaction.cpp | 22 +- .../Transforms/AMDAIEFoldDmaWaits.cpp | 255 ++++++++--------- .../test/controlcode_to_transaction.mlir | 77 +++++- .../Transforms/test/fold_dma_waits.mlir | 258 +++++++++++------- 4 files changed, 365 insertions(+), 247 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp index b427036b3..0c1cf7ef9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -200,9 +200,8 @@ LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op, } LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { - // Batch DMA operations with the same row, channel, and direction into a - // single TCT sync operation, as long as they have consecutive columns. - SmallVector> columnBatches; + // Collect all half DMA ops from the async tokens. + SmallVector pushToQueueOps; for (Value asyncToken : op.getAsyncTokens()) { auto pushToQueueOp = dyn_cast_if_present( asyncToken.getDefiningOp()); @@ -210,6 +209,20 @@ LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { return op.emitOpError() << "should operate on an `amdaie.push_to_queue` op async token"; } + pushToQueueOps.push_back(pushToQueueOp); + } + // Sort the half DMA ops by channel, direction, row, and column. + std::sort(pushToQueueOps.begin(), pushToQueueOps.end(), + [](AMDAIE::NpuPushToQueueOp a, AMDAIE::NpuPushToQueueOp b) { + return std::make_tuple(a.getChannel(), a.getDirection(), + a.getRow(), a.getCol()) < + std::make_tuple(b.getChannel(), b.getDirection(), + b.getRow(), b.getCol()); + }); + // Batch DMA operations with the same row, channel, and direction into a + // single TCT sync operation, as long as they have consecutive columns. + llvm::MapVector columnBatches; + for (auto pushToQueueOp : pushToQueueOps) { if (!columnBatches.empty()) { auto &[lastPushOp, lastColNum] = columnBatches.back(); if (lastPushOp.getRow() == pushToQueueOp.getRow() && @@ -220,9 +233,8 @@ LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { continue; } } - columnBatches.push_back({pushToQueueOp, 1}); + columnBatches.insert({pushToQueueOp, 1}); } - // Convert to TCT sync ops. for (auto &[pushToQueueOp, colNum] : columnBatches) { if (failed(builder.appendTCTSync( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index e578203a4..b6bf6c877 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -16,13 +16,14 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -/// Utility function to determine whether a DMA wait op can be folded based on -/// its half DMA copy operation. -FailureOr canFoldByConnection( +using DmaQueue = std::pair; + +/// Utility function to determine whether a DMA wait op can be folded into a +/// queue based on its half DMA copy operation. +FailureOr canFoldByQueue( const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, - DenseMap, - SmallVector> &tileConnectToBdIdQueue) { + DenseMap> &dmaQueueToBdIds) { // Retrieve the connection op. std::optional maybeConnectionOp = npuHalfDmaCpyNdOp.getConnectionOp(); @@ -35,7 +36,7 @@ FailureOr canFoldByConnection( // Retrieve the flow op. std::optional maybeFlowOp = connectionOp.getFlowOp(); if (!maybeFlowOp) { - return connectionOp->emitOpError() + return connectionOp.emitOpError() << "expected to operate on an `amdaie.flow`"; } AMDAIE::FlowOp flowOp = maybeFlowOp.value(); @@ -66,20 +67,18 @@ FailureOr canFoldByConnection( // duplicate BD ID in the same tile, or packet flow, or the queue is // empty uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); - bool isDuplicateBdId = - llvm::any_of(tileConnectToBdIdQueue, [&](const auto &entry) { - return entry.first.first == tileOp && - llvm::is_contained(entry.second, bdId); - }); - SmallVector &bdIdQueue = - tileConnectToBdIdQueue[{tileOp, connectionOp}]; + bool isDuplicateBdId = llvm::any_of(dmaQueueToBdIds, [&](const auto &entry) { + return entry.first.first == tileOp && + llvm::is_contained(entry.second, bdId); + }); + SmallVector &bdIds = dmaQueueToBdIds[{tileOp, connectionOp}]; bool canFold = true; - if (isDuplicateBdId || isPacketFlow || bdIdQueue.size() >= maxQueueSize || - bdIdQueue.empty()) { - bdIdQueue.clear(); + if (isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize || + bdIds.empty()) { + bdIds.clear(); canFold = false; } - bdIdQueue.push_back(bdId); + bdIds.push_back(bdId); return canFold; } @@ -87,13 +86,13 @@ FailureOr canFoldByConnection( /// only one DMA wait op is retained for every maximum queue size. /// /// Example Output: assuming a maximum queue size of 4. -/// dma_cpy_nd -/// %0 = dma_cpy_nd +/// dma_cpy_nd(connection=0, bd_id=0) +/// %0 = dma_cpy_nd(connection=0, bd_id=1) /// dma_wait(%0) -/// dma_cpy_nd -/// dma_cpy_nd -/// dma_cpy_nd -/// %1 = dma_cpy_nd +/// dma_cpy_nd(connection=0, bd_id=2) +/// dma_cpy_nd(connection=0, bd_id=3) +/// dma_cpy_nd(connection=0, bd_id=4) +/// %1 = dma_cpy_nd(connection=0, bd_id=5) /// dma_wait(%1) /// From the bottom up, for every four DMA copy operations, only one DMA wait /// operation is retained. @@ -101,14 +100,11 @@ FailureOr canFoldByConnection( /// Reverse traversal simplifies handling duplicate BD IDs, preventing /// the need to revisit and modify earlier operations after processing later /// ones. -LogicalResult foldDmaWaitsByConnection( - const AMDAIE::AMDAIEDeviceModel &deviceModel, - AMDAIE::ControlCodeOp controlCodeOp) { +LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, + AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); std::vector waitOpsToErase; - DenseMap, - SmallVector> - tileConnectToBdIdQueue; + DenseMap> dmaQueueToBdIds; // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { @@ -117,8 +113,8 @@ LogicalResult foldDmaWaitsByConnection( if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - FailureOr result = canFoldByConnection( - deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue); + FailureOr result = + canFoldByQueue(deviceModel, npuHalfDmaCpyNdOp, dmaQueueToBdIds); if (failed(result)) return WalkResult::interrupt(); toErase &= *result; } @@ -153,144 +149,113 @@ LogicalResult foldDmaWaitsByConnection( return success(); } -struct DmaColumnBatch { - uint32_t row; - uint32_t channel; - AMDAIE::DMAChannelDir direction; - - // Sorted by column. - std::map colWaitOpMap; -}; - -/// Updates a batch of asynchronous DMA wait operations by combining their -/// async tokens into a single NpuDmaWaitOp. -void updateColumnBatchTokens( - IRRewriter &rewriter, - std::map &colWaitOpMap) { - if (colWaitOpMap.size() < 2) return; - - // Check if there is any discontinuity in the columns, and if so, split into - // separate batches. - SmallVector> waitOpsList; - uint32_t prevCol = 0; - for (auto &entry : colWaitOpMap) { - uint32_t col = entry.first; - AMDAIE::NpuDmaWaitOp waitOp = entry.second; - if (waitOpsList.empty() || col != prevCol + 1) { - waitOpsList.push_back({}); +/// For each batch, combine the async tokens into a single NpuDmaWaitOp. +LogicalResult updateBatchTokens(IRRewriter &rewriter, + SmallVector &waitOps) { + // Skip if there are less than two DMA wait operations. + if (waitOps.size() < 2) return success(); + + SmallVector asyncTokens; + Operation *parentOp = waitOps[0]->getParentOp(); + for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { + if (waitOp->getParentOp() != parentOp) { + return waitOp.emitError( + "DMA operations to be batched must belong to the same scope"); } - waitOpsList.back().push_back(waitOp); - prevCol = col; + asyncTokens.append(waitOp.getAsyncTokens().begin(), + waitOp.getAsyncTokens().end()); } - for (SmallVector &waitOps : waitOpsList) { - // For each batch, combine the async tokens into a single NpuDmaWaitOp. - SmallVector asyncTokens; - for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { - asyncTokens.append(waitOp.getAsyncTokens().begin(), - waitOp.getAsyncTokens().end()); - } - rewriter.setInsertionPointAfter(waitOps.back()); - rewriter.create(waitOps.back().getLoc(), asyncTokens); - for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { - rewriter.eraseOp(waitOp); - } + rewriter.setInsertionPointAfter(waitOps.back()); + rewriter.create(waitOps.back().getLoc(), asyncTokens); + for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { + rewriter.eraseOp(waitOp); } + return success(); } -/// Utility function to determine if a DMA wait operation can be folded. -/// This is achieved by verifying whether it shares the same row, channel, -/// and direction with preceding wait operations. -LogicalResult foldByColumn(IRRewriter &rewriter, DmaColumnBatch &dmaBatch, - AMDAIE::NpuHalfDmaCpyNdOp dmaOp, - AMDAIE::NpuDmaWaitOp waitOp) { - // Get the row and column. - std::optional maybeBdIdOp = dmaOp.getBdIdOp(); - if (!maybeBdIdOp) return dmaOp.emitOpError() << "must have a BD ID op"; - AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); - AMDAIE::TileOp tileOp = - dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); - if (!tileOp) - return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; - uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); - uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); +/// Utility function to determine if a DMA wait operation can be folded into a +/// a batch based on its half DMA copy operation. +FailureOr canFoldByBatch( + AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, + SmallVector &connectionOps) { + // Retrieve the connection op. + std::optional maybeConnectionOp = + npuHalfDmaCpyNdOp.getConnectionOp(); + if (!maybeConnectionOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "expected to operate on an `amdaie.connection`"; + } + AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); - // Get the channel. - std::optional maybeChannelOp = dmaOp.getChannelOp(); - if (!maybeChannelOp) - return dmaOp.emitOpError() << "found non-`amdaie.channel` channel"; - AMDAIE::ChannelOp channelOp = maybeChannelOp.value(); - std::optional maybeDirection = - channelOp.getDirection(); - std::optional maybeChannel = channelOp.getValue(); - if (!maybeDirection || !maybeChannel) - return channelOp.emitOpError() << "direction and channel needed"; - AMDAIE::DMAChannelDir direction = maybeDirection.value(); - uint32_t channel = maybeChannel.value(); + // Retrieve the flow op. + std::optional maybeFlowOp = connectionOp.getFlowOp(); + if (!maybeFlowOp) { + return connectionOp.emitOpError() + << "expected to operate on an `amdaie.flow`"; + } + AMDAIE::FlowOp flowOp = maybeFlowOp.value(); + bool isPacketFlow = flowOp.getIsPacketFlow(); - if (dmaBatch.colWaitOpMap.empty() || row != dmaBatch.row || - channel != dmaBatch.channel || direction != dmaBatch.direction) { - updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap); - dmaBatch = {row, channel, direction, {}}; + bool canFold = true; + // Can't fold if the current connection op already occurs in the batch, or + // if the current operation is a packet flow, or if the batch is empty. + if (llvm::is_contained(connectionOps, connectionOp) || isPacketFlow || + connectionOps.empty()) { + connectionOps.clear(); + canFold = false; } - dmaBatch.colWaitOpMap[col] = waitOp; - return success(); + connectionOps.push_back(connectionOp); + return canFold; } /// Traverses the control code forward, ensuring that only one DMA wait op is -/// retained for all the columns. +/// retained for every batch of DMA copy operations. /// /// Example Input: -/// %0 = dma_cpy_nd(col=0) -/// %1 = dma_cpy_nd(col=1) -/// %2 = dma_cpy_nd(col=2) -/// %3 = dma_cpy_nd(col=3) +/// %0 = dma_cpy_nd(connection0) /// dma_wait(%0) +/// %1 = dma_cpy_nd(connection1) +/// %2 = dma_cpy_nd(connection2) +/// %3 = dma_cpy_nd(connection3) /// dma_wait(%1) /// dma_wait(%2) /// dma_wait(%3) /// Example Output: -/// %0 = dma_cpy_nd(col=0) -/// %1 = dma_cpy_nd(col=1) -/// %2 = dma_cpy_nd(col=2) -/// %3 = dma_cpy_nd(col=3) +/// %0 = dma_cpy_nd(connection0) +/// %1 = dma_cpy_nd(connection1) +/// %2 = dma_cpy_nd(connection2) +/// %3 = dma_cpy_nd(connection3) /// dma_wait(%0, %1, %2, %3) -LogicalResult foldDmaWaitsByColumn(const AMDAIE::AMDAIEDeviceModel &deviceModel, - AMDAIE::ControlCodeOp controlCodeOp) { +LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); - DmaColumnBatch dmaBatch = {}; - - WalkResult res = controlCodeOp->walk([&](Operation *op) { - auto waitOp = dyn_cast(op); - // Skip if not a DMA wait op or if it already has multiple async tokens. - if (!waitOp || waitOp.getAsyncTokens().size() != 1) { - updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap); - dmaBatch.colWaitOpMap.clear(); - return WalkResult::advance(); + SmallVector waitOps; + SmallVector connectionOps; + WalkResult res = controlCodeOp->walk([&](AMDAIE::NpuDmaWaitOp waitOp) { + bool toBatch = true; + for (Value token : waitOp.getAsyncTokens()) { + if (auto npuHalfDmaCpyNdOp = + dyn_cast_if_present( + token.getDefiningOp())) { + FailureOr result = + canFoldByBatch(npuHalfDmaCpyNdOp, connectionOps); + if (failed(result)) return WalkResult::interrupt(); + toBatch &= *result; + } } - - // Get the half DMA copy operation. - Value token = waitOp.getAsyncTokens().front(); - auto npuHalfDmaCpyNdOp = - dyn_cast_if_present(token.getDefiningOp()); - if (!npuHalfDmaCpyNdOp) { - waitOp.emitOpError() << "expected to operate on an " - "`amdaie.npu.half_dma_cpy_nd`"; - return WalkResult::interrupt(); - } - - // Check if the DMA wait op can be folded into the column batch. - if (succeeded( - foldByColumn(rewriter, dmaBatch, npuHalfDmaCpyNdOp, waitOp))) { - return WalkResult::advance(); - } else { - return WalkResult::interrupt(); + // Process the previous batch of wait ops, and start a new batch. + if (!toBatch) { + if (failed(updateBatchTokens(rewriter, waitOps))) + return WalkResult::interrupt(); + waitOps.clear(); } + waitOps.push_back(waitOp); + return WalkResult::advance(); }); - // Process the remaining wait ops. - updateColumnBatchTokens(rewriter, dmaBatch.colWaitOpMap); if (res.wasInterrupted()) return failure(); + // Process the remaining wait ops. + if (failed(updateBatchTokens(rewriter, waitOps))) return failure(); return success(); } @@ -323,10 +288,10 @@ void AMDAIEFoldDmaWaitsPass::runOnOperation() { WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { AMDAIE::ControlCodeOp controlCodeOp = workgroupOp.getControlCode(); - if (failed(foldDmaWaitsByConnection(deviceModel, controlCodeOp))) { + if (failed(foldDmaWaitsByQueue(deviceModel, controlCodeOp))) { return WalkResult::interrupt(); } - if (failed(foldDmaWaitsByColumn(deviceModel, controlCodeOp))) { + if (failed(foldDmaWaitsByBatch(controlCodeOp))) { return WalkResult::interrupt(); } return WalkResult::advance(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir index a75546cff..f36ad7fa2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -153,6 +153,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Same channel, direction, and row, but different col. +// Expect one TCT sync operation (0x00000080), with col_num = 4. // CHECK: 0x06030100 // CHECK: 0x00000105 // CHECK: 0x00000005 @@ -165,7 +167,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000018 // CHECK: 0x00000000 // CHECK: 0x00000000 -// CHECK: 0x0201D214 +// CHECK: 0x0601D214 // CHECK: 0x00000000 // CHECK: 0x80000000 // CHECK: 0x00000018 @@ -177,7 +179,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: 0x00000018 // CHECK: 0x00000000 // CHECK: 0x00000000 -// CHECK: 0x0601D214 +// CHECK: 0x0201D214 // CHECK: 0x00000000 // CHECK: 0x80000000 // CHECK: 0x00000018 @@ -193,9 +195,76 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { amdaie.controlcode { %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} - %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// Completely different channels, directions, rows, and cols. +// Expect four TCT sync operations (0x00000080). +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000008 +// CHECK: 0x000000B0 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0001D214 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0201D21C +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0401D204 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0601D20C +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK: 0x00000080 +// CHECK: 0x00000010 +// CHECK: 0x00020000 +// CHECK: 0x00010100 +// CHECK: 0x00000080 +// CHECK: 0x00000010 +// CHECK: 0x00000001 +// CHECK: 0x00010100 +// CHECK: 0x00000080 +// CHECK: 0x00000010 +// CHECK: 0x00030000 +// CHECK: 0x01010100 +// CHECK: 0x00000080 +// CHECK: 0x00000010 +// CHECK: 0x00010001 +// CHECK: 0x01010100 +// CHECK-LABEL: @wait_different_row_col_channel_direction +// CHECK: npu_instructions = dense_resource : tensor<44xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @wait_different_row_col_channel_direction() { + amdaie.workgroup { + amdaie.controlcode { + %0 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %1 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 1 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %2 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %3 = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} amdaie.npu.dma_wait(%0, %1, %2, %3 : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) amdaie.end } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir index a0034a971..954f86687 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -29,7 +29,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// Expect no DMA waits to be folded, since the same BD ID is used. +// Expect no DMA waits to be folded, since the same BD ID is used on the same connection. // CHECK-LABEL: @fold_dma_waits_same_bd_id // CHECK-COUNT-2: amdaie.npu.dma_wait // CHECK-NOT: amdaie.npu.dma_wait @@ -70,9 +70,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// DMA queue has a maximum size of 4. To optimize, starting from -// the end of the control code, retain every 4th DMA wait operation -// while folding the others. +// Same connection, but different BD IDs are used. Expect the DMA waits to be folded. +// DMA queue has a maximum size of 4. To optimize, starting from the end of the control code, +// retain every 4th DMA wait operation, while folding the others and removing their tokens. // CHECK-LABEL: @fold_dma_waits_max_queue_size // CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers // CHECK: %[[CHANNEL_0:.+]] = amdaie.channel @@ -141,92 +141,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// Two circuit connections are used, corresponding to two separate channels. -// Each channel operates with its own independent queue. -// CHECK-LABEL: @fold_dma_waits_two_connections -// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers -// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers -// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel -// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel -// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel -// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel -// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection -// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection -// CHECK: %[[OBJECT_FIFO_2:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK: %[[OBJECT_FIFO_3:.+]] = amdaie.logicalobjectfifo.from_memref -// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id -// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> -// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id -// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> -// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id -// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) -// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id -// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @fold_dma_waits_two_connections() { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %c3 = arith.constant 3 : index - amdaie.workgroup { - %tile = amdaie.tile(%c0, %c1) - %tile_0 = amdaie.tile(%c0, %c0) - %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> - %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> - %buffer_2 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> - %buffer_3 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> - %lock = amdaie.lock(%tile(4), 4) - %lock_4 = amdaie.lock(%tile(5), 0) - %lock_5 = amdaie.lock(%tile(6), 4) - %lock_6 = amdaie.lock(%tile(7), 0) - %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_4}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> - %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_5}, {%lock_6}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> - %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> - %5 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> - %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) - %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S) - %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) - %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) - %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false} - %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false} - %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %9 = amdaie.connection(%3 {%channel_9}, %5 {%channel_8}, flow = %7) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - amdaie.controlcode { - %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> - memref.assume_alignment %1, 64 : memref<64x32xi32> - %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> - memref.assume_alignment %4, 64 : memref<64x32xi32> - %bd_id = amdaie.bd_id(%tile_0, %c0) - %12 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12 : !amdaie.async_token) - %bd_id_1 = amdaie.bd_id(%tile_0, %c1) - %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_8) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13 : !amdaie.async_token) - %bd_id_2 = amdaie.bd_id(%tile_0, %c2) - %14 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14 : !amdaie.async_token) - %bd_id_3 = amdaie.bd_id(%tile_0, %c3) - %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_8) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%15 : !amdaie.async_token) - amdaie.end - } - } - return - } -} - -// ----- - -// The first two DMA operations are expected to be batched into a single DMA wait, as they share the same row, -// channel, and direction, with consecutive columns (0 and 1). The third DMA operation is not batched because -// its column (3) is not consecutive with the previous operations. -// CHECK-LABEL: @fold_dma_waits_column_batch +// The three DMA operations are accessed through different connections. +// They are expected to be batched into a single DMA wait. +// CHECK-LABEL: @fold_dma_waits_batching // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index @@ -239,12 +156,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_1]] // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]]) // CHECK: %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]] -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]] : !amdaie.async_token, !amdaie.async_token) -// CHECK: amdaie.npu.dma_wait(%[[TOKEN_2]] : !amdaie.async_token) +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token, !amdaie.async_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @fold_dma_waits_column_batch() { + func.func @fold_dma_waits_batching() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index @@ -310,3 +226,159 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} return } } + +// ----- + +// The three DMA are operating on two different connections. +// Expect the last two DMA operations to be batched into a single DMA wait, +// while the first DMA operation is retained standalone, as each connection can only be accessed once per batch. +// CHECK-LABEL: @fold_dma_waits_batching +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]] +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_1]] +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) +// CHECK: %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]] +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_batching() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_1 = amdaie.tile(%c1, %c1) + %tile_1_0 = amdaie.tile(%c1, %c0) + %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(4), 4) + %lock_3 = amdaie.lock(%tile_0_1(5), 0) + %lock_4 = amdaie.lock(%tile_1_1(4), 4) + %lock_5 = amdaie.lock(%tile_1_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) + %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false} + %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false} + %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0_0, %c0) + %12 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12 : !amdaie.async_token) + %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0) + %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo> + %bd_id_10 = amdaie.bd_id(%tile_1_0, %c0) + %14 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_10 channel = %channel_7) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13 : !amdaie.async_token) + amdaie.npu.dma_wait(%14 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// Four DMA operations interleaved on two connections. +// DMA operations on the same connection are expected to be folded using the DMA task queue. +// DMA operations on different connections are expected to be folded using DMA batching. +// With both optimizations, a single DMA wait is retained. +// CHECK-LABEL: @fold_dma_waits_two_connections +// CHECK: %[[OBJECT_FIFO_0:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[OBJECT_FIFO_1:.+]] = amdaie.logicalobjectfifo.from_buffers +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel +// CHECK: %[[CONNECTION_0:.+]] = amdaie.connection +// CHECK: %[[CONNECTION_1:.+]] = amdaie.connection +// CHECK: %[[OBJECT_FIFO_2:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[OBJECT_FIFO_3:.+]] = amdaie.logicalobjectfifo.from_memref +// CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_0]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_1:.+]] = amdaie.bd_id +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_1]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[OBJECT_FIFO_2]] [] [] [] bd_id = %[[BD_ID_2]] channel = %[[CHANNEL_0]]) : !amdaie.logicalobjectfifo> +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_1]](%[[OBJECT_FIFO_3]] [] [] [] bd_id = %[[BD_ID_3]] channel = %[[CHANNEL_2]]) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]], %[[TOKEN_1]] : !amdaie.async_token, !amdaie.async_token) +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_two_connections() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_4 = amdaie.lock(%tile(5), 0) + %lock_5 = amdaie.lock(%tile(6), 4) + %lock_6 = amdaie.lock(%tile(7), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_4}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_5}, {%lock_6}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_7 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S) + %channel_8 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %channel_9 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) + %6 = amdaie.flow({%channel} -> {%channel_7}) {is_packet_flow = false} + %7 = amdaie.flow({%channel_8} -> {%channel_9}) {is_packet_flow = false} + %8 = amdaie.connection(%0 {%channel_7}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %9 = amdaie.connection(%3 {%channel_9}, %5 {%channel_8}, flow = %7) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + %bd_id = amdaie.bd_id(%tile_0, %c0) + %12 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12 : !amdaie.async_token) + %bd_id_1 = amdaie.bd_id(%tile_0, %c1) + %13 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_1 channel = %channel_8) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13 : !amdaie.async_token) + %bd_id_2 = amdaie.bd_id(%tile_0, %c2) + %14 = amdaie.npu.half_dma_cpy_nd async %8(%10 [] [] [] bd_id = %bd_id_2 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%14 : !amdaie.async_token) + %bd_id_3 = amdaie.bd_id(%tile_0, %c3) + %15 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_3 channel = %channel_8) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%15 : !amdaie.async_token) + amdaie.end + } + } + return + } +} From fb6d4d28130f48120d73fe535daf54def6542acc Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Tue, 17 Dec 2024 12:20:50 +0000 Subject: [PATCH 3/8] resolve comments --- .../Transforms/AMDAIEFoldDmaWaits.cpp | 80 +++++----- .../Transforms/AMDAIEInsertDmaBdChain.cpp | 17 ++- .../Transforms/test/fold_dma_waits.mlir | 142 ++++++++++++++---- 3 files changed, 168 insertions(+), 71 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index b6bf6c877..7bdd8d0d8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -16,14 +16,14 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -using DmaQueue = std::pair; +using DmaQueueKey = std::pair; /// Utility function to determine whether a DMA wait op can be folded into a /// queue based on its half DMA copy operation. FailureOr canFoldByQueue( const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, - DenseMap> &dmaQueueToBdIds) { + DenseMap> &dmaQueueToBdIds) { // Retrieve the connection op. std::optional maybeConnectionOp = npuHalfDmaCpyNdOp.getConnectionOp(); @@ -104,7 +104,7 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); std::vector waitOpsToErase; - DenseMap> dmaQueueToBdIds; + DenseMap> dmaQueueToBdIds; // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { @@ -168,17 +168,15 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter, rewriter.setInsertionPointAfter(waitOps.back()); rewriter.create(waitOps.back().getLoc(), asyncTokens); - for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { - rewriter.eraseOp(waitOp); - } + for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) rewriter.eraseOp(waitOp); return success(); } /// Utility function to determine if a DMA wait operation can be folded into a /// a batch based on its half DMA copy operation. -FailureOr canFoldByBatch( - AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, - SmallVector &connectionOps) { +FailureOr canFoldByBatch(Operation *batchParentOp, + AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, + DenseSet &connectionOps) { // Retrieve the connection op. std::optional maybeConnectionOp = npuHalfDmaCpyNdOp.getConnectionOp(); @@ -199,17 +197,19 @@ FailureOr canFoldByBatch( bool canFold = true; // Can't fold if the current connection op already occurs in the batch, or - // if the current operation is a packet flow, or if the batch is empty. - if (llvm::is_contained(connectionOps, connectionOp) || isPacketFlow || - connectionOps.empty()) { + // if the current operation is a packet flow, or if the batch is empty, or + // if the current operation is not in the same scope as the batch. + if (connectionOps.contains(connectionOp) || isPacketFlow || + connectionOps.empty() || + (batchParentOp != npuHalfDmaCpyNdOp->getParentOp())) { connectionOps.clear(); canFold = false; } - connectionOps.push_back(connectionOp); + connectionOps.insert(connectionOp); return canFold; } -/// Traverses the control code forward, ensuring that only one DMA wait op is +/// Traverses the control code in reverse, ensuring that only one DMA wait op is /// retained for every batch of DMA copy operations. /// /// Example Input: @@ -227,34 +227,42 @@ FailureOr canFoldByBatch( /// %2 = dma_cpy_nd(connection2) /// %3 = dma_cpy_nd(connection3) /// dma_wait(%0, %1, %2, %3) +/// Reverse traversal simplifies handling duplicate connections, preventing +/// the need to revisit and modify earlier operations after processing later +/// ones. LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); SmallVector waitOps; - SmallVector connectionOps; - WalkResult res = controlCodeOp->walk([&](AMDAIE::NpuDmaWaitOp waitOp) { - bool toBatch = true; - for (Value token : waitOp.getAsyncTokens()) { - if (auto npuHalfDmaCpyNdOp = - dyn_cast_if_present( - token.getDefiningOp())) { - FailureOr result = - canFoldByBatch(npuHalfDmaCpyNdOp, connectionOps); - if (failed(result)) return WalkResult::interrupt(); - toBatch &= *result; - } - } - // Process the previous batch of wait ops, and start a new batch. - if (!toBatch) { - if (failed(updateBatchTokens(rewriter, waitOps))) - return WalkResult::interrupt(); - waitOps.clear(); - } - waitOps.push_back(waitOp); - return WalkResult::advance(); - }); + DenseSet connectionOps; + WalkResult res = controlCodeOp->walk( + [&](AMDAIE::NpuDmaWaitOp waitOp) { + bool toBatch = true; + Operation *batchParentOp = + waitOps.empty() ? waitOp->getParentOp() : waitOps[0]->getParentOp(); + for (Value token : waitOp.getAsyncTokens()) { + if (auto npuHalfDmaCpyNdOp = + dyn_cast_if_present( + token.getDefiningOp())) { + FailureOr result = + canFoldByBatch(batchParentOp, npuHalfDmaCpyNdOp, connectionOps); + if (failed(result)) return WalkResult::interrupt(); + toBatch &= *result; + } + } + // Process the previous batch of wait ops, and start a new batch. + if (!toBatch) { + std::reverse(waitOps.begin(), waitOps.end()); + if (failed(updateBatchTokens(rewriter, waitOps))) + return WalkResult::interrupt(); + waitOps.clear(); + } + waitOps.push_back(waitOp); + return WalkResult::advance(); + }); if (res.wasInterrupted()) return failure(); // Process the remaining wait ops. + std::reverse(waitOps.begin(), waitOps.end()); if (failed(updateBatchTokens(rewriter, waitOps))) return failure(); return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp index b21ceb025..352c8e500 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp @@ -17,7 +17,7 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -using DmaChain = std::pair; +using DmaChainKey = std::pair; /// Utility function to update `next_bd` and `start_bd` operands. LogicalResult updateChainOperands( @@ -83,9 +83,9 @@ LogicalResult updateChainOperands( /// - Chain X: [0] (the newly added BD ID). /// - Chain Y: [] (emptied after breaking). void checkForChainsToBeBroken( - uint32_t currBdId, const DmaChain &currDmaChain, - const DenseMap> &dmaChainToBdIds, - SmallVector &chainsToBreak) { + uint32_t currBdId, const DmaChainKey &currDmaChain, + const DenseMap> &dmaChainToBdIds, + SmallVector &chainsToBreak) { for (auto &[entry, bdIds] : dmaChainToBdIds) { if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) { // Break the chain that contains the duplicate BD ID. @@ -120,9 +120,10 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, } // BD IDs that have been assigned in each tile. - DenseMap> dmaChainToBdIds; + DenseMap> dmaChainToBdIds; // Buffers the DMA ops that will be chained. - DenseMap> dmaChainToDmaOps; + DenseMap> + dmaChainToDmaOps; res = controlCodeOp->walk([&](Operation *op) { @@ -185,8 +186,8 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, // Any duplicate BD ID from the same tile indicates that the chain // cannot grow further and requires breaking to release the // conflicting BD ID. - SmallVector chainsToBreak; - DmaChain currDmaChain = {tileOp, connectionOp}; + SmallVector chainsToBreak; + DmaChainKey currDmaChain = {tileOp, connectionOp}; checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds, chainsToBreak); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir index 954f86687..f74b8bad6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/fold_dma_waits.mlir @@ -70,6 +70,66 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- +// Expect no DMA waits to be folded, since they are operating on different scopes. +// CHECK-LABEL: @fold_dma_waits_loop +// CHECK-COUNT-2: amdaie.npu.dma_wait +// CHECK-NOT: amdaie.npu.dma_wait +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @fold_dma_waits_loop() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + amdaie.workgroup { + %tile_0_1 = amdaie.tile(%c0, %c1) + %tile_0_0 = amdaie.tile(%c0, %c0) + %tile_1_1 = amdaie.tile(%c1, %c1) + %tile_1_0 = amdaie.tile(%c1, %c0) + %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile_0_1(4), 4) + %lock_3 = amdaie.lock(%tile_0_1(5), 0) + %lock_4 = amdaie.lock(%tile_1_1(4), 4) + %lock_5 = amdaie.lock(%tile_1_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) + %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false} + %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false} + %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> + %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %4, 64 : memref<64x32xi32> + scf.for %arg0 = %c0 to %c1 step %c8 { + %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0) + %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%13 : !amdaie.async_token) + } + %bd_id = amdaie.bd_id(%tile_1_0, %c0) + %12 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id channel = %channel_7) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%12 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + // Same connection, but different BD IDs are used. Expect the DMA waits to be folded. // DMA queue has a maximum size of 4. To optimize, starting from the end of the control code, // retain every 4th DMA wait operation, while folding the others and removing their tokens. @@ -229,14 +289,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// The three DMA are operating on two different connections. -// Expect the last two DMA operations to be batched into a single DMA wait, -// while the first DMA operation is retained standalone, as each connection can only be accessed once per batch. -// CHECK-LABEL: @fold_dma_waits_batching +// The five DMA are operating on three different connections. +// Expect the first DMA operation to be retained standalone, while the rest are batched into two DMA waits. +// This is because each connection can only be accessed once per batch. +// CHECK-LABEL: @fold_dma_waits_multi_batching // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: %[[TILE_3_0:.+]] = amdaie.tile(%[[C3]], %[[C0]]) // CHECK: %[[BD_ID_0:.+]] = amdaie.bd_id(%[[TILE_0_0]], %[[C0]]) // CHECK: %[[TOKEN_0:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_0]] // CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) @@ -245,10 +307,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[BD_ID_2:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) // CHECK: %[[TOKEN_2:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_2]] // CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]], %[[TOKEN_2]] : !amdaie.async_token, !amdaie.async_token) +// CHECK: %[[BD_ID_3:.+]] = amdaie.bd_id(%[[TILE_3_0]], %[[C0]]) +// CHECK: %[[TOKEN_3:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_3]] +// CHECK: %[[BD_ID_4:.+]] = amdaie.bd_id(%[[TILE_1_0]], %[[C0]]) +// CHECK: %[[TOKEN_4:.+]] = amdaie.npu.half_dma_cpy_nd async %{{.+}}(%{{.+}} [] [] [] bd_id = %[[BD_ID_4]] +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_3]], %[[TOKEN_4]] : !amdaie.async_token, !amdaie.async_token) #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @fold_dma_waits_batching() { + func.func @fold_dma_waits_multi_batching() { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index @@ -257,42 +324,63 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_0 = amdaie.tile(%c0, %c0) %tile_1_1 = amdaie.tile(%c1, %c1) %tile_1_0 = amdaie.tile(%c1, %c0) + %tile_3_1 = amdaie.tile(%c3, %c1) + %tile_3_0 = amdaie.tile(%c3, %c0) %buffer = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> %buffer_0 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> %buffer_1 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> %buffer_2 = amdaie.buffer(%tile_1_1) : memref<2048xi32, 1 : i32> + %buffer_3 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> + %buffer_4 = amdaie.buffer(%tile_3_1) : memref<2048xi32, 1 : i32> %lock = amdaie.lock(%tile_0_1(4), 4) - %lock_3 = amdaie.lock(%tile_0_1(5), 0) - %lock_4 = amdaie.lock(%tile_1_1(4), 4) - %lock_5 = amdaie.lock(%tile_1_1(5), 0) - %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_3}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %lock_5 = amdaie.lock(%tile_0_1(5), 0) + %lock_6 = amdaie.lock(%tile_1_1(4), 4) + %lock_7 = amdaie.lock(%tile_1_1(5), 0) + %lock_8 = amdaie.lock(%tile_3_1(4), 4) + %lock_9 = amdaie.lock(%tile_3_1(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_0}, {%lock}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_1, %buffer_2}, {%lock_6}, {%lock_7}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> %5 = amdaie.logicalobjectfifo.placeholder{%tile_1_0} : !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_buffers({%buffer_3, %buffer_4}, {%lock_8}, {%lock_9}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %8 = amdaie.logicalobjectfifo.placeholder{%tile_3_0} : !amdaie.logicalobjectfifo> %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) - %channel_6 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) - %channel_7 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) - %channel_8 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) - %6 = amdaie.flow({%channel} -> {%channel_6}) {is_packet_flow = false} - %7 = amdaie.connection(%0 {%channel_6}, %2 {%channel}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %8 = amdaie.flow({%channel_7} -> {%channel_8}) {is_packet_flow = false} - %9 = amdaie.connection(%3 {%channel_8}, %5 {%channel_7}, flow = %8) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %channel_10 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_11 = amdaie.channel(%tile_1_0, 0, port_type = DMA, direction = MM2S) + %channel_12 = amdaie.channel(%tile_1_1, 0, port_type = DMA, direction = S2MM) + %channel_13 = amdaie.channel(%tile_3_0, 0, port_type = DMA, direction = MM2S) + %channel_14 = amdaie.channel(%tile_3_1, 0, port_type = DMA, direction = S2MM) + %9 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false} + %10 = amdaie.connection(%0 {%channel_10}, %2 {%channel}, flow = %9) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %11 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false} + %12 = amdaie.connection(%3 {%channel_12}, %5 {%channel_11}, flow = %11) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + %13 = amdaie.flow({%channel_13} -> {%channel_14}) {is_packet_flow = false} + %14 = amdaie.connection(%6 {%channel_14}, %8 {%channel_13}, flow = %13) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) amdaie.controlcode { - %10 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + %15 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> memref.assume_alignment %1, 64 : memref<64x32xi32> - %11 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + %16 = amdaie.logicalobjectfifo.from_memref %4, {%tile_1_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> memref.assume_alignment %4, 64 : memref<64x32xi32> + %17 = amdaie.logicalobjectfifo.from_memref %7, {%tile_3_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %7, 64 : memref<64x32xi32> %bd_id = amdaie.bd_id(%tile_0_0, %c0) - %12 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%12 : !amdaie.async_token) - %bd_id_9 = amdaie.bd_id(%tile_0_0, %c0) - %13 = amdaie.npu.half_dma_cpy_nd async %7(%10 [] [] [] bd_id = %bd_id_9 channel = %channel) : !amdaie.logicalobjectfifo> - %bd_id_10 = amdaie.bd_id(%tile_1_0, %c0) - %14 = amdaie.npu.half_dma_cpy_nd async %9(%11 [] [] [] bd_id = %bd_id_10 channel = %channel_7) : !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13 : !amdaie.async_token) - amdaie.npu.dma_wait(%14 : !amdaie.async_token) + %18 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%18 : !amdaie.async_token) + %bd_id_15 = amdaie.bd_id(%tile_0_0, %c0) + %19 = amdaie.npu.half_dma_cpy_nd async %10(%15 [] [] [] bd_id = %bd_id_15 channel = %channel) : !amdaie.logicalobjectfifo> + %bd_id_16 = amdaie.bd_id(%tile_1_0, %c0) + %20 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_16 channel = %channel_11) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%19 : !amdaie.async_token) + amdaie.npu.dma_wait(%20 : !amdaie.async_token) + %bd_id_17 = amdaie.bd_id(%tile_3_0, %c0) + %21 = amdaie.npu.half_dma_cpy_nd async %14(%17 [] [] [] bd_id = %bd_id_17 channel = %channel_13) : !amdaie.logicalobjectfifo> + %bd_id_18 = amdaie.bd_id(%tile_1_0, %c0) + %22 = amdaie.npu.half_dma_cpy_nd async %12(%16 [] [] [] bd_id = %bd_id_18 channel = %channel_11) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%21 : !amdaie.async_token) + amdaie.npu.dma_wait(%22 : !amdaie.async_token) amdaie.end } } From b13c577440f469ed31a07ab776a2e5ba6dbf745b Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Tue, 17 Dec 2024 16:53:12 +0000 Subject: [PATCH 4/8] fix test and refactor --- .../Transforms/AMDAIEFoldDmaWaits.cpp | 183 +++++++++++++----- 1 file changed, 131 insertions(+), 52 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 7bdd8d0d8..b622c7146 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -16,14 +16,58 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -using DmaQueueKey = std::pair; +using DmaBdIdKey = std::pair; + +/// Utility function to erase the DMA wait operations in the queue, except for +/// the last one. +LogicalResult eraseQueueOperations(IRRewriter &rewriter, + SmallVector &waitOps) { + // Skip if there are less than two DMA wait operations in the queue. + if (waitOps.size() < 2) return success(); + + Operation *parentOp = waitOps.back()->getParentOp(); + // Do not modify the last wait op, it will be kept. + waitOps.pop_back(); + + for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { + if (waitOp->getParentOp() != parentOp) { + return waitOp.emitError( + "DMA operations to be queued must belong to the same scope"); + } + // Erase the wait op. + SmallVector asyncTokens(waitOp.getAsyncTokens()); + rewriter.eraseOp(waitOp); + for (Value token : asyncTokens) { + auto dmaOp = + dyn_cast_if_present(token.getDefiningOp()); + if (!dmaOp) + waitOp.emitError("expected to operate on an `amdaie.half_dma_cpy_nd`"); + if (dmaOp.use_empty()) { + rewriter.setInsertionPoint(dmaOp); + TypeRange resultTypeRange = TypeRange{}; + // Nullify the result to avoid issuing a token. + rewriter.create( + dmaOp.getLoc(), resultTypeRange, dmaOp.getConnection(), + dmaOp.getInput(), dmaOp.getMixedOffsets(), dmaOp.getMixedSizes(), + dmaOp.getMixedStrides(), dmaOp.getBdId(), dmaOp.getChannel(), + dmaOp.getNextBd(), dmaOp.getStartBd()); + rewriter.eraseOp(dmaOp); + } + } + } + return success(); +} /// Utility function to determine whether a DMA wait op can be folded into a /// queue based on its half DMA copy operation. FailureOr canFoldByQueue( - const AMDAIE::AMDAIEDeviceModel &deviceModel, + const AMDAIE::AMDAIEDeviceModel &deviceModel, Operation *queueParentOp, AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, - DenseMap> &dmaQueueToBdIds) { + DenseMap> &dmaBdIdsMap) { + // Check if the current operation is in the same scope as the rest of the + // queue. + bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == queueParentOp; + // Retrieve the connection op. std::optional maybeConnectionOp = npuHalfDmaCpyNdOp.getConnectionOp(); @@ -63,22 +107,24 @@ FailureOr canFoldByQueue( uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); - // Keep wait op if, either reaches the maximum queue size, or a - // duplicate BD ID in the same tile, or packet flow, or the queue is - // empty uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); - bool isDuplicateBdId = llvm::any_of(dmaQueueToBdIds, [&](const auto &entry) { - return entry.first.first == tileOp && - llvm::is_contained(entry.second, bdId); + bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) { + return entry.first.first == tileOp && entry.second.contains(bdId); }); - SmallVector &bdIds = dmaQueueToBdIds[{tileOp, connectionOp}]; + DenseSet &bdIds = dmaBdIdsMap[{tileOp, connectionOp}]; bool canFold = true; + // Can't fold wait op if: + // (1) the current BD ID on the same tile already occurs in the queue, or + // (2) the current operation is a packet flow, or + // (3) reaches the maximum queue size, or + // (4) the queue is empty, or + // (5) the current operation is not in the same scope as the queue. if (isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize || - bdIds.empty()) { + bdIds.empty() || !isSameScope) { bdIds.clear(); canFold = false; } - bdIds.push_back(bdId); + bdIds.insert(bdId); return canFold; } @@ -103,49 +149,43 @@ FailureOr canFoldByQueue( LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); - std::vector waitOpsToErase; - DenseMap> dmaQueueToBdIds; + SmallVector> waitOpQueues; + DenseMap> dmaBdIdsMap; // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { - bool toErase = true; + bool toFold = true; + Operation *queueParentOp = + waitOpQueues.empty() ? waitOp->getParentOp() + : waitOpQueues.back().front()->getParentOp(); for (Value token : waitOp.getAsyncTokens()) { if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - FailureOr result = - canFoldByQueue(deviceModel, npuHalfDmaCpyNdOp, dmaQueueToBdIds); + FailureOr result = canFoldByQueue( + deviceModel, queueParentOp, npuHalfDmaCpyNdOp, dmaBdIdsMap); if (failed(result)) return WalkResult::interrupt(); - toErase &= *result; + toFold &= *result; } } - // Erase later to avoid invalidating the iterator. - if (toErase) waitOpsToErase.push_back(waitOp); + // Store all the queues, and modify later to avoid invalidating the + // iterator. + if (toFold) { + // Append the wait op to the last queue if it can be folded. + waitOpQueues.back().push_back(waitOp); + } else { + // Create a new queue if the wait op cannot be folded. + waitOpQueues.push_back(SmallVector{waitOp}); + } return WalkResult::advance(); }); if (res.wasInterrupted()) return failure(); - - for (AMDAIE::NpuDmaWaitOp waitOp : waitOpsToErase) { - SmallVector asyncTokens(waitOp.getAsyncTokens()); - // Erase the wait op. - rewriter.eraseOp(waitOp); - for (Value token : asyncTokens) { - if (auto op = dyn_cast_if_present( - token.getDefiningOp())) { - if (op.use_empty()) { - rewriter.setInsertionPoint(op); - TypeRange resultTypeRange = TypeRange{}; - // Nullify the result to avoid issuing a token. - rewriter.create( - op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(), - op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(), - op.getBdId(), op.getChannel(), op.getNextBd(), op.getStartBd()); - rewriter.eraseOp(op); - } - } - } + for (SmallVector &waitOps : waitOpQueues) { + // Since the controlcode is traversed in reverse order, we need to + // restore the original order of the DMA operations. + std::reverse(waitOps.begin(), waitOps.end()); + if (failed(eraseQueueOperations(rewriter, waitOps))) return failure(); } - return success(); } @@ -174,9 +214,14 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter, /// Utility function to determine if a DMA wait operation can be folded into a /// a batch based on its half DMA copy operation. -FailureOr canFoldByBatch(Operation *batchParentOp, - AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, - DenseSet &connectionOps) { +FailureOr canFoldByBatch( + Operation *batchParentOp, AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, + DenseSet &connectionOps, + DenseMap> &dmaBdIdsMap) { + // Check if the current operation is in the same scope as the rest of the + // batch. + bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == batchParentOp; + // Retrieve the connection op. std::optional maybeConnectionOp = npuHalfDmaCpyNdOp.getConnectionOp(); @@ -195,17 +240,48 @@ FailureOr canFoldByBatch(Operation *batchParentOp, AMDAIE::FlowOp flowOp = maybeFlowOp.value(); bool isPacketFlow = flowOp.getIsPacketFlow(); + // Retrieve the BD ID op. + std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); + if (!maybeBdIdOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "must have a BD ID op to lower to " + "`amdaie.npu.write_bd`"; + } + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + + // Retrieve the tile op. + AMDAIE::TileOp tileOp = + dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + } + + bool isDuplicateConnection = connectionOps.contains(connectionOp); + uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); + bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) { + return entry.first.first == tileOp && entry.second.contains(bdId); + }); + bool canFold = true; - // Can't fold if the current connection op already occurs in the batch, or - // if the current operation is a packet flow, or if the batch is empty, or - // if the current operation is not in the same scope as the batch. - if (connectionOps.contains(connectionOp) || isPacketFlow || - connectionOps.empty() || - (batchParentOp != npuHalfDmaCpyNdOp->getParentOp())) { + // Can't fold wait op if: + // (1) the current connection op already occurs in the batch, or + // (2) the current BD ID on the same tile already occurs in the batch, or + // (3) the current operation is a packet flow, or + // (4) the batch is empty, or + // (5) the current operation is not in the same scope as the batch. + if (isDuplicateConnection || isDuplicateBdId || isPacketFlow || + connectionOps.empty() || !isSameScope) { + // Clear the BD IDs for all the connections in the batch. + for (auto &entry : dmaBdIdsMap) { + ConnectionOp connectionOp = entry.first.second; + DenseSet &bdIds = entry.second; + if (connectionOps.contains(connectionOp)) bdIds.clear(); + } connectionOps.clear(); canFold = false; } connectionOps.insert(connectionOp); + dmaBdIdsMap[{tileOp, connectionOp}].insert(bdId); return canFold; } @@ -234,6 +310,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); SmallVector waitOps; DenseSet connectionOps; + DenseMap> dmaBdIdsMap; WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { bool toBatch = true; @@ -243,14 +320,16 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - FailureOr result = - canFoldByBatch(batchParentOp, npuHalfDmaCpyNdOp, connectionOps); + FailureOr result = canFoldByBatch( + batchParentOp, npuHalfDmaCpyNdOp, connectionOps, dmaBdIdsMap); if (failed(result)) return WalkResult::interrupt(); toBatch &= *result; } } // Process the previous batch of wait ops, and start a new batch. if (!toBatch) { + // Since the controlcode is traversed in reverse order, we need to + // restore the original order of the DMA operations. std::reverse(waitOps.begin(), waitOps.end()); if (failed(updateBatchTokens(rewriter, waitOps))) return WalkResult::interrupt(); From a10aedd5b56184d850f1619bf18668998e041c4b Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Tue, 17 Dec 2024 21:30:12 +0000 Subject: [PATCH 5/8] separate canFold decisions with update --- .../Transforms/AMDAIEFoldDmaWaits.cpp | 134 +++++++++++------- 1 file changed, 81 insertions(+), 53 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index b622c7146..21b0af3f5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -61,18 +61,20 @@ LogicalResult eraseQueueOperations(IRRewriter &rewriter, /// Utility function to determine whether a DMA wait op can be folded into a /// queue based on its half DMA copy operation. FailureOr canFoldByQueue( - const AMDAIE::AMDAIEDeviceModel &deviceModel, Operation *queueParentOp, - AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, - DenseMap> &dmaBdIdsMap) { + const AMDAIE::AMDAIEDeviceModel &deviceModel, + const Operation *queueParentOp, + const DenseMap> &dmaBdIdsMap, + DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal, + AMDAIE::NpuHalfDmaCpyNdOp &currHalfDmaCpyNdOp) { // Check if the current operation is in the same scope as the rest of the // queue. - bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == queueParentOp; + bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == queueParentOp; // Retrieve the connection op. std::optional maybeConnectionOp = - npuHalfDmaCpyNdOp.getConnectionOp(); + currHalfDmaCpyNdOp.getConnectionOp(); if (!maybeConnectionOp) { - return npuHalfDmaCpyNdOp.emitOpError() + return currHalfDmaCpyNdOp.emitOpError() << "expected to operate on an `amdaie.connection`"; } AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); @@ -87,13 +89,14 @@ FailureOr canFoldByQueue( bool isPacketFlow = flowOp.getIsPacketFlow(); // Retrieve the BD ID op. - std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); + std::optional maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp(); if (!maybeBdIdOp) { - return npuHalfDmaCpyNdOp.emitOpError() + return currHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op to lower to " "`amdaie.npu.write_bd`"; } AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue()); // Retrieve the tile op. AMDAIE::TileOp tileOp = @@ -101,31 +104,26 @@ FailureOr canFoldByQueue( if (!tileOp) { return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; } + currBdIdKey = {tileOp, connectionOp}; // Get the maximum queue size. uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); - uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) { - return entry.first.first == tileOp && entry.second.contains(bdId); + return entry.first.first == tileOp && entry.second.contains(currBdIdVal); }); - DenseSet &bdIds = dmaBdIdsMap[{tileOp, connectionOp}]; - bool canFold = true; + const DenseSet &bdIds = dmaBdIdsMap.lookup(currBdIdKey); + // Can't fold wait op if: // (1) the current BD ID on the same tile already occurs in the queue, or // (2) the current operation is a packet flow, or // (3) reaches the maximum queue size, or // (4) the queue is empty, or // (5) the current operation is not in the same scope as the queue. - if (isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize || - bdIds.empty() || !isSameScope) { - bdIds.clear(); - canFold = false; - } - bdIds.insert(bdId); - return canFold; + return !(isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize || + bdIds.empty() || !isSameScope); } /// Traverses the control code in reverse, ensuring that for each connection, @@ -151,6 +149,16 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, IRRewriter rewriter(controlCodeOp->getContext()); SmallVector> waitOpQueues; DenseMap> dmaBdIdsMap; + + auto updateWithCurrBdId = + [&](bool canFold, DenseMap> &dmaBdIdsMap, + DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) { + assert(currBdIdKey.first && "TileOp must not be null"); + assert(currBdIdKey.second && "ConnectionOp must not be null"); + if (!canFold) dmaBdIdsMap[currBdIdKey].clear(); + dmaBdIdsMap[currBdIdKey].insert(currBdIdVal); + }; + // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { @@ -162,10 +170,14 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - FailureOr result = canFoldByQueue( - deviceModel, queueParentOp, npuHalfDmaCpyNdOp, dmaBdIdsMap); + DmaBdIdKey currBdIdKey = {nullptr, nullptr}; + uint32_t currBdIdVal = 0; + FailureOr result = + canFoldByQueue(deviceModel, queueParentOp, dmaBdIdsMap, + currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp); if (failed(result)) return WalkResult::interrupt(); toFold &= *result; + updateWithCurrBdId(*result, dmaBdIdsMap, currBdIdKey, currBdIdVal); } } // Store all the queues, and modify later to avoid invalidating the @@ -190,8 +202,8 @@ LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, } /// For each batch, combine the async tokens into a single NpuDmaWaitOp. -LogicalResult updateBatchTokens(IRRewriter &rewriter, - SmallVector &waitOps) { +LogicalResult eraseBatchOperations(IRRewriter &rewriter, + SmallVector &waitOps) { // Skip if there are less than two DMA wait operations. if (waitOps.size() < 2) return success(); @@ -215,21 +227,24 @@ LogicalResult updateBatchTokens(IRRewriter &rewriter, /// Utility function to determine if a DMA wait operation can be folded into a /// a batch based on its half DMA copy operation. FailureOr canFoldByBatch( - Operation *batchParentOp, AMDAIE::NpuHalfDmaCpyNdOp npuHalfDmaCpyNdOp, - DenseSet &connectionOps, - DenseMap> &dmaBdIdsMap) { + const Operation *batchParentOp, + const DenseSet &connectionOps, + const DenseMap> &dmaBdIdsMap, + DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal, + AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp) { // Check if the current operation is in the same scope as the rest of the // batch. - bool isSameScope = npuHalfDmaCpyNdOp->getParentOp() == batchParentOp; + bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == batchParentOp; // Retrieve the connection op. std::optional maybeConnectionOp = - npuHalfDmaCpyNdOp.getConnectionOp(); + currHalfDmaCpyNdOp.getConnectionOp(); if (!maybeConnectionOp) { - return npuHalfDmaCpyNdOp.emitOpError() + return currHalfDmaCpyNdOp.emitOpError() << "expected to operate on an `amdaie.connection`"; } AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); + bool isDuplicateConnection = connectionOps.contains(connectionOp); // Retrieve the flow op. std::optional maybeFlowOp = connectionOp.getFlowOp(); @@ -241,13 +256,14 @@ FailureOr canFoldByBatch( bool isPacketFlow = flowOp.getIsPacketFlow(); // Retrieve the BD ID op. - std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); + std::optional maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp(); if (!maybeBdIdOp) { - return npuHalfDmaCpyNdOp.emitOpError() + return currHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op to lower to " "`amdaie.npu.write_bd`"; } AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue()); // Retrieve the tile op. AMDAIE::TileOp tileOp = @@ -255,34 +271,20 @@ FailureOr canFoldByBatch( if (!tileOp) { return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; } + currBdIdKey = {tileOp, connectionOp}; - bool isDuplicateConnection = connectionOps.contains(connectionOp); - uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) { - return entry.first.first == tileOp && entry.second.contains(bdId); + return entry.first.first == tileOp && entry.second.contains(currBdIdVal); }); - bool canFold = true; // Can't fold wait op if: // (1) the current connection op already occurs in the batch, or // (2) the current BD ID on the same tile already occurs in the batch, or // (3) the current operation is a packet flow, or // (4) the batch is empty, or // (5) the current operation is not in the same scope as the batch. - if (isDuplicateConnection || isDuplicateBdId || isPacketFlow || - connectionOps.empty() || !isSameScope) { - // Clear the BD IDs for all the connections in the batch. - for (auto &entry : dmaBdIdsMap) { - ConnectionOp connectionOp = entry.first.second; - DenseSet &bdIds = entry.second; - if (connectionOps.contains(connectionOp)) bdIds.clear(); - } - connectionOps.clear(); - canFold = false; - } - connectionOps.insert(connectionOp); - dmaBdIdsMap[{tileOp, connectionOp}].insert(bdId); - return canFold; + return !(isDuplicateConnection || isDuplicateBdId || isPacketFlow || + connectionOps.empty() || !isSameScope); } /// Traverses the control code in reverse, ensuring that only one DMA wait op is @@ -311,6 +313,27 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { SmallVector waitOps; DenseSet connectionOps; DenseMap> dmaBdIdsMap; + + auto updateWithCurrBdId = + [&](bool canFold, DenseSet &connectionOps, + DenseMap> &dmaBdIdsMap, + DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) { + assert(currBdIdKey.first && "TileOp must not be null"); + assert(currBdIdKey.second && "ConnectionOp must not be null"); + if (!canFold) { + // Clear the BD IDs for all the connections in the batch. + for (auto &entry : dmaBdIdsMap) { + ConnectionOp connectionOp = entry.first.second; + DenseSet &bdIds = entry.second; + if (connectionOps.contains(connectionOp)) bdIds.clear(); + } + connectionOps.clear(); + } + connectionOps.insert(currBdIdKey.second); + dmaBdIdsMap[currBdIdKey].insert(currBdIdVal); + }; + + // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { bool toBatch = true; @@ -320,10 +343,15 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - FailureOr result = canFoldByBatch( - batchParentOp, npuHalfDmaCpyNdOp, connectionOps, dmaBdIdsMap); + DmaBdIdKey currBdIdKey = {nullptr, nullptr}; + uint32_t currBdIdVal = 0; + FailureOr result = + canFoldByBatch(batchParentOp, connectionOps, dmaBdIdsMap, + currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp); if (failed(result)) return WalkResult::interrupt(); toBatch &= *result; + updateWithCurrBdId(*result, connectionOps, dmaBdIdsMap, currBdIdKey, + currBdIdVal); } } // Process the previous batch of wait ops, and start a new batch. @@ -331,7 +359,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { // Since the controlcode is traversed in reverse order, we need to // restore the original order of the DMA operations. std::reverse(waitOps.begin(), waitOps.end()); - if (failed(updateBatchTokens(rewriter, waitOps))) + if (failed(eraseBatchOperations(rewriter, waitOps))) return WalkResult::interrupt(); waitOps.clear(); } @@ -342,7 +370,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { if (res.wasInterrupted()) return failure(); // Process the remaining wait ops. std::reverse(waitOps.begin(), waitOps.end()); - if (failed(updateBatchTokens(rewriter, waitOps))) return failure(); + if (failed(eraseBatchOperations(rewriter, waitOps))) return failure(); return success(); } From cc1ae9e027fc2a580421325de63725b90896cdfd Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Tue, 17 Dec 2024 21:39:10 +0000 Subject: [PATCH 6/8] separate refactor --- .../Transforms/AMDAIEFoldDmaWaits.cpp | 176 +++++++----------- .../Transforms/AMDAIEInsertDmaBdChain.cpp | 17 +- 2 files changed, 71 insertions(+), 122 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 21b0af3f5..973d30449 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -18,63 +18,17 @@ namespace { using DmaBdIdKey = std::pair; -/// Utility function to erase the DMA wait operations in the queue, except for -/// the last one. -LogicalResult eraseQueueOperations(IRRewriter &rewriter, - SmallVector &waitOps) { - // Skip if there are less than two DMA wait operations in the queue. - if (waitOps.size() < 2) return success(); - - Operation *parentOp = waitOps.back()->getParentOp(); - // Do not modify the last wait op, it will be kept. - waitOps.pop_back(); - - for (AMDAIE::NpuDmaWaitOp waitOp : waitOps) { - if (waitOp->getParentOp() != parentOp) { - return waitOp.emitError( - "DMA operations to be queued must belong to the same scope"); - } - // Erase the wait op. - SmallVector asyncTokens(waitOp.getAsyncTokens()); - rewriter.eraseOp(waitOp); - for (Value token : asyncTokens) { - auto dmaOp = - dyn_cast_if_present(token.getDefiningOp()); - if (!dmaOp) - waitOp.emitError("expected to operate on an `amdaie.half_dma_cpy_nd`"); - if (dmaOp.use_empty()) { - rewriter.setInsertionPoint(dmaOp); - TypeRange resultTypeRange = TypeRange{}; - // Nullify the result to avoid issuing a token. - rewriter.create( - dmaOp.getLoc(), resultTypeRange, dmaOp.getConnection(), - dmaOp.getInput(), dmaOp.getMixedOffsets(), dmaOp.getMixedSizes(), - dmaOp.getMixedStrides(), dmaOp.getBdId(), dmaOp.getChannel(), - dmaOp.getNextBd(), dmaOp.getStartBd()); - rewriter.eraseOp(dmaOp); - } - } - } - return success(); -} - -/// Utility function to determine whether a DMA wait op can be folded into a -/// queue based on its half DMA copy operation. +/// Utility function to determine whether a DMA wait op can be folded based on +/// its half DMA copy operation. FailureOr canFoldByQueue( const AMDAIE::AMDAIEDeviceModel &deviceModel, - const Operation *queueParentOp, - const DenseMap> &dmaBdIdsMap, - DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal, - AMDAIE::NpuHalfDmaCpyNdOp &currHalfDmaCpyNdOp) { - // Check if the current operation is in the same scope as the rest of the - // queue. - bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == queueParentOp; - + AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp, + DenseMap> &tileConnectToBdIdQueue) { // Retrieve the connection op. std::optional maybeConnectionOp = - currHalfDmaCpyNdOp.getConnectionOp(); + npuHalfDmaCpyNdOp.getConnectionOp(); if (!maybeConnectionOp) { - return currHalfDmaCpyNdOp.emitOpError() + return npuHalfDmaCpyNdOp.emitOpError() << "expected to operate on an `amdaie.connection`"; } AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); @@ -82,21 +36,20 @@ FailureOr canFoldByQueue( // Retrieve the flow op. std::optional maybeFlowOp = connectionOp.getFlowOp(); if (!maybeFlowOp) { - return connectionOp.emitOpError() + return connectionOp->emitOpError() << "expected to operate on an `amdaie.flow`"; } AMDAIE::FlowOp flowOp = maybeFlowOp.value(); bool isPacketFlow = flowOp.getIsPacketFlow(); // Retrieve the BD ID op. - std::optional maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp(); + std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); if (!maybeBdIdOp) { - return currHalfDmaCpyNdOp.emitOpError() + return npuHalfDmaCpyNdOp.emitOpError() << "must have a BD ID op to lower to " "`amdaie.npu.write_bd`"; } AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); - currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue()); // Retrieve the tile op. AMDAIE::TileOp tileOp = @@ -104,39 +57,44 @@ FailureOr canFoldByQueue( if (!tileOp) { return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; } - currBdIdKey = {tileOp, connectionOp}; // Get the maximum queue size. uint32_t col = getConstantIndexOrAssert(tileOp.getCol()); uint32_t row = getConstantIndexOrAssert(tileOp.getRow()); uint32_t maxQueueSize = deviceModel.getDmaMaxQueueSize(col, row); - bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) { - return entry.first.first == tileOp && entry.second.contains(currBdIdVal); - }); - const DenseSet &bdIds = dmaBdIdsMap.lookup(currBdIdKey); - - // Can't fold wait op if: - // (1) the current BD ID on the same tile already occurs in the queue, or - // (2) the current operation is a packet flow, or - // (3) reaches the maximum queue size, or - // (4) the queue is empty, or - // (5) the current operation is not in the same scope as the queue. - return !(isDuplicateBdId || isPacketFlow || bdIds.size() >= maxQueueSize || - bdIds.empty() || !isSameScope); + // Keep wait op if, either reaches the maximum queue size, or a + // duplicate BD ID in the same tile, or packet flow, or the queue is + // empty + uint32_t bdId = getConstantIndexOrAssert(bdIdOp.getValue()); + bool isDuplicateBdId = + llvm::any_of(tileConnectToBdIdQueue, [&](const auto &entry) { + return entry.first.first == tileOp && + llvm::is_contained(entry.second, bdId); + }); + SmallVector &bdIdQueue = + tileConnectToBdIdQueue[{tileOp, connectionOp}]; + bool canFold = true; + if (isDuplicateBdId || isPacketFlow || bdIdQueue.size() >= maxQueueSize || + bdIdQueue.empty()) { + bdIdQueue.clear(); + canFold = false; + } + bdIdQueue.push_back(bdId); + return canFold; } /// Traverses the control code in reverse, ensuring that for each connection, /// only one DMA wait op is retained for every maximum queue size. /// /// Example Output: assuming a maximum queue size of 4. -/// dma_cpy_nd(connection=0, bd_id=0) -/// %0 = dma_cpy_nd(connection=0, bd_id=1) +/// dma_cpy_nd +/// %0 = dma_cpy_nd /// dma_wait(%0) -/// dma_cpy_nd(connection=0, bd_id=2) -/// dma_cpy_nd(connection=0, bd_id=3) -/// dma_cpy_nd(connection=0, bd_id=4) -/// %1 = dma_cpy_nd(connection=0, bd_id=5) +/// dma_cpy_nd +/// dma_cpy_nd +/// dma_cpy_nd +/// %1 = dma_cpy_nd /// dma_wait(%1) /// From the bottom up, for every four DMA copy operations, only one DMA wait /// operation is retained. @@ -147,57 +105,49 @@ FailureOr canFoldByQueue( LogicalResult foldDmaWaitsByQueue(const AMDAIE::AMDAIEDeviceModel &deviceModel, AMDAIE::ControlCodeOp controlCodeOp) { IRRewriter rewriter(controlCodeOp->getContext()); - SmallVector> waitOpQueues; - DenseMap> dmaBdIdsMap; - - auto updateWithCurrBdId = - [&](bool canFold, DenseMap> &dmaBdIdsMap, - DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) { - assert(currBdIdKey.first && "TileOp must not be null"); - assert(currBdIdKey.second && "ConnectionOp must not be null"); - if (!canFold) dmaBdIdsMap[currBdIdKey].clear(); - dmaBdIdsMap[currBdIdKey].insert(currBdIdVal); - }; - + std::vector waitOpsToErase; + DenseMap> tileConnectToBdIdQueue; // Traverse the control code in reverse. WalkResult res = controlCodeOp->walk( [&](AMDAIE::NpuDmaWaitOp waitOp) { - bool toFold = true; - Operation *queueParentOp = - waitOpQueues.empty() ? waitOp->getParentOp() - : waitOpQueues.back().front()->getParentOp(); + bool toErase = true; for (Value token : waitOp.getAsyncTokens()) { if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - DmaBdIdKey currBdIdKey = {nullptr, nullptr}; - uint32_t currBdIdVal = 0; - FailureOr result = - canFoldByQueue(deviceModel, queueParentOp, dmaBdIdsMap, - currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp); + FailureOr result = canFoldByQueue( + deviceModel, npuHalfDmaCpyNdOp, tileConnectToBdIdQueue); if (failed(result)) return WalkResult::interrupt(); - toFold &= *result; - updateWithCurrBdId(*result, dmaBdIdsMap, currBdIdKey, currBdIdVal); + toErase &= *result; } } - // Store all the queues, and modify later to avoid invalidating the - // iterator. - if (toFold) { - // Append the wait op to the last queue if it can be folded. - waitOpQueues.back().push_back(waitOp); - } else { - // Create a new queue if the wait op cannot be folded. - waitOpQueues.push_back(SmallVector{waitOp}); - } + // Erase later to avoid invalidating the iterator. + if (toErase) waitOpsToErase.push_back(waitOp); return WalkResult::advance(); }); if (res.wasInterrupted()) return failure(); - for (SmallVector &waitOps : waitOpQueues) { - // Since the controlcode is traversed in reverse order, we need to - // restore the original order of the DMA operations. - std::reverse(waitOps.begin(), waitOps.end()); - if (failed(eraseQueueOperations(rewriter, waitOps))) return failure(); + + for (AMDAIE::NpuDmaWaitOp waitOp : waitOpsToErase) { + SmallVector asyncTokens(waitOp.getAsyncTokens()); + // Erase the wait op. + rewriter.eraseOp(waitOp); + for (Value token : asyncTokens) { + if (auto op = dyn_cast_if_present( + token.getDefiningOp())) { + if (op.use_empty()) { + rewriter.setInsertionPoint(op); + TypeRange resultTypeRange = TypeRange{}; + // Nullify the result to avoid issuing a token. + rewriter.create( + op.getLoc(), resultTypeRange, op.getConnection(), op.getInput(), + op.getMixedOffsets(), op.getMixedSizes(), op.getMixedStrides(), + op.getBdId(), op.getChannel(), op.getNextBd(), op.getStartBd()); + rewriter.eraseOp(op); + } + } + } } + return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp index 352c8e500..b21ceb025 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertDmaBdChain.cpp @@ -17,7 +17,7 @@ namespace mlir::iree_compiler::AMDAIE { namespace { -using DmaChainKey = std::pair; +using DmaChain = std::pair; /// Utility function to update `next_bd` and `start_bd` operands. LogicalResult updateChainOperands( @@ -83,9 +83,9 @@ LogicalResult updateChainOperands( /// - Chain X: [0] (the newly added BD ID). /// - Chain Y: [] (emptied after breaking). void checkForChainsToBeBroken( - uint32_t currBdId, const DmaChainKey &currDmaChain, - const DenseMap> &dmaChainToBdIds, - SmallVector &chainsToBreak) { + uint32_t currBdId, const DmaChain &currDmaChain, + const DenseMap> &dmaChainToBdIds, + SmallVector &chainsToBreak) { for (auto &[entry, bdIds] : dmaChainToBdIds) { if (entry.first == currDmaChain.first && bdIds.contains(currBdId)) { // Break the chain that contains the duplicate BD ID. @@ -120,10 +120,9 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, } // BD IDs that have been assigned in each tile. - DenseMap> dmaChainToBdIds; + DenseMap> dmaChainToBdIds; // Buffers the DMA ops that will be chained. - DenseMap> - dmaChainToDmaOps; + DenseMap> dmaChainToDmaOps; res = controlCodeOp->walk([&](Operation *op) { @@ -186,8 +185,8 @@ LogicalResult insertDmaBdChain(const AMDAIE::AMDAIEDeviceModel &deviceModel, // Any duplicate BD ID from the same tile indicates that the chain // cannot grow further and requires breaking to release the // conflicting BD ID. - SmallVector chainsToBreak; - DmaChainKey currDmaChain = {tileOp, connectionOp}; + SmallVector chainsToBreak; + DmaChain currDmaChain = {tileOp, connectionOp}; checkForChainsToBeBroken(bdId, currDmaChain, dmaChainToBdIds, chainsToBreak); From e47e0f315870692eb55ac5fb1def653d78977a93 Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Wed, 18 Dec 2024 09:53:13 +0000 Subject: [PATCH 7/8] retrive current BD ID key value in a separate function --- .../Transforms/AMDAIEFoldDmaWaits.cpp | 126 ++++++++++-------- 1 file changed, 71 insertions(+), 55 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 973d30449..1446c55e3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -17,6 +17,39 @@ namespace mlir::iree_compiler::AMDAIE { namespace { using DmaBdIdKey = std::pair; +using DmaBdIdPair = std::pair; + +FailureOr retriveDmaBdIdPair( + AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp) { + // Retrieve the connection op. + std::optional maybeConnectionOp = + npuHalfDmaCpyNdOp.getConnectionOp(); + if (!maybeConnectionOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "expected to operate on an `amdaie.connection`"; + } + AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); + + // Retrieve the BD ID op. + std::optional maybeBdIdOp = npuHalfDmaCpyNdOp.getBdIdOp(); + if (!maybeBdIdOp) { + return npuHalfDmaCpyNdOp.emitOpError() + << "must have a BD ID op to lower to " + "`amdaie.npu.write_bd`"; + } + AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); + uint32_t currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue()); + + // Retrieve the tile op. + AMDAIE::TileOp tileOp = + dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) { + return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + } + + DmaBdIdKey currBdIdKey = {tileOp, connectionOp}; + return DmaBdIdPair{currBdIdKey, currBdIdVal}; +} /// Utility function to determine whether a DMA wait op can be folded based on /// its half DMA copy operation. @@ -176,65 +209,44 @@ LogicalResult eraseBatchOperations(IRRewriter &rewriter, /// Utility function to determine if a DMA wait operation can be folded into a /// a batch based on its half DMA copy operation. +/// Can't fold wait op if: +/// (1) the current operation is not in the same scope as the batch, or +/// (2) the current connection op already occurs in the batch, or +/// (3) the batch is empty, or +/// (4) the current operation is a packet flow, or +/// (5) the current BD ID on the same tile already occurs in the batch. FailureOr canFoldByBatch( const Operation *batchParentOp, const DenseSet &connectionOps, const DenseMap> &dmaBdIdsMap, - DmaBdIdKey &currBdIdKey, uint32_t &currBdIdVal, - AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp) { - // Check if the current operation is in the same scope as the rest of the - // batch. - bool isSameScope = currHalfDmaCpyNdOp->getParentOp() == batchParentOp; + AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp, DmaBdIdPair &currBdIdPair) { + // Not in the same scope? Can't fold. + if (currHalfDmaCpyNdOp->getParentOp() != batchParentOp) return false; - // Retrieve the connection op. - std::optional maybeConnectionOp = - currHalfDmaCpyNdOp.getConnectionOp(); - if (!maybeConnectionOp) { - return currHalfDmaCpyNdOp.emitOpError() - << "expected to operate on an `amdaie.connection`"; - } - AMDAIE::ConnectionOp connectionOp = maybeConnectionOp.value(); - bool isDuplicateConnection = connectionOps.contains(connectionOp); + // Connection op already in the batch, or an empty batch? Can't fold. + AMDAIE::ConnectionOp connectionOp = currBdIdPair.first.second; + if (connectionOps.contains(connectionOp) || connectionOps.empty()) + return false; - // Retrieve the flow op. + // Packet flow? Can't fold. std::optional maybeFlowOp = connectionOp.getFlowOp(); if (!maybeFlowOp) { return connectionOp.emitOpError() << "expected to operate on an `amdaie.flow`"; } AMDAIE::FlowOp flowOp = maybeFlowOp.value(); - bool isPacketFlow = flowOp.getIsPacketFlow(); - - // Retrieve the BD ID op. - std::optional maybeBdIdOp = currHalfDmaCpyNdOp.getBdIdOp(); - if (!maybeBdIdOp) { - return currHalfDmaCpyNdOp.emitOpError() - << "must have a BD ID op to lower to " - "`amdaie.npu.write_bd`"; - } - AMDAIE::BdIdOp bdIdOp = maybeBdIdOp.value(); - currBdIdVal = getConstantIndexOrAssert(bdIdOp.getValue()); - - // Retrieve the tile op. - AMDAIE::TileOp tileOp = - dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); - if (!tileOp) { - return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; - } - currBdIdKey = {tileOp, connectionOp}; + if (flowOp.getIsPacketFlow()) return false; + // Duplicate BD ID on the same tile? Can't fold. + AMDAIE::TileOp tileOp = currBdIdPair.first.first; + uint32_t currBdIdVal = currBdIdPair.second; bool isDuplicateBdId = llvm::any_of(dmaBdIdsMap, [&](const auto &entry) { return entry.first.first == tileOp && entry.second.contains(currBdIdVal); }); + if (isDuplicateBdId) return false; - // Can't fold wait op if: - // (1) the current connection op already occurs in the batch, or - // (2) the current BD ID on the same tile already occurs in the batch, or - // (3) the current operation is a packet flow, or - // (4) the batch is empty, or - // (5) the current operation is not in the same scope as the batch. - return !(isDuplicateConnection || isDuplicateBdId || isPacketFlow || - connectionOps.empty() || !isSameScope); + // Can fold. + return true; } /// Traverses the control code in reverse, ensuring that only one DMA wait op is @@ -265,11 +277,11 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { DenseMap> dmaBdIdsMap; auto updateWithCurrBdId = - [&](bool canFold, DenseSet &connectionOps, - DenseMap> &dmaBdIdsMap, - DmaBdIdKey &currBdIdKey, uint32_t currBdIdVal) { - assert(currBdIdKey.first && "TileOp must not be null"); - assert(currBdIdKey.second && "ConnectionOp must not be null"); + [&](bool canFold, DmaBdIdPair &currBdIdPair, + DenseSet &connectionOps, + DenseMap> &dmaBdIdsMap) { + DmaBdIdKey currBdIdKey = currBdIdPair.first; + uint32_t currBdIdVal = currBdIdPair.second; if (!canFold) { // Clear the BD IDs for all the connections in the batch. for (auto &entry : dmaBdIdsMap) { @@ -293,15 +305,19 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { if (auto npuHalfDmaCpyNdOp = dyn_cast_if_present( token.getDefiningOp())) { - DmaBdIdKey currBdIdKey = {nullptr, nullptr}; - uint32_t currBdIdVal = 0; - FailureOr result = + // Retrieve the TileOp, ConnectionOp, and BD ID. + FailureOr currBdIdPair = + retriveDmaBdIdPair(npuHalfDmaCpyNdOp); + if (failed(currBdIdPair)) return WalkResult::interrupt(); + // Check if the current DMA wait op can be folded into the batch. + FailureOr canFold = canFoldByBatch(batchParentOp, connectionOps, dmaBdIdsMap, - currBdIdKey, currBdIdVal, npuHalfDmaCpyNdOp); - if (failed(result)) return WalkResult::interrupt(); - toBatch &= *result; - updateWithCurrBdId(*result, connectionOps, dmaBdIdsMap, currBdIdKey, - currBdIdVal); + npuHalfDmaCpyNdOp, *currBdIdPair); + if (failed(canFold)) return WalkResult::interrupt(); + // Update the `connectionOps` and `dmaBdIdsMap`. + updateWithCurrBdId(*canFold, *currBdIdPair, connectionOps, + dmaBdIdsMap); + toBatch &= *canFold; } } // Process the previous batch of wait ops, and start a new batch. From f4e5f07def39fa43839701d6159c77e839d42877 Mon Sep 17 00:00:00 2001 From: Yu-Zhewen Date: Wed, 18 Dec 2024 11:08:36 +0000 Subject: [PATCH 8/8] resolve comments --- .../iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp index 1446c55e3..8699ecf25 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEFoldDmaWaits.cpp @@ -19,7 +19,9 @@ namespace { using DmaBdIdKey = std::pair; using DmaBdIdPair = std::pair; -FailureOr retriveDmaBdIdPair( +/// Utility function to retrieve TileOp, ConnectionOp, and BD ID from a given +/// half DMA copy operation. +FailureOr retrieveDmaBdIdPair( AMDAIE::NpuHalfDmaCpyNdOp &npuHalfDmaCpyNdOp) { // Retrieve the connection op. std::optional maybeConnectionOp = @@ -219,7 +221,7 @@ FailureOr canFoldByBatch( const Operation *batchParentOp, const DenseSet &connectionOps, const DenseMap> &dmaBdIdsMap, - AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp, DmaBdIdPair &currBdIdPair) { + AMDAIE::NpuHalfDmaCpyNdOp currHalfDmaCpyNdOp, DmaBdIdPair currBdIdPair) { // Not in the same scope? Can't fold. if (currHalfDmaCpyNdOp->getParentOp() != batchParentOp) return false; @@ -277,7 +279,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { DenseMap> dmaBdIdsMap; auto updateWithCurrBdId = - [&](bool canFold, DmaBdIdPair &currBdIdPair, + [&](bool canFold, DmaBdIdPair currBdIdPair, DenseSet &connectionOps, DenseMap> &dmaBdIdsMap) { DmaBdIdKey currBdIdKey = currBdIdPair.first; @@ -307,7 +309,7 @@ LogicalResult foldDmaWaitsByBatch(AMDAIE::ControlCodeOp controlCodeOp) { token.getDefiningOp())) { // Retrieve the TileOp, ConnectionOp, and BD ID. FailureOr currBdIdPair = - retriveDmaBdIdPair(npuHalfDmaCpyNdOp); + retrieveDmaBdIdPair(npuHalfDmaCpyNdOp); if (failed(currBdIdPair)) return WalkResult::interrupt(); // Check if the current DMA wait op can be folded into the batch. FailureOr canFold =