nod-ai · yzhang93 · Dec 5, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 4, 2024
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -289,33 +289,6 @@ def generate(self, config, template_name):
         )
 
 
-class MatmulFullBias(BaseMatmul):
-    """
-    A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
-    """
-
-    def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]):
-        super().__init__(
-            run_on_target=run_on_target,
-            aie_compilation_flags=None,
-            M=M,
-            N=N,
-            K=K,
-            input_type=input_type,
-            acc_type=acc_type,
-            lower_to_aie_pipeline="air",
-        )
-        self.labels.append("MatmulFullBias")
-        self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}"
-
-    def _execute(self, config):
-        matmul_template_dir = config.file_dir / "matmul_template"
-        template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
-        self.generate(config, template_name)
-        self.vs_cpu(config)
-        return True
-
-
 class Matmul(BaseMatmul):
     """
     A test of the form matmul(A,B) where A:MxK, B:KxN
@@ -467,9 +440,50 @@ def _execute(self, config):
         matmul_template_dir = config.file_dir / "matmul_template"
         template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir"
         self.generate(config, template_name)
+        self.add_aie_compilation_flags(
+            [
+                "--iree-amdaie-matmul-elementwise-fusion",
+                "--iree-amdaie-num-rows=2",
+                "--iree-amdaie-num-cols=2",
+            ]
+        )
         return self.vs_cpu(config)
 
 
+class MatmulFullBias(BaseMatmul):
+    """
+    A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
+    """
+
+    def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]):
+        super().__init__(
+            run_on_target=run_on_target,
+            aie_compilation_flags=None,
+            M=M,
+            N=N,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            lower_to_aie_pipeline="air",
+        )
+        self.labels.append("MatmulFullBias")
+        self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}"
+
+    def _execute(self, config):
+        matmul_template_dir = config.file_dir / "matmul_template"
+        template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
+        self.generate(config, template_name)
+        self.add_aie_compilation_flags(
+            [
+                "--iree-amdaie-matmul-elementwise-fusion",
+                "--iree-amdaie-num-rows=2",
+                "--iree-amdaie-num-cols=2",
+            ]
+        )
+        self.vs_cpu(config)
+        return True
+
+
 class BatchMatmul(BaseMatmul):
     """
     A test of the form batch_matmul(A,B) where A:BxMxK, B:BxKxN
@@ -679,7 +693,6 @@ def generate_aie_vmfb(
         f"--iree-amdaie-target-device={config.target_device}",
         f"--iree-amdaie-tile-pipeline={tile_pipeline}",
         f"--iree-amdaie-lower-to-aie-pipeline={lower_to_aie_pipeline}",
-        "--iree-amdaie-matmul-elementwise-fusion",
         f"--iree-amd-aie-peano-install-dir={config.peano_dir}",
         f"--iree-amd-aie-install-dir={config.iree_dir}",
         f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}",

@@ -150,6 +150,27 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
     Builder b(context);
     SmallVector<NamedAttribute> configItems;
 
+    // Make sure the input number of rows/cols is smaller or equal to the max
+    // number of rows/cols from the device.
+    AMDAIEDeviceModel deviceModel =
+        AMDAIE::getDeviceModel(options.AMDAIETargetDevice);
+    uint32_t maxCoreRows = deviceModel.getNumCoreRows();
+    uint32_t maxCoreCols = deviceModel.getNumCoreCols();
+    if (options.AMDAIENumRows <= 0 || options.AMDAIENumRows > maxCoreRows) {
+      llvm::report_fatal_error(llvm::Twine("Invalid number of core rows (") +
+                               std::to_string(options.AMDAIENumRows) +
+                               "), must be in the range [1, " +
+                               std::to_string(maxCoreRows) + "] for device " +
+                               stringifyEnum(deviceModel.device));
+    }
+    if (options.AMDAIENumCols <= 0 || options.AMDAIENumCols > maxCoreCols) {
+      llvm::report_fatal_error(llvm::Twine("Invalid number of core cols (") +
+                               std::to_string(options.AMDAIENumCols) +
+                               "), must be in the range [1, " +
+                               std::to_string(maxCoreCols) + "] for device " +
+                               stringifyEnum(deviceModel.device));
+    }
+
     // Add some configurations to the `hal.executable.target` attribute.
     auto addConfig = [&](StringRef name, Attribute value) {
       configItems.emplace_back(StringAttr::get(context, name), value);
@@ -161,6 +182,11 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
     // Set microkernel enabling flag.
     addConfig("ukernels",
               StringAttr::get(context, options.enableAMDAIEUkernels));
+    // Set number of rows/cols used in an AIE array.
+    addConfig("num_rows", IntegerAttr::get(IntegerType::get(context, 32),
+                                           options.AMDAIENumRows));
+    addConfig("num_cols", IntegerAttr::get(IntegerType::get(context, 32),
+                                           options.AMDAIENumCols));
     auto configAttr = b.getDictionaryAttr(configItems);
 
     switch (options.deviceHal) {
@@ -204,7 +230,8 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
   void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr,
                                     OpPassManager &passManager) override {
     buildAMDAIETransformPassPipeline(
-        passManager, options.AMDAIETargetDevice, options.useTilePipeline,
+        passManager, options.AMDAIETargetDevice, options.AMDAIENumRows,
+        options.AMDAIENumCols, options.useTilePipeline,
         options.useLowerToAIEPipeline, options.matmulElementwiseFusion,
         options.enableVectorizationPasses, options.pathToUkernels,
         options.enablePacketFlow, options.enableCoalescingLoops,

@@ -10,6 +10,7 @@
 #include <string>
 
 #include "iree-amd-aie/Transforms/KernelDispatch.h"
+#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
 #include "iree/compiler/Dialect/HAL/Target/TargetBackend.h"
 #include "iree/compiler/Dialect/HAL/Target/TargetDevice.h"
 #include "iree/compiler/Utils/OptionUtils.h"
@@ -59,6 +60,8 @@ struct AMDAIEOptions {
   bool insertLoopAroundCoreBlock{false};
   bool matmulElementwiseFusion{false};
   AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col};
+  unsigned AMDAIENumRows{getDeviceModel(AMDAIETargetDevice).getNumCoreRows()};
+  unsigned AMDAIENumCols{getDeviceModel(AMDAIETargetDevice).getNumCoreCols()};
   std::string enableAMDAIEUkernels{"none"};
   bool enablePacketFlow{false};
 
@@ -231,6 +234,22 @@ struct AMDAIEOptions {
             clEnumValN(AMDAIEDevice::npu4, "npu4",
                        "Strix B0 NPU with 8 columns and 6 rows")));
 
+    binder.opt<unsigned>(
+        "iree-amdaie-num-rows", AMDAIENumRows, llvm::cl::cat(category),
+        llvm::cl::desc(
+            "Number of rows used in an AIE core array. The compiler will "
+            "choose a tiling strategy that uses no more than this number of "
+            "rows. However, some workloads (like convolution) currently ignore "
+            "this flag, and use a hardcoded number of rows."));
+
+    binder.opt<unsigned>(
+        "iree-amdaie-num-cols", AMDAIENumCols, llvm::cl::cat(category),
+        llvm::cl::desc(
+            "Number of columns used in an AIE core array. The compiler will "
+            "choose a tiling strategy that uses no more than this number of "
+            "columns. However, some workloads (like convolution) currently "
+            "ignore this flag, and use a hardcoded number of cols."));
+
     binder.opt<bool>("iree-amdaie-enable-packet-flow", enablePacketFlow,
                      llvm::cl::cat(category),
                      llvm::cl::desc("Enable packet routing data movement."));

@@ -1,8 +1,10 @@
 // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets %s | FileCheck %s --check-prefix=DEFAULT
 // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-enable-ukernels=all %s | FileCheck %s --check-prefix=ENABLE_UKERNEL
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-num-rows=2 --iree-amdaie-num-cols=2 %s | FileCheck %s --check-prefix=NUM_ROWS_COLS
 
-//        DEFAULT: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "none"}>) {
-// ENABLE_UKERNEL: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "all"}>) {
+//        DEFAULT: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {num_cols = 4 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>) {
+// ENABLE_UKERNEL: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {num_cols = 4 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "all"}>) {
+//  NUM_ROWS_COLS: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 2 : i32, target_device = "npu1_4col", ukernels = "none"}>) {
 func.func @matmul_small(%lhs : tensor<16x16xi32>,
     %rhs : tensor<16x32xi32>) -> tensor<16x32xi32> {
   %empty = tensor.empty() : tensor<16x32xi32>

@@ -1,4 +1,4 @@
-// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=air --iree-amdaie-tile-pipeline=pack-peel --iree-amdaie-matmul-elementwise-fusion --split-input-file | FileCheck %s
+// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-num-rows=2 --iree-amdaie-num-cols=2 --iree-amdaie-lower-to-aie-pipeline=air --iree-amdaie-tile-pipeline=pack-peel --iree-amdaie-matmul-elementwise-fusion --split-input-file %s | FileCheck %s
 
 func.func @matmul_elementwise_i32(%lhs: tensor<1024x512xi32>, %rhs: tensor<512x1024xi32>, %ele: tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
 {
@@ -49,6 +49,7 @@ func.func @matmul_elementwise_bf16_f32(%arg0: tensor<1024x512xbf16>, %arg1: tens
 // CHECK-COUNT-3:   aie.shim_dma_allocation
 
 // -----
+
 func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<512x16384xbf16> {
   %cst = arith.constant 0.000000e+00 : f32
   %7 = tensor.empty() : tensor<512x16384xbf16>

@@ -52,7 +52,7 @@ void AMDAIELoweringStrategyPass::runOnOperation() {
 
   // Detect unsupported pipeline combinations.
   {
-    bool padPack = usePassPipeline == TilePassPipeline::PadPackPipeline;
+    bool padPack = useTilePipeline == TilePassPipeline::PadPackPipeline;
     bool objectFifo =
         useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo;
     if (padPack && objectFifo) {
@@ -66,8 +66,9 @@ void AMDAIELoweringStrategyPass::runOnOperation() {
 
   for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
     // Set the strategy with default heuristics.
-    if (failed(initAIELaunchConfig(funcOp, usePassPipeline,
-                                   useLowerToAIEPipeline, targetDevice))) {
+    if (failed(initAIELaunchConfig(funcOp, useTilePipeline,
+                                   useLowerToAIEPipeline, targetDevice, numRows,
+                                   numCols))) {
       funcOp.emitOpError("failed to have a lowering configuration set for it.");
       return signalPassFailure();
     }

@@ -107,15 +107,15 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() {
       return;
     case IREE::Codegen::DispatchLoweringPassPipeline::Custom: {
       TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
-      if (usePassPipeline == TilePassPipeline::PackPeelPipeline) {
+      if (useTilePipeline == TilePassPipeline::PackPeelPipeline) {
         addPackPeelBasedPassPipeline(executableLoweringPipeline, tilingConfig,
                                      pathToUkernels, enableVectorizationPasses,
                                      TilePassPipeline::PackPeelPipeline);
-      } else if (usePassPipeline == TilePassPipeline::PadPackPipeline) {
+      } else if (useTilePipeline == TilePassPipeline::PadPackPipeline) {
         addPadPackBasedPassPipeline(executableLoweringPipeline, tilingConfig,
                                     pathToUkernels, enableVectorizationPasses,
                                     TilePassPipeline::PadPackPipeline);
-      } else if (usePassPipeline == TilePassPipeline::ConvDecomposePipeline) {
+      } else if (useTilePipeline == TilePassPipeline::ConvDecomposePipeline) {
         addConvDecomposePassPipeline(executableLoweringPipeline, tilingConfig,
                                      enableVectorizationPasses,
                                      TilePassPipeline::ConvDecomposePipeline);