Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LoweringStrategy] Refactor to take num of rows/cols as inputs #955

Merged
merged 5 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 41 additions & 28 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,33 +289,6 @@ def generate(self, config, template_name):
)


class MatmulFullBias(BaseMatmul):
"""
A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
"""

def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]):
super().__init__(
run_on_target=run_on_target,
aie_compilation_flags=None,
M=M,
N=N,
K=K,
input_type=input_type,
acc_type=acc_type,
lower_to_aie_pipeline="air",
)
self.labels.append("MatmulFullBias")
self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}"

def _execute(self, config):
matmul_template_dir = config.file_dir / "matmul_template"
template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
self.generate(config, template_name)
self.vs_cpu(config)
return True


class Matmul(BaseMatmul):
"""
A test of the form matmul(A,B) where A:MxK, B:KxN
Expand Down Expand Up @@ -467,9 +440,50 @@ def _execute(self, config):
matmul_template_dir = config.file_dir / "matmul_template"
template_name = matmul_template_dir / "matmul_bias_MxK_KxN_N.mlir"
self.generate(config, template_name)
self.add_aie_compilation_flags(
[
"--iree-amdaie-matmul-elementwise-fusion",
"--iree-amdaie-num-rows=2",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this change the default for all matmuls from 4x4 to 2x2? Maybe not what we want?

I'd be interested to know how many of the tests

for n_rows in [1,4]:
  for n_cols in [1,2,3,4]:
    # run matmul with these values  (on npu1_4col for say M=N=K=3*256).

work

Copy link
Collaborator

@jtuyls jtuyls Dec 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's being shown in a confusing way here, but these changes are actually in MatmulThinBias, not Matmul.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's added to MatmulThinBias class and used for matmul-elementwise tests in AIR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ha, I totally missed that...

{8C56B1BC-EB69-4D62-9344-F07F07905261}

"--iree-amdaie-num-cols=2",
]
)
return self.vs_cpu(config)


class MatmulFullBias(BaseMatmul):
"""
A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:MxN
"""

def __init__(self, M, N, K, input_type, acc_type, run_on_target=["npu1_4col"]):
super().__init__(
run_on_target=run_on_target,
aie_compilation_flags=None,
M=M,
N=N,
K=K,
input_type=input_type,
acc_type=acc_type,
lower_to_aie_pipeline="air",
)
self.labels.append("MatmulFullBias")
self.name = f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}"

def _execute(self, config):
matmul_template_dir = config.file_dir / "matmul_template"
template_name = matmul_template_dir / "matmul_bias_MxK_KxN_MxN.mlir"
self.generate(config, template_name)
self.add_aie_compilation_flags(
[
"--iree-amdaie-matmul-elementwise-fusion",
"--iree-amdaie-num-rows=2",
"--iree-amdaie-num-cols=2",
]
)
self.vs_cpu(config)
return True


class BatchMatmul(BaseMatmul):
"""
A test of the form batch_matmul(A,B) where A:BxMxK, B:BxKxN
Expand Down Expand Up @@ -679,7 +693,6 @@ def generate_aie_vmfb(
f"--iree-amdaie-target-device={config.target_device}",
f"--iree-amdaie-tile-pipeline={tile_pipeline}",
f"--iree-amdaie-lower-to-aie-pipeline={lower_to_aie_pipeline}",
"--iree-amdaie-matmul-elementwise-fusion",
f"--iree-amd-aie-peano-install-dir={config.peano_dir}",
f"--iree-amd-aie-install-dir={config.iree_dir}",
f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,27 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
Builder b(context);
SmallVector<NamedAttribute> configItems;

// Make sure the input number of rows/cols is smaller or equal to the max
// number of rows/cols from the device.
AMDAIEDeviceModel deviceModel =
AMDAIE::getDeviceModel(options.AMDAIETargetDevice);
uint32_t maxCoreRows = deviceModel.getNumCoreRows();
uint32_t maxCoreCols = deviceModel.getNumCoreCols();
if (options.AMDAIENumRows <= 0 || options.AMDAIENumRows > maxCoreRows) {
llvm::report_fatal_error(llvm::Twine("Invalid number of core rows (") +
std::to_string(options.AMDAIENumRows) +
"), must be in the range [1, " +
std::to_string(maxCoreRows) + "] for device " +
stringifyEnum(deviceModel.device));
}
if (options.AMDAIENumCols <= 0 || options.AMDAIENumCols > maxCoreCols) {
llvm::report_fatal_error(llvm::Twine("Invalid number of core cols (") +
std::to_string(options.AMDAIENumCols) +
"), must be in the range [1, " +
std::to_string(maxCoreCols) + "] for device " +
stringifyEnum(deviceModel.device));
}

// Add some configurations to the `hal.executable.target` attribute.
auto addConfig = [&](StringRef name, Attribute value) {
configItems.emplace_back(StringAttr::get(context, name), value);
Expand All @@ -161,6 +182,11 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
// Set microkernel enabling flag.
addConfig("ukernels",
StringAttr::get(context, options.enableAMDAIEUkernels));
// Set number of rows/cols used in an AIE array.
addConfig("num_rows", IntegerAttr::get(IntegerType::get(context, 32),
options.AMDAIENumRows));
addConfig("num_cols", IntegerAttr::get(IntegerType::get(context, 32),
options.AMDAIENumCols));
auto configAttr = b.getDictionaryAttr(configItems);

switch (options.deviceHal) {
Expand Down Expand Up @@ -204,7 +230,8 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend {
void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr,
OpPassManager &passManager) override {
buildAMDAIETransformPassPipeline(
passManager, options.AMDAIETargetDevice, options.useTilePipeline,
passManager, options.AMDAIETargetDevice, options.AMDAIENumRows,
options.AMDAIENumCols, options.useTilePipeline,
options.useLowerToAIEPipeline, options.matmulElementwiseFusion,
options.enableVectorizationPasses, options.pathToUkernels,
options.enablePacketFlow, options.enableCoalescingLoops,
Expand Down
19 changes: 19 additions & 0 deletions compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <string>

#include "iree-amd-aie/Transforms/KernelDispatch.h"
#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h"
#include "iree/compiler/Dialect/HAL/Target/TargetBackend.h"
#include "iree/compiler/Dialect/HAL/Target/TargetDevice.h"
#include "iree/compiler/Utils/OptionUtils.h"
Expand Down Expand Up @@ -59,6 +60,8 @@ struct AMDAIEOptions {
bool insertLoopAroundCoreBlock{false};
bool matmulElementwiseFusion{false};
AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col};
unsigned AMDAIENumRows{getDeviceModel(AMDAIETargetDevice).getNumCoreRows()};
unsigned AMDAIENumCols{getDeviceModel(AMDAIETargetDevice).getNumCoreCols()};
std::string enableAMDAIEUkernels{"none"};
bool enablePacketFlow{false};

Expand Down Expand Up @@ -231,6 +234,22 @@ struct AMDAIEOptions {
clEnumValN(AMDAIEDevice::npu4, "npu4",
"Strix B0 NPU with 8 columns and 6 rows")));

binder.opt<unsigned>(
"iree-amdaie-num-rows", AMDAIENumRows, llvm::cl::cat(category),
llvm::cl::desc(
"Number of rows used in an AIE core array. The compiler will "
"choose a tiling strategy that uses no more than this number of "
"rows. However, some workloads (like convolution) currently ignore "
"this flag, and use a hardcoded number of rows."));

binder.opt<unsigned>(
"iree-amdaie-num-cols", AMDAIENumCols, llvm::cl::cat(category),
llvm::cl::desc(
"Number of columns used in an AIE core array. The compiler will "
"choose a tiling strategy that uses no more than this number of "
"columns. However, some workloads (like convolution) currently "
"ignore this flag, and use a hardcoded number of cols."));

binder.opt<bool>("iree-amdaie-enable-packet-flow", enablePacketFlow,
llvm::cl::cat(category),
llvm::cl::desc("Enable packet routing data movement."));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets %s | FileCheck %s --check-prefix=DEFAULT
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-enable-ukernels=all %s | FileCheck %s --check-prefix=ENABLE_UKERNEL
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-num-rows=2 --iree-amdaie-num-cols=2 %s | FileCheck %s --check-prefix=NUM_ROWS_COLS

// DEFAULT: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "none"}>) {
// ENABLE_UKERNEL: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "all"}>) {
// DEFAULT: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {num_cols = 4 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>) {
// ENABLE_UKERNEL: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {num_cols = 4 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "all"}>) {
// NUM_ROWS_COLS: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 2 : i32, target_device = "npu1_4col", ukernels = "none"}>) {
func.func @matmul_small(%lhs : tensor<16x16xi32>,
%rhs : tensor<16x32xi32>) -> tensor<16x32xi32> {
%empty = tensor.empty() : tensor<16x32xi32>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-sources %s | iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-translate-target-executable-variants{target=amd-aie})))" --iree-amdaie-lower-to-aie-pipeline=air --iree-amdaie-tile-pipeline=pack-peel --iree-amdaie-matmul-elementwise-fusion --split-input-file | FileCheck %s
// RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-num-rows=2 --iree-amdaie-num-cols=2 --iree-amdaie-lower-to-aie-pipeline=air --iree-amdaie-tile-pipeline=pack-peel --iree-amdaie-matmul-elementwise-fusion --split-input-file %s | FileCheck %s

func.func @matmul_elementwise_i32(%lhs: tensor<1024x512xi32>, %rhs: tensor<512x1024xi32>, %ele: tensor<1024x1024xi32>) -> tensor<1024x1024xi32>
{
Expand Down Expand Up @@ -49,6 +49,7 @@ func.func @matmul_elementwise_bf16_f32(%arg0: tensor<1024x512xbf16>, %arg1: tens
// CHECK-COUNT-3: aie.shim_dma_allocation

// -----

func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<512x16384xbf16> {
%cst = arith.constant 0.000000e+00 : f32
%7 = tensor.empty() : tensor<512x16384xbf16>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void AMDAIELoweringStrategyPass::runOnOperation() {

// Detect unsupported pipeline combinations.
{
bool padPack = usePassPipeline == TilePassPipeline::PadPackPipeline;
bool padPack = useTilePipeline == TilePassPipeline::PadPackPipeline;
bool objectFifo =
useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo;
if (padPack && objectFifo) {
Expand All @@ -66,8 +66,9 @@ void AMDAIELoweringStrategyPass::runOnOperation() {

for (auto funcOp : moduleOp.getOps<FunctionOpInterface>()) {
// Set the strategy with default heuristics.
if (failed(initAIELaunchConfig(funcOp, usePassPipeline,
useLowerToAIEPipeline, targetDevice))) {
if (failed(initAIELaunchConfig(funcOp, useTilePipeline,
useLowerToAIEPipeline, targetDevice, numRows,
numCols))) {
funcOp.emitOpError("failed to have a lowering configuration set for it.");
return signalPassFailure();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,15 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() {
return;
case IREE::Codegen::DispatchLoweringPassPipeline::Custom: {
TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp);
if (usePassPipeline == TilePassPipeline::PackPeelPipeline) {
if (useTilePipeline == TilePassPipeline::PackPeelPipeline) {
addPackPeelBasedPassPipeline(executableLoweringPipeline, tilingConfig,
pathToUkernels, enableVectorizationPasses,
TilePassPipeline::PackPeelPipeline);
} else if (usePassPipeline == TilePassPipeline::PadPackPipeline) {
} else if (useTilePipeline == TilePassPipeline::PadPackPipeline) {
addPadPackBasedPassPipeline(executableLoweringPipeline, tilingConfig,
pathToUkernels, enableVectorizationPasses,
TilePassPipeline::PadPackPipeline);
} else if (usePassPipeline == TilePassPipeline::ConvDecomposePipeline) {
} else if (useTilePipeline == TilePassPipeline::ConvDecomposePipeline) {
addConvDecomposePassPipeline(executableLoweringPipeline, tilingConfig,
enableVectorizationPasses,
TilePassPipeline::ConvDecomposePipeline);
Expand Down
Loading
Loading