From dbd7975c466db0521986c216954175ec3635d986 Mon Sep 17 00:00:00 2001 From: Stanley Winata Date: Wed, 13 Dec 2023 02:43:16 -0600 Subject: [PATCH] [LLVMGPU] Add a slowpath if expected to overflow gridDim. For really large models such as llama_70b it may overflow the number of blocks/ gridDim because parallel dims is always tiled to 1. Add some hueristic to use slowpath if compute is expected to overflow max grid dim. Context length can be set/tuned by: --iree-codegen-llvmgpu-context-length=512 512 is the default. This should be a temporary or at most complementary fix. The real fix should be implementing better distribution for parallel dims, and or adding a specialization kernel/dispatch that can pick slowpath vs fast path depending on the size of dynamic dims during runtime. --- .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 37 ++++++++++++++++++- .../iree/compiler/Codegen/LLVMGPU/Passes.cpp | 3 +- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp index 577c248cc09e..f28a20105500 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp @@ -48,6 +48,13 @@ llvm::cl::opt llvm::cl::desc("force use mma sync instead of wmma ops"), llvm::cl::init(false)); +/// Flag used to toggle expected context length. +/// Note: With system prompt we start with ~141 tokens. +llvm::cl::opt clGPUContextLength( + "iree-codegen-llvmgpu-context-length", + llvm::cl::desc("Sets expected context length to prevent overflow"), + llvm::cl::init(512)); + namespace { constexpr StringLiteral kCudaTarget = "cuda"; @@ -299,7 +306,33 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint, staticNonUnitParallelDimCount += bounds[nDim] != 1 && !ShapedType::isDynamic(bounds[nDim]); } - if (staticNonUnitParallelDimCount <= 1) + + // Since all parallel dims are tiled to 1 along the grid, there are + // cases where we overflow the gridDim. We add a temporary heuristic + // use slow path if context length and MaxGridDim may cause overflow. + // TODO: This is conservative and temporary solution, we'd need to + // add better distribution of parallel dims. + bool mayOverflowGridinWarpReduc = false; + { + SmallVector parallelDims; + op.getParallelDims(parallelDims); + std::optional parallelSize = 1; + const int kMaxGridDim = 4190000; + bool hasDynParallelDim = false; + for (int64_t dim : parallelDims) { + if (ShapedType::isDynamic(bounds[dim])) { + hasDynParallelDim = true; + continue; + } + *parallelSize *= bounds[dim]; + } + if (parallelSize && hasDynParallelDim && + *parallelSize * clGPUContextLength >= kMaxGridDim) { + mayOverflowGridinWarpReduc = true; + } + } + + if (staticNonUnitParallelDimCount <= 1 && !mayOverflowGridinWarpReduc) return failure(); // Don't consider operations that don't have a broadcast, those should go @@ -311,7 +344,7 @@ static LogicalResult setContractConfig(func::FuncOp entryPoint, // TODO: Properly rematerialize leading elementwise with shared memory // promotion. - if (hasFusedLeadingOp(op)) { + if (hasFusedLeadingOp(op) && !mayOverflowGridinWarpReduc) { return failure(); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 8271580b58a1..45385e21faa7 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -163,7 +163,8 @@ void addGPUVectorizationPassPipeline(OpPassManager &pm) { void addGPUMatmulSimtPassPipeline(OpPassManager &pm) { tileAndDistributeToWorkgroup(pm); auto &nestedModulePM = pm.nest(); - + nestedModulePM.addNestedPass( + createRematerializeParallelOpsPass()); nestedModulePM.addPass(createCanonicalizerPass()); nestedModulePM.addNestedPass( createWorkgroupSpecializationPass());