Skip to content

Commit fd26708

Browse files
authored
[VPlan] Refactor VPlan creation, add transform introducing region (NFC). (#128419)
Create an empty VPlan first, then let the HCFG builder create a plain CFG for the top-level loop (w/o a top-level region). The top-level region is introduced by a separate VPlan-transform. This is instead of creating the vector loop region before building the VPlan CFG for the input loop. This simplifies the HCFG builder (which should probably be renamed) and moves along the roadmap ('buildLoop') outlined in [1]. As follow-up, I plan to also preserve the exit branches in the initial VPlan out of the CFG builder, including connections to the exit blocks. The conversion from plain CFG with potentially multiple exits to a single entry/exit region will be done as VPlan transform in a follow-up. This is needed to enable VPlan-based predication. Currently early exit support relies on building the block-in masks on the original CFG, because exiting branches and conditions aren't preserved in the VPlan. So in order to switch to VPlan-based predication, we will have to preserve them in the initial plain CFG, so the exit conditions are available explicitly when we convert to single entry/exit regions. Another follow-up is updating the outer loop handling to also introduce VPRegionBlocks for nested loops as transform. Currently the existing logic in the builder will take care of creating VPRegionBlocks for nested loops, but not the top-level loop. [1] https://llvm.org/devmtg/2023-10/slides/techtalks/Hahn-VPlan-StatusUpdateAndRoadmap.pdf PR: #128419
1 parent f3dd9c9 commit fd26708

File tree

10 files changed

+193
-207
lines changed

10 files changed

+193
-207
lines changed

llvm/lib/Transforms/Vectorize/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ add_llvm_component_library(LLVMVectorize
2323
VectorCombine.cpp
2424
VPlan.cpp
2525
VPlanAnalysis.cpp
26+
VPlanConstruction.cpp
2627
VPlanHCFGBuilder.cpp
2728
VPlanRecipes.cpp
2829
VPlanSLP.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+11-7
Original file line numberDiff line numberDiff line change
@@ -9312,14 +9312,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93129312
return !CM.requiresScalarEpilogue(VF.isVector());
93139313
},
93149314
Range);
9315-
VPlanPtr Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(),
9316-
PSE, RequiresScalarEpilogueCheck,
9317-
CM.foldTailByMasking(), OrigLoop);
9318-
9315+
auto Plan = std::make_unique<VPlan>(OrigLoop);
93199316
// Build hierarchical CFG.
9317+
// Convert to VPlan-transform and consoliate all transforms for VPlan
9318+
// creation.
93209319
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
93219320
HCFGBuilder.buildHierarchicalCFG();
93229321

9322+
VPlanTransforms::introduceTopLevelVectorLoopRegion(
9323+
*Plan, Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
9324+
CM.foldTailByMasking(), OrigLoop);
9325+
93239326
// Don't use getDecisionAndClampRange here, because we don't know the UF
93249327
// so this function is better to be conservative, rather than to split
93259328
// it up into different VPlans.
@@ -9615,13 +9618,14 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
96159618
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
96169619

96179620
// Create new empty VPlan
9618-
auto Plan = VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,
9619-
true, false, OrigLoop);
9620-
9621+
auto Plan = std::make_unique<VPlan>(OrigLoop);
96219622
// Build hierarchical CFG
96229623
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
96239624
HCFGBuilder.buildHierarchicalCFG();
96249625

9626+
VPlanTransforms::introduceTopLevelVectorLoopRegion(
9627+
*Plan, Legal->getWidestInductionType(), PSE, true, false, OrigLoop);
9628+
96259629
for (ElementCount VF : Range)
96269630
Plan->addVF(VF);
96279631

llvm/lib/Transforms/Vectorize/VPlan.cpp

+7-84
Original file line numberDiff line numberDiff line change
@@ -880,85 +880,6 @@ VPlan::~VPlan() {
880880
delete BackedgeTakenCount;
881881
}
882882

883-
VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
884-
PredicatedScalarEvolution &PSE,
885-
bool RequiresScalarEpilogueCheck,
886-
bool TailFolded, Loop *TheLoop) {
887-
auto Plan = std::make_unique<VPlan>(TheLoop);
888-
VPBlockBase *ScalarHeader = Plan->getScalarHeader();
889-
890-
// Connect entry only to vector preheader initially. Entry will also be
891-
// connected to the scalar preheader later, during skeleton creation when
892-
// runtime guards are added as needed. Note that when executing the VPlan for
893-
// an epilogue vector loop, the original entry block here will be replaced by
894-
// a new VPIRBasicBlock wrapping the entry to the epilogue vector loop after
895-
// generating code for the main vector loop.
896-
VPBasicBlock *VecPreheader = Plan->createVPBasicBlock("vector.ph");
897-
VPBlockUtils::connectBlocks(Plan->getEntry(), VecPreheader);
898-
899-
// Create SCEV and VPValue for the trip count.
900-
// We use the symbolic max backedge-taken-count, which works also when
901-
// vectorizing loops with uncountable early exits.
902-
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
903-
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
904-
"Invalid loop count");
905-
ScalarEvolution &SE = *PSE.getSE();
906-
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
907-
InductionTy, TheLoop);
908-
Plan->TripCount =
909-
vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
910-
911-
// Create VPRegionBlock, with empty header and latch blocks, to be filled
912-
// during processing later.
913-
VPBasicBlock *HeaderVPBB = Plan->createVPBasicBlock("vector.body");
914-
VPBasicBlock *LatchVPBB = Plan->createVPBasicBlock("vector.latch");
915-
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
916-
auto *TopRegion = Plan->createVPRegionBlock(
917-
HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
918-
919-
VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
920-
VPBasicBlock *MiddleVPBB = Plan->createVPBasicBlock("middle.block");
921-
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
922-
923-
VPBasicBlock *ScalarPH = Plan->createVPBasicBlock("scalar.ph");
924-
VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
925-
if (!RequiresScalarEpilogueCheck) {
926-
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
927-
return Plan;
928-
}
929-
930-
// If needed, add a check in the middle block to see if we have completed
931-
// all of the iterations in the first vector loop. Three cases:
932-
// 1) If (N - N%VF) == N, then we *don't* need to run the remainder.
933-
// Thus if tail is to be folded, we know we don't need to run the
934-
// remainder and we can set the condition to true.
935-
// 2) If we require a scalar epilogue, there is no conditional branch as
936-
// we unconditionally branch to the scalar preheader. Do nothing.
937-
// 3) Otherwise, construct a runtime check.
938-
BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
939-
VPIRBasicBlock *VPExitBlock = Plan->getExitBlock(IRExitBlock);
940-
// The connection order corresponds to the operands of the conditional branch.
941-
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
942-
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
943-
944-
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
945-
// Here we use the same DebugLoc as the scalar loop latch terminator instead
946-
// of the corresponding compare because they may have ended up with
947-
// different line numbers and we want to avoid awkward line stepping while
948-
// debugging. Eg. if the compare has got a line number inside the loop.
949-
VPBuilder Builder(MiddleVPBB);
950-
VPValue *Cmp =
951-
TailFolded
952-
? Plan->getOrAddLiveIn(ConstantInt::getTrue(
953-
IntegerType::getInt1Ty(TripCount->getType()->getContext())))
954-
: Builder.createICmp(CmpInst::ICMP_EQ, Plan->getTripCount(),
955-
&Plan->getVectorTripCount(),
956-
ScalarLatchTerm->getDebugLoc(), "cmp.n");
957-
Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp},
958-
ScalarLatchTerm->getDebugLoc());
959-
return Plan;
960-
}
961-
962883
void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
963884
VPTransformState &State) {
964885
Type *TCTy = TripCountV->getType();
@@ -1135,11 +1056,13 @@ void VPlan::printLiveIns(raw_ostream &O) const {
11351056
}
11361057

11371058
O << "\n";
1138-
if (TripCount->isLiveIn())
1139-
O << "Live-in ";
1140-
TripCount->printAsOperand(O, SlotTracker);
1141-
O << " = original trip-count";
1142-
O << "\n";
1059+
if (TripCount) {
1060+
if (TripCount->isLiveIn())
1061+
O << "Live-in ";
1062+
TripCount->printAsOperand(O, SlotTracker);
1063+
O << " = original trip-count";
1064+
O << "\n";
1065+
}
11431066
}
11441067

11451068
LLVM_DUMP_METHOD

llvm/lib/Transforms/Vectorize/VPlan.h

+8-16
Original file line numberDiff line numberDiff line change
@@ -3503,21 +3503,6 @@ class VPlan {
35033503
VPBB->setPlan(this);
35043504
}
35053505

3506-
/// Create initial VPlan, having an "entry" VPBasicBlock (wrapping
3507-
/// original scalar pre-header) which contains SCEV expansions that need
3508-
/// to happen before the CFG is modified (when executing a VPlan for the
3509-
/// epilogue vector loop, the original entry needs to be replaced by a new
3510-
/// one); a VPBasicBlock for the vector pre-header, followed by a region for
3511-
/// the vector loop, followed by the middle VPBasicBlock. If a check is needed
3512-
/// to guard executing the scalar epilogue loop, it will be added to the
3513-
/// middle block, together with VPBasicBlocks for the scalar preheader and
3514-
/// exit blocks. \p InductionTy is the type of the canonical induction and
3515-
/// used for related values, like the trip count expression.
3516-
static VPlanPtr createInitialVPlan(Type *InductionTy,
3517-
PredicatedScalarEvolution &PSE,
3518-
bool RequiresScalarEpilogueCheck,
3519-
bool TailFolded, Loop *TheLoop);
3520-
35213506
/// Prepare the plan for execution, setting up the required live-in values.
35223507
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
35233508
VPTransformState &State);
@@ -3579,11 +3564,18 @@ class VPlan {
35793564
return TripCount;
35803565
}
35813566

3567+
/// Set the trip count assuming it is currently null; if it is not - use
3568+
/// resetTripCount().
3569+
void setTripCount(VPValue *NewTripCount) {
3570+
assert(!TripCount && NewTripCount && "TripCount should not be set yet.");
3571+
TripCount = NewTripCount;
3572+
}
3573+
35823574
/// Resets the trip count for the VPlan. The caller must make sure all uses of
35833575
/// the original trip count have been replaced.
35843576
void resetTripCount(VPValue *NewTripCount) {
35853577
assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 &&
3586-
"TripCount always must be set");
3578+
"TripCount must be set when resetting");
35873579
TripCount = NewTripCount;
35883580
}
35893581

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
//===-- VPlanConstruction.cpp - Transforms for initial VPlan construction -===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// \file
10+
/// This file implements transforms for initial VPlan construction.
11+
///
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "LoopVectorizationPlanner.h"
15+
#include "VPlan.h"
16+
#include "VPlanCFG.h"
17+
#include "VPlanTransforms.h"
18+
#include "llvm/Analysis/LoopInfo.h"
19+
#include "llvm/Analysis/ScalarEvolution.h"
20+
21+
using namespace llvm;
22+
23+
void VPlanTransforms::introduceTopLevelVectorLoopRegion(
24+
VPlan &Plan, Type *InductionTy, PredicatedScalarEvolution &PSE,
25+
bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop) {
26+
// TODO: Generalize to introduce all loop regions.
27+
auto *HeaderVPBB = cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor());
28+
VPBlockUtils::disconnectBlocks(Plan.getEntry(), HeaderVPBB);
29+
30+
VPBasicBlock *OriginalLatch =
31+
cast<VPBasicBlock>(HeaderVPBB->getSinglePredecessor());
32+
VPBlockUtils::disconnectBlocks(OriginalLatch, HeaderVPBB);
33+
VPBasicBlock *VecPreheader = Plan.createVPBasicBlock("vector.ph");
34+
VPBlockUtils::connectBlocks(Plan.getEntry(), VecPreheader);
35+
assert(OriginalLatch->getNumSuccessors() == 0 &&
36+
"Plan should end at top level latch");
37+
38+
// Create SCEV and VPValue for the trip count.
39+
// We use the symbolic max backedge-taken-count, which works also when
40+
// vectorizing loops with uncountable early exits.
41+
const SCEV *BackedgeTakenCountSCEV = PSE.getSymbolicMaxBackedgeTakenCount();
42+
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCountSCEV) &&
43+
"Invalid loop count");
44+
ScalarEvolution &SE = *PSE.getSE();
45+
const SCEV *TripCount = SE.getTripCountFromExitCount(BackedgeTakenCountSCEV,
46+
InductionTy, TheLoop);
47+
Plan.setTripCount(
48+
vputils::getOrCreateVPValueForSCEVExpr(Plan, TripCount, SE));
49+
50+
// Create VPRegionBlock, with existing header and new empty latch block, to be
51+
// filled.
52+
VPBasicBlock *LatchVPBB = Plan.createVPBasicBlock("vector.latch");
53+
VPBlockUtils::insertBlockAfter(LatchVPBB, OriginalLatch);
54+
auto *TopRegion = Plan.createVPRegionBlock(
55+
HeaderVPBB, LatchVPBB, "vector loop", false /*isReplicator*/);
56+
// All VPBB's reachable shallowly from HeaderVPBB belong to top level loop,
57+
// because VPlan is expected to end at top level latch.
58+
for (VPBlockBase *VPBB : vp_depth_first_shallow(HeaderVPBB))
59+
VPBB->setParent(TopRegion);
60+
61+
VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
62+
VPBasicBlock *MiddleVPBB = Plan.createVPBasicBlock("middle.block");
63+
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
64+
65+
VPBasicBlock *ScalarPH = Plan.createVPBasicBlock("scalar.ph");
66+
VPBlockUtils::connectBlocks(ScalarPH, Plan.getScalarHeader());
67+
if (!RequiresScalarEpilogueCheck) {
68+
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
69+
return;
70+
}
71+
72+
// If needed, add a check in the middle block to see if we have completed
73+
// all of the iterations in the first vector loop. Three cases:
74+
// 1) If (N - N%VF) == N, then we *don't* need to run the remainder.
75+
// Thus if tail is to be folded, we know we don't need to run the
76+
// remainder and we can set the condition to true.
77+
// 2) If we require a scalar epilogue, there is no conditional branch as
78+
// we unconditionally branch to the scalar preheader. Do nothing.
79+
// 3) Otherwise, construct a runtime check.
80+
BasicBlock *IRExitBlock = TheLoop->getUniqueLatchExitBlock();
81+
auto *VPExitBlock = Plan.getExitBlock(IRExitBlock);
82+
// The connection order corresponds to the operands of the conditional branch.
83+
VPBlockUtils::insertBlockAfter(VPExitBlock, MiddleVPBB);
84+
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
85+
86+
auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
87+
// Here we use the same DebugLoc as the scalar loop latch terminator instead
88+
// of the corresponding compare because they may have ended up with
89+
// different line numbers and we want to avoid awkward line stepping while
90+
// debugging. Eg. if the compare has got a line number inside the loop.
91+
VPBuilder Builder(MiddleVPBB);
92+
VPValue *Cmp =
93+
TailFolded
94+
? Plan.getOrAddLiveIn(ConstantInt::getTrue(
95+
IntegerType::getInt1Ty(TripCount->getType()->getContext())))
96+
: Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
97+
&Plan.getVectorTripCount(),
98+
ScalarLatchTerm->getDebugLoc(), "cmp.n");
99+
Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp},
100+
ScalarLatchTerm->getDebugLoc());
101+
}

0 commit comments

Comments
 (0)