diff --git a/configs/example/xiangshan.py b/configs/example/xiangshan.py index a3d1a88486..86ddef69d5 100644 --- a/configs/example/xiangshan.py +++ b/configs/example/xiangshan.py @@ -352,7 +352,9 @@ def setKmhV3IdealParams(args, system): cpu.branchPred.tage.enableSC = False # TODO(bug): When numBr changes, enabling SC will trigger an assert cpu.branchPred.ftq_size = 256 cpu.branchPred.fsq_size = 256 - cpu.branchPred.uftb.numEntries = 1024 + uftb_size = 1024 + cpu.branchPred.uftb.numEntries = uftb_size + cpu.branchPred.uftb.numWays = uftb_size cpu.branchPred.ftb.numEntries = 16384 cpu.branchPred.tage.numPredictors = 14 cpu.branchPred.tage.baseTableSize = 16384 diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py index 7a887aa654..a449638db1 100644 --- a/src/cpu/pred/BranchPredictor.py +++ b/src/cpu/pred/BranchPredictor.py @@ -879,7 +879,7 @@ class DefaultFTB(TimedBaseFTBPredictor): class UFTB(DefaultFTB): numEntries = 32 - tagBits = 38 + tagBits = 16 numWays = 32 numDelay = 0 diff --git a/src/cpu/pred/SConscript b/src/cpu/pred/SConscript index b9170497e7..a3badc6ccd 100644 --- a/src/cpu/pred/SConscript +++ b/src/cpu/pred/SConscript @@ -106,6 +106,7 @@ DebugFlag('DecoupleBPRAS') DebugFlag('DecoupleBPuRAS') DebugFlag('DecoupleBPUseful') DebugFlag('Override') +DebugFlag('OverrideByL1') DebugFlag('DecoupleBPFTB') DebugFlag('FTB') DebugFlag('FTBTAGE') diff --git a/src/cpu/pred/ftb/decoupled_bpred.cc b/src/cpu/pred/ftb/decoupled_bpred.cc index 06b106256b..06ab31549d 100644 --- a/src/cpu/pred/ftb/decoupled_bpred.cc +++ b/src/cpu/pred/ftb/decoupled_bpred.cc @@ -449,6 +449,18 @@ DecoupledBPUWithFTB::DBPFTBStats::DBPFTBStats(statistics::Group* parent, unsigne ADD_STAT(otherMiss, statistics::units::Count::get(), "the number of other branch misses"), ADD_STAT(staticBranchNum, statistics::units::Count::get(), "the number of all (different) static branches"), ADD_STAT(staticBranchNumEverTaken, statistics::units::Count::get(), "the number of all (different) static branches that are once taken"), + ADD_STAT(overrideByL1, statistics::units::Count::get(), "the number of preds override by L1"), + ADD_STAT(overrideByL1WhenL0Hit, statistics::units::Count::get(), "the number of preds override by L1, when L0 Hit and L1 Hit"), + ADD_STAT(overrideByL1WhenL0HitButTargetDiff, statistics::units::Count::get(), "the number of preds override by L1, when L0 Hit and L1 Hit, but target diff"), + ADD_STAT(overrideByL1WhenL0HitButIsReturn, statistics::units::Count::get(), "the number of preds override by L1, when L0 Hit and L1 Hit, but target diff"), + ADD_STAT(overrideByL1WhenL0HitButTakenDiff, statistics::units::Count::get(), "the number of preds override by L1, when L0 Hit and L1 Hit, but taken diff"), + ADD_STAT(overrideByL1WhenL0HitButBranchDiff, statistics::units::Count::get(), "the number of preds override by L1, when L0 Hit and L1 Hit, but branch diff"), + ADD_STAT(overrideByL1WhenL0HitButEntryDiff, statistics::units::Count::get(), "the number of preds override by L1, when L0 Hit and L1 Hit, but entry diff"), + ADD_STAT(overrideByL1WhenL0Miss, statistics::units::Count::get(), "the number of preds override by L1, when L0 Miss and L1 Hit"), + ADD_STAT(overrideByL2, statistics::units::Count::get(), "the number of preds override by L2"), + ADD_STAT(squashWhenOverriding, statistics::units::Count::get(), "the number of squash when overriding"), + ADD_STAT(overrideBubbles, statistics::units::Count::get(), "number of bpu pred override bubbles"), + ADD_STAT(s1PredTakenChangeAtSamePC, statistics::units::Count::get(), "s1 pred different taken at the same pc"), ADD_STAT(predsOfEachStage, statistics::units::Count::get(), "the number of preds of each stage that account for final pred"), ADD_STAT(commitPredsFromEachStage, statistics::units::Count::get(), "the number of preds of each stage that account for a committed stream"), ADD_STAT(fsqEntryDist, statistics::units::Count::get(), "the distribution of number of entries in fsq"), @@ -616,6 +628,10 @@ DecoupledBPUWithFTB::tick() numOverrideBubbles = generateFinalPredAndCreateBubbles(); } + if (squashing && numOverrideBubbles > 0 && receivedPred && s0PC != MaxAddr && !sentPCHist) { + dbpFtbStats.squashWhenOverriding++; + } + if (!squashing) { DPRINTF(DecoupleBP, "DecoupledBPUWithFTB::tick()\n"); DPRINTF(Override, "DecoupledBPUWithFTB::tick()\n"); @@ -656,7 +672,6 @@ DecoupledBPUWithFTB::tick() sentPCHist = true; } - // query loop buffer with start pc if (enableLoopBuffer && !lb.isActive() && lb.streamBeforeLoop.getTakenTarget() == lb.streamBeforeLoop.startPC && @@ -806,6 +821,7 @@ DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles() } first_hit_stage++; } + // generate bubbles bubblesToCreate = first_hit_stage; // assign pred source @@ -837,6 +853,90 @@ DecoupledBPUWithFTB::generateFinalPredAndCreateBubbles() printFullFTBPrediction(*chosen); dbpFtbStats.predsOfEachStage[first_hit_stage]++; + if (!squashing && s0PC != MaxAddr && receivedPred && bubblesToCreate > 0) { + if (first_hit_stage == 1) { + assert(predsOfEachStage[1].valid); + + for (int b = 0; b < numBr; ++b) { + if (b < predsOfEachStage[1].ftbEntry.slots.size()){ + Addr slot_pc = predsOfEachStage[1].ftbEntry.slots[b].pc; + auto it = s1PrevPredTakens.find(slot_pc); + bool not_found = it == s1PrevPredTakens.end(); + if (not_found){ + s1PrevPredTakens[slot_pc] = predsOfEachStage[1].condTakens[b]; + dbpFtbStats.s1PredTakenChangeAtSamePC++; + } else if (s1PrevPredTakens[slot_pc] != predsOfEachStage[1].condTakens[b]) { + s1PrevPredTakens[slot_pc] = predsOfEachStage[1].condTakens[b]; + dbpFtbStats.s1PredTakenChangeAtSamePC++; + } + } + } + + if (predsOfEachStage[0].valid) { + assert(predsOfEachStage[0].bbStart == predsOfEachStage[1].bbStart); + dbpFtbStats.overrideByL1WhenL0Hit++; + // printTwoFullFTBPrediction(predsOfEachStage[0], predsOfEachStage[1]); + + auto s0_entry = predsOfEachStage[0].ftbEntry; + auto s1_entry = predsOfEachStage[1].ftbEntry; + auto s0_condTakens = predsOfEachStage[0].condTakens; + auto s1_condTakens = predsOfEachStage[1].condTakens; + + for (int b = 0; b < numBr; ++b){ + if (b >= s0_entry.slots.size() || b >= s1_entry.slots.size()) { + if (s0_entry.slots.size() != s1_entry.slots.size()) { + dbpFtbStats.overrideByL1WhenL0HitButEntryDiff++; + } + break; + } + + FTBSlot s0_entry_slot = s0_entry.slots[b]; + FTBSlot s1_entry_slot = s1_entry.slots[b]; + + if (s0_entry_slot.condValid() && s1_entry_slot.condValid()) { + if (s0_entry_slot.pc == s1_entry_slot.pc && s0_condTakens[b] != s1_condTakens[b]) { + dbpFtbStats.overrideByL1WhenL0HitButTakenDiff++; + // uftb->updateUftbWhenOverrideByL1(predsOfEachStage[0].bbStart, b, s1_condTakens[b]); + break; + } else if (s0_entry_slot.pc == s1_entry_slot.pc && s0_condTakens[b] == 1 && s1_condTakens[b] == 1) { + assert(0); // if taken, then predsOfEachStage0 should equal to predsOfEachStage1 + } else if (s0_entry_slot.pc == s1_entry_slot.pc && s0_condTakens[b] == 0 && s1_condTakens[b] == 0) { + continue; + } else if (s0_entry_slot.pc != s1_entry_slot.pc) { + dbpFtbStats.overrideByL1WhenL0HitButBranchDiff++; + break; + } else { + assert(0); // should not reach here + } + } else if (s0_entry_slot.uncondValid() && s1_entry_slot.uncondValid()) { + if (s0_entry_slot.pc == s1_entry_slot.pc && s0_entry_slot.target != s1_entry_slot.target) { + dbpFtbStats.overrideByL1WhenL0HitButTargetDiff++; + break; + } else if (s0_entry_slot.pc == s1_entry_slot.pc && s0_entry_slot.target == s1_entry_slot.target) { + if (predsOfEachStage[0].returnTarget != predsOfEachStage[1].returnTarget) { + dbpFtbStats.overrideByL1WhenL0HitButIsReturn++; // RAS + break; + } + continue; + } else if (s0_entry_slot.pc != s1_entry_slot.pc) { + dbpFtbStats.overrideByL1WhenL0HitButBranchDiff++; + break; + } else { + assert(0); + } + } else { + dbpFtbStats.overrideByL1WhenL0HitButBranchDiff++; + break; + } + } + } else { + dbpFtbStats.overrideByL1WhenL0Miss++; + } + dbpFtbStats.overrideByL1++; + }else if (first_hit_stage == 2) { + dbpFtbStats.overrideByL2++; + } + } } else { bubblesToCreate = 0; receivedPred = true; @@ -2123,6 +2223,7 @@ DecoupledBPUWithFTB::tryEnqFetchStream() } // prediction valid, but not ready to enq because of bubbles if (numOverrideBubbles > 0) { + dbpFtbStats.overrideBubbles++; DPRINTF(DecoupleBP, "Waiting for bubble caused by overriding, bubbles rest: %u\n", numOverrideBubbles); DPRINTF(Override, "Waiting for bubble caused by overriding, bubbles rest: %u\n", numOverrideBubbles); return; diff --git a/src/cpu/pred/ftb/decoupled_bpred.hh b/src/cpu/pred/ftb/decoupled_bpred.hh index b8ffa7707d..b645f882c9 100644 --- a/src/cpu/pred/ftb/decoupled_bpred.hh +++ b/src/cpu/pred/ftb/decoupled_bpred.hh @@ -9,6 +9,8 @@ #include "arch/generic/pcstate.hh" #include "base/statistics.hh" +#include "base/trace.hh" +#include "base/statistics.hh" #include "config/the_isa.hh" // #include "cpu/base.hh" #include "cpu/o3/cpu_def.hh" @@ -34,6 +36,7 @@ #include "debug/DecoupleBPuRAS.hh" #include "debug/DecoupleBPVerbose.hh" #include "debug/DBPFTBStats.hh" +#include "debug/OverrideByL1.hh" #include "debug/JumpAheadPredictor.hh" #include "debug/LoopBuffer.hh" #include "debug/LoopPredictor.hh" @@ -281,6 +284,8 @@ class DecoupledBPUWithFTB : public BPredUnit unsigned numOverrideBubbles{0}; + std::map s1PrevPredTakens; + using JAInfo = JumpAheadPredictor::JAInfo; JAInfo jaInfo; @@ -374,6 +379,15 @@ class DecoupledBPUWithFTB : public BPredUnit } } + void printFTBEntryWhenOverrideByL1(const FTBEntry &entry) { + DPRINTF(OverrideByL1, "FTB entry: valid %d, tag %#lx, fallThruAddr:%#lx, slots:\n", + entry.valid, entry.tag, entry.fallThruAddr); + for (auto &slot : entry.slots) { + DPRINTF(OverrideByL1, " valid %d, pc:%#lx, size:%d, target:%#lx, cond:%d, indirect:%d, call:%d, return:%d\n", + slot.valid, slot.pc, slot.size, slot.target, slot.isCond, slot.isIndirect, slot.isCall, slot.isReturn); + } + } + void printFullFTBPrediction(const FullFTBPrediction &pred) { DPRINTF(DecoupleBP, "dumping FullFTBPrediction\n"); DPRINTF(DecoupleBP, "bbStart: %#lx, ftbEntry:\n", pred.bbStart); @@ -387,6 +401,34 @@ class DecoupledBPUWithFTB : public BPredUnit pred.indirectTarget, pred.returnTarget); } + void printTwoFullFTBPrediction(FullFTBPrediction &pred0, FullFTBPrediction &pred1) { + DPRINTF(OverrideByL1, "============================\n"); + DPRINTF(OverrideByL1, "dumping FullFTBPrediction0\n"); + DPRINTF(OverrideByL1, "bbStart: %#lx, ftbEntry:\n", pred0.bbStart); + printFTBEntryWhenOverrideByL1(pred0.ftbEntry); + DPRINTF(OverrideByL1, "condTakens: "); + for (auto taken : pred0.condTakens) { + DPRINTFR(OverrideByL1, "%d ", taken); + } + DPRINTFR(OverrideByL1, "\n"); + DPRINTF(OverrideByL1, "indirectTarget: %#lx, returnTarget: %#lx\n", + pred0.indirectTarget, pred0.returnTarget); + DPRINTF(OverrideByL1, "npc: %#lx\n", pred0.getTarget()); + DPRINTF(OverrideByL1, "-----------------------------\n"); + DPRINTF(OverrideByL1, "dumping FullFTBPrediction1\n"); + DPRINTF(OverrideByL1, "bbStart: %#lx, ftbEntry:\n", pred1.bbStart); + printFTBEntryWhenOverrideByL1(pred1.ftbEntry); + DPRINTF(OverrideByL1, "condTakens: "); + for (auto taken : pred1.condTakens) { + DPRINTFR(OverrideByL1, "%d ", taken); + } + DPRINTFR(OverrideByL1, "\n"); + DPRINTF(OverrideByL1, "indirectTarget: %#lx, returnTarget: %#lx\n", + pred1.indirectTarget, pred1.returnTarget); + DPRINTF(OverrideByL1, "npc: %#lx\n", pred1.getTarget()); + DPRINTF(OverrideByL1, "============================\n"); + } + struct DBPFTBStats : public statistics::Group { statistics::Scalar condNum; statistics::Scalar uncondNum; @@ -401,6 +443,19 @@ class DecoupledBPUWithFTB : public BPredUnit statistics::Scalar staticBranchNum; statistics::Scalar staticBranchNumEverTaken; + statistics::Scalar overrideByL1; + statistics::Scalar overrideByL1WhenL0Hit; + statistics::Scalar overrideByL1WhenL0HitButTargetDiff; + statistics::Scalar overrideByL1WhenL0HitButIsReturn; + statistics::Scalar overrideByL1WhenL0HitButTakenDiff; + statistics::Scalar overrideByL1WhenL0HitButBranchDiff; + statistics::Scalar overrideByL1WhenL0HitButEntryDiff; + statistics::Scalar overrideByL1WhenL0Miss; + statistics::Scalar overrideByL2; + statistics::Scalar squashWhenOverriding; + statistics::Scalar overrideBubbles; + statistics::Scalar s1PredTakenChangeAtSamePC; + statistics::Vector predsOfEachStage; statistics::Vector commitPredsFromEachStage; statistics::Distribution fsqEntryDist; diff --git a/src/cpu/pred/ftb/ftb.cc b/src/cpu/pred/ftb/ftb.cc index 1ef9673282..e4169068ae 100644 --- a/src/cpu/pred/ftb/ftb.cc +++ b/src/cpu/pred/ftb/ftb.cc @@ -32,6 +32,7 @@ #include "cpu/o3/dyn_inst.hh" #include "cpu/pred/ftb/ftb.hh" #include "debug/Fetch.hh" +#include "debug/Override.hh" namespace gem5 { @@ -221,16 +222,35 @@ DefaultFTB::lookup(Addr inst_pc) Addr ftb_tag = getTag(inst_pc); DPRINTF(FTB, "FTB: Looking up FTB entry index %#lx tag %#lx\n", ftb_idx, ftb_tag); + if (isL0()) { + auto tag_it = predTagSet.find(ftb_tag); + if (tag_it == predTagSet.end()) { + predTagSet.insert(ftb_tag); + ftbStats.predTagSetSize++; + } + } + assert(ftb_idx < numSets); // ignore false hit when lowest bit is 1 const auto &it = ftb[ftb_idx].find(ftb_tag); if (it != ftb[ftb_idx].end()) { + assert(it->second.valid); if (it->second.valid) { it->second.tick = curTick(); std::make_heap(mruList[ftb_idx].begin(), mruList[ftb_idx].end(), older()); return it->second; } } + + // ftb not hit + bool ftb_is_full = (ftb[ftb_idx].size() >= numWays); + if (ftb_is_full) { + ftbStats.predMissWhenFull++; + DPRINTF(FTB, "FTB: Looking up FTB entry index %#lx tag %#lx miss, ftb is full\n", ftb_idx, ftb_tag); + } else { + ftbStats.predMissWhenNotFull++; + DPRINTF(FTB, "FTB: Looking up FTB entry index %#lx tag %#lx miss, ftb is not full\n", ftb_idx, ftb_tag); + } return TickedFTBEntry(); } @@ -381,6 +401,14 @@ DefaultFTB::update(const FetchStream &stream) Addr ftb_idx = getIndex(startPC); Addr ftb_tag = getTag(startPC); + if (isL0()) { + auto tag_it = updateTagSet.find(ftb_tag); + if (tag_it == updateTagSet.end()) { + updateTagSet.insert(ftb_tag); + ftbStats.updateTagSetSize++; + } + } + DPRINTF(FTB, "FTB: Updating FTB entry index %#lx tag %#lx\n", ftb_idx, ftb_tag); auto it = ftb[ftb_idx].find(ftb_tag); @@ -391,6 +419,11 @@ DefaultFTB::update(const FetchStream &stream) std::pop_heap(mruList[ftb_idx].begin(), mruList[ftb_idx].end(), older()); const auto& old_entry = mruList[ftb_idx].back(); DPRINTF(FTB, "FTB: Replacing entry with tag %#lx in set %#lx\n", old_entry->first, ftb_idx); + if (old_entry->second.tick == 0) { + ftbStats.updateUseEmptyEntry++; + } else { + ftbStats.updateUseOldEntry++; + } ftb[ftb_idx].erase(old_entry->first); } @@ -447,6 +480,7 @@ DefaultFTB::update(const FetchStream &stream) } } + assert(entry_to_write.valid); ftb[ftb_idx][ftb_tag] = TickedFTBEntry(entry_to_write, curTick()); ftb[ftb_idx][ftb_tag].tag = ftb_tag; // in case different ftb has different tags @@ -468,6 +502,22 @@ DefaultFTB::update(const FetchStream &stream) // ftb[ftb_idx].tag = getTag(inst_pc); } +void +DefaultFTB::updateUftbWhenOverrideByL1(Addr bbStart, int brIdx, bool condTaken){ + assert(getDelay() == 0); + + Addr ftb_idx = getIndex(bbStart); + Addr ftb_tag = getTag(bbStart); + + auto it = ftb[ftb_idx].find(ftb_tag); + bool not_found = it == ftb[ftb_idx].end(); + assert(!not_found); + + auto entry_to_update = ftb[ftb_idx][ftb_tag]; + updateCtr(entry_to_update.slots[brIdx].ctr, condTaken); // only update the ctr + ftb[ftb_idx][ftb_tag] = entry_to_update; +} + void DefaultFTB::commitBranch(const FetchStream &stream, const DynInstPtr &inst) { @@ -580,9 +630,15 @@ DefaultFTB::FTBStats::FTBStats(statistics::Group* parent) : ADD_STAT(oldEntryWithNewCond, statistics::units::Count::get(), "number of old ftb entries with new conditional branches"), ADD_STAT(oldEntryWithNewUncond, statistics::units::Count::get(), "number of old ftb entries with new unconditional branches"), ADD_STAT(predMiss, statistics::units::Count::get(), "misses encountered on prediction"), + ADD_STAT(predMissWhenFull, statistics::units::Count::get(), "misses encountered on pred when ftb full"), + ADD_STAT(predMissWhenNotFull, statistics::units::Count::get(), "misses encountered on pred when ftb not full"), ADD_STAT(predHit, statistics::units::Count::get(), "hits encountered on prediction"), ADD_STAT(updateMiss, statistics::units::Count::get(), "misses encountered on update"), ADD_STAT(updateHit, statistics::units::Count::get(), "hits encountered on update"), + ADD_STAT(updateUseEmptyEntry, statistics::units::Count::get(), "use empty entry when update"), + ADD_STAT(updateUseOldEntry, statistics::units::Count::get(), "update old entry when update"), + ADD_STAT(predTagSetSize, statistics::units::Count::get(), "uftb pred tag set size"), + ADD_STAT(updateTagSetSize, statistics::units::Count::get(), "uftb update tag set size"), ADD_STAT(eraseSlotBehindUncond, statistics::units::Count::get(), "erase slots behind unconditional slot"), ADD_STAT(predUseL0OnL1Miss, statistics::units::Count::get(), "use l0 result on l1 miss when pred"), ADD_STAT(updateUseL0OnL1Miss, statistics::units::Count::get(), "use l0 result on l1 miss when update"), diff --git a/src/cpu/pred/ftb/ftb.hh b/src/cpu/pred/ftb/ftb.hh index 0a23cc7fa7..9c72d9b2d4 100644 --- a/src/cpu/pred/ftb/ftb.hh +++ b/src/cpu/pred/ftb/ftb.hh @@ -37,6 +37,7 @@ #include "cpu/pred/ftb/timed_base_pred.hh" #include "debug/FTB.hh" #include "debug/FTBStats.hh" +#include "debug/OverrideByL1.hh" #include "params/DefaultFTB.hh" @@ -118,6 +119,8 @@ class DefaultFTB : public TimedBaseFTBPredictor */ void update(const FetchStream &stream) override; + void updateUftbWhenOverrideByL1(Addr bbStart, int brIdx, bool condTaken); + void commitBranch(const FetchStream &stream, const DynInstPtr &inst) override; /** @@ -185,6 +188,15 @@ class DefaultFTB : public TimedBaseFTBPredictor } } + void printFTBEntryWhenOverrideByL1(const FTBEntry &entry) { + DPRINTF(OverrideByL1, "FTB entry: valid %d, tag %#lx, fallThruAddr:%#lx, slots:\n", + entry.valid, entry.tag, entry.fallThruAddr); + for (auto &slot : entry.slots) { + DPRINTF(OverrideByL1, " valid %d, pc:%#lx, size:%d, target:%#lx, ctr:%d, cond:%d, indirect:%d, call:%d, return:%d\n", + slot.valid, slot.pc, slot.size, slot.target, slot.ctr, slot.isCond, slot.isIndirect, slot.isCall, slot.isReturn); + } + } + void printTickedFTBEntry(TickedFTBEntry &e) { printFTBEntry(e, e.tick); } @@ -235,6 +247,10 @@ class DefaultFTB : public TimedBaseFTBPredictor /** The number of tag bits per entry. */ unsigned tagBits; + /** The number of tags, only used for stats. */ + std::set predTagSet; + std::set updateTagSet; + /** The tag mask. */ Addr tagMask; @@ -277,9 +293,16 @@ class DefaultFTB : public TimedBaseFTBPredictor statistics::Scalar oldEntryWithNewUncond; statistics::Scalar predMiss; + statistics::Scalar predMissWhenFull; + statistics::Scalar predMissWhenNotFull; statistics::Scalar predHit; statistics::Scalar updateMiss; statistics::Scalar updateHit; + statistics::Scalar updateUseEmptyEntry; + statistics::Scalar updateUseOldEntry; + + statistics::Scalar predTagSetSize; + statistics::Scalar updateTagSetSize; statistics::Scalar eraseSlotBehindUncond;