Skip to content

Commit

Permalink
Merge branch 'xs-dev' into add-stats-override-bubble
Browse files Browse the repository at this point in the history
  • Loading branch information
Lawrence-ID authored Jan 16, 2025
2 parents 893f614 + 49749b6 commit be9283c
Show file tree
Hide file tree
Showing 43 changed files with 1,923 additions and 380 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,6 @@ run_fs.sh
.vscode/
llvm-pgo/
*.profdata
*.profraw
*.profraw
*nemu*
ready-to-run/
31 changes: 30 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,12 @@ Notes:
- If you have already built GEM5, you should rebuild gem5 after install DRAMSim3
- If simulating Xiangshan system, use DRAMSim3 with our costumized config

Use init.sh to clone and build DRAMSim3.

```shell
bash ./init.sh
```

## Build GEM5

```shell
Expand Down Expand Up @@ -235,7 +241,30 @@ Simulation error without Difftest **will NOT be responded.**

### Example command

Firstly, one should ensure GEM5 is properly built and workloads are prepared by running a single workload:
#### Easy to run
Easy to run a single workload(not a checkpoint, just a single binary file)

```shell
# prepare the binary file
git clone https://github.com/OpenXiangShan/ready-to-run.git
# prepare nemu reference design
wget https://github.com/OpenXiangShan/GEM5/releases/download/2024-10-16/riscv64-nemu-interpreter-c1469286ca32-so
# set environment variables
export GCBV_REF_SO=`realpath riscv64-nemu-interpreter-c1469286ca32-so`
# run the workload
./build/RISCV/gem5.opt ./configs/example/xiangshan.py --raw-cpt --generic-rv-cpt=./ready-to-run/coremark-2-iteration.bin
# get the ipc
grep 'cpu.ipc' m5out/stats.txt
```
xiangshan.py is the default configuration for XS-GEM5.

raw-cpt means the input is a single binary file.

generic-rv-cpt is the path to the binary file.

Then you can see the output in the terminal, find gem5 output in the `m5out` directory.

Otherwise, if you want to run a checkpoint, you should ensure GEM5 is properly built and workloads are prepared by running a single workload:
``` shel
mkdir util/xs_scripts/example
cd util/xs_scripts/example
Expand Down
3 changes: 3 additions & 0 deletions configs/common/Caches.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ class L2Cache(Cache):
cache_level = 2
enable_wayprediction = False

slice_num = 4

class L3Cache(Cache):
mshrs = 64
tgts_per_mshr = 20
Expand Down Expand Up @@ -159,6 +161,7 @@ class L1ToL2Bus(CoherentXBar):
forward_latency = 3 # l1 -> l2 req/snoop latency
response_latency = 3 # l2 -> l1 resp latency
snoop_response_latency = 1
hint_wakeup_ahead_cycles = 2 # send Hint to L1 N cycles in advance with TimingResp

# Use a snoop-filter by default, and set the latency to zero as
# the lookup is assumed to overlap with the frontend latency of
Expand Down
1 change: 0 additions & 1 deletion configs/common/FSConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,6 @@ def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=
self.iobus = IOXBar()
if not ruby:
self.membus = MemBus()
self.membus.width = 32

self.bridge = Bridge(delay='50ns')
self.bridge.mem_side_port = self.iobus.cpu_side_ports
Expand Down
20 changes: 12 additions & 8 deletions configs/example/xiangshan.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,14 +337,17 @@ def setKmhV3IdealParams(args, system):
cpu.mmu.itb.size = 96

cpu.BankConflictCheck = False # real bank conflict 0.2 score
cpu.EnableLdMissReplay = False
cpu.EnablePipeNukeCheck = False
cpu.StoreWbStage = 2 # store writeback at s2

cpu.scheduler = IdealScheduler()
cpu.scheduler = IdealScheduler()
# use centralized load/store issue queue, for hmmer

# ideal decoupled frontend
if args.bp_type is None or args.bp_type == 'DecoupledBPUWithFTB':
cpu.branchPred.enableTwoTaken = True
cpu.branchPred.numBr = 6
cpu.branchPred.numBr = 8 # numBr must be a power of 2, see getShuffledBrIndex()
cpu.branchPred.predictWidth = 64
cpu.branchPred.tage.enableSC = False # TODO(bug): When numBr changes, enabling SC will trigger an assert
cpu.branchPred.ftq_size = 256
Expand All @@ -353,12 +356,12 @@ def setKmhV3IdealParams(args, system):
cpu.branchPred.uftb.numEntries = uftb_size
cpu.branchPred.uftb.numWays = uftb_size
cpu.branchPred.ftb.numEntries = 16384
cpu.branchPred.tage.numPredictors = 9
cpu.branchPred.tage.baseTableSize = 4096
cpu.branchPred.tage.tableSizes = [4096] * 9
cpu.branchPred.tage.TTagBitSizes = [8] * 9
cpu.branchPred.tage.TTagPcShifts = [1] * 9
cpu.branchPred.tage.histLengths = [8, 13, 21, 35, 57, 93, 151, 246, 401]
cpu.branchPred.tage.numPredictors = 14
cpu.branchPred.tage.baseTableSize = 16384
cpu.branchPred.tage.tableSizes = [2048] * 14
cpu.branchPred.tage.TTagBitSizes = [13] * 14
cpu.branchPred.tage.TTagPcShifts = [1] * 14
cpu.branchPred.tage.histLengths = [4, 7, 12, 16, 21, 29, 38, 51, 68, 90, 120, 160, 283, 499]

# ideal l1 caches
if args.caches:
Expand All @@ -376,6 +379,7 @@ def setKmhV3IdealParams(args, system):
system.l2_caches[i].slice_num = 0 # 4 -> 0, no slice
system.tol2bus_list[i].forward_latency = 0 # 3->0
system.tol2bus_list[i].response_latency = 0 # 3->0
system.tol2bus_list[i].hint_wakeup_ahead_cycles = 0 # 2->0

if args.l3cache:
system.l3.enable_wayprediction = False
Expand Down
12 changes: 12 additions & 0 deletions init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

export GEM5_HOME=$(pwd)

# build DRAMSim
cd ext/dramsim3
git clone https://github.com/umd-memsys/DRAMsim3.git DRAMsim3
cd DRAMsim3 && mkdir -p build
cd build
cmake ..
make -j 48
cd $GEM5_HOME
2 changes: 1 addition & 1 deletion src/arch/riscv/isa/decoder.isa
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ decode QUADRANT default Unknown::unknown() {
0x03: decode FUNCT3 {
format FenceOp {
0x0: fence({{
}}, uint64_t, IsReadBarrier, IsWriteBarrier, No_OpClass);
}}, uint64_t, IsReadBarrier, IsWriteBarrier, MemReadOp);
0x1: fence_i({{
}}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass);
}
Expand Down
34 changes: 32 additions & 2 deletions src/arch/riscv/isa/formats/amo.isa
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,36 @@ def template LRSCMacroConstructor {{
}
}};

// Strictly order-preserving LRSC
def template LRSCStrictMacroConstructor {{
%(class_name)s::%(class_name)s(ExtMachInst machInst):
%(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
{
%(constructor)s;

StaticInstPtr rel_fence;
StaticInstPtr lrsc;
StaticInstPtr acq_fence;

rel_fence = new MemFenceMicro(machInst, No_OpClass);
rel_fence->setFlag(IsFirstMicroop);
rel_fence->setFlag(IsReadBarrier);
rel_fence->setFlag(IsWriteBarrier);
rel_fence->setFlag(IsDelayedCommit);

// set up atomic rmw op
lrsc = new %(class_name)sMicro(machInst, this);
lrsc->setFlag(IsDelayedCommit);

acq_fence = new MemFenceMicro(machInst, No_OpClass);
acq_fence->setFlag(IsLastMicroop);
acq_fence->setFlag(IsReadBarrier);
acq_fence->setFlag(IsWriteBarrier);

microops = {rel_fence, lrsc, acq_fence};
}
}};

def template LRSCMicroConstructor {{
%(class_name)s::%(class_name)sMicro::%(class_name)sMicro(
ExtMachInst machInst, %(class_name)s *_p)
Expand Down Expand Up @@ -435,7 +465,7 @@ def format LoadReserved(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
macro_iop = InstObjParams(name, Name, 'LoadReserved', macro_ea_code,
macro_inst_flags)
header_output = LRSCDeclare.subst(macro_iop)
decoder_output = LRSCMacroConstructor.subst(macro_iop)
decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
decode_block = BasicDecode.subst(macro_iop)

exec_output = ''
Expand Down Expand Up @@ -463,7 +493,7 @@ def format StoreCond(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
macro_iop = InstObjParams(name, Name, 'StoreCond', macro_ea_code,
macro_inst_flags)
header_output = LRSCDeclare.subst(macro_iop)
decoder_output = LRSCMacroConstructor.subst(macro_iop)
decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
decode_block = BasicDecode.subst(macro_iop)

exec_output = ''
Expand Down
12 changes: 10 additions & 2 deletions src/arch/riscv/tlb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ TLB::TLB(const Params &p) :
openBackPre(p.open_back_pre),
backPrePrecision(p.initial_back_pre_precision_value),
forwardPrePrecision(p.initial_forward_pre_precision_value),
controlNum(0),
allForwardPre(0),removeNoUseForwardPre(0),removeNoUseBackPre(0),
usedBackPre(0),test_num(0),allUsed(0),forwardUsedPre(0),
lastVaddr(0),lastPc(0), traceFlag(false),
Expand Down Expand Up @@ -1980,6 +1981,9 @@ TLB::translate(const RequestPtr &req, ThreadContext *tc,
req->setFlags(Request::PHYSICAL);

Fault fault;
if (req->getVaddr() == 0)
warn("notice vaddr == 0 pc %lx \n", req->getPC());

if (req->getFlags() & Request::PHYSICAL) {
req->setTwoStageState(false, 0, 0);
/**
Expand All @@ -1989,8 +1993,6 @@ TLB::translate(const RequestPtr &req, ThreadContext *tc,
if ((hgatp.mode == 8 || vsatp.mode == 8) && (pmode < PrivilegeMode::PRV_M)) {
fault = doTwoStageTranslate(req, tc, translation, mode, delayed);
} else {
if (req->getVaddr() == 0)
warn("vaddr ==0 pc %lx \n", req->getPC());
req->setPaddr(req->getVaddr());
fault = NoFault;
assert(!req->get_h_inst());
Expand All @@ -2006,6 +2008,12 @@ TLB::translate(const RequestPtr &req, ThreadContext *tc,
fault = doTwoStageTranslate(req, tc, translation, mode, delayed);
} else {
req->setTwoStageState(false, 0, 0);
if (controlNum < 5) {
uint16_t control_address = (req->getVaddr() >> VADDR_BITS) & VPN_MASK;
if ((control_address != VPN_MASK) && (control_address != 0))
warn("notice sv48,the vaddr %lx may valid sv48\n", req->getVaddr());
}
controlNum++;
fault = doTranslate(req, tc, translation, mode, delayed);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/arch/riscv/tlb.hh
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class TLB : public BaseTLB
bool openBackPre;
bool backPrePrecision;
bool forwardPrePrecision;
uint64_t controlNum;
uint64_t allForwardPre;
uint64_t removeNoUseForwardPre;
uint64_t removeNoUseBackPre;
Expand Down
7 changes: 7 additions & 0 deletions src/cpu/o3/BaseO3CPU.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,15 @@ def support_take_over(cls):
LQEntries = Param.Unsigned(80, "Number of load queue entries")
SQEntries = Param.Unsigned(64, "Number of store queue entries")

LdPipeStages = Param.Unsigned(4, "Number of load pipeline stages")
StPipeStages = Param.Unsigned(5, "Number of store pipeline stages")

SbufferEntries = Param.Unsigned(16, "Number of store buffer entries")
SbufferEvictThreshold = Param.Unsigned(7, "store buffer eviction threshold")
storeBufferInactiveThreshold = Param.Unsigned(800, "store buffer writeback timeout threshold")

StoreWbStage = Param.Unsigned(4, "Which PipeLine Stage store instruction writeback, 4 means S4")

LSQDepCheckShift = Param.Unsigned(0,
"Number of places to shift addr before check")
LSQCheckLoads = Param.Bool(True,
Expand All @@ -188,6 +193,8 @@ def support_take_over(cls):
LFSTEntrySize = Param.Unsigned(4,"The number of store table inst in every entry of LFST can contain")
SSITSize = Param.Unsigned(8192, "Store set ID table size")
BankConflictCheck = Param.Bool(True, "open Bank conflict check")
EnableLdMissReplay = Param.Bool(True, "Replay Cache missed load instrution from ReplayQ if True")
EnablePipeNukeCheck = Param.Bool(True, "Replay load if Raw violation is detected in loadPipe if True")


numRobs = Param.Unsigned(1, "Number of Reorder Buffers");
Expand Down
1 change: 1 addition & 0 deletions src/cpu/o3/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ if env['CONF']['TARGET_ISA'] != 'null':
DebugFlag('IQ')
DebugFlag('LSQ')
DebugFlag('LSQUnit')
DebugFlag('Hint')
DebugFlag('TagReadFail')
DebugFlag('StoreBuffer')
DebugFlag('MemDepUnit')
Expand Down
22 changes: 22 additions & 0 deletions src/cpu/o3/dyn_inst.hh
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ class DynInst : public ExecContext, public RefCounted
NotAnInst,
TranslationStarted,
TranslationCompleted,
WaitingCacheRefill,
HasPendingCacheReq,
PossibleLoadViolation,
HitExternalSnoop,
EffAddrValid,
Expand Down Expand Up @@ -462,6 +464,22 @@ class DynInst : public ExecContext, public RefCounted
}
void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; }

/** True if inst is waiting for Dcache refill. */
bool
waitingCacheRefill() const
{
return instFlags[WaitingCacheRefill];
}
void waitingCacheRefill(bool f) { instFlags[WaitingCacheRefill] = f; }

/** True if inst is has pending cache request. */
bool
hasPendingCacheReq() const
{
return instFlags[HasPendingCacheReq];
}
void hasPendingCacheReq(bool f) { instFlags[HasPendingCacheReq] = f; }

/** True if this address was found to match a previous load and they issued
* out of order. If that happend, then it's only a problem if an incoming
* snoop invalidate modifies the line, in which case we need to squash.
Expand Down Expand Up @@ -1395,6 +1413,10 @@ class DynInst : public ExecContext, public RefCounted
return squashVer.getVersion();
}

ssize_t getLqIdx()
{
return lqIdx;
}

Addr getPC()
{
Expand Down
Loading

0 comments on commit be9283c

Please sign in to comment.