Skip to content

Commit be9283c

Browse files
authored
Merge branch 'xs-dev' into add-stats-override-bubble
2 parents 893f614 + 49749b6 commit be9283c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1923
-380
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,4 +45,6 @@ run_fs.sh
4545
.vscode/
4646
llvm-pgo/
4747
*.profdata
48-
*.profraw
48+
*.profraw
49+
*nemu*
50+
ready-to-run/

README.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,12 @@ Notes:
171171
- If you have already built GEM5, you should rebuild gem5 after install DRAMSim3
172172
- If simulating Xiangshan system, use DRAMSim3 with our costumized config
173173

174+
Use init.sh to clone and build DRAMSim3.
175+
176+
```shell
177+
bash ./init.sh
178+
```
179+
174180
## Build GEM5
175181

176182
```shell
@@ -235,7 +241,30 @@ Simulation error without Difftest **will NOT be responded.**
235241

236242
### Example command
237243

238-
Firstly, one should ensure GEM5 is properly built and workloads are prepared by running a single workload:
244+
#### Easy to run
245+
Easy to run a single workload(not a checkpoint, just a single binary file)
246+
247+
```shell
248+
# prepare the binary file
249+
git clone https://github.com/OpenXiangShan/ready-to-run.git
250+
# prepare nemu reference design
251+
wget https://github.com/OpenXiangShan/GEM5/releases/download/2024-10-16/riscv64-nemu-interpreter-c1469286ca32-so
252+
# set environment variables
253+
export GCBV_REF_SO=`realpath riscv64-nemu-interpreter-c1469286ca32-so`
254+
# run the workload
255+
./build/RISCV/gem5.opt ./configs/example/xiangshan.py --raw-cpt --generic-rv-cpt=./ready-to-run/coremark-2-iteration.bin
256+
# get the ipc
257+
grep 'cpu.ipc' m5out/stats.txt
258+
```
259+
xiangshan.py is the default configuration for XS-GEM5.
260+
261+
raw-cpt means the input is a single binary file.
262+
263+
generic-rv-cpt is the path to the binary file.
264+
265+
Then you can see the output in the terminal, find gem5 output in the `m5out` directory.
266+
267+
Otherwise, if you want to run a checkpoint, you should ensure GEM5 is properly built and workloads are prepared by running a single workload:
239268
``` shel
240269
mkdir util/xs_scripts/example
241270
cd util/xs_scripts/example

configs/common/Caches.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ class L2Cache(Cache):
104104
cache_level = 2
105105
enable_wayprediction = False
106106

107+
slice_num = 4
108+
107109
class L3Cache(Cache):
108110
mshrs = 64
109111
tgts_per_mshr = 20
@@ -159,6 +161,7 @@ class L1ToL2Bus(CoherentXBar):
159161
forward_latency = 3 # l1 -> l2 req/snoop latency
160162
response_latency = 3 # l2 -> l1 resp latency
161163
snoop_response_latency = 1
164+
hint_wakeup_ahead_cycles = 2 # send Hint to L1 N cycles in advance with TimingResp
162165

163166
# Use a snoop-filter by default, and set the latency to zero as
164167
# the lookup is assumed to overlap with the frontend latency of

configs/common/FSConfig.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -670,7 +670,6 @@ def makeBareMetalXiangshanSystem(mem_mode, mdesc=None, cmdline=None, np=1, ruby=
670670
self.iobus = IOXBar()
671671
if not ruby:
672672
self.membus = MemBus()
673-
self.membus.width = 32
674673

675674
self.bridge = Bridge(delay='50ns')
676675
self.bridge.mem_side_port = self.iobus.cpu_side_ports

configs/example/xiangshan.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -337,14 +337,17 @@ def setKmhV3IdealParams(args, system):
337337
cpu.mmu.itb.size = 96
338338

339339
cpu.BankConflictCheck = False # real bank conflict 0.2 score
340+
cpu.EnableLdMissReplay = False
341+
cpu.EnablePipeNukeCheck = False
342+
cpu.StoreWbStage = 2 # store writeback at s2
340343

341-
cpu.scheduler = IdealScheduler()
344+
cpu.scheduler = IdealScheduler()
342345
# use centralized load/store issue queue, for hmmer
343346

344347
# ideal decoupled frontend
345348
if args.bp_type is None or args.bp_type == 'DecoupledBPUWithFTB':
346349
cpu.branchPred.enableTwoTaken = True
347-
cpu.branchPred.numBr = 6
350+
cpu.branchPred.numBr = 8 # numBr must be a power of 2, see getShuffledBrIndex()
348351
cpu.branchPred.predictWidth = 64
349352
cpu.branchPred.tage.enableSC = False # TODO(bug): When numBr changes, enabling SC will trigger an assert
350353
cpu.branchPred.ftq_size = 256
@@ -353,12 +356,12 @@ def setKmhV3IdealParams(args, system):
353356
cpu.branchPred.uftb.numEntries = uftb_size
354357
cpu.branchPred.uftb.numWays = uftb_size
355358
cpu.branchPred.ftb.numEntries = 16384
356-
cpu.branchPred.tage.numPredictors = 9
357-
cpu.branchPred.tage.baseTableSize = 4096
358-
cpu.branchPred.tage.tableSizes = [4096] * 9
359-
cpu.branchPred.tage.TTagBitSizes = [8] * 9
360-
cpu.branchPred.tage.TTagPcShifts = [1] * 9
361-
cpu.branchPred.tage.histLengths = [8, 13, 21, 35, 57, 93, 151, 246, 401]
359+
cpu.branchPred.tage.numPredictors = 14
360+
cpu.branchPred.tage.baseTableSize = 16384
361+
cpu.branchPred.tage.tableSizes = [2048] * 14
362+
cpu.branchPred.tage.TTagBitSizes = [13] * 14
363+
cpu.branchPred.tage.TTagPcShifts = [1] * 14
364+
cpu.branchPred.tage.histLengths = [4, 7, 12, 16, 21, 29, 38, 51, 68, 90, 120, 160, 283, 499]
362365

363366
# ideal l1 caches
364367
if args.caches:
@@ -376,6 +379,7 @@ def setKmhV3IdealParams(args, system):
376379
system.l2_caches[i].slice_num = 0 # 4 -> 0, no slice
377380
system.tol2bus_list[i].forward_latency = 0 # 3->0
378381
system.tol2bus_list[i].response_latency = 0 # 3->0
382+
system.tol2bus_list[i].hint_wakeup_ahead_cycles = 0 # 2->0
379383

380384
if args.l3cache:
381385
system.l3.enable_wayprediction = False

init.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
export GEM5_HOME=$(pwd)
4+
5+
# build DRAMSim
6+
cd ext/dramsim3
7+
git clone https://github.com/umd-memsys/DRAMsim3.git DRAMsim3
8+
cd DRAMsim3 && mkdir -p build
9+
cd build
10+
cmake ..
11+
make -j 48
12+
cd $GEM5_HOME

src/arch/riscv/isa/decoder.isa

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,7 @@ decode QUADRANT default Unknown::unknown() {
679679
0x03: decode FUNCT3 {
680680
format FenceOp {
681681
0x0: fence({{
682-
}}, uint64_t, IsReadBarrier, IsWriteBarrier, No_OpClass);
682+
}}, uint64_t, IsReadBarrier, IsWriteBarrier, MemReadOp);
683683
0x1: fence_i({{
684684
}}, uint64_t, IsNonSpeculative, IsSerializeAfter, No_OpClass);
685685
}

src/arch/riscv/isa/formats/amo.isa

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,36 @@ def template LRSCMacroConstructor {{
151151
}
152152
}};
153153

154+
// Strictly order-preserving LRSC
155+
def template LRSCStrictMacroConstructor {{
156+
%(class_name)s::%(class_name)s(ExtMachInst machInst):
157+
%(base_class)s("%(mnemonic)s", machInst, %(op_class)s)
158+
{
159+
%(constructor)s;
160+
161+
StaticInstPtr rel_fence;
162+
StaticInstPtr lrsc;
163+
StaticInstPtr acq_fence;
164+
165+
rel_fence = new MemFenceMicro(machInst, No_OpClass);
166+
rel_fence->setFlag(IsFirstMicroop);
167+
rel_fence->setFlag(IsReadBarrier);
168+
rel_fence->setFlag(IsWriteBarrier);
169+
rel_fence->setFlag(IsDelayedCommit);
170+
171+
// set up atomic rmw op
172+
lrsc = new %(class_name)sMicro(machInst, this);
173+
lrsc->setFlag(IsDelayedCommit);
174+
175+
acq_fence = new MemFenceMicro(machInst, No_OpClass);
176+
acq_fence->setFlag(IsLastMicroop);
177+
acq_fence->setFlag(IsReadBarrier);
178+
acq_fence->setFlag(IsWriteBarrier);
179+
180+
microops = {rel_fence, lrsc, acq_fence};
181+
}
182+
}};
183+
154184
def template LRSCMicroConstructor {{
155185
%(class_name)s::%(class_name)sMicro::%(class_name)sMicro(
156186
ExtMachInst machInst, %(class_name)s *_p)
@@ -435,7 +465,7 @@ def format LoadReserved(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
435465
macro_iop = InstObjParams(name, Name, 'LoadReserved', macro_ea_code,
436466
macro_inst_flags)
437467
header_output = LRSCDeclare.subst(macro_iop)
438-
decoder_output = LRSCMacroConstructor.subst(macro_iop)
468+
decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
439469
decode_block = BasicDecode.subst(macro_iop)
440470

441471
exec_output = ''
@@ -463,7 +493,7 @@ def format StoreCond(memacc_code, postacc_code={{ }}, ea_code={{EA = Rs1;}},
463493
macro_iop = InstObjParams(name, Name, 'StoreCond', macro_ea_code,
464494
macro_inst_flags)
465495
header_output = LRSCDeclare.subst(macro_iop)
466-
decoder_output = LRSCMacroConstructor.subst(macro_iop)
496+
decoder_output = LRSCStrictMacroConstructor.subst(macro_iop)
467497
decode_block = BasicDecode.subst(macro_iop)
468498

469499
exec_output = ''

src/arch/riscv/tlb.cc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ TLB::TLB(const Params &p) :
9191
openBackPre(p.open_back_pre),
9292
backPrePrecision(p.initial_back_pre_precision_value),
9393
forwardPrePrecision(p.initial_forward_pre_precision_value),
94+
controlNum(0),
9495
allForwardPre(0),removeNoUseForwardPre(0),removeNoUseBackPre(0),
9596
usedBackPre(0),test_num(0),allUsed(0),forwardUsedPre(0),
9697
lastVaddr(0),lastPc(0), traceFlag(false),
@@ -1980,6 +1981,9 @@ TLB::translate(const RequestPtr &req, ThreadContext *tc,
19801981
req->setFlags(Request::PHYSICAL);
19811982

19821983
Fault fault;
1984+
if (req->getVaddr() == 0)
1985+
warn("notice vaddr == 0 pc %lx \n", req->getPC());
1986+
19831987
if (req->getFlags() & Request::PHYSICAL) {
19841988
req->setTwoStageState(false, 0, 0);
19851989
/**
@@ -1989,8 +1993,6 @@ TLB::translate(const RequestPtr &req, ThreadContext *tc,
19891993
if ((hgatp.mode == 8 || vsatp.mode == 8) && (pmode < PrivilegeMode::PRV_M)) {
19901994
fault = doTwoStageTranslate(req, tc, translation, mode, delayed);
19911995
} else {
1992-
if (req->getVaddr() == 0)
1993-
warn("vaddr ==0 pc %lx \n", req->getPC());
19941996
req->setPaddr(req->getVaddr());
19951997
fault = NoFault;
19961998
assert(!req->get_h_inst());
@@ -2006,6 +2008,12 @@ TLB::translate(const RequestPtr &req, ThreadContext *tc,
20062008
fault = doTwoStageTranslate(req, tc, translation, mode, delayed);
20072009
} else {
20082010
req->setTwoStageState(false, 0, 0);
2011+
if (controlNum < 5) {
2012+
uint16_t control_address = (req->getVaddr() >> VADDR_BITS) & VPN_MASK;
2013+
if ((control_address != VPN_MASK) && (control_address != 0))
2014+
warn("notice sv48,the vaddr %lx may valid sv48\n", req->getVaddr());
2015+
}
2016+
controlNum++;
20092017
fault = doTranslate(req, tc, translation, mode, delayed);
20102018
}
20112019
}

src/arch/riscv/tlb.hh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ class TLB : public BaseTLB
8787
bool openBackPre;
8888
bool backPrePrecision;
8989
bool forwardPrePrecision;
90+
uint64_t controlNum;
9091
uint64_t allForwardPre;
9192
uint64_t removeNoUseForwardPre;
9293
uint64_t removeNoUseBackPre;

src/cpu/o3/BaseO3CPU.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,10 +171,15 @@ def support_take_over(cls):
171171
LQEntries = Param.Unsigned(80, "Number of load queue entries")
172172
SQEntries = Param.Unsigned(64, "Number of store queue entries")
173173

174+
LdPipeStages = Param.Unsigned(4, "Number of load pipeline stages")
175+
StPipeStages = Param.Unsigned(5, "Number of store pipeline stages")
176+
174177
SbufferEntries = Param.Unsigned(16, "Number of store buffer entries")
175178
SbufferEvictThreshold = Param.Unsigned(7, "store buffer eviction threshold")
176179
storeBufferInactiveThreshold = Param.Unsigned(800, "store buffer writeback timeout threshold")
177180

181+
StoreWbStage = Param.Unsigned(4, "Which PipeLine Stage store instruction writeback, 4 means S4")
182+
178183
LSQDepCheckShift = Param.Unsigned(0,
179184
"Number of places to shift addr before check")
180185
LSQCheckLoads = Param.Bool(True,
@@ -188,6 +193,8 @@ def support_take_over(cls):
188193
LFSTEntrySize = Param.Unsigned(4,"The number of store table inst in every entry of LFST can contain")
189194
SSITSize = Param.Unsigned(8192, "Store set ID table size")
190195
BankConflictCheck = Param.Bool(True, "open Bank conflict check")
196+
EnableLdMissReplay = Param.Bool(True, "Replay Cache missed load instrution from ReplayQ if True")
197+
EnablePipeNukeCheck = Param.Bool(True, "Replay load if Raw violation is detected in loadPipe if True")
191198

192199

193200
numRobs = Param.Unsigned(1, "Number of Reorder Buffers");

src/cpu/o3/SConscript

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ if env['CONF']['TARGET_ISA'] != 'null':
6565
DebugFlag('IQ')
6666
DebugFlag('LSQ')
6767
DebugFlag('LSQUnit')
68+
DebugFlag('Hint')
6869
DebugFlag('TagReadFail')
6970
DebugFlag('StoreBuffer')
7071
DebugFlag('MemDepUnit')

src/cpu/o3/dyn_inst.hh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,8 @@ class DynInst : public ExecContext, public RefCounted
194194
NotAnInst,
195195
TranslationStarted,
196196
TranslationCompleted,
197+
WaitingCacheRefill,
198+
HasPendingCacheReq,
197199
PossibleLoadViolation,
198200
HitExternalSnoop,
199201
EffAddrValid,
@@ -462,6 +464,22 @@ class DynInst : public ExecContext, public RefCounted
462464
}
463465
void translationCompleted(bool f) { instFlags[TranslationCompleted] = f; }
464466

467+
/** True if inst is waiting for Dcache refill. */
468+
bool
469+
waitingCacheRefill() const
470+
{
471+
return instFlags[WaitingCacheRefill];
472+
}
473+
void waitingCacheRefill(bool f) { instFlags[WaitingCacheRefill] = f; }
474+
475+
/** True if inst is has pending cache request. */
476+
bool
477+
hasPendingCacheReq() const
478+
{
479+
return instFlags[HasPendingCacheReq];
480+
}
481+
void hasPendingCacheReq(bool f) { instFlags[HasPendingCacheReq] = f; }
482+
465483
/** True if this address was found to match a previous load and they issued
466484
* out of order. If that happend, then it's only a problem if an incoming
467485
* snoop invalidate modifies the line, in which case we need to squash.
@@ -1395,6 +1413,10 @@ class DynInst : public ExecContext, public RefCounted
13951413
return squashVer.getVersion();
13961414
}
13971415

1416+
ssize_t getLqIdx()
1417+
{
1418+
return lqIdx;
1419+
}
13981420

13991421
Addr getPC()
14001422
{

0 commit comments

Comments
 (0)