Skip to content

Commit

Permalink
feat(memory): add mem bandwidth benchmark (#45)
Browse files Browse the repository at this point in the history
- We are assuming a well-tuned prefetcher

Co-authored-by: jueshiwenli <[email protected]>
  • Loading branch information
shinezyy and jueshiwenli authored Dec 4, 2024
1 parent 015fa83 commit c030f21
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 0 deletions.
3 changes: 3 additions & 0 deletions apps/mem_test/mem_test_bw/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
NAME = mem_test_bw
SRCS = mem_test_bw.c
include $(AM_HOME)/Makefile.app
47 changes: 47 additions & 0 deletions apps/mem_test/mem_test_bw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# build bmk

Remember to set `AM_HOME` to the root of this repo.

Clear old build
```
rm -rf $AM_HOME/am/build build
```

Build bandwidth bmk:
```
make ARCH=riscv64-xs
```

# Build DUT with well-tuned prefetcher

To fully utilize the memory bandwidth, this benchmark is assuming well-tuned spatial prefetchers, such as
stream prefetcher, stride prefetcher, or BOP.

If the prefetcher is not well-tuned, this benchmark is just stressing other part of the CPU,
like instruction window size, MHSR counts.
Then this benchmark will be nonsense.

# Run

With NEMU to test building correctness:
```
$NEMU_HOME/build/riscv64-nemu-interpreter -b build/mem_test_bw-riscv64-xs.bin
```

Then test it with Xiangshan:
```
/path/to/xiangshan/build/emu -i build/mem_test_bw-riscv64-xs.bin
```

Xiangshan with single channel DDR4 3200 configuration is expected to print something like
```
start ddr test
mem band width 7.145648 B/cycle (2000 samples) inst 6016, checksum=0
Core 0: HIT GOOD TRAP at pc = 0x80000288
```

# Compute bandwidth

Memory bandwidth = 7.145648 B/cycle * 3GHz = 21.435 GB/s

3GHz is the simulated frequency of CPU
116 changes: 116 additions & 0 deletions apps/mem_test/mem_test_bw/mem_test_bw.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//#ifndef PROBE_H
//#define PROBE_H

#include <klib.h>
#include <csr.h>
//#include "bitutils.h"
//#include "resultmat.h"

// config
// #define PERF_SIM // probe run in simulatior, diaable perf counters

// perf const
#define BYTE (1)
#define KB (1024*BYTE)
#define MB (1024*KB)
#define GB (1024*MB)

// platform dependent const
#ifndef _PERF_TEST_ADDR_BASE
#define _PERF_TEST_ADDR_BASE 0x80400000
#define _PERF_TEST_ADDR_BASE1 0x80100000
// #define _PERF_TEST_ADDR_BASE 0x2000400000
#endif
#define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE)
#define _PERF_CACHELINE_SIZE_BYTE_STEP (8 * BYTE)
#define _PERF_PAGE_SIZE_BYTE (4 * KB)
#define _PERF_L1_NOALIAS_SIZE_BYTE (16 * KB)
#define _PERF_L1_SIZE_BYTE (64 * KB)
#define _PERF_L2_SIZE_BYTE (1 * MB)
#define _PERF_L3_SIZE_BYTE (6 * MB)
#define _PERF_MEM_SIZE_BYTE (1024 * MB)
#define _PERF_L1_NUM_WAYS 4
#define _PERF_L1_NUM_SETS 256
#define _PERF_L2_NUM_WAYS 8
#define _PERF_L2_NUM_SLICES 4
#define _PERF_L2_NUM_SETS 512

#define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE
#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SLICES * _PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE)
#define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE)

// probe const
#define _PERF_BLACKHOLE _PERF_TEST_ADDR_BASE

void legacy_test_mem_throughput(uint64_t iter)
{
uint64_t remain = iter;
uint64_t result = 0;
uint64_t access_addr = _PERF_TEST_ADDR_BASE;
uint64_t cycle_1;
uint64_t cycle_2;
uint64_t inst_1;
uint64_t inst_2;
uint64_t cycle;
uint64_t inst;

cycle_1 = csr_read(CSR_MCYCLE);
inst_1 = csr_read(CSR_MINSTRET);

// Unroll this loop by four times to avoid fragmenting instruction supply
#define STEP 4
assert(iter % STEP == 0);
#pragma GCC unroll 4

while (remain -= 1) {
uint64_t * restrict ptr = (uint64_t *)access_addr;
result += *ptr;
access_addr += _PERF_CACHELINE_SIZE_BYTE;
}
//_perf_end_timer();
cycle_2 = csr_read(CSR_MCYCLE);
inst_2 = csr_read(CSR_MINSTRET);

cycle = cycle_2 - cycle_1;
inst = inst_2 - inst_1;
*(uint64_t*) _PERF_BLACKHOLE = result;
printf("mem band width %f B/cycle (%d samples) inst %ld, checksum=%lx\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / cycle, iter, inst, result);
}

void legacy_test_mem_throughput_same_set(uint64_t iter)
{
uint64_t remain = iter;
uint64_t result = 0;
uint64_t access_addr = _PERF_TEST_ADDR_BASE1;
uint64_t cycle_1;
uint64_t cycle_2;
uint64_t inst_1;
uint64_t inst_2;
uint64_t cycle;
uint64_t inst;
//_perf_start_timer();
cycle_1 = csr_read(CSR_MCYCLE);
inst_1 = csr_read(CSR_MINSTRET);
while (remain--) {
result += *(uint64_t*) access_addr;
access_addr += _PERF_CACHELINE_SIZE_BYTE;
}
cycle_2 = csr_read(CSR_MCYCLE);
inst_2 = csr_read(CSR_MINSTRET);
cycle = cycle_2 - cycle_1;
inst = inst_2 - inst_1;
//*(uint64_t*) _PERF_BLACKHOLE = result;
//_perf_end_timer();
*(uint64_t*) _PERF_BLACKHOLE = result;
printf("mem band width %f B/cycle (%d samples) inst %ld\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / cycle, iter, inst);
}

int main(){
printf("start ddr test\n");
legacy_test_mem_throughput(2000);
// legacy_test_mem_throughput_same_set(500);

}

0 comments on commit c030f21

Please sign in to comment.