From c030f219a40287dd65c6d18f04e5042f5a62341a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E8=80=80=E9=98=B3=20=28Zhou=20Yaoyang=29?= Date: Wed, 4 Dec 2024 16:12:19 +0800 Subject: [PATCH] feat(memory): add mem bandwidth benchmark (#45) - We are assuming a well-tuned prefetcher Co-authored-by: jueshiwenli <275626310@qq.com> --- apps/mem_test/mem_test_bw/Makefile | 3 + apps/mem_test/mem_test_bw/README.md | 47 ++++++++++ apps/mem_test/mem_test_bw/mem_test_bw.c | 116 ++++++++++++++++++++++++ 3 files changed, 166 insertions(+) create mode 100644 apps/mem_test/mem_test_bw/Makefile create mode 100644 apps/mem_test/mem_test_bw/README.md create mode 100644 apps/mem_test/mem_test_bw/mem_test_bw.c diff --git a/apps/mem_test/mem_test_bw/Makefile b/apps/mem_test/mem_test_bw/Makefile new file mode 100644 index 00000000..7e3b8f46 --- /dev/null +++ b/apps/mem_test/mem_test_bw/Makefile @@ -0,0 +1,3 @@ +NAME = mem_test_bw +SRCS = mem_test_bw.c +include $(AM_HOME)/Makefile.app \ No newline at end of file diff --git a/apps/mem_test/mem_test_bw/README.md b/apps/mem_test/mem_test_bw/README.md new file mode 100644 index 00000000..74aaf41c --- /dev/null +++ b/apps/mem_test/mem_test_bw/README.md @@ -0,0 +1,47 @@ +# build bmk + +Remember to set `AM_HOME` to the root of this repo. + +Clear old build +``` +rm -rf $AM_HOME/am/build build +``` + +Build bandwidth bmk: +``` +make ARCH=riscv64-xs +``` + +# Build DUT with well-tuned prefetcher + +To fully utilize the memory bandwidth, this benchmark is assuming well-tuned spatial prefetchers, such as +stream prefetcher, stride prefetcher, or BOP. + +If the prefetcher is not well-tuned, this benchmark is just stressing other part of the CPU, +like instruction window size, MHSR counts. +Then this benchmark will be nonsense. + +# Run + +With NEMU to test building correctness: +``` +$NEMU_HOME/build/riscv64-nemu-interpreter -b build/mem_test_bw-riscv64-xs.bin +``` + +Then test it with Xiangshan: +``` +/path/to/xiangshan/build/emu -i build/mem_test_bw-riscv64-xs.bin +``` + +Xiangshan with single channel DDR4 3200 configuration is expected to print something like +``` +start ddr test +mem band width 7.145648 B/cycle (2000 samples) inst 6016, checksum=0 +Core 0: HIT GOOD TRAP at pc = 0x80000288 +``` + +# Compute bandwidth + +Memory bandwidth = 7.145648 B/cycle * 3GHz = 21.435 GB/s + +3GHz is the simulated frequency of CPU diff --git a/apps/mem_test/mem_test_bw/mem_test_bw.c b/apps/mem_test/mem_test_bw/mem_test_bw.c new file mode 100644 index 00000000..bb2ed0d4 --- /dev/null +++ b/apps/mem_test/mem_test_bw/mem_test_bw.c @@ -0,0 +1,116 @@ +//#ifndef PROBE_H +//#define PROBE_H + +#include +#include +//#include "bitutils.h" +//#include "resultmat.h" + +// config +// #define PERF_SIM // probe run in simulatior, diaable perf counters + +// perf const +#define BYTE (1) +#define KB (1024*BYTE) +#define MB (1024*KB) +#define GB (1024*MB) + +// platform dependent const +#ifndef _PERF_TEST_ADDR_BASE +#define _PERF_TEST_ADDR_BASE 0x80400000 +#define _PERF_TEST_ADDR_BASE1 0x80100000 +// #define _PERF_TEST_ADDR_BASE 0x2000400000 +#endif +#define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE) +#define _PERF_CACHELINE_SIZE_BYTE_STEP (8 * BYTE) +#define _PERF_PAGE_SIZE_BYTE (4 * KB) +#define _PERF_L1_NOALIAS_SIZE_BYTE (16 * KB) +#define _PERF_L1_SIZE_BYTE (64 * KB) +#define _PERF_L2_SIZE_BYTE (1 * MB) +#define _PERF_L3_SIZE_BYTE (6 * MB) +#define _PERF_MEM_SIZE_BYTE (1024 * MB) +#define _PERF_L1_NUM_WAYS 4 +#define _PERF_L1_NUM_SETS 256 +#define _PERF_L2_NUM_WAYS 8 +#define _PERF_L2_NUM_SLICES 4 +#define _PERF_L2_NUM_SETS 512 + +#define _PERF_ADDR_STRIDE_L1_SAME_BANK _PERF_CACHELINE_SIZE_BYTE +#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L2_SAME_SLICE (_PERF_L2_NUM_SLICES * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L1_SAME_SET (_PERF_L1_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_L2_SAME_SET (_PERF_L2_NUM_SLICES * _PERF_L2_NUM_SETS * _PERF_CACHELINE_SIZE_BYTE) +#define _PERF_ADDR_STRIDE_NEXT_PAGE (_PERF_PAGE_SIZE_BYTE) + +// probe const +#define _PERF_BLACKHOLE _PERF_TEST_ADDR_BASE + +void legacy_test_mem_throughput(uint64_t iter) +{ + uint64_t remain = iter; + uint64_t result = 0; + uint64_t access_addr = _PERF_TEST_ADDR_BASE; + uint64_t cycle_1; + uint64_t cycle_2; + uint64_t inst_1; + uint64_t inst_2; + uint64_t cycle; + uint64_t inst; + + cycle_1 = csr_read(CSR_MCYCLE); + inst_1 = csr_read(CSR_MINSTRET); + + // Unroll this loop by four times to avoid fragmenting instruction supply +#define STEP 4 + assert(iter % STEP == 0); +#pragma GCC unroll 4 + + while (remain -= 1) { + uint64_t * restrict ptr = (uint64_t *)access_addr; + result += *ptr; + access_addr += _PERF_CACHELINE_SIZE_BYTE; + } + //_perf_end_timer(); + cycle_2 = csr_read(CSR_MCYCLE); + inst_2 = csr_read(CSR_MINSTRET); + + cycle = cycle_2 - cycle_1; + inst = inst_2 - inst_1; + *(uint64_t*) _PERF_BLACKHOLE = result; + printf("mem band width %f B/cycle (%d samples) inst %ld, checksum=%lx\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / cycle, iter, inst, result); +} + +void legacy_test_mem_throughput_same_set(uint64_t iter) +{ + uint64_t remain = iter; + uint64_t result = 0; + uint64_t access_addr = _PERF_TEST_ADDR_BASE1; + uint64_t cycle_1; + uint64_t cycle_2; + uint64_t inst_1; + uint64_t inst_2; + uint64_t cycle; + uint64_t inst; + //_perf_start_timer(); + cycle_1 = csr_read(CSR_MCYCLE); + inst_1 = csr_read(CSR_MINSTRET); + while (remain--) { + result += *(uint64_t*) access_addr; + access_addr += _PERF_CACHELINE_SIZE_BYTE; + } + cycle_2 = csr_read(CSR_MCYCLE); + inst_2 = csr_read(CSR_MINSTRET); + cycle = cycle_2 - cycle_1; + inst = inst_2 - inst_1; + //*(uint64_t*) _PERF_BLACKHOLE = result; + //_perf_end_timer(); + *(uint64_t*) _PERF_BLACKHOLE = result; + printf("mem band width %f B/cycle (%d samples) inst %ld\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / cycle, iter, inst); +} + +int main(){ + printf("start ddr test\n"); + legacy_test_mem_throughput(2000); + // legacy_test_mem_throughput_same_set(500); + +}