-
Notifications
You must be signed in to change notification settings - Fork 84
Add only TLB support #118
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Add only TLB support #118
Changes from all commits
0c81128
bed64bb
18dc15a
10caaec
e23c475
698d0f4
a42b23b
f1ab0bb
6c655bb
3f493cd
241e180
aea50c2
d6f67bf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -93,6 +93,7 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving; | |
#define L2 0x02 | ||
#define DRAM 0x04 | ||
#define ICNT 0x08 | ||
#define GMMU 0x10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor nitpicking: Can you rewrite this in the form of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we were to adopt the form "1 << n", we better wrap them in a pair of parentheses at macro definition. The C++ shift operators have low precedence. |
||
|
||
#define MEM_LATENCY_STAT_IMPL | ||
|
||
|
@@ -322,6 +323,12 @@ void memory_config::reg_options(class OptionParser *opp) { | |
// SST mode activate | ||
option_parser_register(opp, "-SST_mode", OPT_BOOL, &SST_mode, "SST mode", | ||
"0"); | ||
// TLB related options | ||
option_parser_register( | ||
opp, "-page_table_walk_latency", OPT_INT64, &page_table_walk_latency, | ||
"Average page table walk latency (in core cycle).", "100"); | ||
option_parser_register(opp, "-page_size", OPT_CSTR, &page_size_string, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This argument seems unused in the whole PR. Is it used anywhere? If used, please consider implementing safety check code to test its validity; if not, please consider removing it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used inside gmmu_t constructor to decide the page number from an address. What do you mean by the validity? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The description says "GDDR page size, only 4KB/2MB avaliable." It would be nice to have a sanity check in the code for this. |
||
"GDDR page size, only 4KB/2MB avaliable.", "4KB"); | ||
m_address_mapping.addrdec_setoption(opp); | ||
} | ||
|
||
|
@@ -654,6 +661,8 @@ void shader_core_config::reg_options(class OptionParser *opp) { | |
option_parser_register(opp, "-gpgpu_reg_file_port_throughput", OPT_INT32, | ||
®_file_port_throughput, | ||
"the number ports of the register file", "1"); | ||
option_parser_register(opp, "-tlb_size", OPT_INT32, &tlb_size, | ||
"Number of tlb entries per SM.", "4096"); | ||
|
||
for (unsigned j = 0; j < SPECIALIZED_UNIT_NUM; ++j) { | ||
std::stringstream ss; | ||
|
@@ -993,7 +1002,8 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx) | |
m_power_stats = | ||
new power_stat_t(m_shader_config, average_pipeline_duty_cycle, active_sms, | ||
m_shader_stats, m_memory_config, m_memory_stats); | ||
|
||
m_gmmu = new gmmu_t(this, config, m_memory_stats); | ||
|
||
gpu_sim_insn = 0; | ||
gpu_tot_sim_insn = 0; | ||
gpu_tot_issued_cta = 0; | ||
|
@@ -1140,6 +1150,7 @@ void gpgpu_sim::reinit_clock_domains(void) { | |
dram_time = 0; | ||
icnt_time = 0; | ||
l2_time = 0; | ||
gmmu_time = 0; | ||
} | ||
|
||
bool gpgpu_sim::active() { | ||
|
@@ -1636,6 +1647,7 @@ void gpgpu_sim::gpu_print_stat(unsigned long long streamID) { | |
printf("icnt_total_pkts_simt_to_mem=%ld\n", total_simt_to_mem); | ||
|
||
time_vector_print(); | ||
m_memory_stats->tlb_print(stdout); | ||
fflush(stdout); | ||
|
||
clear_executed_kernel_info(); | ||
|
@@ -1933,7 +1945,7 @@ void dram_t::dram_log(int task) { | |
|
||
// Find next clock domain and increment its time | ||
int gpgpu_sim::next_clock_domain(void) { | ||
double smallest = min3(core_time, icnt_time, dram_time); | ||
double smallest = min4(core_time, icnt_time, dram_time, gmmu_time); | ||
int mask = 0x00; | ||
if (l2_time <= smallest) { | ||
smallest = l2_time; | ||
|
@@ -1952,6 +1964,10 @@ int gpgpu_sim::next_clock_domain(void) { | |
mask |= CORE; | ||
core_time += m_config.core_period; | ||
} | ||
if (gmmu_time <= smallest) { | ||
mask |= GMMU; | ||
gmmu_time += m_config.core_period; | ||
} | ||
return mask; | ||
} | ||
|
||
|
@@ -1970,9 +1986,86 @@ void gpgpu_sim::issue_block2core() { | |
unsigned long long g_single_step = | ||
0; // set this in gdb to single step the pipeline | ||
|
||
gmmu_t::gmmu_t(class gpgpu_sim *gpu, const gpgpu_sim_config &config, | ||
class memory_stats_t *mem_stats) | ||
: m_gpu(gpu), m_config(config) { | ||
m_shader_config = &m_config.m_shader_config; | ||
m_memory_config = &m_config.m_memory_config; | ||
m_memory_stats = mem_stats; | ||
|
||
m_log2_page_size = -1; | ||
for (unsigned n = 0, mask = 1; mask != 0; mask <<= 1, n++) { | ||
if (m_memory_config->page_size & mask) { | ||
assert(m_log2_page_size == (unsigned)-1); | ||
m_log2_page_size = n; | ||
} | ||
} | ||
//gpu_sim_cycle = m_gpu->gpu_sim_cycle; | ||
//gpu_tot_sim_cycle = m_gpu->gpu_tot_sim_cycle; | ||
} | ||
|
||
void gmmu_t::register_tlbflush_callback( | ||
std::function<void(mem_addr_t)> cb_tlb) { | ||
callback_tlb_flush.push_back(cb_tlb); | ||
} | ||
|
||
void gmmu_t::tlb_flush(mem_addr_t page_num) { | ||
for (list<std::function<void(mem_addr_t)>>::iterator iter = | ||
callback_tlb_flush.begin(); | ||
iter != callback_tlb_flush.end(); iter++) { | ||
(*iter)(page_num); | ||
} | ||
} | ||
|
||
void gmmu_t::cycle() { | ||
int simt_cluster_id = 0; | ||
|
||
size_t num_read_stage_queue = 0; | ||
std::map<mem_addr_t, std::list<mem_fetch *>> page_fault_this_turn; | ||
|
||
// check the page_table_walk_delay_queue | ||
while (!page_table_walk_queue.empty() && | ||
((m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) >= | ||
page_table_walk_queue.front().ready_cycle)) { | ||
|
||
mem_fetch *mf = page_table_walk_queue.front().mf; | ||
|
||
simt_cluster_id = mf->get_sid() / m_config.num_core_per_cluster(); | ||
|
||
(m_gpu->getSIMTCluster(simt_cluster_id))->push_gmmu_cu_queue(mf); | ||
|
||
m_memory_stats->mf_page_hit[simt_cluster_id]++; | ||
page_table_walk_queue.pop_front(); | ||
} | ||
|
||
// fetch from cluster's cu to gmmu queue and push it into the page table way | ||
// delay queue | ||
for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) { | ||
|
||
if (!(m_gpu->getSIMTCluster(i))->empty_cu_gmmu_queue()) { | ||
|
||
mem_fetch *mf = (m_gpu->getSIMTCluster(i))->front_cu_gmmu_queue(); | ||
|
||
struct page_table_walk_latency_t pt_t; | ||
pt_t.mf = mf; | ||
pt_t.ready_cycle = | ||
m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle + m_memory_config->page_table_walk_latency; | ||
|
||
page_table_walk_queue.push_back(pt_t); | ||
|
||
(m_gpu->getSIMTCluster(i))->pop_cu_gmmu_queue(); | ||
} | ||
} | ||
} | ||
|
||
void gpgpu_sim::cycle() { | ||
int clock_mask = next_clock_domain(); | ||
|
||
// the gmmu has the same clock as the core | ||
if (clock_mask & GMMU) { | ||
m_gmmu->cycle(); | ||
} | ||
|
||
if (clock_mask & CORE) { | ||
// shader core loading (pop from ICNT into core) follows CORE clock | ||
for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) | ||
|
@@ -2297,7 +2390,7 @@ const shader_core_config *gpgpu_sim::getShaderCoreConfig() { | |
|
||
const memory_config *gpgpu_sim::getMemoryConfig() { return m_memory_config; } | ||
|
||
simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; } | ||
simt_core_cluster *gpgpu_sim::getSIMTCluster(int index) { return *(m_cluster + index); } | ||
|
||
void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) { | ||
if (m_shader_config->n_simt_clusters != sst_numcores) { | ||
|
@@ -2312,6 +2405,9 @@ void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) { | |
} | ||
|
||
void sst_gpgpu_sim::SST_cycle() { | ||
// the gmmu has the same clock as the core | ||
m_gmmu->cycle(); | ||
|
||
// shader core loading (pop from ICNT into core) follows CORE clock | ||
for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) | ||
static_cast<sst_simt_core_cluster *>(m_cluster[i])->icnt_cycle_SST(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ | |
#include <fstream> | ||
#include <iostream> | ||
#include <list> | ||
#include <functional> | ||
#include "../abstract_hardware_model.h" | ||
#include "../option_parser.h" | ||
#include "../trace.h" | ||
|
@@ -401,6 +402,11 @@ class memory_config { | |
bool m_perf_sim_memcpy; | ||
bool simple_dram_model; | ||
bool SST_mode; | ||
|
||
unsigned long long page_table_walk_latency; | ||
int page_size; | ||
char *page_size_string; | ||
|
||
gpgpu_context *gpgpu_ctx; | ||
}; | ||
|
||
|
@@ -446,6 +452,7 @@ class gpgpu_sim_config : public power_config, | |
unsigned get_core_freq() const { return core_freq; } | ||
unsigned num_shader() const { return m_shader_config.num_shader(); } | ||
unsigned num_cluster() const { return m_shader_config.n_simt_clusters; } | ||
unsigned num_core_per_cluster() const { return m_shader_config.n_simt_cores_per_cluster; } | ||
unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; } | ||
|
||
/** | ||
|
@@ -521,6 +528,60 @@ class gpgpu_sim_config : public power_config, | |
|
||
friend class gpgpu_sim; | ||
friend class sst_gpgpu_sim; | ||
friend class gmmu_t; | ||
}; | ||
|
||
class gmmu_t { | ||
public: | ||
gmmu_t(class gpgpu_sim *gpu, const gpgpu_sim_config &config, | ||
class memory_stats_t *memory_stats); | ||
void cycle(); | ||
void register_tlbflush_callback(std::function<void(mem_addr_t)> cb_tlb); | ||
void tlb_flush(mem_addr_t page_num); | ||
mem_addr_t get_page_num(mem_addr_t addr) { | ||
return addr >> m_log2_page_size; | ||
} | ||
|
||
private: | ||
unsigned m_log2_page_size; | ||
// data structure to wrap memory fetch and page table walk delay | ||
struct page_table_walk_latency_t { | ||
mem_fetch *mf; | ||
unsigned long long ready_cycle; | ||
}; | ||
|
||
// page table walk delay queue | ||
std::list<page_table_walk_latency_t> page_table_walk_queue; | ||
|
||
enum class latency_type { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same for this enum, is it ever used? |
||
PCIE_READ, | ||
PCIE_WRITE_BACK, | ||
INVALIDATE, | ||
PAGE_FAULT, | ||
DMA | ||
}; | ||
|
||
class gpgpu_sim *m_gpu; | ||
|
||
// config file | ||
const gpgpu_sim_config &m_config; | ||
const struct memory_config *m_memory_config; | ||
const struct shader_core_config *m_shader_config; | ||
|
||
// callback functions to invalidate the tlb in ldst unit | ||
std::list<std::function<void(mem_addr_t)>> callback_tlb_flush; | ||
|
||
class memory_stats_t *m_memory_stats; | ||
}; | ||
|
||
struct lp_tree_node { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this struct ever used? |
||
mem_addr_t addr; | ||
size_t size; | ||
size_t valid_size; | ||
struct lp_tree_node *left; | ||
struct lp_tree_node *right; | ||
uint32_t access_counter; | ||
uint8_t RW; | ||
}; | ||
|
||
struct occupancy_stats { | ||
|
@@ -535,7 +596,7 @@ struct occupancy_stats { | |
|
||
float get_occ_fraction() const { | ||
return float(aggregate_warp_slot_filled) / | ||
float(aggregate_theoretical_warp_slots); | ||
float(aggregate_theoretical_warp_slots); | ||
} | ||
|
||
occupancy_stats &operator+=(const occupancy_stats &rhs) { | ||
|
@@ -654,7 +715,9 @@ class gpgpu_sim : public gpgpu_t { | |
* Returning the cluster of of the shader core, used by the functional | ||
* simulation so far | ||
*/ | ||
simt_core_cluster *getSIMTCluster(); | ||
simt_core_cluster *getSIMTCluster(int index); | ||
|
||
gmmu_t *getGmmu() { return m_gmmu; } | ||
|
||
void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd, | ||
const ptx_instruction *pI); | ||
|
@@ -687,6 +750,7 @@ class gpgpu_sim : public gpgpu_t { | |
|
||
protected: | ||
///// data ///// | ||
class gmmu_t *m_gmmu; | ||
class simt_core_cluster **m_cluster; | ||
class memory_partition_unit **m_memory_partition_unit; | ||
class memory_sub_partition **m_memory_sub_partition; | ||
|
@@ -709,6 +773,7 @@ class gpgpu_sim : public gpgpu_t { | |
double icnt_time; | ||
double dram_time; | ||
double l2_time; | ||
double gmmu_time; | ||
|
||
// debug | ||
bool gpu_deadlock; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need the dev-tlb branch of accel-sim for gpgpu-sim to work?
If the config is not configured to use tlbs, then does it still need this branch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is a circular dependency issue: the tlb version of Accel-Sim needs the tlb version of gpgpu-sim and vice versa. We should have some ways to deal with this. Some ideas I have:
dev
branch to testAccel-Sim/dev
withGPGPU-Sim/dev-tlb
, make sure things are not broken. Then add additional CI test forAccel-Sim/dev-tlb
andGPGPU-Sim/dev-tlb
Also, is it possible to control whether to enable TLB functionality or not?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The only chnage in dev-tlb branch is adding m_memory_stats argument in trace_shader_core_ctx() within trace_driven.cc. It still needs this change regardless of whether tlb config is used or not. So is it better to overload this functions and create two variants.