accel-sim · yechen3 · Apr 22, 2025 · May 11, 2025 · May 14, 2025 · May 15, 2025
diff --git a/.github/workflows/accelsim.yml b/.github/workflows/accelsim.yml
@@ -16,7 +16,7 @@ on:
 # By default regress against accel-sim's dev branch
 env:
   ACCELSIM_REPO: https://github.com/purdue-aalp/accel-sim-framework-public.git
-  ACCELSIM_BRANCH: dev
+  ACCELSIM_BRANCH: dev-tlb
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:

diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
@@ -178,6 +178,7 @@ void gpgpu_functional_sim_config::ptx_set_tex_cache_linesize(
   m_texcache_linesize = linesize;
 }
 
+
 gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
     : m_function_model_config(config) {
   gpgpu_ctx = ctx;

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
@@ -558,7 +558,7 @@ class gpgpu_functional_sim_config {
   int get_resume_CTA() const { return resume_CTA; }
   int get_checkpoint_CTA_t() const { return checkpoint_CTA_t; }
   int get_checkpoint_insn_Y() const { return checkpoint_insn_Y; }
-
+  
  private:
   // PTX options
   int m_ptx_convert_to_ptxplus;
@@ -1064,6 +1064,8 @@ class warp_inst_t : public inst_t {
     m_is_depbar = false;
 
     m_depbar_group_no = 0;
+
+    m_tlb_miss = false;
   }
   warp_inst_t(const core_config *config) {
     m_uid = 0;
@@ -1085,6 +1087,8 @@ class warp_inst_t : public inst_t {
     m_is_depbar = false;
 
     m_depbar_group_no = 0;
+
+    m_tlb_miss = false;
   }
   virtual ~warp_inst_t() {}
 
@@ -1212,6 +1216,14 @@ class warp_inst_t : public inst_t {
 
   bool accessq_empty() const { return m_accessq.empty(); }
   unsigned accessq_count() const { return m_accessq.size(); }
+
+  // for queue, always push back and pop front
+  mem_access_t &accessq_front() { return m_accessq.front(); }
+  void accessq_pop_front() { m_accessq.pop_front(); }
+  void accessq_push_back(mem_access_t mem_access) {
+    m_accessq.push_back(mem_access);
+  }
+
   const mem_access_t &accessq_back() { return m_accessq.back(); }
   void accessq_pop_back() { m_accessq.pop_back(); }
 
@@ -1277,6 +1289,9 @@ class warp_inst_t : public inst_t {
   bool m_is_depbar;
 
   unsigned int m_depbar_group_no;
+
+  bool m_tlb_miss;  // TLB miss for this instruction
+  std::list<mem_access_t> m_tlb_miss_map;
 };
 
 void move_warp(warp_inst_t *&dst, warp_inst_t *&src);

diff --git a/src/gpgpu-sim/gpu-misc.h b/src/gpgpu-sim/gpu-misc.h
@@ -38,5 +38,7 @@ unsigned int LOGB2(unsigned int v);
 
 #define gs_min2(a, b) (((a) < (b)) ? (a) : (b))
 #define min3(x, y, z) (((x) < (y) && (x) < (z)) ? (x) : (gs_min2((y), (z))))
+#define min4(w, x, y, z)                                                       \
+  ((gs_min2(w, x) < gs_min2(y, z)) ? gs_min2(w, x) : gs_min2(y, z))
 
 #endif
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
@@ -93,6 +93,7 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 #define L2 0x02
 #define DRAM 0x04
 #define ICNT 0x08
+#define GMMU 0x10
 
 #define MEM_LATENCY_STAT_IMPL
 
@@ -322,6 +323,12 @@ void memory_config::reg_options(class OptionParser *opp) {
   // SST mode activate
   option_parser_register(opp, "-SST_mode", OPT_BOOL, &SST_mode, "SST mode",
                          "0");
+  // TLB related options
+  option_parser_register(
+      opp, "-page_table_walk_latency", OPT_INT64, &page_table_walk_latency,
+      "Average page table walk latency (in core cycle).", "100");
+  option_parser_register(opp, "-page_size", OPT_CSTR, &page_size_string,
+                         "GDDR page size, only 4KB/2MB avaliable.", "4KB");
   m_address_mapping.addrdec_setoption(opp);
 }
 
@@ -654,6 +661,8 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(opp, "-gpgpu_reg_file_port_throughput", OPT_INT32,
                          &reg_file_port_throughput,
                          "the number ports of the register file", "1");
+  option_parser_register(opp, "-tlb_size", OPT_INT32, &tlb_size,
+                         "Number of tlb entries per SM.", "4096");   
 
   for (unsigned j = 0; j < SPECIALIZED_UNIT_NUM; ++j) {
     std::stringstream ss;
@@ -993,7 +1002,8 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   m_power_stats =
       new power_stat_t(m_shader_config, average_pipeline_duty_cycle, active_sms,
                        m_shader_stats, m_memory_config, m_memory_stats);
-
+  m_gmmu = new gmmu_t(this, config, m_memory_stats);
+
   gpu_sim_insn = 0;
   gpu_tot_sim_insn = 0;
   gpu_tot_issued_cta = 0;
@@ -1140,6 +1150,7 @@ void gpgpu_sim::reinit_clock_domains(void) {
   dram_time = 0;
   icnt_time = 0;
   l2_time = 0;
+  gmmu_time = 0;
 }
 
 bool gpgpu_sim::active() {
@@ -1636,6 +1647,7 @@ void gpgpu_sim::gpu_print_stat(unsigned long long streamID) {
   printf("icnt_total_pkts_simt_to_mem=%ld\n", total_simt_to_mem);
 
   time_vector_print();
+  m_memory_stats->tlb_print(stdout);
   fflush(stdout);
 
   clear_executed_kernel_info();
@@ -1933,7 +1945,7 @@ void dram_t::dram_log(int task) {
 
 // Find next clock domain and increment its time
 int gpgpu_sim::next_clock_domain(void) {
-  double smallest = min3(core_time, icnt_time, dram_time);
+  double smallest = min4(core_time, icnt_time, dram_time, gmmu_time);
   int mask = 0x00;
   if (l2_time <= smallest) {
     smallest = l2_time;
@@ -1952,6 +1964,10 @@ int gpgpu_sim::next_clock_domain(void) {
     mask |= CORE;
     core_time += m_config.core_period;
   }
+  if (gmmu_time <= smallest) {
+    mask |= GMMU;
+    gmmu_time += m_config.core_period;
+  }
   return mask;
 }
 
@@ -1970,9 +1986,86 @@ void gpgpu_sim::issue_block2core() {
 unsigned long long g_single_step =
     0;  // set this in gdb to single step the pipeline
 
+gmmu_t::gmmu_t(class gpgpu_sim *gpu, const gpgpu_sim_config &config,
+               class memory_stats_t *mem_stats)
+    : m_gpu(gpu), m_config(config) {
+  m_shader_config = &m_config.m_shader_config;
+  m_memory_config = &m_config.m_memory_config;
+  m_memory_stats = mem_stats;
+
+  m_log2_page_size = -1;
+  for (unsigned n = 0, mask = 1; mask != 0; mask <<= 1, n++) {
+    if (m_memory_config->page_size & mask) {
+      assert(m_log2_page_size == (unsigned)-1);
+      m_log2_page_size = n;
+    }
+  }
+  //gpu_sim_cycle = m_gpu->gpu_sim_cycle;
+  //gpu_tot_sim_cycle = m_gpu->gpu_tot_sim_cycle;
+}
+
+void gmmu_t::register_tlbflush_callback(
+    std::function<void(mem_addr_t)> cb_tlb) {
+  callback_tlb_flush.push_back(cb_tlb);
+}
+
+void gmmu_t::tlb_flush(mem_addr_t page_num) {
+  for (list<std::function<void(mem_addr_t)>>::iterator iter =
+           callback_tlb_flush.begin();
+       iter != callback_tlb_flush.end(); iter++) {
+    (*iter)(page_num);
+  }
+}
+
+void gmmu_t::cycle() {
+  int simt_cluster_id = 0;
+
+  size_t num_read_stage_queue = 0;
+  std::map<mem_addr_t, std::list<mem_fetch *>> page_fault_this_turn;
+
+  // check the page_table_walk_delay_queue
+  while (!page_table_walk_queue.empty() &&
+         ((m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) >=
+          page_table_walk_queue.front().ready_cycle)) {
+
+    mem_fetch *mf = page_table_walk_queue.front().mf;
+
+    simt_cluster_id = mf->get_sid() / m_config.num_core_per_cluster();
+
+    (m_gpu->getSIMTCluster(simt_cluster_id))->push_gmmu_cu_queue(mf);
+
+    m_memory_stats->mf_page_hit[simt_cluster_id]++;
+    page_table_walk_queue.pop_front();
+  }
+
+  // fetch from cluster's cu to gmmu queue and push it into the page table way
+  // delay queue
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+
+    if (!(m_gpu->getSIMTCluster(i))->empty_cu_gmmu_queue()) {
+
+      mem_fetch *mf = (m_gpu->getSIMTCluster(i))->front_cu_gmmu_queue();
+
+      struct page_table_walk_latency_t pt_t;
+      pt_t.mf = mf;
+      pt_t.ready_cycle =
+          m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle + m_memory_config->page_table_walk_latency;
+
+      page_table_walk_queue.push_back(pt_t);
+
+      (m_gpu->getSIMTCluster(i))->pop_cu_gmmu_queue();
+    }
+  }
+}
+
 void gpgpu_sim::cycle() {
   int clock_mask = next_clock_domain();
 
+  // the gmmu has the same clock as the core
+  if (clock_mask & GMMU) {
+    m_gmmu->cycle();
+  }
+
   if (clock_mask & CORE) {
     // shader core loading (pop from ICNT into core) follows CORE clock
     for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
@@ -2297,7 +2390,7 @@ const shader_core_config *gpgpu_sim::getShaderCoreConfig() {
 
 const memory_config *gpgpu_sim::getMemoryConfig() { return m_memory_config; }
 
-simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; }
+simt_core_cluster *gpgpu_sim::getSIMTCluster(int index) { return *(m_cluster + index); }
 
 void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
   if (m_shader_config->n_simt_clusters != sst_numcores) {
@@ -2312,6 +2405,9 @@ void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
 }
 
 void sst_gpgpu_sim::SST_cycle() {
+  // the gmmu has the same clock as the core
+  m_gmmu->cycle();  
+
   // shader core loading (pop from ICNT into core) follows CORE clock
   for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
     static_cast<sst_simt_core_cluster *>(m_cluster[i])->icnt_cycle_SST();

diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
@@ -37,6 +37,7 @@
 #include <fstream>
 #include <iostream>
 #include <list>
+#include <functional>
 #include "../abstract_hardware_model.h"
 #include "../option_parser.h"
 #include "../trace.h"
@@ -401,6 +402,11 @@ class memory_config {
   bool m_perf_sim_memcpy;
   bool simple_dram_model;
   bool SST_mode;
+
+  unsigned long long page_table_walk_latency;
+  int page_size;
+  char *page_size_string;
+
   gpgpu_context *gpgpu_ctx;
 };
 
@@ -446,6 +452,7 @@ class gpgpu_sim_config : public power_config,
   unsigned get_core_freq() const { return core_freq; }
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
+  unsigned num_core_per_cluster() const { return m_shader_config.n_simt_cores_per_cluster; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
 
   /**
@@ -521,6 +528,60 @@ class gpgpu_sim_config : public power_config,
 
   friend class gpgpu_sim;
   friend class sst_gpgpu_sim;
+  friend class gmmu_t;
+};
+
+class gmmu_t {
+ public:
+  gmmu_t(class gpgpu_sim *gpu, const gpgpu_sim_config &config,
+         class memory_stats_t *memory_stats);
+  void cycle();
+  void register_tlbflush_callback(std::function<void(mem_addr_t)> cb_tlb);
+  void tlb_flush(mem_addr_t page_num);
+  mem_addr_t get_page_num(mem_addr_t addr) {
+    return addr >> m_log2_page_size;
+  }
+
+ private:
+  unsigned m_log2_page_size;
+  // data structure to wrap memory fetch and page table walk delay
+  struct page_table_walk_latency_t {
+    mem_fetch *mf;
+    unsigned long long ready_cycle;
+  };
+
+  // page table walk delay queue
+  std::list<page_table_walk_latency_t> page_table_walk_queue;
+
+  enum class latency_type {
+    PCIE_READ,
+    PCIE_WRITE_BACK,
+    INVALIDATE,
+    PAGE_FAULT,
+    DMA
+  };
+
+  class gpgpu_sim *m_gpu;
+
+  // config file
+  const gpgpu_sim_config &m_config;
+  const struct memory_config *m_memory_config;
+  const struct shader_core_config *m_shader_config;
+
+  // callback functions to invalidate the tlb in ldst unit
+  std::list<std::function<void(mem_addr_t)>> callback_tlb_flush;
+
+  class memory_stats_t *m_memory_stats;
+};
+
+struct lp_tree_node {
+  mem_addr_t addr;
+  size_t size;
+  size_t valid_size;
+  struct lp_tree_node *left;
+  struct lp_tree_node *right;
+  uint32_t access_counter;
+  uint8_t RW;
 };
 
 struct occupancy_stats {
@@ -535,7 +596,7 @@ struct occupancy_stats {
 
   float get_occ_fraction() const {
     return float(aggregate_warp_slot_filled) /
-           float(aggregate_theoretical_warp_slots);
+            float(aggregate_theoretical_warp_slots);
   }
 
   occupancy_stats &operator+=(const occupancy_stats &rhs) {
@@ -654,7 +715,9 @@ class gpgpu_sim : public gpgpu_t {
    * Returning the cluster of of the shader core, used by the functional
    * simulation so far
    */
-  simt_core_cluster *getSIMTCluster();
+  simt_core_cluster *getSIMTCluster(int index);
+
+  gmmu_t *getGmmu() { return m_gmmu; }
 
   void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd,
                       const ptx_instruction *pI);
@@ -687,6 +750,7 @@ class gpgpu_sim : public gpgpu_t {
 
  protected:
   ///// data /////
+  class gmmu_t *m_gmmu;
   class simt_core_cluster **m_cluster;
   class memory_partition_unit **m_memory_partition_unit;
   class memory_sub_partition **m_memory_sub_partition;
@@ -709,6 +773,7 @@ class gpgpu_sim : public gpgpu_t {
   double icnt_time;
   double dram_time;
   double l2_time;
+  double gmmu_time;
 
   // debug
   bool gpu_deadlock;

diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
@@ -81,6 +81,7 @@ class mem_fetch {
   void set_partition(unsigned sub_partition_id) {
     m_raw_addr.sub_partition = sub_partition_id;
   }
+  mem_access_t get_m_access() const { return m_access; }
   unsigned get_data_size() const { return m_data_size; }
   void set_data_size(unsigned size) { m_data_size = size; }
   unsigned get_ctrl_size() const { return m_ctrl_size; }