|
1 |
| -# This config models the Turing RTX 2060 |
2 |
| -# For more info about turing architecture: |
3 |
| -# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf |
4 |
| -# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020 |
5 |
| - |
6 | 1 | # functional simulator specification
|
7 | 2 | -gpgpu_ptx_instruction_classification 0
|
8 | 3 | -gpgpu_ptx_sim_mode 0
|
|
14 | 9 | -gpgpu_runtime_sync_depth_limit 2
|
15 | 10 | -gpgpu_runtime_pending_launch_count_limit 2048
|
16 | 11 | -gpgpu_kernel_launch_latency 5000
|
| 12 | +-gpgpu_TB_launch_latency 0 |
17 | 13 |
|
18 | 14 | # Compute Capability
|
19 | 15 | -gpgpu_compute_capability_major 7
|
|
27 | 23 | -gpgpu_n_clusters 30
|
28 | 24 | -gpgpu_n_cores_per_cluster 1
|
29 | 25 | -gpgpu_n_mem 12
|
30 |
| --gpgpu_n_sub_partition_per_mchannel 2 |
| 26 | +-gpgpu_n_sub_partition_per_mchannel 2 |
31 | 27 |
|
32 |
| -# volta clock domains |
| 28 | +# clock domains |
33 | 29 | #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
|
34 |
| --gpgpu_clock_domains 1365.0:1365.0:1365.0:3500.0 |
35 |
| -# boost mode |
36 |
| -# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0 |
| 30 | +-gpgpu_clock_domains 1365:1365:1365:3500.5 |
37 | 31 |
|
38 | 32 | # shader core pipeline config
|
39 | 33 | -gpgpu_shader_registers 65536
|
40 | 34 | -gpgpu_registers_per_block 65536
|
41 | 35 | -gpgpu_occupancy_sm_number 75
|
42 | 36 |
|
43 |
| -# This implies a maximum of 32 warps/SM |
44 |
| --gpgpu_shader_core_pipeline 1024:32 |
45 |
| --gpgpu_shader_cta 32 |
| 37 | +-gpgpu_shader_core_pipeline 1024:32 |
| 38 | +-gpgpu_shader_cta 16 |
46 | 39 | -gpgpu_simd_model 1
|
47 | 40 |
|
48 | 41 | # Pipeline widths and number of FUs
|
49 | 42 | # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
|
50 |
| -## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units |
51 |
| -## We need to scale the number of pipeline registers to be equal to the number of SP units |
52 |
| --gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4 |
| 43 | +-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4 |
53 | 44 | -gpgpu_num_sp_units 4
|
54 | 45 | -gpgpu_num_sfu_units 4
|
| 46 | +-gpgpu_num_dp_units 4 |
55 | 47 | -gpgpu_num_int_units 4
|
56 | 48 | -gpgpu_tensor_core_avail 1
|
57 | 49 | -gpgpu_num_tensor_core_units 4
|
58 | 50 |
|
59 | 51 | # Instruction latencies and initiation intervals
|
60 | 52 | # "ADD,MAX,MUL,MAD,DIV"
|
61 | 53 | # All Div operations are executed on SFU unit
|
62 |
| --ptx_opcode_latency_int 4,13,4,5,145,32 |
63 |
| --ptx_opcode_initiation_int 2,2,2,2,8,4 |
64 |
| --ptx_opcode_latency_fp 4,13,4,5,39 |
| 54 | +-ptx_opcode_latency_int 4,4,4,4,21 |
| 55 | +-ptx_opcode_initiation_int 2,2,2,2,2 |
| 56 | +-ptx_opcode_latency_fp 4,4,4,4,39 |
65 | 57 | -ptx_opcode_initiation_fp 2,2,2,2,4
|
66 |
| --ptx_opcode_latency_dp 8,19,8,8,330 |
67 |
| --ptx_opcode_initiation_dp 4,4,4,4,130 |
68 |
| --ptx_opcode_latency_sfu 100 |
| 58 | +-ptx_opcode_latency_dp 64,64,64,64,330 |
| 59 | +-ptx_opcode_initiation_dp 64,64,64,64,130 |
| 60 | +-ptx_opcode_latency_sfu 21 |
69 | 61 | -ptx_opcode_initiation_sfu 8
|
70 | 62 | -ptx_opcode_latency_tesnor 64
|
71 | 63 | -ptx_opcode_initiation_tensor 64
|
72 | 64 |
|
73 |
| -# Turing has four schedulers per core |
74 |
| --gpgpu_num_sched_per_core 4 |
75 |
| -# Greedy then oldest scheduler |
76 |
| --gpgpu_scheduler gto |
77 |
| -## In Turing, a warp scheduler can issue 1 inst per cycle |
78 |
| --gpgpu_max_insn_issue_per_warp 1 |
79 |
| --gpgpu_dual_issue_diff_exec_units 1 |
80 |
| - |
81 |
| -# shared memory bankconflict detection |
82 |
| --gpgpu_shmem_num_banks 32 |
83 |
| --gpgpu_shmem_limited_broadcast 0 |
84 |
| --gpgpu_shmem_warp_parts 1 |
85 |
| --gpgpu_coalesce_arch 75 |
86 |
| - |
87 |
| -# Trung has sub core model, in which each scheduler has its own register file and EUs |
| 65 | +# sub core model: in which each scheduler has its own register file and EUs |
88 | 66 | # i.e. schedulers are isolated
|
89 | 67 | -gpgpu_sub_core_model 1
|
90 | 68 | # disable specialized operand collectors and use generic operand collectors instead
|
91 | 69 | -gpgpu_enable_specialized_operand_collector 0
|
92 | 70 | -gpgpu_operand_collector_num_units_gen 8
|
93 | 71 | -gpgpu_operand_collector_num_in_ports_gen 8
|
94 | 72 | -gpgpu_operand_collector_num_out_ports_gen 8
|
95 |
| -# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler |
96 |
| -# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version |
97 |
| --gpgpu_num_reg_banks 16 |
| 73 | +# register banks |
| 74 | +-gpgpu_num_reg_banks 8 |
98 | 75 | -gpgpu_reg_file_port_throughput 2
|
99 | 76 |
|
| 77 | +# warp scheduling |
| 78 | +-gpgpu_num_sched_per_core 4 |
| 79 | +-gpgpu_scheduler lrr |
| 80 | +# a warp scheduler issue mode |
| 81 | +-gpgpu_max_insn_issue_per_warp 1 |
| 82 | +-gpgpu_dual_issue_diff_exec_units 1 |
| 83 | + |
| 84 | +## L1/shared memory configuration |
100 | 85 | # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
|
101 | 86 | # ** Optional parameter - Required when mshr_type==Texture Fifo
|
102 |
| --gpgpu_adaptive_cache_config 0 |
| 87 | +# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache |
| 88 | +# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x |
| 89 | +-gpgpu_adaptive_cache_config 1 |
| 90 | +-gpgpu_shmem_option 32,64 |
| 91 | +-gpgpu_unified_l1d_size 96 |
| 92 | +# L1 cache configuration |
103 | 93 | -gpgpu_l1_banks 4
|
104 |
| --gpgpu_cache:dl1 S:1:128:512,L:L:s:N:L,A:256:8,16:0,32 |
105 |
| --gpgpu_shmem_size 65536 |
106 |
| --gpgpu_shmem_sizeDefault 65536 |
107 |
| --gpgpu_shmem_per_block 65536 |
| 94 | +-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:256:32,16:0,32 |
| 95 | +-gpgpu_l1_latency 32 |
108 | 96 | -gpgpu_gmem_skip_L1D 0
|
109 |
| --gpgpu_n_cluster_ejection_buffer_size 32 |
110 |
| --gpgpu_l1_latency 20 |
111 |
| --gpgpu_smem_latency 20 |
112 | 97 | -gpgpu_flush_l1_cache 1
|
| 98 | +-gpgpu_n_cluster_ejection_buffer_size 32 |
| 99 | +-gpgpu_l1_cache_write_ratio 25 |
113 | 100 |
|
114 |
| -# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache |
| 101 | +# shared memory configuration |
| 102 | +-gpgpu_shmem_size 65536 |
| 103 | +-gpgpu_shmem_sizeDefault 65536 |
| 104 | +-gpgpu_shmem_per_block 49152 |
| 105 | +-gpgpu_smem_latency 30 |
| 106 | +# shared memory bankconflict detection |
| 107 | +-gpgpu_shmem_num_banks 32 |
| 108 | +-gpgpu_shmem_limited_broadcast 0 |
| 109 | +-gpgpu_shmem_warp_parts 1 |
| 110 | +-gpgpu_coalesce_arch 75 |
| 111 | + |
| 112 | +# L2 cache |
115 | 113 | -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
|
116 | 114 | -gpgpu_cache:dl2_texture_only 0
|
117 | 115 | -gpgpu_dram_partition_queues 64:64:64:64
|
|
122 | 120 | -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
|
123 | 121 | -gpgpu_inst_fetch_throughput 4
|
124 | 122 | # 128 KB Tex
|
125 |
| -# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod |
| 123 | +# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod |
126 | 124 | -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
|
127 | 125 | # 64 KB Const
|
128 | 126 | -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
|
129 | 127 | -gpgpu_perfect_inst_const_cache 1
|
130 | 128 |
|
131 | 129 | # interconnection
|
132 |
| -#-network_mode 1 |
133 |
| -#-inter_config_file config_turing_islip.icnt |
134 | 130 | # use built-in local xbar
|
135 | 131 | -network_mode 2
|
136 | 132 | -icnt_in_buffer_limit 512
|
137 | 133 | -icnt_out_buffer_limit 512
|
138 | 134 | -icnt_subnets 2
|
139 |
| --icnt_arbiter_algo 1 |
140 | 135 | -icnt_flit_size 40
|
| 136 | +-icnt_arbiter_algo 1 |
141 | 137 |
|
142 | 138 | # memory partition latency config
|
143 |
| --gpgpu_l2_rop_latency 160 |
144 |
| --dram_latency 100 |
| 139 | +-gpgpu_l2_rop_latency 194 |
| 140 | +-dram_latency 96 |
145 | 141 |
|
146 |
| -# dram model config |
| 142 | +# dram sched config |
147 | 143 | -gpgpu_dram_scheduler 1
|
148 | 144 | -gpgpu_frfcfs_dram_sched_queue_size 64
|
149 | 145 | -gpgpu_dram_return_queue_size 192
|
150 | 146 |
|
151 |
| -# Turing has GDDR6 |
152 |
| -# http://monitorinsider.com/GDDR6.html |
| 147 | +# dram model config |
153 | 148 | -gpgpu_n_mem_per_ctrlr 1
|
154 | 149 | -gpgpu_dram_buswidth 2
|
155 | 150 | -gpgpu_dram_burst_length 16
|
156 | 151 | -dram_data_command_freq_ratio 4
|
157 | 152 | -gpgpu_mem_address_mask 1
|
158 | 153 | -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
|
159 | 154 |
|
160 |
| -# Use the same GDDR5 timing, scaled to 3500MHZ |
161 |
| --gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62: |
162 |
| - CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4" |
| 155 | +# Mem timing |
| 156 | +-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4 |
| 157 | +-dram_dual_bus_interface 0 |
163 | 158 |
|
164 | 159 | # select lower bits for bnkgrp to increase bnkgrp parallelism
|
165 | 160 | -dram_bnk_indexing_policy 0
|
|
174 | 169 | -enable_ptx_file_line_stats 1
|
175 | 170 | -visualizer_enabled 0
|
176 | 171 |
|
177 |
| -# power model configs, disable it untill we create a real energy model for Volta |
| 172 | +# power model configs, disable it untill we create a real energy model |
178 | 173 | -power_simulation_enabled 0
|
179 | 174 |
|
180 | 175 | # tracing functionality
|
|
0 commit comments