diff --git a/python/PerfAI/PerfAI.doc/include/dma.py b/python/PerfAI/PerfAI.doc/include/dma.py index f8d5b0ac6..e877235d5 100644 --- a/python/PerfAI/PerfAI.doc/include/dma.py +++ b/python/PerfAI/PerfAI.doc/include/dma.py @@ -36,7 +36,7 @@ def __init__(self, core_id, writer): :param core_id: the id of current core :param writer: the writer of Excel to write """ - self.columns = ['Engine Id', 'Core Id', 'Cmd Id', 'Layer Id', 'Layer Name', + self.columns = ['Engine Id', 'Core Id', 'Global Idx', 'Cmd Id', 'Layer Id', 'Layer Name', 'Function Type', 'Function Name', 'DMA data size(B)', 'Start Cycle', 'End Cycle', 'Asic Cycle', 'Stall Cycle', 'DDR Bandwidth(GB/s)', 'L2M Bandwidth(GB/s)', 'Direction', 'AvgBurstLength', 'Data Type', 'Non32ByteRatio', 'MaskWriteRatio', 'cmd_id_dep', 'cmd_special_function', 'src_start_addr', 'dst_start_addr', @@ -144,7 +144,7 @@ def load(self, reg_info_file, dma_layer_map): self.reg_list.append(reg_dict) self.height = len(self.reg_list) - def add_kpi_field(self): + def add_kpi_field(self, is_cdma=False): """ Add some indicators which are convenient for performance analysis artificially. :return: None @@ -152,22 +152,31 @@ def add_kpi_field(self): for i in range(len(self.reg_list)): reg_dict = self.reg_list[i] name_key = (int(reg_dict['cmd_type'])) - if reg_dict['cmd_type'] == 6: + sys_cmd_id = 7 if is_cdma else 6 + sys_wait_id = [4, 6] if is_cdma else [4] + transfer_bytes = 0 + if reg_dict['cmd_type'] == sys_cmd_id: reg_dict['Data Type'] = 'None' # dma_sys do not transfer data reg_dict['Direction'] = '-' + if reg_dict['cmd_special_function'] in sys_wait_id: + self.wait_msg_total_time += reg_dict['Asic Cycle'] + if isinstance(reg_dict['DMA data size(B)'], int) and reg_dict['DMA data size(B)'] > 0: + transfer_bytes = reg_dict['DMA data size(B)'] self.dma_cycle += int(reg_dict['Asic Cycle']) self.stall_cycle += int(reg_dict['Stall Cycle']) if 'DDR' in reg_dict['Direction'] and isinstance(reg_dict['DMA data size(B)'], int): - self.ddr_total_datasize += reg_dict['DMA data size(B)'] - self.ddr_total_cycle += reg_dict['Asic Cycle'] - self.ddr_burst_length_sum += reg_dict['gmem_bl_sum'] - self.ddr_xact_cnt += reg_dict['gmem_xact_cnt'] + if not is_cdma or transfer_bytes: + self.ddr_total_datasize += reg_dict['DMA data size(B)'] + self.ddr_total_cycle += reg_dict['Asic Cycle'] + self.ddr_burst_length_sum += reg_dict['gmem_bl_sum'] + self.ddr_xact_cnt += reg_dict['gmem_xact_cnt'] elif 'L2' in reg_dict['Direction'] and isinstance(reg_dict['DMA data size(B)'], int): - self.l2_total_datasize += reg_dict['DMA data size(B)'] - self.l2_total_cycle += reg_dict['Asic Cycle'] - if reg_dict['cmd_type'] == 6 and reg_dict['cmd_special_function'] == 4: - self.wait_msg_total_time += reg_dict['Asic Cycle'] + if not is_cdma or transfer_bytes: + self.l2_total_datasize += reg_dict['DMA data size(B)'] + self.l2_total_cycle += reg_dict['Asic Cycle'] + # if reg_dict['cmd_type'] == 6 and reg_dict['cmd_special_function'] == 4: + # self.wait_msg_total_time += reg_dict['Asic Cycle'] if reg_dict['gmem_xact_cnt'] > 0: reg_dict['AvgBurstLength'] = get_ratio_float_2f(reg_dict['gmem_bl_sum'], reg_dict['gmem_xact_cnt']) reg_dict['Non32ByteRatio'] = get_ratio_float_2f(reg_dict['gmem_n32Ba_sa_cnt'], reg_dict['gmem_xact_cnt']) @@ -516,8 +525,9 @@ def load(self, reg_info_file): elif reg_count == 0: fields = row.split(': ') attr = fields[0][1:] - val = fields[1][:-1] - self.chip_arch_dict[attr] = val + if len(fields) > 1: + val = fields[1][:-1] + self.chip_arch_dict[attr] = val idx = 0 else: fields = row.split(': ') diff --git a/python/PerfAI/PerfAI.doc/include/tiu.py b/python/PerfAI/PerfAI.doc/include/tiu.py index b464bd0d4..6e2a5b0a5 100644 --- a/python/PerfAI/PerfAI.doc/include/tiu.py +++ b/python/PerfAI/PerfAI.doc/include/tiu.py @@ -42,7 +42,7 @@ def __init__(self, core_id, writer): :param writer: the writer of Excel to write """ self.writer = writer - self.columns = ['Engine Id', 'Core Id', 'Cmd Id', 'Layer Id', 'Layer Name', 'Function Type', 'Function Name', + self.columns = ['Engine Id', 'Core Id', 'Global Idx', 'Cmd Id', 'Layer Id', 'Layer Name', 'Function Type', 'Function Name', 'Alg Cycle', 'Asic Cycle', 'Start Cycle', 'End Cycle', 'Avg Cycle Last 200', 'Alg Ops', 'uArch Ops', 'uArch Rate', 'Bank Conflict Ratio', 'Initial Cycle Ratio', 'Data Type', 'Sim Power(W)', 'des_cmd_id_dep', diff --git a/python/PerfAI/PerfAI.doc/src/generator/details.py b/python/PerfAI/PerfAI.doc/src/generator/details.py index e4575ef80..bb306ac6f 100644 --- a/python/PerfAI/PerfAI.doc/src/generator/details.py +++ b/python/PerfAI/PerfAI.doc/src/generator/details.py @@ -11,7 +11,7 @@ # @Time : 2023/8/7 11:26 # @Author : chongqing.zeng@sophgo.com # @Project: PerfAI -import os +import os, glob, re import pandas as pd from tqdm import tqdm @@ -83,6 +83,7 @@ def generate_details(input_fold, out_file, g_info, writer, core_num=8, split_ins tiu_instance_map, gdma_instance_map = dict(), dict() tiu_layer_map, gdma_layer_map = get_engine_layer(g_info) chip_arch_act = None + file_names = sorted(glob.glob(cdma_reg_file + '_*.txt')) for core_id in tqdm(range(act_core_num)): # tiu cur_tiu_reg_file = tiu_reg_file + '_' + str(core_id) + '.txt' @@ -111,22 +112,40 @@ def generate_details(input_fold, out_file, g_info, writer, core_num=8, split_ins sdma_instance.add_kpi_field() sdma_instance.write() # cdma - cdma_instance = Cdma(core_id, writer, 'CDMA') - if act_core_num: - cur_cdma_reg_file = cdma_reg_file + '_' + str(core_id) + '.txt' - if os.path.exists(cur_cdma_reg_file) and os.path.getsize(cur_cdma_reg_file): - tmp_chip_arch = cdma_instance.load(cur_cdma_reg_file) - chip_arch_act = chip_arch_act if chip_arch_act else tmp_chip_arch - cdma_instance.add_kpi_field() - cdma_instance.write() - reg_list += tiu_instance.reg_list + gdma_instance.reg_list + sdma_instance.reg_list + cdma_instance.reg_list - else: + if chip_arch['Chip Arch'] == 'sg2260' : reg_list += tiu_instance.reg_list + gdma_instance.reg_list + sdma_instance.reg_list + cdma_instance = None + if act_core_num and core_id == 7 and file_names: + for f in file_names: + port = eval(re.search(rf"{cdma_reg_file}_(\d+)\.txt", f).group(1)) + cdma_instance = Cdma(port, writer, 'CDMA') + if os.path.exists(f) and os.path.getsize(f): + tmp_chip_arch = cdma_instance.load(f) + chip_arch_act = chip_arch_act if chip_arch_act else tmp_chip_arch + cdma_instance.add_kpi_field(True) + cdma_instance.write() + reg_list += cdma_instance.reg_list + cdma_instances.append(cdma_instance) + else: + cdma_instance = Cdma(core_id, writer, 'CDMA') + cdma_instances.append(cdma_instance) + else: + cdma_instance = Cdma(core_id, writer, 'CDMA') + if act_core_num: + cur_cdma_reg_file = cdma_reg_file + '_' + str(core_id) + '.txt' + if os.path.exists(cur_cdma_reg_file) and os.path.getsize(cur_cdma_reg_file): + tmp_chip_arch = cdma_instance.load(cur_cdma_reg_file) + chip_arch_act = chip_arch_act if chip_arch_act else tmp_chip_arch + cdma_instance.add_kpi_field(True) + cdma_instance.write() + reg_list += tiu_instance.reg_list + gdma_instance.reg_list + sdma_instance.reg_list + cdma_instance.reg_list + else: + reg_list += tiu_instance.reg_list + gdma_instance.reg_list + sdma_instance.reg_list + cdma_instances.append(cdma_instance) instr_cols = get_instr_cols(tiu_instance.columns, gdma_instance.columns) tiu_instances.append(tiu_instance) gdma_instances.append(gdma_instance) sdma_instances.append(sdma_instance) - cdma_instances.append(cdma_instance) if act_core_num: # instr world @@ -218,7 +237,7 @@ def generate_divided_details(input_fold, g_info, core_num=8): if os.path.exists(cur_cdma_reg_file) and os.path.getsize(cur_cdma_reg_file): tmp_chip_arch = cdma_instance.load(cur_cdma_reg_file) chip_arch_act = chip_arch_act if chip_arch_act else tmp_chip_arch - cdma_instance.add_kpi_field() + cdma_instance.add_kpi_field(True) cdma_instance.write() reg_list += tiu_instance.reg_list + gdma_instance.reg_list + sdma_instance.reg_list + cdma_instance.reg_list else: diff --git a/python/PerfAI/PerfAI.web/src/dma.py b/python/PerfAI/PerfAI.web/src/dma.py index 45be2d01e..4b791e139 100644 --- a/python/PerfAI/PerfAI.web/src/dma.py +++ b/python/PerfAI/PerfAI.web/src/dma.py @@ -11,6 +11,7 @@ from decimal import Decimal import os import sys +import glob, re from utils.utils import * @@ -34,7 +35,7 @@ def __init__(self, dirpath, dmaType): self.total_burst_length = 0 self.total_xact_cnt = 0 self.frequency = 0 - self.columns = ['Engine Id', 'Core Id', 'Cmd Id', 'Layer Id', 'Layer Name', 'Subnet Id', 'Subnet Type', 'File Line', + self.columns = ['Engine Id', 'Core Id', 'Global Idx', 'Cmd Id', 'Layer Id', 'Layer Name', 'Subnet Id', 'Subnet Type', 'File Line', 'Function Type', 'Function Name', 'DMA data size(B)', 'Start Cycle', 'End Cycle', 'Asic Cycle', 'Stall Cycle', 'DDR Bandwidth(GB/s)','L2M Bandwidth(GB/s)', 'Direction', 'AvgBurstLength', 'Data Type', 'Non32ByteRatio', 'MaskWriteRatio', 'cmd_id_dep', 'cmd_special_function', 'src_start_addr', 'dst_start_addr', @@ -48,6 +49,12 @@ def __init__(self, dirpath, dmaType): 'mask_start_addr_h8', 'mask_start_addr_l32', 'mask_data_format', 'localmem_mask_h32', 'localmem_mask_l32', 'fill_constant_en', 'constant_value', 'index', 'cmd_short', 'intr_en', 'Msg Id', 'Sd\Wt Count'] + self.sys_cmd_id = '6'; + self.sys_wait_id = ['4'] + if self.dmaType == 'CDMA': + self.sys_cmd_id = '7' + self.sys_wait_id = ['4', '6'] + def dma_engine_type(self): if self.dmaType == 'CDMA': return '4' @@ -58,12 +65,42 @@ def dma_engine_type(self): self.dmaType = 'TDMA' return '3' + # def process_file(self, layer_map): + # engineId = self.dma_engine_type() + # # file_name = f"{self.dirpath}/{self.dmaType.lower()}RegInfo_0.txt" + # file_name = os.path.join(self.dirpath,f'{self.dmaType.lower()}RegInfo_0.txt') + # if os.path.exists(file_name) and os.path.getsize(file_name) > 0: + # with open(file_name, "r") as f: + # lines = f.readlines() + # for line in lines: + # self.linecount += 1 + # if "\t" in line: + # fields = line.split(': ') + # attr = fields[0][1:] + # val = fields[1][:-1] + # self.chipArgs[attr] = val + # if f'__{self.dmaType}_REG_INFO__' in line: + # break + # self.frequency = int(self.chipArgs['DMA Frequency(MHz)']) + # coreNum = int(self.chipArgs['Core Num']) + # for coreId in range(int(coreNum)): + # curDmaRegFile = f"{self.dirpath}/{self.dmaType.lower()}RegInfo" + '_' + str(coreId) + '.txt' + # if os.path.exists(curDmaRegFile) and os.path.getsize(curDmaRegFile) != 0: + # self.actual_corenum += 1 + # dmaDf_list = [] #list of tiu dataframes + # for coreId in range(self.actual_corenum): + # dmaDf_list.append(self.process_data(coreId,engineId,layer_map)) + # return dmaDf_list + # else: + # self.dma_cycle_list.append(0) + # return [] def process_file(self, layer_map): engineId = self.dma_engine_type() # file_name = f"{self.dirpath}/{self.dmaType.lower()}RegInfo_0.txt" - file_name = os.path.join(self.dirpath,f'{self.dmaType.lower()}RegInfo_0.txt') - if os.path.exists(file_name) and os.path.getsize(file_name) > 0: - with open(file_name, "r") as f: + # file_name = os.path.join(self.dirpath,f'{self.dmaType.lower()}RegInfo_0.txt') + file_names = sorted(glob.glob(self.dirpath + f"{self.dmaType.lower()}RegInfo_*.txt")) + if file_names and os.path.exists(file_names[0]) and os.path.getsize(file_names[0]) > 0: + with open(file_names[0], "r") as f: lines = f.readlines() for line in lines: self.linecount += 1 @@ -76,14 +113,25 @@ def process_file(self, layer_map): break self.frequency = int(self.chipArgs['DMA Frequency(MHz)']) coreNum = int(self.chipArgs['Core Num']) - for coreId in range(int(coreNum)): - curDmaRegFile = f"{self.dirpath}/{self.dmaType.lower()}RegInfo" + '_' + str(coreId) + '.txt' - if os.path.exists(curDmaRegFile) and os.path.getsize(curDmaRegFile) != 0: - self.actual_corenum += 1 - dmaDf_list = [] #list of tiu dataframes - for coreId in range(self.actual_corenum): - dmaDf_list.append(self.process_data(coreId,engineId,layer_map)) - return dmaDf_list + if engineId != '4': + for coreId in range(int(coreNum)): + curDmaRegFile = f"{self.dirpath}/{self.dmaType.lower()}RegInfo" + '_' + str(coreId) + '.txt' + if os.path.exists(curDmaRegFile) and os.path.getsize(curDmaRegFile) != 0: + self.actual_corenum += 1 + dmaDf_list = [] #list of tiu dataframes + for coreId in range(self.actual_corenum): + dmaDf_list.append(self.process_data(coreId,engineId,layer_map)) + return dmaDf_list + else: + self.actual_corenum = 1 + dmaDf_list = [] + for f in file_names: + port = eval(re.search(rf"{self.dmaType.lower()}RegInfo_(\d+)\.txt", f).group(1)) + data = self.process_data(port,engineId,layer_map) + data.port = port + dmaDf_list.append(data) + return dmaDf_list + else: self.dma_cycle_list.append(0) return [] @@ -151,21 +199,26 @@ def process_data(self, coreId, engineId, layer_map): totalInstRegList = [] for i in range(len(new_reglist)): regDict = new_reglist[i] - if regDict['cmd_type'] == '6': + # dma_sys do not transfer data + if regDict['cmd_type'] == self.sys_cmd_id: regDict['Data Type'] = 'None' - if int(regDict['cmd_type']) == 6: # dma_sys do not transfer data regDict['Direction'] = '-' + transfer_bytes = 0 + if regDict['DMA data size(B)'].isnumeric(): + transfer_bytes = int(regDict['DMA data size(B)']) if regDict['Asic Cycle'].isnumeric(): DmaCycle += int(regDict['Asic Cycle']) if 'DDR' in regDict['Direction'] and regDict['DMA data size(B)'].isnumeric(): - dmaDdrTotalDataSize += int(regDict['DMA data size(B)']) - dmaDdrCycle += float(regDict['Asic Cycle']) - dmaDdrBurstLength += int(regDict['gmem_bl_sum']) - dmaDdrXactCnt += int(regDict['gmem_xact_cnt']) + if self.dmaType != 'CDMA' or transfer_bytes: + dmaDdrTotalDataSize += transfer_bytes + dmaDdrCycle += float(regDict['Asic Cycle']) + dmaDdrBurstLength += int(regDict['gmem_bl_sum']) + dmaDdrXactCnt += int(regDict['gmem_xact_cnt']) elif 'L2' in regDict['Direction'] and regDict['DMA data size(B)'].isnumeric(): - dmaL2TotalDataSize += int(regDict['DMA data size(B)']) - dmaL2Cycle += float(regDict['Asic Cycle']) - if regDict['cmd_type'] == '6' and regDict['cmd_special_function'] == '4': + if self.dmaType != 'CDMA' or transfer_bytes: + dmaL2TotalDataSize += transfer_bytes + dmaL2Cycle += float(regDict['Asic Cycle']) + if regDict['cmd_type'] == self.sys_cmd_id and regDict['cmd_special_function'] in self.sys_wait_id: dmaWaitMsgTotalTime += eval(regDict['Asic Cycle']) if int(regDict['gmem_xact_cnt']) > 0: regDict['AvgBurstLength'] = Decimal( diff --git a/python/PerfAI/PerfAI.web/src/summary.py b/python/PerfAI/PerfAI.web/src/summary.py index ae192a8e4..70b33342f 100644 --- a/python/PerfAI/PerfAI.web/src/summary.py +++ b/python/PerfAI/PerfAI.web/src/summary.py @@ -68,7 +68,7 @@ def _process_dma(self, dmaProcessor, add_l2=True): if len(temp_timelist) >= len(self.total_time_list): self.total_time_list = temp_timelist dmaProcessor.dma_cycle_list.append(max(dmaProcessor.dma_cycle_list)) - last_row = len(self.total_time_list) - 1 + last_row = len(dmaProcessor.dma_cycle_list) - 1 dmaProcessor.dma_cycle_list[last_row] = str( (Decimal(dmaProcessor.dma_cycle_list[last_row] / dmaProcessor.frequency)).quantize(Decimal("0.00"))) + 'us' dmaProcessor.dma_ddr_total_datasize_list.append(sum(dmaProcessor.dma_ddr_total_datasize_list)) @@ -157,6 +157,10 @@ def make_summary(self): self.data[0] = CoreIdList self.data[1] = ParallelismList self.data[2] = self.total_time_list + max_len = max([len(i)for i in self.data]) + for i in self.data: + if len(i) < max_len: + i[1:1] = [0] * (max_len - len(i)) summaryData = transpose(self.data).tolist() summaryDf = pd.DataFrame(summaryData, columns=self.columns, index=None) return summaryDf diff --git a/python/PerfAI/PerfAI.web/src/tiu.py b/python/PerfAI/PerfAI.web/src/tiu.py index 3eeb1ddb5..379af9889 100644 --- a/python/PerfAI/PerfAI.web/src/tiu.py +++ b/python/PerfAI/PerfAI.web/src/tiu.py @@ -29,7 +29,7 @@ def __init__(self, dirpath): self.total_alg_ops_list = [] self.uArach_rate_list = [] self.total_uarch_ops_list = [] - self.columns = ['Engine Id', 'Core Id', 'Cmd Id', 'Layer Id', 'Layer Name', 'Subnet Id', 'Subnet Type', 'File Line', + self.columns = ['Engine Id', 'Core Id', 'Global Idx', 'Cmd Id', 'Layer Id', 'Layer Name', 'Subnet Id', 'Subnet Type', 'File Line', 'Function Type', 'Function Name', 'Alg Cycle', 'Alg Ops','Asic Cycle', 'Start Cycle', 'End Cycle', 'uArch Ops', 'uArch Rate', 'Bank Conflict Ratio', diff --git a/python/PerfAI/PerfAI.web/utils/js_prep.py b/python/PerfAI/PerfAI.web/utils/js_prep.py index 3b4ff4d1f..e9ee33cc0 100644 --- a/python/PerfAI/PerfAI.web/utils/js_prep.py +++ b/python/PerfAI/PerfAI.web/utils/js_prep.py @@ -55,7 +55,7 @@ def generate_jsfile(dirpath, name, out_path, file_path, layerinfo_path): ddrBw = pd.to_numeric(chipArchArgs['DDR Max BW(GB/s/Core)']) L2Bw = pd.to_numeric(chipArchArgs['L2 Max BW(GB/s)']) dependCmds = parse_cmdgroups(file_path) - time_header = ["category", "begin_time", "end_time", "Duration", "stall_time", "func_type", "height", "cmd", "func_name","uArchRate/BW", "Data Type", "Info","Msg_Id","Sd/Wt_Count"] + time_header = ["category", "begin_time", "end_time", "Duration", "stall_time", "func_type", "height", "cmd", "global_idx", "func_name","uArchRate/BW", "Data Type", "Info","Msg_Id","Sd/Wt_Count"] filter_cols = [time_header.index(c) for c in ["category", "func_type"]] # time_header = ["category", "begin_time", "end_time", "Duration", "stall_time", "func_type", "height", "cmd", "func_name", 'layer_id','layer_name','subnet_id','subnet_type',"uArchRate/BW", "Data Type", "Info","Msg_Id","Sd/Wt_Count"] # filter_cols.extend([time_header.index(c) for c in ['layer_id','layer_name','subnet_id','subnet_type']]) @@ -64,12 +64,13 @@ def generate_jsfile(dirpath, name, out_path, file_path, layerinfo_path): if len(sdma_instances[0]) > 0: categories.append("TPU_SDMA") if len(cdma_instances) > 0: - categories.append("TPU_CDMA") + for inst in cdma_instances: + categories.append(f"TPU_CDMA_PORT{inst.port}") if tiu_layer_map or dma_layer_map: include_layer = True categories.append("TPU_LAYER") categories.append("TPU_GROUP_LAYER") - time_header = ["category", "begin_time", "end_time", "Duration", "stall_time", "func_type", "height", "cmd", "func_name", 'layer_id','layer_name','subnet_id','subnet_type',"uArchRate/BW", "Data Type", "Info","Msg_Id","Sd/Wt_Count"] + time_header = ["category", "begin_time", "end_time", "Duration", "stall_time", "func_type", "height", "cmd", "global_idx", "func_name", 'layer_id','layer_name','subnet_id','subnet_type',"uArchRate/BW", "Data Type", "Info","Msg_Id","Sd/Wt_Count"] filter_cols.extend([time_header.index(c) for c in ['layer_id','layer_name','subnet_id','subnet_type']]) lmem_size = int(chipArchArgs['TPU Lmem Size(MiB)']) lane_num = int(chipArchArgs['NPU Num']) @@ -94,9 +95,15 @@ def generate_jsfile(dirpath, name, out_path, file_path, layerinfo_path): if not sdmadf.empty: prepare_data(include_layer, sdmadf, sdmaProcessor.frequency, idx, categories.index("TPU_SDMA"), [ddrBw, L2Bw], lane_num, cycle_data_dict, lmem_op_dict, lane_size) - if idx < len(cdma_instances): - cdmadf = cdma_instances[idx] - prepare_data(include_layer, cdmadf, cdmaProcessor.frequency,idx, categories.index("TPU_CDMA"), [ddrBw, L2Bw], lane_num, cycle_data_dict, lmem_op_dict, lane_size) + # if idx < len(cdma_instances): + if idx == 7: + for i in range(len(cdma_instances)): + cdmadf = cdma_instances[i] + prepare_data(include_layer, cdmadf, cdmaProcessor.frequency,idx, categories.index(f"TPU_CDMA_PORT{cdmadf.port}"), [ddrBw, L2Bw], lane_num, cycle_data_dict, lmem_op_dict, lane_size) + # if i == 1: + # prepare_data(include_layer, cdmadf, cdmaProcessor.frequency,idx, categories.index("TPU_CDMA"), [ddrBw, L2Bw], lane_num, cycle_data_dict, lmem_op_dict, lane_size) + # else: + # prepare_data(include_layer, cdmadf, cdmaProcessor.frequency,idx, categories.index("TPU_CDMA1"), [ddrBw, L2Bw], lane_num, cycle_data_dict, lmem_op_dict, lane_size) cycle_data_dict = merge_layer_data(cycle_data_dict, categories) cycle_data_dict = merge_group_layer_data(cycle_data_dict, categories, file_line_dict) @@ -139,7 +146,7 @@ def generate_jsfile(dirpath, name, out_path, file_path, layerinfo_path): js.write(f'window.{keyname} = [{js_content}]\n') def prepare_data(if_layer, data, frequency, idx, ip_type, bwlist, lane_num, cycle_data_dict, lmem_op_dict, lane_size): - if data.empty: + if data is None or data.empty: return read_directions = ['DDR->LMEM'] + [f'DDR->LMEM{i}' for i in range(8)] + [f'L2M->LMEM{i}' for i in range(8)] + [f'L2M{i}->LMEM{i}' for i in range(8)] write_directions = ['LMEM->DDR'] + [f'LMEM{i}->DDR' for i in range(8)] + [f'LMEM{i}->L2M' for i in range(8)] + [f'LMEM{i}->L2M{i}' for i in range(8)] @@ -168,6 +175,7 @@ def prepare_data(if_layer, data, frequency, idx, ip_type, bwlist, lane_num, cycl data['Function Type'][i] if 'Function Type' in data else '', height, cmd, + data['Global Idx'][i], data['Function Name'][i] ] if if_layer: diff --git a/python/debugger/target_1690/context.py b/python/debugger/target_1690/context.py index 25b799a8e..11c7a58e2 100644 --- a/python/debugger/target_1690/context.py +++ b/python/debugger/target_1690/context.py @@ -9,7 +9,7 @@ from functools import partial, lru_cache import numpy as np -from .regdef import sDMA_sys_reg as dma_sys, SYS_reg as tiu_sys, SYS_TR_ACC_reg +from .regdef import sDMA_sys_reg as dma_sys, SYS_reg as tiu_sys, SYS_TR_ACC_reg, sCDMA_sys_reg as cdma_sys from .memmap import * from .decoder import Decoder from typing import List, Type @@ -29,6 +29,7 @@ class BM1690Context(BModelContext): dma_sys = dma_sys tiu_sys = tiu_sys + cdma_sys = cdma_sys local_layout_to_stride = local_layout_to_stride valid_tag = {1: 0, 2: 1} # {tag : corresponding index in self.base_addr} @@ -118,7 +119,7 @@ def fix_tgcr_cmd_id_dp(tiu_cmd: List[BaseTpuCmd]): @classmethod def is_sys(cls, cmd: BaseTpuCmd): - return isinstance(cmd.reg, (dma_sys, tiu_sys)) + return isinstance(cmd.reg, (dma_sys, tiu_sys, cdma_sys)) def get_runner(self, memory_size: int) -> CModelRunner: assert self.using_cmodel, "2260 currently only support cmodel mode" diff --git a/python/debugger/target_1690/decoder.py b/python/debugger/target_1690/decoder.py index f56869a9d..3f9f52268 100644 --- a/python/debugger/target_1690/decoder.py +++ b/python/debugger/target_1690/decoder.py @@ -16,7 +16,7 @@ import ctypes from .regdef import op_class_dic -from .regdef import sDMA_sys_reg as dma_sys, SYS_reg as tiu_sys +from .regdef import sDMA_sys_reg as dma_sys, SYS_reg as tiu_sys, sCDMA_sys_reg as cdma_sys from ..target_common import ( atomic_reg, CMDType, @@ -24,7 +24,7 @@ BaseTpuCmd, HeadDef, ) -from .opdef import tiu_index, tiu_cls, dma_index, dma_cls, TiuCmd, DmaCmd +from .opdef import tiu_index, tiu_cls, dma_index, dma_cls, cdma_index, TiuCmd, DmaCmd if TYPE_CHECKING: @@ -107,6 +107,26 @@ class DmaHead(HeadDef): def __hash__(self): return hash((bool(self.cmd_short), self.cmd_type, self.cmd_sp_func)) +class CDmaHead(HeadDef): + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("nchw_copy", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_sp_func", ctypes.c_uint64, 3), + ] + intr_en: int + stride_enable: int + nchw_copy: int + cmd_short: int + reserved: int + cmd_type: int + cmd_sp_func: int + + def __hash__(self): + return hash((False, self.cmd_type, self.cmd_sp_func)) + TiuHeads: List[ctypes.Structure] = [TiuHead, SYS_TR_ACC_HEAD] @@ -173,6 +193,32 @@ def decode_dma_cmd( ) return cmd + def decode_cdma_cmd( + self, reg_buf: memoryview, *, offset, core_id, cmd_id, subnet_id): + assert cmd_id is not None, "1688 must assign cmd_id manully" + head = CDmaHead.from_buffer(reg_buf, offset) # type: DmaHead + op_info = cdma_index.get(head, None) + + assert op_info is not None, ( + f"Unable to decode DMA code at offset {offset} out of {len(reg_buf)} total." + f" Potential head identified as {head}" + ) + # get op struct + op_clazz = op_class_dic[op_info.name] + + reg = self.decode_reg(op_clazz, buf=reg_buf, offset=offset) + buf = reg_buf[offset : offset + op_clazz.length // 8] + param_fn = self.context.opparam_converter.get(reg.OP_NAME, None) + cmd = op_info( + reg, + buf=buf, + cmd_id=cmd_id, + subnet_id=subnet_id, + core_id=core_id, + param_fn=param_fn, + ) + return cmd + def decode_dma_cmds( self, reg_buf: bytes, @@ -224,6 +270,35 @@ def decode_tiu_cmds( break return res + def decode_cdma_cmds( + self, + reg_buf: bytes, + *, + core_id=0, + subnet_id=0, + ) -> List[atomic_reg]: + offset = 0 + res = [] + cmd_id = 1 + while offset < len(reg_buf): + cmd = self.decode_cdma_cmd( + reg_buf, + offset=offset, + core_id=core_id, + subnet_id=subnet_id, + cmd_id=cmd_id, + ) + + cmd_id += 1 + offset += cmd.reg.length // 8 + res.append(cmd) + # reached sys_end already + if isinstance(cmd.reg, cdma_sys) and cmd.reg.cmd_special_function == 0: + break + if self.buf_is_end(reg_buf[offset:], cmd, cdma_sys): + break + return res + @staticmethod def buf_is_end(reg_buf, operation: BaseTpuCmd, end_op): is_sys = isinstance(operation.reg, end_op) diff --git a/python/debugger/target_1690/opdef.py b/python/debugger/target_1690/opdef.py index fd3dcb12e..dfab12b03 100644 --- a/python/debugger/target_1690/opdef.py +++ b/python/debugger/target_1690/opdef.py @@ -12,15 +12,17 @@ should be maintained manully. """ from typing import Dict, Tuple -from ..target_common import BaseTpuCmd, atomic_reg, OpInfo, Tiu, Dma, ALIGN, RegIndex +from ..target_common import BaseTpuCmd, atomic_reg, OpInfo, Tiu, Dma, Cdma, ALIGN, RegIndex from .regdef import SYS_TR_ACC_reg, SYS_reg, sDMA_sys_reg # global data and type # ------------------------------------------------------------ tiu_cls = dict() dma_cls = dict() +cdma_cls = dict() tiu_index = RegIndex() dma_index = RegIndex() +cdma_index = RegIndex() def fix_cmd_id_dep(value): @@ -245,6 +247,108 @@ def get_op_info(self) -> list: op_info_list.append(self.operands[i]) return op_info_list +class CdmaCmd(BaseTpuCmd, Cdma): + opparam_converter = None # assigned by BM1690Context instance + + description = "CDMA Operation." + opcode_bits = (4, 8) + fun_bits = (8, 11) + sp_fun = () + short_cmd = False + + def __init__( + self, + reg: atomic_reg, + *, + buf: memoryview, + cmd_id, + param_fn, + subnet_id=0, + core_id=0, + ) -> None: + assert param_fn is not None + super().__init__( + reg, buf=buf, subnet_id=subnet_id, core_id=core_id, param_fn=param_fn + ) + self.cmd_id = cmd_id + # lazy assigned in merge_instruction + # self.cmd_id_dep = fix_cmd_id_dep(reg.cmd_id_dep) + + def ops(self, is_arch): + return 0 + @property + def op_name(self): + op_name = self.reg.OP_NAME + op_info = cdma_cls[self.reg.OP_NAME] + sp_func_id = self.reg["cmd_special_function"] + if len(op_info["sp_fun"]) != 0: + op_name = op_info["sp_fun"][sp_func_id] + + return op_name + + + def __init_subclass__(cls) -> None: + if len(cls.sp_fun) == 0: + cls.sp_fun = {0: "none"} + cdma_index[(cls.short_cmd, cls.opcode, cls.sp_fun)] = cls + + cdma_cls[cls.name] = { + "description": cls.description, + "tsk_typ": cls.opcode, + "sp_fun": cls.sp_fun, + # "short_cmd": cls.short_cmd, + } + return cls + + def __repr__(self): + ci = self.core_id + if self.operands == [] or self.results == []: + if self.attribute: + tmp_attr = self.attribute.copy() + has_msg_id = "msg_id" in tmp_attr + if has_msg_id: + msg_id = tmp_attr.pop("msg_id") + attribute = f" {self.attribute}".replace(":", " =").replace("'", "") + return ( + f'%D{self.cmd_id}C{ci} = "{self.op_name}"' + + (f"(%msg{msg_id})" if has_msg_id else "") + + attribute + ) + else: + return self.description + res_name, res_type_t = zip(*((x.name, x.type_str) for x in self.results)) + opd_name, opd_type_t = zip(*((x.name, x.type_str) for x in self.operands)) + + + attribute_dic = {} + if self.attribute: + attribute_dic.update(self.attribute) + + op_name = self.op_name + + attribute = f"{attribute_dic}" if len(attribute_dic) > 0 else "" + attribute = f" {attribute}".replace(":", " =").replace("'", "") + + return ( + f"{', '.join(res_name)}, %D{self.cmd_id}C{ci} = \"{op_name}\"" + + f"({', '.join(opd_name)})" + + attribute + + f": ({', '.join(opd_type_t)}, none) -> ({res_type_t[0]}, none)" + ) + + def get_op_info(self) -> list: + op_info_list = [] + op_info_list.append(self.op_name) + id_info = f"%D{self.cmd_id}C{self.core_id}" + op_info_list.append(id_info) + if len(self.results) > 0: + op_info_list.append(self.results[0]) + else: + op_info_list.append("") + if len(self.operands) > 0: + for i in range(len(self.operands)): + op_info_list.append(self.operands[i]) + return op_info_list class conv_op(TiuCmd): name = "CONV" @@ -809,3 +913,70 @@ class dma_transfer(DmaCmd): opcode = 16 sp_fun = {0: "dma.transfer"} description = "DMA transfer" + +class cdma_send(CdmaCmd): + name = "CDMA_send" + opcode = 0 + sp_fun = {0: "cdma.send"} + description = "CDMA send" + +class cdma_read(CdmaCmd): + name = "CDMA_read" + opcode = 1 + sp_fun = {0: "cdma.read"} + description = "CDMA read" + +class cdma_write(CdmaCmd): + name = "CDMA_write" + opcode = 2 + sp_fun = {0: "cdma.write"} + description = "CDMA write" + +class cdma_general(CdmaCmd): + name = "CDMA_general" + opcode = 3 + sp_fun = {0: "cdma.general"} + description = "CDMA general" + +class cdma_receive(CdmaCmd): + name = "CDMA_receive" + opcode = 4 + sp_fun = {0: "cdma.receive"} + description = "CDMA receive" + +class cdma_lossy_compress(CdmaCmd): + name = "CDMA_lossy_compress" + opcode = 5 + sp_fun = {0: "cdma.lossy_compress"} + description = "CDMA lossy compress" + +class cdma_lossy_decompress(CdmaCmd): + name = "CDMA_lossy_decompress" + opcode = 6 + sp_fun = {0: "cdma.lossy_decompress"} + description = "CDMA lossy decompress" + +class cdma_sys(CdmaCmd): + name = "sCDMA_sys" + opcode = 7 + sp_fun = {0: "cdma.chain_end", + 1: "cdma.nop", + 2: "cdma.sys_tr_wr", + 3: "cdma.sys_msg_tx_send", + 4: "cdma.sys_msg_tx_wait", + 5: "cdma.sys_msg_rx_send", + 6: "cdma.sys_msg_rx_wait", + } + description = "sCDMA sys" + +class cdma_tcp_send(CdmaCmd): + name = "CDMA_tcp_send" + opcode = 8 + sp_fun = {} # no sp_fun + description = "CDMA tcp send" + +class cdma_tcp_receive(CdmaCmd): + name = "CDMA_tcp_receive" + opcode = 9 + sp_fun = {} # no sp_fun + description = "CDMA tcp receive" diff --git a/python/debugger/target_1690/opparam.py b/python/debugger/target_1690/opparam.py index 2a338b557..a5e9f7a6d 100644 --- a/python/debugger/target_1690/opparam.py +++ b/python/debugger/target_1690/opparam.py @@ -1372,3 +1372,147 @@ def sDMA_sys_converter(context: "BM1690Context", reg: sDMA_sys_reg): elif reg.cmd_special_function > 4: raise KeyError(f"cmd_special_function {reg.cmd_special_function} not supported") return ([], attr, []) + + +def cdma_base(context, reg): + src_n, src_c, src_h, src_w = (reg[f"src_{d}size"] for d in "nchw") + dst_n, dst_c, dst_h, dst_w = (reg[f"dst_{d}size"] for d in "nchw") + src_sn, src_sc, src_sh = (reg[f"src_{d}stride"] for d in "nch") + dst_sn, dst_sc, dst_sh = (reg[f"dst_{d}stride"] for d in "nch") + opd0 = dict( + address=dma_addr(reg.src_start_addr_h13, reg.src_start_addr_l32), + dtype=DType(reg.src_data_format), + shape=(src_n, src_c, src_h, src_w), + stride=(src_sn, src_sc, src_sh, 1), + layout=Layout.compact, + ) + res0 = dict( + address=dma_addr(reg.dst_start_addr_h13, reg.dst_start_addr_l32), + dtype=DType(reg.src_data_format), + shape=(dst_n, dst_c, dst_h, dst_w), + stride=(dst_sn, dst_sc, dst_sh, 1), + layout=Layout.compact, + ) + if reg.nchw_copy: + res0["shape"] = opd0["shape"] + + attr = dict() + operands = [get_value(context, **opd0)] + results = [get_value(context, **res0)] + + return (results, attr, operands) + +def cdma_operand_base(context, reg): + n, c, h, w = (reg[f"src_{d}size"] for d in "nchw") + sn, sc, sh = (reg[f"src_{d}stride"] for d in "nch") + stride = (sn, sc, sh, 1) + opd0 = dict( + address=dma_addr(reg.src_start_addr_h13, reg.src_start_addr_l32), + dtype=DType(reg.src_data_format), + shape=(n, c, h, w), + stride=stride, + layout=Layout.alignLine, + ) + attr = dict() + operands = [get_value(context, **opd0)] + return ([], attr, operands) + +def cdma_result_base(context, reg): + n, c, h, w = (reg[f"dst_{d}size"] for d in "nchw") + sn, sc, sh = (reg[f"dst_{d}stride"] for d in "nch") + stride = (sn, sc, sh, 1) + res0 = dict( + address=dma_addr(reg.dst_start_addr_h13, reg.dst_start_addr_l32), + dtype=DType(0), + shape=(n, c, h, w), + stride=stride, + layout=Layout.compact, + ) + attr = dict() + results = [get_value(context, **res0)] + return (results, attr, []) + + +@opparam_converter_regitstry("CDMA_send") +def CDMA_send_converter(context: "BM1690Context", reg: CDMA_send_reg): + rao = cdma_operand_base(context, reg) + rao[1]["psum_op"] = "rd+wr" if reg.psum_op else "wo" + return rao + +@opparam_converter_regitstry("CDMA_read") +def CDMA_read_converter(context: "BM1690Context", reg: CDMA_read_reg): + return cdma_base(context, reg) + +@opparam_converter_regitstry("CDMA_write") +def CDMA_write_converter(context: "BM1690Context", reg: CDMA_write_reg): + return cdma_base(context, reg) + +@opparam_converter_regitstry("CDMA_general") +def CDMA_general_converter(context: "BM1690Context", reg: CDMA_general_reg): + copy_len = reg.cmd_length + opd0 = dict( + address=dma_addr(reg.src_start_addr_h13, reg.src_start_addr_l32), + dtype=DType(reg.src_data_format), + shape=(copy_len,), + stride=(1,), + layout=Layout.DMAlinear, + ) + res0 = dict( + address=dma_addr(reg.dst_start_addr_h13, reg.dst_start_addr_l32), + dtype=DType(reg.src_data_format), + shape=(copy_len,), + stride=(1,), + layout=Layout.DMAlinear, + ) + # attr = dict(base=reg.dst_nstride) + attr = dict() + operands = [get_value(context, **opd0)] + results = [get_value(context, **res0)] + return (results, attr, operands) + +@opparam_converter_regitstry("CDMA_receive") +def CDMA_receive_converter(context: "BM1690Context", reg: CDMA_receive_reg): + reduce_type = {0: "nop", 1:"mul", 2:"max", 3:"min", 4:"add"} + rao = cdma_result_base(context, reg) + rao[1]["reduce_op"] = reduce_type[reg.reduce_op] + return rao + +@opparam_converter_regitstry("CDMA_lossy_compress") +def CDMA_lossy_compress_converter(context: "BM1690Context", reg: CDMA_lossy_compress_reg): + rao = cdma_operand_base(context, reg) + rao[1]["psum_op"] = "rd+wr" if reg.psum_op else "wo" + return rao + +@opparam_converter_regitstry("CDMA_lossy_decompress") +def CDMA_lossy_decompress_converter(context: "BM1690Context", reg: CDMA_lossy_decompress_reg): + rao = cdma_operand_base(context, reg) + rao[1]["psum_op"] = "rd+wr" if reg.psum_op else "wo" + return rao + +@opparam_converter_regitstry("sCDMA_sys") +def sCDMA_sys_converter(context: "BM1690Context", reg: sCDMA_sys_reg): + constant_value_l32 = reg.constant_value_l32 + attr = {} + if reg.cmd_special_function in (3, 4, 5, 6): + msg_id = constant_value_l32 & 0x1FF + cnt = (constant_value_l32 >> 16) & 0x7F + attr = dict(msg_id=msg_id, cnt=cnt) + elif reg.cmd_special_function == 2: + const_value = reg.constant_value_h32<<32|constant_value_l32 + reg_sel = reg.reg_sel + if reg_sel == 0: + reg_sel = "current_id" + elif reg_sel == 1: + reg_sel = "src_mac_id" + elif reg_sel == 2: + reg_sel = "dst_mac_id" + attr = dict(const=const_value, reg_sel=reg_sel) + return ([], attr, []) + +@opparam_converter_regitstry("CDMA_tcp_send ") +def CDMA_tcp_send_converter(context: "BM1690Context", reg: CDMA_tcp_send_reg): + return [] * 3 + +@opparam_converter_regitstry("CDMA_tcp_rcv ") +def CDMA_tcp_rcv_converter(context: "BM1690Context", reg: CDMA_tcp_rcv_reg): + return [] * 3 diff --git a/python/debugger/target_1690/regdef.py b/python/debugger/target_1690/regdef.py index 6815b20d0..5453e84f9 100644 --- a/python/debugger/target_1690/regdef.py +++ b/python/debugger/target_1690/regdef.py @@ -5652,6 +5652,510 @@ class DMA_tansfer_reg(atomic_reg): length: int = 768 +class CDMA_send_reg(atomic_reg): + OP_NAME = "CDMA_send" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("nchw_copy", ctypes.c_uint64, 1), + ("breakpoint", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("reserved", ctypes.c_uint64, 1), + ("src_data_format", ctypes.c_uint64, 4), + ("src_start_addr_h13", ctypes.c_uint64, 13), + # ("src_start_addr_h13", ctypes.c_uint64, 13), + ("psum_op", ctypes.c_uint64, 3), + ("src_nstride", ctypes.c_uint64, 32), + ("src_cstride", ctypes.c_uint64, 32), + ("src_hstride", ctypes.c_uint64, 32), + ("src_nsize", ctypes.c_uint64, 16), + ("src_csize", ctypes.c_uint64, 16), + ("src_hsize", ctypes.c_uint64, 32), + ("src_wsize", ctypes.c_uint64, 32), + ("src_start_addr_l32", ctypes.c_uint64, 32), + ] + + intr_en: int + stride_enable: int + nchw_copy: int + breakpoint: int + cmd_type: int + cmd_special_function: int + reserved: int + src_data_format: int + src_start_addr_h13: int + psum_op: int + src_nstride: int + src_cstride: int + src_hstride: int + src_nsize: int + src_csize: int + src_hsize: int + src_wsize: int + src_start_addr_l32: int + + length: int = 256 + +class CDMA_read_reg(atomic_reg): + OP_NAME = "CDMA_read" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("nchw_copy", ctypes.c_uint64, 1), + ("breakpoint", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("reserved", ctypes.c_uint64, 1), + ("src_data_format", ctypes.c_uint64, 4), + ("src_start_addr_h13", ctypes.c_uint64, 13), + ("reserved", ctypes.c_uint64, 3), + ("dst_start_addr_h13", ctypes.c_uint64, 13), + ("reserved", ctypes.c_uint64, 19), + ("src_nstride", ctypes.c_uint64, 32), + ("src_cstride", ctypes.c_uint64, 32), + ("src_hstride", ctypes.c_uint64, 32), + ("dst_nstride", ctypes.c_uint64, 32), + ("dst_cstride", ctypes.c_uint64, 32), + ("dst_hstride", ctypes.c_uint64, 32), + ("src_nsize", ctypes.c_uint64, 16), + ("src_csize", ctypes.c_uint64, 16), + ("src_hsize", ctypes.c_uint64, 32), + ("src_wsize", ctypes.c_uint64, 32), + ("dst_nsize", ctypes.c_uint64, 16), + ("dst_csize", ctypes.c_uint64, 16), + ("dst_hsize", ctypes.c_uint64, 32), + ("dst_wsize", ctypes.c_uint64, 32), + ("src_start_addr_l32", ctypes.c_uint64, 32), + ("dst_start_addr_l32", ctypes.c_uint64, 32), + ] + + intr_en: int + stride_enable: int + nchw_copy: int + breakpoint: int + cmd_type: int + cmd_special_function: int + reserved: int + src_data_format: int + src_start_addr_h13: int + reserved: int + dst_start_addr_h13: int + reserved: int + src_nstride: int + src_cstride: int + src_hstride: int + dst_nstride: int + dst_cstride: int + dst_hstride: int + src_nsize: int + src_csize: int + src_hsize: int + src_wsize: int + dst_nsize: int + dst_csize: int + dst_hsize: int + dst_wsize: int + src_start_addr_l32: int + dst_start_addr_l32: int + + length: int = 512 + +class CDMA_write_reg(atomic_reg): + OP_NAME = "CDMA_write" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("nchw_copy", ctypes.c_uint64, 1), + ("breakpoint", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("fill_constant_en", ctypes.c_uint64, 1), + ("src_data_format", ctypes.c_uint64, 4), + ("src_start_addr_h13", ctypes.c_uint64, 13), + ("reserved", ctypes.c_uint64, 3), + ("dst_start_addr_h13", ctypes.c_uint64, 13), + ("reserved", ctypes.c_uint64, 19), + ("src_nstride", ctypes.c_uint64, 32), + ("src_cstride", ctypes.c_uint64, 32), + ("src_hstride", ctypes.c_uint64, 32), + ("dst_nstride", ctypes.c_uint64, 32), + ("dst_cstride", ctypes.c_uint64, 32), + ("dst_hstride", ctypes.c_uint64, 32), + ("src_nsize", ctypes.c_uint64, 16), + ("src_csize", ctypes.c_uint64, 16), + ("src_hsize", ctypes.c_uint64, 32), + ("src_wsize", ctypes.c_uint64, 32), + ("dst_nsize", ctypes.c_uint64, 16), + ("dst_csize", ctypes.c_uint64, 16), + ("dst_hsize", ctypes.c_uint64, 32), + ("dst_wsize", ctypes.c_uint64, 32), + ("src_start_addr_l32", ctypes.c_uint64, 32), + ("dst_start_addr_l32", ctypes.c_uint64, 32), + ] + + intr_en: int + stride_enable: int + nchw_copy: int + breakpoint: int + cmd_type: int + cmd_special_function: int + fill_constant_en: int + src_data_format: int + src_start_addr_h13: int + reserved: int + dst_start_addr_h13: int + reserved: int + src_nstride: int + src_cstride: int + src_hstride: int + dst_nstride: int + dst_cstride: int + dst_hstride: int + src_nsize: int + src_csize: int + src_hsize: int + src_wsize: int + dst_nsize: int + dst_csize: int + dst_hsize: int + dst_wsize: int + src_start_addr_l32: int + dst_start_addr_l32: int + + length: int = 512 + + +class CDMA_general_reg(atomic_reg): + OP_NAME = "CDMA_general" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("rcv_intr_en", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 1), + ("breakpoint", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("reserved", ctypes.c_uint64, 1), + ("src_data_format", ctypes.c_uint64, 4), + ("src_start_addr_h13", ctypes.c_uint64, 13), + ("reserved", ctypes.c_uint64, 3), # 32bit + ("dst_start_addr_h13", ctypes.c_uint64, 13), + ("reserved", ctypes.c_uint64, 19), + ("cmd_length", ctypes.c_uint64, 32), + ("src_start_addr_l32", ctypes.c_uint64, 32), # constant_value + ("dst_start_addr_l32", ctypes.c_uint64, 32), + ("reserved", ctypes.c_uint64, 32), + ("reserved", ctypes.c_uint64, 32), + ("reserved", ctypes.c_uint64, 32), + ] + + intr_en: int + rcv_intr_en: int + reserved: int + breakpoint: int + cmd_type: int + cmd_special_function: int + reserved: int + src_data_format: int + src_start_addr_h13: int + reserved: int + dst_start_addr_h13: int + reserved: int + cmd_length: int + src_start_addr_l32: int + dst_start_addr_l32: int + reserved: int + reserved: int + reserved: int + + length: int = 256 + +class CDMA_receive_reg(atomic_reg): + OP_NAME = "CDMA_receive" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 1), + ("breakpoint", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("reserved", ctypes.c_uint64, 5), + ("dst_start_addr_h13", ctypes.c_uint64, 13), + ("reduce_op", ctypes.c_uint64, 3), # 32bit + ("dst_nstride", ctypes.c_uint64, 32), + ("dst_cstride", ctypes.c_uint64, 32), + ("dst_hstride", ctypes.c_uint64, 32), + ("dst_nsize", ctypes.c_uint64, 16), + ("dst_csize", ctypes.c_uint64, 16), + ("dst_hsize", ctypes.c_uint64, 32), + ("dst_wsize", ctypes.c_uint64, 32), + ("dst_start_addr_l32", ctypes.c_uint64, 32), + ] + + intr_en: int + stride_enable: int + reserved: int + breakpoint: int + cmd_type: int + cmd_special_function: int + reserved: int + dst_start_addr_h13: int + reduce_op: int + dst_nstride: int + dst_cstride: int + dst_hstride: int + dst_nsize: int + dst_csize: int + dst_hsize: int + dst_wsize: int + dst_start_addr_l32: int + + length: int = 256 + + +class CDMA_lossy_compress_reg(atomic_reg): + OP_NAME = "CDMA_lossy_compress" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("nchw_copy", ctypes.c_uint64, 1), + ("breakpoint", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("reserved", ctypes.c_uint64, 1), + ("src_data_format", ctypes.c_uint64, 4), + ("src_start_addr_h13", ctypes.c_uint64, 13), + ("reduce_op", ctypes.c_uint64, 3), # 32bit + ("src_nstride", ctypes.c_uint64, 32), + ("src_cstride", ctypes.c_uint64, 32), + ("src_hstride", ctypes.c_uint64, 32), + ("src_nsize", ctypes.c_uint64, 16), + ("src_csize", ctypes.c_uint64, 16), + ("src_hsize", ctypes.c_uint64, 16), + ("reserved", ctypes.c_uint64, 16), + ("src_wsize", ctypes.c_uint64, 32), + ("src_start_addr_l32", ctypes.c_uint64, 32), + ] + + intr_en: int + stride_enable: int + nchw_copy: int + breakpoint: int + cmd_type: int + cmd_special_function: int + reserved: int + src_data_format: int + src_start_addr_h13: int + reduce_op: int + src_nstride: int + src_cstride: int + src_hstride: int + src_nsize: int + src_csize: int + src_hsize: int + reserved: int + src_wsize: int + reserved: int + src_start_addr_l32: int + + length: int = 256 + + +class CDMA_lossy_decompress_reg(atomic_reg): + OP_NAME = "CDMA_lossy_decompress" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("nchw_copy", ctypes.c_uint64, 1), + ("breakpoint", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("reserved", ctypes.c_uint64, 1), + ("src_data_format", ctypes.c_uint64, 4), + ("src_start_addr_h13", ctypes.c_uint64, 13), + ("reduce_op", ctypes.c_uint64, 3), # 32bit + ("src_nstride", ctypes.c_uint64, 32), + ("src_cstride", ctypes.c_uint64, 32), + ("src_hstride", ctypes.c_uint64, 32), + ("src_nsize", ctypes.c_uint64, 16), + ("src_csize", ctypes.c_uint64, 16), + ("src_hsize", ctypes.c_uint64, 16), + ("reserved", ctypes.c_uint64, 16), + ("src_wsize", ctypes.c_uint64, 32), + ("src_start_addr_l32", ctypes.c_uint64, 32), + ] + + intr_en: int + stride_enable: int + nchw_copy: int + breakpoint: int + cmd_type: int + cmd_special_function: int + reserved: int + src_data_format: int + src_start_addr_h13: int + reduce_op: int + src_nstride: int + src_cstride: int + src_hstride: int + src_nsize: int + src_csize: int + src_hsize: int + reserved: int + src_wsize: int + reserved: int + src_start_addr_l32: int + + length: int = 256 + + +class sCDMA_sys_reg(atomic_reg): + OP_NAME = "sCDMA_sys" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("stride_enable", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 2), + ("cmd_type", ctypes.c_uint64, 4), + ("cmd_special_function", ctypes.c_uint64, 3), + ("reserved", ctypes.c_uint64, 21), # 32bit + ("constant_value_l32", ctypes.c_uint64, 32), + ("constant_value_h32", ctypes.c_uint64, 32), + ("reg_sel", ctypes.c_uint64, 4), + ("reserved", ctypes.c_uint64, 28), + ] + + intr_en: int + stride_enable: int + reserved: int + cmd_type: int + cmd_special_function: int + reserved: int + constant_value_l32: int + constant_value_h32: int + reg_sel: int + reserved: int + + length: int = 128 + +class CDMA_tcp_send_reg(atomic_reg): + OP_NAME = "CDMA_tcp_send" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("own", ctypes.c_uint64, 1), + ("FD", ctypes.c_uint64, 1), + ("LD", ctypes.c_uint64, 1), + ("cmd_type", ctypes.c_uint64, 4), + ("buffer_length", ctypes.c_uint64, 16), + ("breakpoint", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 7), # 32bit + ("frame_length", ctypes.c_uint64, 16), + ("reserved", ctypes.c_uint64, 16), + ("buffer_addr_l32", ctypes.c_uint64, 32), + ("buffer_addr_h13", ctypes.c_uint64, 8), + ("cmd_id", ctypes.c_uint64, 24), + ] + + intr_en: int + own: int + FD: int + LD: int + cmd_type: int + buffer_length: int + breakpoint: int + reserved: int + frame_length: int + reserved: int + buffer_addr_l32: int + buffer_addr_h13: int + cmd_id: int + + length: int = 128 + +class CDMA_tcp_rcv_reg(atomic_reg): + OP_NAME = "CDMA_tcp_rcv" + _fields_ = [ + ("intr_en", ctypes.c_uint64, 1), + ("own", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 2), + ("cmd_type", ctypes.c_uint64, 4), + ("buffer_length", ctypes.c_uint64, 16), + ("breakpoint", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 7), # 32bit + ("reserved", ctypes.c_uint64, 32), + ("buffer_addr_l32", ctypes.c_uint64, 32), + ("buffer_addr_h13", ctypes.c_uint64, 8), + ("cmd_id", ctypes.c_uint64, 24), + ] + + intr_en: int + own: int + reserved: int + cmd_type: int + buffer_length: int + breakpoint: int + reserved: int + reserved: int + buffer_addr_l32: int + buffer_addr_h13: int + cmd_id: int + + length: int = 128 + +class CDMA_tcp_send_wb_reg(atomic_reg): + OP_NAME = "CDMA_tcp_send_wb" + _fields_ = [ + ("DERR", ctypes.c_uint64, 1), + ("own", ctypes.c_uint64, 1), + ("FD", ctypes.c_uint64, 1), + ("LD", ctypes.c_uint64, 1), + ("reserved", ctypes.c_uint64, 28), # 32bit + ("reserved", ctypes.c_uint64, 32), + ("reserved", ctypes.c_uint64, 32), + ("reserved", ctypes.c_uint64, 32), + ] + + DERR: int + own: int + FD: int + LD: int + reserved: int + reserved: int + reserved: int + reserved: int + + length: int = 128 + +class CDMA_tcp_rcv_wb_reg(atomic_reg): + OP_NAME = "CDMA_tcp_rcv_wb" + _fields_ = [ + ("reserved", ctypes.c_uint64, 1), + ("own", ctypes.c_uint64, 1), + ("FD", ctypes.c_uint64, 1), + ("LD", ctypes.c_uint64, 1), + ("MAC_filter_status", ctypes.c_uint64, 18), + ("reserved", ctypes.c_uint64, 10), # 32bit + ("packet_status", ctypes.c_uint64, 9), + ("error_summary", ctypes.c_uint64, 1), + ("packet_length", ctypes.c_uint64, 14), + ("reserved", ctypes.c_uint64, 8), + ("reserved", ctypes.c_uint64, 32), + ("reserved", ctypes.c_uint64, 32), + ] + + reserved: int + own: int + FD: int + LD: int + MAC_filter_status: int + reserved: int + packet_status: int + error_summary: int + packet_length: int + reserved: int + reserved: int + reserved: int + + length: int = 128 op_class_dic: Dict[str, Type[atomic_reg]] = { @@ -5705,4 +6209,16 @@ class DMA_tansfer_reg(atomic_reg): "DMA_lossy_decompress": DMA_lossy_decompress_reg, "DMA_randmask": DMA_randmask_reg, "DMA_tansfer": DMA_tansfer_reg, + "CDMA_send": CDMA_send_reg, + "CDMA_read": CDMA_read_reg, + "CDMA_write": CDMA_write_reg, + "CDMA_general": CDMA_general_reg, + "CDMA_receive": CDMA_receive_reg, + "CDMA_lossy_compress": CDMA_lossy_compress_reg, + "CDMA_lossy_decompress": CDMA_lossy_decompress_reg, + "sCDMA_sys": sCDMA_sys_reg, + "CDMA_tcp_send": CDMA_tcp_send_reg, + "CDMA_tcp_rcv": CDMA_tcp_rcv_reg, + # "CDMA_tcp_send_wb": CDMA_tcp_send_wb_reg, + # "CDMA_tcp_rcv_wb": CDMA_tcp_rcv_wb_reg, } diff --git a/python/debugger/target_common/__init__.py b/python/debugger/target_common/__init__.py index 55f344965..190442f4e 100644 --- a/python/debugger/target_common/__init__.py +++ b/python/debugger/target_common/__init__.py @@ -47,6 +47,7 @@ CpuCmd, Tiu, Dma, + Cdma, CMDType, DynIrCmd, RegIndex, diff --git a/python/debugger/target_common/decoder.py b/python/debugger/target_common/decoder.py index 1354effb1..ca94b863b 100644 --- a/python/debugger/target_common/decoder.py +++ b/python/debugger/target_common/decoder.py @@ -72,6 +72,8 @@ def decode_dma_cmds(self, reg_buf: bytes, *, subnet_id, **kw) -> List[BaseTpuCmd def decode_tiu_cmds(self, reg_buf: bytes, *, subnet_id, **kw) -> List[BaseTpuCmd]: raise NotImplementedError() + def decode_cdma_cmds(self, reg_buf: bytes, *, subnet_id, **kw) -> List[BaseTpuCmd]: + raise NotImplementedError() def decode_cmds(self, cmd_arry: bytes, core_id: int, cmd_id: int, t: int) -> list: raise NotImplementedError() def decode_cpu_cmd( diff --git a/python/debugger/target_common/op_support.py b/python/debugger/target_common/op_support.py index 16fcd439e..b43610f86 100644 --- a/python/debugger/target_common/op_support.py +++ b/python/debugger/target_common/op_support.py @@ -629,6 +629,8 @@ def tuple_key(self): key = (self.subnet_id, self.cmd_id, None, self.core_id) elif isinstance(self, Dma): key = (self.subnet_id, None, self.cmd_id, self.core_id) + elif isinstance(self, Cdma): + key = (self.subnet_id, None, self.cmd_id, self.core_id) else: raise NotImplementedError() return key @@ -649,6 +651,10 @@ class Dma: # Mixin for detect cmd type pass +class Cdma: + # Mixin for detect cmd type + pass + class RegIndex: def __init__(self): diff --git a/python/profile_helper/bm1690_defs.py b/python/profile_helper/bm1690_defs.py index 9edbbe0a2..267b5c75b 100755 --- a/python/profile_helper/bm1690_defs.py +++ b/python/profile_helper/bm1690_defs.py @@ -27,6 +27,8 @@ dma_sys_code = 6 cdma_sys_code = 7 profile_sys_num = 2 +profile_init_cmd_num = 2 +profile_nop_num = 1 class DynRecordType(Enum): FUNC = 0 @@ -221,7 +223,7 @@ class CDMAProfileFormat(dictStructure): ("inst_id", ct.c_uint32, 24), ("thread_id", ct.c_uint32, 1), ("reserved0", ct.c_uint32, 7), ("_reserved0", ct.c_uint32), # H1 - ("m0_data_aw_cntr", ct.c_uint32), ("m0_data_w_cntr", ct.c_uint32), + ("m0_data_aw_cntr", ct.c_uint32), ("m0_data_wr_cntr", ct.c_uint32), ("m0_data_ar_cntr", ct.c_uint32), ("reserved1", ct.c_uint32), # H2 ("m0_data_wr_valid_cntr", ct.c_uint32), ("m0_data_wr_stall_cntr", ct.c_uint32), @@ -292,6 +294,14 @@ class ProfileFormat(dictStructure): ("extra_info", ct.c_uint32, 11), ("inst_id", ct.c_uint32) ] +class BDCommandParser(): + def __init__(self) -> None: + self.ctx = get_target_context("BM1690") + + def parse(self, raw_data): + tmp = bytearray(raw_data) + return self.ctx.decoder.decode_tiu_cmds(tmp) + class GDMACommandParser(): _byte_len_ = 256 def __init__(self) -> None: @@ -301,14 +311,13 @@ def parse(self, raw_data): tmp = bytearray(raw_data) return self.ctx.decoder.decode_dma_cmds(tmp) -class BDCommandParser(): +class CDMACommandParser(): def __init__(self) -> None: self.ctx = get_target_context("BM1690") def parse(self, raw_data): tmp = bytearray(raw_data) - return self.ctx.decoder.decode_tiu_cmds(tmp) - + return self.ctx.decoder.decode_cdma_cmds(tmp) DMA_ARCH = { "Chip Arch": "sg2260", @@ -345,13 +354,10 @@ def get_src_dst_type(v): return "LMEM", "LMEM" def mem_type(v): + if v in range(30): + return "DDR" if v == TagType.TAG_LMEM: return "LMEM" - if v == TagType.TAG_USERS \ - or v == TagType.TAG_WEIGHT \ - or v == TagType.TAG_ACTIVATION \ - or v == TagType.TAG_GLOBAL: - return "DDR" if v == TagType.TAG_L2M: return "L2M" raise ValueError(f"Unknow dma mem_type: {v}") @@ -375,26 +381,43 @@ def get_dma_info_dyn(monitor_info, reg_info, engine_id=1): dma_info["src_data_format"] = dtype dma_info["cmd_type"] = reg_info.des_tsk_typ dma_info["cmd_special_function"] = reg_info.des_tsk_eu_typ - dma_info["Function Type"], dma_info["Function Name"] = getDmaFunctionName( + parser = getDmaFunctionName + if engine_id == 4: + parser = getCdmaFunctionName + dma_info["Function Type"], dma_info["Function Name"] = parser( reg_info.des_tsk_typ, reg_info.des_tsk_eu_typ, dma_info["Direction"]) dma_info["Start Cycle"] = monitor_info.inst_start_time dma_info["End Cycle"] = monitor_info.inst_end_time - dma_info["Cmd Id"] = monitor_info.inst_id + 1 + dma_info["Cmd Id"] = monitor_info.inst_id dma_info["Data Type"] = data_type.name - dma_info["Asic Cycle"] = monitor_info.inst_end_time - \ - monitor_info.inst_start_time + 1 - dma_info["Stall Cycle"] = monitor_info.gif_wr_rd_stall_cntr - dma_info["gmem_xfer_bytes(B)"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr - dma_info["gmem_bandwidth"] = round(dma_info["gmem_xfer_bytes(B)"] / - dma_info["Asic Cycle"], 4) - dma_info["gmem_dma_data_size(B)"] = dma_info["gmem_xfer_bytes(B)"] - dma_info["lmem_xfer_bytes"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr - dma_info["lmem_bandwidth"] = round(dma_info["lmem_xfer_bytes"] / dma_info["Asic Cycle"], 4) - - dma_info["lmem_dma_data_size(B)"] = dma_info["lmem_xfer_bytes"] - - dma_info["DMA data size(B)"] = max(dma_info["gmem_dma_data_size(B)"], dma_info["lmem_dma_data_size(B)"]) - dma_info["DDR Bandwidth(GB/s)"] = max(dma_info["lmem_bandwidth"], dma_info["gmem_bandwidth"]) + dma_info["Asic Cycle"] = monitor_info.inst_end_time - monitor_info.inst_start_time + 1 + if engine_id != 4: + dma_info["Cmd Id"] += 1 + dma_info["Stall Cycle"] = monitor_info.gif_wr_rd_stall_cntr + dma_info["gmem_xfer_bytes(B)"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr + dma_info["gmem_bandwidth"] = round(dma_info["gmem_xfer_bytes(B)"] / + dma_info["Asic Cycle"], 4) + dma_info["gmem_dma_data_size(B)"] = dma_info["gmem_xfer_bytes(B)"] + dma_info["lmem_xfer_bytes"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr + dma_info["lmem_bandwidth"] = round(dma_info["lmem_xfer_bytes"] / dma_info["Asic Cycle"], 4) + + dma_info["lmem_dma_data_size(B)"] = dma_info["lmem_xfer_bytes"] + + dma_info["DMA data size(B)"] = max(dma_info["gmem_dma_data_size(B)"], dma_info["lmem_dma_data_size(B)"]) + dma_info["DDR Bandwidth(GB/s)"] = max(dma_info["lmem_bandwidth"], dma_info["gmem_bandwidth"]) + dma_info["lmem_xact_cnt"] = monitor_info.gif_wr_valid_cntr + monitor_info.gif_rd_valid_cntr + dma_info["gmem_xact_cnt"] = monitor_info.axi_d0_wr_vaild_cntr + monitor_info.axi_d0_rd_vaild_cntr + else: + # pcie <-> cdma + mac <-> cdma + # dma_info["Stall Cycle"] = monitor_info.m0_data_wr_stall_cntr + monitor_info.m0_data_rd_stall_cntr \ + # + monitor_info.ari_data_stall_cntr + monitor_info.ati_data_stall_cntr + dma_info["Stall Cycle"] = 0 + dma_info["DMA data size(B)"] = monitor_info.m0_data_ar_cntr + monitor_info.m0_data_aw_cntr + dma_info["DDR Bandwidth(GB/s)"] = round(dma_info["DMA data size(B)"] / dma_info["Asic Cycle"], 4) + # dma_info["lmem_xact_cnt"] = 1 + dma_info["gmem_xact_cnt"] = monitor_info.ari_data_valid_cntr + monitor_info.ati_data_valid_cntr \ + + monitor_info.m0_data_rd_valid_cntr + monitor_info.m0_data_wr_valid_cntr + dma_info["Direction"] = "DDR->DDR" # not implemented dma_info["gmem_bl_sum"] = 0 dma_info["gmem_avg_burst_length"] = 0 @@ -404,8 +427,6 @@ def get_dma_info_dyn(monitor_info, reg_info, engine_id=1): # no need # dma_info["lmem_xact_cnt"] = monitor_info.axi_d0_ar_cntr + monitor_info.axi_d0_aw_cntr # dma_info["gmem_xact_cnt"] = dma_info["gmem_xfer_bytes(B)"] // BYTE_PER_BEAT - dma_info["lmem_xact_cnt"] = monitor_info.gif_wr_valid_cntr + monitor_info.gif_rd_valid_cntr - dma_info["gmem_xact_cnt"] = monitor_info.axi_d0_wr_vaild_cntr + monitor_info.axi_d0_rd_vaild_cntr dma_info["lmem_msk_wr_cnt"] = 0 dma_info["gmem_msk_wr_cnt"] = 0 dma_info["lmem_n32Ba_sa_cnt"] = 0 @@ -424,8 +445,6 @@ def get_dma_info_dyn(monitor_info, reg_info, engine_id=1): return dma_info, None def get_dma_info(monitor_info, reg_info, core_id, engine_id=1): - is_sys = reg_info.name == 'sDMA_sys' - _reg_info = reg_info reg_info = reg_info.reg dma_info = dict() # step1 : get registor information from command @@ -441,47 +460,55 @@ def get_dma_info(monitor_info, reg_info, core_id, engine_id=1): dma_info[trans_key] = value dma_info["mask_start_addr_h8"] = dma_info.get("mask_start_addr_h8", 0) dma_info["mask_start_addr_l32"] = dma_info.get("mask_start_addr_l32", 0) - if is_sys: - dma_info["dst_start_addr"] = 0 - dma_info["src_start_addr"] = 0 - dma_info["src_start_addr_h13"] = 0 - dma_info["dst_start_addr_h13"] = 0 - else: - dma_info["dst_start_addr"] = ( - int(dma_info["dst_start_addr_h13"]) << 32) + int(dma_info["dst_start_addr_l32"]) - dma_info["src_start_addr"] = ( - int(dma_info["src_start_addr_h13"]) << 32) + int(dma_info["src_start_addr_l32"]) - + src_h13 = dma_info.get("src_start_addr_h13", 0) + dst_h13 = dma_info.get("dst_start_addr_h13", 0) + src_l32 = dma_info.get("src_start_addr_l32", 0) + dst_l32 = dma_info.get("dst_start_addr_l32", 0) + dma_info["dst_start_addr"] = (int(dst_h13) << 32) + int(dst_l32) + dma_info["src_start_addr"] = (int(src_h13) << 32) + int(src_l32) # step2: get custom information - src_type = mem_type(dma_info['src_start_addr_h13'] >> 8) - dst_type = mem_type(dma_info['dst_start_addr_h13'] >> 8) + src_type = mem_type(src_h13 >> 8) + dst_type = mem_type(dst_h13 >> 8) # src_type = mem_type(dma_info['src_start_addr'], core_id) # dst_type = mem_type(dma_info['dst_start_addr'], core_id) - data_type = DATATYPE(reg_info.src_data_format) + data_type = None + if 'src_data_format' in reg_info: + data_type = DATATYPE(reg_info.src_data_format) + parser = getCdmaFunctionName if engine_id == 4 else getDmaFunctionName dma_info["Engine Id"] = engine_id dma_info["Direction"] = "{}->{}".format(src_type, dst_type) dma_info["from_addr"] = src_type dma_info["to_addr"] = dst_type - dma_info["Function Type"], dma_info["Function Name"] = getDmaFunctionName( + dma_info["Function Type"], dma_info["Function Name"] = parser( reg_info.cmd_type, reg_info.cmd_special_function, dma_info["Direction"]) dma_info["Start Cycle"] = monitor_info.inst_start_time dma_info["End Cycle"] = monitor_info.inst_end_time - dma_info["Cmd Id"] = monitor_info.inst_id + 1 - dma_info["Data Type"] = data_type.name - dma_info["Asic Cycle"] = monitor_info.inst_end_time - \ - monitor_info.inst_start_time + 1 - dma_info["Stall Cycle"] = monitor_info.gif_wr_rd_stall_cntr - # print(monitor_info.axi_d0_w_cntr, monitor_info.axi_d0_ar_cntr, monitor_info.axi_d0_aw_cntr) - dma_info["gmem_xfer_bytes(B)"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr - dma_info["gmem_bandwidth"] = round(dma_info["gmem_xfer_bytes(B)"] / - dma_info["Asic Cycle"], 4) - dma_info["gmem_dma_data_size(B)"] = dma_info["gmem_xfer_bytes(B)"] - dma_info["lmem_xfer_bytes"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr - dma_info["lmem_bandwidth"] = round(dma_info["lmem_xfer_bytes"] / dma_info["Asic Cycle"], 4) - - dma_info["lmem_dma_data_size(B)"] = dma_info["lmem_xfer_bytes"] - dma_info["DMA data size(B)"] = max(dma_info["gmem_dma_data_size(B)"], dma_info["lmem_dma_data_size(B)"]) + dma_info["Cmd Id"] = monitor_info.inst_id + dma_info["Data Type"] = data_type.name if data_type else data_type + dma_info["Asic Cycle"] = monitor_info.inst_end_time - monitor_info.inst_start_time + 1 + if engine_id != 4: + dma_info["Cmd Id"] += 1 # cdma pmu cmd_id start from 1, others from 0 + dma_info["Stall Cycle"] = monitor_info.gif_wr_rd_stall_cntr + dma_info["gmem_xfer_bytes(B)"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr + dma_info["gmem_bandwidth"] = round(dma_info["gmem_xfer_bytes(B)"] / + dma_info["Asic Cycle"], 4) + dma_info["gmem_dma_data_size(B)"] = dma_info["gmem_xfer_bytes(B)"] + dma_info["lmem_xfer_bytes"] = monitor_info.gif_mem_w_cntr + monitor_info.axi_d0_w_cntr + dma_info["lmem_bandwidth"] = round(dma_info["lmem_xfer_bytes"] / dma_info["Asic Cycle"], 4) + + dma_info["lmem_dma_data_size(B)"] = dma_info["lmem_xfer_bytes"] + dma_info["DMA data size(B)"] = max(dma_info["gmem_dma_data_size(B)"], dma_info["lmem_dma_data_size(B)"]) + dma_info["lmem_xact_cnt"] = monitor_info.gif_wr_valid_cntr + monitor_info.gif_rd_valid_cntr + dma_info["gmem_xact_cnt"] = monitor_info.axi_d0_wr_vaild_cntr + monitor_info.axi_d0_rd_vaild_cntr + else: + dma_info["Stall Cycle"] = 0 + dma_info["lmem_bandwidth"] = 0 + dma_info["DMA data size(B)"] = monitor_info.m0_data_ar_cntr + monitor_info.m0_data_aw_cntr + dma_info["gmem_bandwidth"] = round(dma_info["DMA data size(B)"] / dma_info["Asic Cycle"], 4) + # dma_info["lmem_xact_cnt"] = 1 + dma_info["gmem_xact_cnt"] = monitor_info.ari_data_valid_cntr + monitor_info.ati_data_valid_cntr \ + + monitor_info.m0_data_rd_valid_cntr + monitor_info.m0_data_wr_valid_cntr if "DDR" in dma_info["Direction"]: dma_info["DDR Bandwidth(GB/s)"] = max(dma_info["lmem_bandwidth"], dma_info["gmem_bandwidth"]) dma_info['L2M Bandwidth(GB/s)'] = 0 @@ -502,8 +529,6 @@ def get_dma_info(monitor_info, reg_info, core_id, engine_id=1): # no need # dma_info["lmem_xact_cnt"] = monitor_info.axi_d0_ar_cntr + monitor_info.axi_d0_aw_cntr # dma_info["gmem_xact_cnt"] = dma_info["gmem_xfer_bytes(B)"] // BYTE_PER_BEAT - dma_info["lmem_xact_cnt"] = monitor_info.gif_wr_valid_cntr + monitor_info.gif_rd_valid_cntr - dma_info["gmem_xact_cnt"] = monitor_info.axi_d0_wr_vaild_cntr + monitor_info.axi_d0_rd_vaild_cntr dma_info["lmem_msk_wr_cnt"] = 0 dma_info["gmem_msk_wr_cnt"] = 0 dma_info["lmem_n32Ba_sa_cnt"] = 0 @@ -675,7 +700,7 @@ def getDmaFunctionName(cmd_type, cmd_special_function, direction): (1, 0): 'DMA_matrix', (1, 1): 'matrix transpose', (2, 0): 'DMA_masked_select', (2, 1): 'ncw mode', (3, 0): 'DMA_general', (3, 1): 'broadcast', - (4, 0): 'DMA_cw transpose', (4, 1): 'DMA transpose', + (4, 0): 'DMA_cw transpose',(4, 1): 'DMA_cw transpose',(4, 5): 'DMA_cw transpose', (5, 0): 'DMA_nonzero', (6, 0): 'DMA_sys', (6, 1): 'nop', (6, 2): 'sys_tr_wr', (6, 3): 'sys_send', (6, 4): 'sys_wait', (7, 0): 'DMA_gather', @@ -702,7 +727,7 @@ def getDmaFunctionName(cmd_type, cmd_special_function, direction): return functionType, functinName -def getCdmaFunctionName(cmd_type, cmd_special_function): +def getCdmaFunctionName(cmd_type, cmd_special_function, no_use): cdmaFunctionNameDict = { # DMA_send 0: "send", @@ -717,8 +742,8 @@ def getCdmaFunctionName(cmd_type, cmd_special_function): 3: "general", (3, 0): 'general', # DMA_receive_tensor - 4: "recevive_tensor", - (4, 0): 'recevive_tensor', + 4: "recevive", + (4, 0): 'recevive', # DMA_lossy_compress 5: "lossy_compress", (5, 0): 'lossy_compress', @@ -727,7 +752,7 @@ def getCdmaFunctionName(cmd_type, cmd_special_function): (6, 0): 'lossy_decompress', # DMA_sys 7: 'sys', - (7, 0): 'chain_end', (7, 1): 'nop', (7, 2): 'sys_tr_wr', (7, 3): 'sys_msg_tx_send', + (7, 0): 'end', (7, 1): 'nop', (7, 2): 'sys_tr_wr', (7, 3): 'sys_msg_tx_send', (7, 4): 'sys_msg_tx_wait', (7, 5): 'sys_msg_rx_send', (7, 6): 'sys_msg_rx_wait', # DMA_tcp_send 8: "tcp_send", diff --git a/python/profile_helper/bmprofile_common.py b/python/profile_helper/bmprofile_common.py index cb341327d..7e251d447 100644 --- a/python/profile_helper/bmprofile_common.py +++ b/python/profile_helper/bmprofile_common.py @@ -35,6 +35,10 @@ class BlockType(Enum): BMLIB_EXTRA = 10 MONITOR_SDMA = 11 MONITOR_CDMA = 12 + BLOCK_DES_BDC = 13 + BLOCK_DES_GDMA = 14 + BLOCK_DES_SDMA = 15 + BLOCK_DES_CDMA = 16 class DynExtraType(Enum): STRING=0 diff --git a/python/profile_helper/bmprofile_perfAI_2260.py b/python/profile_helper/bmprofile_perfAI_2260.py index 21f354dca..b3692f016 100755 --- a/python/profile_helper/bmprofile_perfAI_2260.py +++ b/python/profile_helper/bmprofile_perfAI_2260.py @@ -12,7 +12,7 @@ from profile_helper.bmprofile_common import BlockType, GlobalInfo, Arch from profile_helper.bmprofile_utils import re_key_value from profile_helper.bm1690_defs import get_tiu_info, get_dma_info, get_tiu_info_dyn, get_dma_info_dyn -import os, re +import os, re, math import logging from typing import List from pathlib import Path @@ -45,31 +45,35 @@ def __init__(self): self.in_dir = None self.out_dir = None self.is_dyn = False + self.cdma_cord_id = None def parse_cdma_cmd(self, file_list): print("Parsing...") self.cdma_pairs = [[] for _ in range(self.archlib.CDMA_NUM)] - for idx, infile in enumerate(file_list): + for infile in file_list: + idx = eval(re.search(r'cdma_(\d+)\.profile', infile).group(1)) + # if len(self.cdma_cmd[idx]) == 0: + # continue blocks = parse_data_blocks(infile) if blocks is None or blocks == []: continue - item = IterRecord() - item.command_info = [] + monitor_cdma = [] + des_cdma = [] blocks_factory = { - BlockType.MONITOR_CDMA.value: (item.monitor_cdma, self.__parse_monitor_cdma), + BlockType.MONITOR_CDMA.value: (monitor_cdma, self.__parse_monitor_cdma), + BlockType.BLOCK_DES_CDMA.value: (des_cdma, lambda l, raw_data: l.extend(self.cdma_parser.parse(raw_data))), } for block in blocks: item_list, item_func = blocks_factory.get( block.type.value, (0, lambda x, y: 0)) item_func(item_list, block.content) - for m in item.monitor_cdma[0]: - self.cdma_pairs[idx].append({"monitor": m, "cmd": None}) - # todo pars cmd and pmu pair (cdma has des mode?) - + self.cdma_pairs[idx] = self.make_pairs(self.cdma_cmd[idx], monitor_cdma[0], + self.archlib.cdma_sys_code, des_cdma, is_cdma=True) def parse_cmd(self, file_list): - self.gdma_parser = self.archlib.GDMACommandParser() self.bdc_parser = self.archlib.BDCommandParser() + self.gdma_parser = self.archlib.GDMACommandParser() + self.cdma_parser = self.archlib.CDMACommandParser() print("Parsing...") self.cdma_cmd = [[] for _ in range(self.archlib.CDMA_NUM)] for infile in tqdm(file_list): @@ -79,6 +83,10 @@ def parse_cmd(self, file_list): item = IterRecord() item.command_info = [] item.dyn_extra = [] + item.des_bdc = [] + item.des_gdma = [] + item.des_sdma = [] + item.des_cdma = [] blocks_factory = { BlockType.MONITOR_GDMA.value: (item.monitor_gdma, self.__parse_monitor_gdma), # # include sdma, vsdma pmu data @@ -86,7 +94,10 @@ def parse_cmd(self, file_list): BlockType.MONITOR_BD.value: (item.monitor_bd, self.__parse_monitor_tiu), BlockType.DYN_DATA.value: (item.dyn_data, self.__parse_dyn_data), BlockType.COMMAND.value: (item.command_info, self.__parse_command_info), - BlockType.DYN_EXTRA.value: (item.dyn_extra, self.__parse_dyn_extra) + BlockType.DYN_EXTRA.value: (item.dyn_extra, self.__parse_dyn_extra), + BlockType.BLOCK_DES_BDC.value: (item.des_bdc, lambda l, raw_data: l.extend(self.bdc_parser.parse(raw_data))), + BlockType.BLOCK_DES_GDMA.value: (item.des_gdma, lambda l, raw_data: l.extend(self.gdma_parser.parse(raw_data))), + BlockType.BLOCK_DES_SDMA.value: (item.des_sdma, lambda l, raw_data: l.extend(self.gdma_parser.parse(raw_data))), } for block in blocks: item_list, item_func = blocks_factory.get( @@ -94,10 +105,8 @@ def parse_cmd(self, file_list): item_func(item_list, block.content) if item.command_info: self.__read_command_data(item) - elif item.dyn_data: - self.__read_dyn_command_data(item) else: - self.__read_pure_pmu_data(item) + self.__read_dyn_command_data(item) def parse(self, in_dir): def sort_key_func(filename): @@ -117,6 +126,7 @@ def sort_key_func(filename): elif dyn_cmd: self.is_dyn = True self.parse_cmd(dyn_cmd) + self.cdma_cord_id = max(len(self.bd_pairs), len(self.gdma_pairs), len(self.sdma_pairs)) - 1 if cdma_cmd: self.parse_cdma_cmd(cdma_cmd) # else: @@ -132,7 +142,7 @@ def to_txt(self, out_dir): Path(self.out_dir).mkdir(parents=True, exist_ok=True) dma_file = os.path.join(self.out_dir, "tdmaRegInfo_{}.txt") tiu_file = os.path.join(self.out_dir, "tiuRegInfo_{}.txt") - cdma_file = os.path.join(self.out_dir, "cdmaPmuInfo_{}.txt") + cdma_file = os.path.join(self.out_dir, "cdmaRegInfo_{}.txt") # write engine info print("Write engine info...") for idx, pair in tqdm(enumerate(self.gdma_pairs)): @@ -155,6 +165,8 @@ def to_txt(self, out_dir): f.write(info) def __write_engine_info(self, nfile, idx, pairs, engine, new_file=True): + g_idx = 0 + core_id = idx fmode = 'w' if not new_file: fmode = 'a' @@ -167,37 +179,30 @@ def __write_engine_info(self, nfile, idx, pairs, engine, new_file=True): arch = self.archlib.TIU_ARCH tag = "__TIU_REG_INFO__\n" elif engine == self.archlib.EngineType.CDMA: - if pairs: - with open(nfile.format(idx), fmode) as f: - for n, p in enumerate(pairs): - p = p["monitor"] - info = f"[{idx:<2}]---> cdma record #{n:<7} inst_id: {p.inst_id:<10} thread_id: {p.thread_id:<4} start_time: {p.inst_start_time:<14} " \ - f"inst_end_time: {p.inst_end_time:<14} cycle: {p.inst_end_time - p.inst_start_time}\n" - f.write(info) - with open(nfile.format(f'{idx}_cmd'), fmode) as f: - for n, p in enumerate(self.cdma_cmd[idx]): - cmd_type, cmd_func = self.archlib.getCdmaFunctionName(p.des_tsk_typ, p.des_tsk_eu_typ) - info = f"[{idx:<2}]---> cdma record #{n:<7} inst_id: {p.inst_id:<10} cmd_type: {cmd_type:<20} cmd_func: {cmd_func:<8}\n" - f.write(info) - return - + fn = self.__get_gdma_info + arch = self.archlib.DMA_ARCH + tag = "__CDMA_REG_INFO__\n" + core_id = self.cdma_cord_id else: raise ValueError(f"Not support parse {self.archlib.EngineType(engine).name} now.") - with open(nfile.format(idx), fmode) as f: - if new_file: - f.write("__CHIP_ARCH_ARGS__\n") - f.write("".join(f"\t{key}: {value}\n" for key, - value in arch.items())) - for p in pairs: - info, extra = fn(p["monitor"], p["cmd"], idx, engine.value) - info["Core Id"] = idx - f.write(tag) - f.write( - "".join(f"\t{key}: {value}\n" for key, value in info.items())) - if extra is not None: - f.write('{}:\n'.format(info["Function Type"])) + if len(pairs): + with open(nfile.format(idx), fmode) as f: + if new_file: + f.write("__CHIP_ARCH_ARGS__\n") + f.write("".join(f"\t{key}: {value}\n" for key, + value in arch.items())) + for p in pairs: + info, extra = fn(p["monitor"], p["cmd"], idx, engine.value) + info["Global Idx"] = g_idx + g_idx += 1 + info["Core Id"] = core_id + f.write(tag) f.write( - "".join(f"\t{key}: {value}\n" for key, value in extra.items())) + "".join(f"\t{key}: {value}\n" for key, value in info.items())) + if extra is not None: + f.write('{}:\n'.format(info["Function Type"])) + f.write( + "".join(f"\t{key}: {value}\n" for key, value in extra.items())) def __align_core_time(self): assert(len(self.profile_sync_points) == len(self.bd_pairs)) @@ -215,30 +220,23 @@ def __align_core_time(self): for j1 in sdma: j1["monitor"].inst_start_time = int(j1["monitor"].inst_start_time - delta_cyle) j1["monitor"].inst_end_time = int(j1["monitor"].inst_end_time - delta_cyle) - # for i, (cdma, cycle) in enumerate(zip(self.cdma_pairs, self.profile_sync_points)): - # if i == 0: - # continue - # delta_cyle = cycle - self.profile_sync_points[0] - # for j1 in cdma: - # j1["monitor"].inst_start_time = int(j1["monitor"].inst_start_time - delta_cyle) - # j1["monitor"].inst_end_time = int(j1["monitor"].inst_end_time - delta_cyle) - # for i, (cpu, cycle) in enumerate(zip(self.cdmlib_extra, self.profile_sync_points)): - # if i == 0: - # continue - # delta_cyle = cycle - self.profile_sync_points[0] - # for j1 in cpu: - # j1.begin_cycle = int(j1.begin_cycle - delta_cyle) + for cdma in self.cdma_pairs: + cycle = self.profile_sync_points[self.cdma_cord_id] + delta_cyle = cycle - self.profile_sync_points[0] + for j1 in cdma: + j1["monitor"].inst_start_time = int(j1["monitor"].inst_start_time - delta_cyle) + j1["monitor"].inst_end_time = int(j1["monitor"].inst_end_time - delta_cyle) def __shift_time(self): - start_cycle = self.gdma_pairs[0][0]["monitor"].inst_start_time - - for _, (bd_pair, gdma_pair) in enumerate(zip(self.bd_pairs, self.gdma_pairs)): - start_cycle = min(bd_pair[0]["monitor"].inst_start_time, - start_cycle, gdma_pair[0]["monitor"].inst_start_time) - for sdma_pair in self.sdma_pairs: + start_cycle = math.inf + # start_cycle = self.gdma_pairs[0][0]["monitor"].inst_start_time + for _, (bd_pair, gdma_pair, sdma_pair) in enumerate(zip(self.bd_pairs, self.gdma_pairs, self.sdma_pairs)): + if bd_pair: + start_cycle = min(bd_pair[0]["monitor"].inst_start_time, start_cycle) + if gdma_pair: + start_cycle = min(gdma_pair[0]["monitor"].inst_start_time, start_cycle) if sdma_pair: start_cycle = min(sdma_pair[0]["monitor"].inst_start_time, start_cycle) - for _, (bd_pair, gdma_pair) in enumerate(zip(self.bd_pairs, self.gdma_pairs)): for j1 in itertools.chain(bd_pair, gdma_pair): j1["monitor"].inst_start_time = int(j1["monitor"].inst_start_time - start_cycle) @@ -249,18 +247,10 @@ def __shift_time(self): j1["monitor"].inst_start_time = int(j1["monitor"].inst_start_time - start_cycle) j1["monitor"].inst_end_time = int(j1["monitor"].inst_end_time - start_cycle) assert(j1["monitor"].inst_start_time >= 0 and j1["monitor"].inst_end_time >= 0) - - # for cdma_pair in self.cdma_pairs: - # for j1 in cdma_pair: - # print(j1["monitor"].inst_start_time, start_cycle) - # j1["monitor"].inst_start_time = int(j1["monitor"].inst_start_time - start_cycle) - # j1["monitor"].inst_end_time = int(j1["monitor"].inst_end_time - start_cycle) - # # for i, cdm_cpu in enumerate(self.cdmlib_extra): - # g_cmd_time = self.gdma_pairs[i][0]["cmd"].begin_cycle - # shift = min(self.gdma_pairs[i][0]["monitor"].inst_start_time, - # self.bd_pairs[i][0]["monitor"].inst_start_time) - # for j1 in cdm_cpu: - # j1.begin_cycle = int(j1.begin_cycle - g_cmd_time + shift) + for cdma_pair in self.cdma_pairs: + for j1 in cdma_pair: + j1["monitor"].inst_start_time = int(j1["monitor"].inst_start_time - start_cycle) + j1["monitor"].inst_end_time = int(j1["monitor"].inst_end_time - start_cycle) def __parse_dyn_data(self, dyn_data: List, raw_data): tmp = parse_fixed_length_items(raw_data, self.archlib.ProfileFormat) @@ -284,7 +274,7 @@ def __veryfy_time(self, data): delta_time = 0 uint32_max = 4294967295 for c in data: - current_time = c.inst_start_time + current_time = c.inst_start_time + delta_time if current_time < last_time: delta_time += uint32_max # uint32 max c.inst_start_time += delta_time @@ -294,6 +284,24 @@ def __veryfy_time(self, data): c.inst_end_time += uint32_max last_time = c.inst_end_time + def __veryfy_cdma_time(self, data): + last_st = 0 + last_et = 0 + delta_time = 0 + uint32_max = 4294967295 + for c in data: + current_st = c.inst_start_time + delta_time + current_et = c.inst_end_time + delta_time + if current_st < last_st and current_et < last_et: + delta_time += uint32_max # uint32 max + c.inst_start_time += delta_time + c.inst_end_time += delta_time + if c.inst_end_time < c.inst_start_time: + # start not overflow but end does + c.inst_end_time += uint32_max + last_st = c.inst_start_time + last_et = c.inst_end_time + def __parse_monitor_tiu(self, monitor_tiu: List, raw_data): tmp = parse_monitor_bd(raw_data, self.archlib) self.__veryfy_cmd_id(tmp) @@ -303,10 +311,10 @@ def __parse_monitor_tiu(self, monitor_tiu: List, raw_data): def __parse_monitor_cdma(self, monitor_cdma: List, raw_data): tmp = parse_monitor_cdma(raw_data, self.archlib) self.__veryfy_cmd_id(tmp) - self.__veryfy_time(tmp) + self.__veryfy_cdma_time(tmp) + self.__adjust_cmd_id(tmp) monitor_cdma.append(tmp) - def __parse_monitor_dma_base(self, raw_data): tmp = parse_monitor_gdma(raw_data, self.archlib) self.__veryfy_cmd_id(tmp) @@ -319,6 +327,7 @@ def __parse_monitor_gdma(self, monitor_gdma: List, raw_data): def __parse_monitor_sdma(self, monitor_sdma: List, raw_data): tmp = self.__parse_monitor_dma_base(raw_data) + self.__adjust_cmd_id(tmp) monitor_sdma.append(tmp) def __parse_command_info(self, command_info: List, raw_data): @@ -359,12 +368,9 @@ def __read_command_data(self, item): self.archlib.EngineType.VSDMA, core_num, self.gdma_parser) - bd_pair, _ = self.__find_profile_sync_points(bd_cmd, item.monitor_bd[core_num], - self.archlib.bd_sys_code, self.archlib.profile_sys_num) - gdma_pair, _ = self.__find_profile_sync_points(gdma_cmd, item.monitor_gdma[core_num], - self.archlib.dma_sys_code, self.archlib.profile_sys_num) - sdma_pair, _ = self.__find_profile_sync_points(sdma_cmd, item.monitor_sdma[core_num], - self.archlib.dma_sys_code, self.archlib.profile_sys_num) + bd_pair, _ = self.make_pairs(bd_cmd, item.monitor_bd[core_num], self.archlib.bd_sys_code) + gdma_pair, _ = self.make_pairs(gdma_cmd, item.monitor_gdma[core_num], self.archlib.dma_sys_code) + sdma_pair, _ = self.make_pairs(sdma_cmd, item.monitor_sdma[core_num], self.archlib.dma_sys_code) if core_num <= len(self.bd_pairs): if item.monitor_bd[core_num]: @@ -439,37 +445,37 @@ def __match_sections(self, monitor, cmd): result.reverse() return result - def __make_mix_pairs(self, cmd, monitor, sys_code): - _cmd = [] - _monitor = [] - cmd_slice = [] - monitor_slice = [] - first_idx = monitor[0].inst_id + 1 if monitor else 1 - start_idx, last_idx = first_idx, first_idx - for m in monitor: - m_id = m.inst_id + 1 - if m_id <= last_idx and monitor_slice: - _monitor.append((last_idx - start_idx + 1, monitor_slice)) - monitor_slice = [] - start_idx = m_id - monitor_slice.append(m) - last_idx = m_id - if monitor_slice: - _monitor.append((last_idx - start_idx + 1, monitor_slice)) - first_idx = get_cmd_id(cmd[0]) if cmd else 1 - start_idx, last_idx = first_idx, first_idx - for c in cmd: - cmd_id = get_cmd_id(c) - if cmd_id <= last_idx and cmd_slice: - _cmd.append((last_idx - start_idx + 1, cmd_slice)) - cmd_slice = [] - start_idx = cmd_id - cmd_slice.append(c) - last_idx = cmd_id - if cmd_slice: - _cmd.append((last_idx - start_idx + 1, cmd_slice)) + def __make_mix_pairs(self, cmd, monitor, sys_code, des_cmd): + + def get_sections(data): + # Notice: + # pmu: bd gdma sdma start idx == 0, cdma strat idx == 1 + # cmd: trat idx == 1 + sections = [] + slice = [] + first_idx = get_cmd_id(data[0]) if data else 1 + start_idx, last_idx = first_idx, first_idx + for c in data: + cmd_id = get_cmd_id(c) + if cmd_id <= last_idx and slice: + sections.append((last_idx - start_idx + 1, slice)) + slice = [] + start_idx = cmd_id + slice.append(c) + last_idx = cmd_id + if slice: + sections.append((last_idx - start_idx + 1, slice)) + return sections + _monitor = get_sections(monitor) + _cmd = get_sections(cmd) + + # print("+++++++++++++") + # print("cmd", [c[0] for c in _cmd], [[i.inst_id for i in c[1]] for c in _cmd]) + # print("pmu", [m[0] for m in _monitor], [[i.inst_id for i in c[1]] for c in _monitor]) + # print("pmu", [str(m[0]) for m in _monitor]) + # compatible code for tpu-train: pmu miss sys pair reason unknow - # force align + # force align TODO remove this if len(_cmd) == len(_monitor): for i in range(len(_cmd))[::-1]: if _cmd[i][0] == 2 and _monitor[i][0] == 2: @@ -485,129 +491,163 @@ def __make_mix_pairs(self, cmd, monitor, sys_code): _cmd[-1] = (max_len, _cmd[-1][1]) _monitor[-1] = (max_len, _monitor[-1][1]) pairs = self.__match_sections(_monitor, _cmd) - # print("####################### pmu ###########################") - # # for m in _monitor: - # # print(m[0], [i.inst_id + 1 for i in m[1]]) - # print([m[0] for m in _monitor]) - # print("####################### cmd ###########################") - # # for m in _cmd: - # # print(m[0], [i.inst_id for i in m[1]]) - # print([m[0] for m in _cmd]) + # des cmd + if des_cmd: + _des_cmd = get_sections(des_cmd) + idx, rest_monitor = [], [] + for i, p in enumerate(pairs): + if p[1] is None: + idx.append(i) + rest_monitor.append(_monitor[i]) + des_pairs = self.__match_sections(rest_monitor, _des_cmd) + for i, p in enumerate(des_pairs): + pairs[idx[i]] = p return pairs - def __make_pairs(self, cmd, monitor, sys_code, mix_mode=True): + def __make_pairs(self, cmd, monitor, sys_code, des_cmd=None): pairs = [] - if cmd == []: + if cmd == [] and des_cmd is None: for m in monitor: pairs.append({"monitor": m, "cmd": None}) return pairs - for p_monitor, p_cmd in self.__make_mix_pairs(cmd, monitor, sys_code): + section = 0 + for p_monitor, p_cmd in self.__make_mix_pairs(cmd, monitor, sys_code, des_cmd): + # print(section, [i.inst_id for i in p_monitor], p_monitor is not None and p_cmd is not None) + section += 1 if p_monitor is None: continue if p_cmd is None: # TODO get cmd from des comand and dyn command here - if mix_mode: + if des_cmd is not None: # mix mode for tpudnn, otherwise bmodel for m in p_monitor: pairs.append({"monitor": m, "cmd": None}) continue # len(cmd) >= len(pmu) cause pmu will drop some data m_start_idx = p_monitor[0].inst_id - c_start_idx = get_cmd_id(p_cmd[0]) for m in p_monitor: m_idx = m.inst_id - m_start_idx - for i, c in enumerate(p_cmd): - c_idx = get_cmd_id(c) - c_start_idx - if m_idx == c_idx: - pairs.append({"monitor": m, "cmd": c}) - p_cmd = p_cmd[i+1:] - break + if m_idx <= len(p_cmd): + pairs.append({"monitor": m, "cmd": p_cmd[m_idx]}) return pairs - def __find_profile_sync_points(self, cmd, monitor, sys_code, cmd_offset=0, omit_end_sys=True, pure_pmu=False): - # sys_code: - # dma sys tsk_typ 6 eu_typ:[3 send, 4 wait] - # tiu sys tsk_typ 15 eu_typ:[8 send, 9 wait] - send_wait = {self.archlib.dma_sys_code: [3, 4], - self.archlib.bd_sys_code: [8, 9]} - sys_num = 0 - time_point = 0 - # for bmodel - if cmd_offset: - for _ in range(cmd_offset): - # current for bmodel case cmdbuf don't contain forfile's sync_all cmd - m = monitor.pop(0) - time_point = m.inst_end_time - sys_num += 1 - # skip last dma cpy for bmodel outputs, TODO parse from dyn_cmd - mix_mode = True - if cmd_offset and sys_code == self.archlib.dma_sys_code: - mix_mode = False - pairs = self.__make_pairs(cmd, monitor, sys_code, mix_mode) - if cmd_offset and not pure_pmu: - # for bmodel remove monitor date without cmd - n = 0 - for item in pairs: - if item["cmd"] is None: - n += 1 - else: - break - for i in range(n): - pairs.pop(0) - for i, j in enumerate(pairs): - if j["cmd"] is None: - break - des_tsk_typ, des_tsk_eu_typ = self.__get_cmd_type(j["cmd"]) - if des_tsk_typ != sys_code or sys_num == self.archlib.profile_sys_num: - break - if des_tsk_eu_typ not in send_wait[sys_code]: - break - if des_tsk_eu_typ == 9: - time_point = j["monitor"].inst_end_time - sys_num += 1 - if sys_code == self.archlib.bd_sys_code: + def correct_ids(self, data): + data.sort(key=lambda x: x["start_time"]) + next_id = 1 + id_mapping = {} + + for record in data: + original_id = record["id"] + if original_id not in id_mapping: + id_mapping[original_id] = next_id + next_id += 1 + record["id"] = id_mapping[original_id] + + for i in range(len(data)): + if data[i]["id"] == id_mapping[8] and data[i]["start_time"] > data[i - 1]["start_time"]: + data[i - 1], data[i] = data[i], data[i - 1] + + return data + + @staticmethod + def __adjust_cmd_id(monitor): + pre = None + for i, m in enumerate(monitor): + # if pre and m.inst_id < pre.inst_id and pre.thread_id == 1: + # compatiable code for cdma pmu receive thread_id == 0 + if pre and m.inst_id < pre.inst_id and (pre.thread_id == 1 or pre.inst_id - m.inst_id == 1): + monitor[i], monitor[i-1] = monitor[i-1], monitor[i] + pre = m + + def __rm_tx_wait_points(self, monitor): + # tx_wait (wait, nop) + for i in range(self.archlib.profile_init_cmd_num): + _m = monitor.pop(0) + if i == 0: + start_time = _m.inst_end_time + # align internal engine + if len(self.sdma_pairs): + sdma_send_time = self.sdma_pairs[self.cdma_cord_id].pop(0)["monitor"].inst_end_time + delta_cycle = start_time - sdma_send_time + if delta_cycle: + for j1 in monitor: + j1.inst_start_time = j1.inst_start_time - delta_cycle + j1.inst_end_time = j1.inst_end_time - delta_cycle + + + def __rm_sync_points(self, monitor, record_time_stamp=False): + # sync points (send, wait) + for _ in range(self.archlib.profile_init_cmd_num): + m = monitor.pop(0) + time_point = m.inst_end_time + if record_time_stamp: + # we use bd to set base time stamp self.profile_sync_points.append(time_point) - if cmd_offset != self.archlib.profile_sys_num: - for _ in range(sys_num): - _m = pairs.pop(0) - if sys_num == 0: - logging.warn("can't find sync cmd at begin.") - sys_end_num = 0 - if omit_end_sys: - extra_sys = [] - part = [] - for i in range(len(pairs))[::-1]: - if pairs[i]["cmd"] is None: - if self.is_dyn: - break - else: - # for bmodel - extra_sys.append(i) - continue - des_tsk_typ, _ = self.__get_cmd_type(pairs[i]["cmd"]) - if des_tsk_typ != sys_code: + else: + # align internal engine, not sure if is needed + delta_cycle = time_point - self.profile_sync_points[-1] + if delta_cycle: + for j1 in monitor: + j1.inst_start_time = j1.inst_start_time - delta_cycle + j1.inst_end_time = j1.inst_end_time - delta_cycle + + def __omit_end_sys(self, pairs, sys_code): + extra_sys = [] + part = [] + for i in range(len(pairs))[::-1]: + if pairs[i]["cmd"] is None: + if self.is_dyn: break - else: - part.append(i) - if pairs[i]["monitor"].inst_id == 0: - extra_sys.extend(part) - part = [] - sys_end_num = len(extra_sys) - for i in extra_sys: + else: # for bmodel + extra_sys.append(i) + continue + tsk_type, _ = self.__get_cmd_type(pairs[i]["cmd"]) + if tsk_type != sys_code: + break + else: + part.append(i) + if pairs[i]["monitor"].inst_id == 0: + extra_sys.extend(part) + part = [] + for i in extra_sys: + if len(pairs) > 2: pairs.pop(i) - # assert(len(pairs) != 0) - return pairs, sys_num + + def __compatiable_make_pairs(self, cmd, monitor, sys_code, des_cmd=None): + pairs = [] + if len(cmd) == len(monitor): + for c, m in zip(cmd, monitor): + pairs.append({"monitor": m, "cmd": c}) + else: + pairs = self.__make_pairs(cmd, monitor, sys_code, des_cmd) + return pairs + + + def make_pairs(self, cmd, monitor, sys_code, des_cmd=None, is_cdma=False): + offset = self.archlib.profile_init_cmd_num + if is_cdma: + self.__rm_tx_wait_points(monitor) + # tmp compatiable code cause pio mode pmu inst_id is not correct + pairs = self.__compatiable_make_pairs(cmd[offset:], monitor, sys_code, des_cmd) # todo support des + return pairs + else: + self.__rm_sync_points(monitor, sys_code == self.archlib.bd_sys_code) + pairs = self.__make_pairs(cmd[offset:], monitor, sys_code, des_cmd) + self.__omit_end_sys(pairs, sys_code) + return pairs def __read_dyn_command_data(self, item): # just for tpudnn # dyn cmd include single core cmd per cdm.profile + gdma_cmd = [] + sdma_cmd = [] + bd_cmd = [] if item.dyn_data: dyn_data = item.dyn_data assert(len(dyn_data)>0) - gdma_cmd = [] - sdma_cmd = [] - bd_cmd = [] + core_id = len(self.bd_pairs) for i, d in enumerate(dyn_data[1:]): # skip init record + d.core_id = core_id if d.type == self.archlib.DynRecordType.NODE_SET.value: if item.dyn_extra: d.detailed_cmd = item.dyn_extra[i].content @@ -621,33 +661,15 @@ def __read_dyn_command_data(self, item): bd_cmd.append(d) elif d.engine == self.archlib.EngineType.CDMA.value: self.cdma_cmd[d.extra_info >> 7].append(d) - bd_pair, bd_sys_num = self.__find_profile_sync_points(bd_cmd, item.monitor_bd[0], self.archlib.bd_sys_code) - sdma_pair, sdma_sys_num = self.__find_profile_sync_points(sdma_cmd, item.monitor_sdma[0], self.archlib.dma_sys_code) - gdma_pair, gdma_sys_num = self.__find_profile_sync_points(gdma_cmd, item.monitor_gdma[0], self.archlib.dma_sys_code) - self.bd_pairs.append(bd_pair) - self.gdma_pairs.append(gdma_pair) - self.sdma_pairs.append(sdma_pair) - # skip profile init and call sync_all - self.cdmlib_extra.append(dyn_data[1 + bd_sys_num + sdma_sys_num + gdma_sys_num:]) - # if item.dyn_extra: - # print(item.dyn_extra ) - # for k, v in item.dyn_extra.items(): - # print(k) - # engine = self.archlib.EngineType((k >> 8) & 0x7) - # print(engine, engine == self.archlib.EngineType.GDMA) - # parser = self.archlib.GDMACommandParser() - # if engine == self.archlib.EngineType.GDMA: - # gdma_parser = parser.parse(v[0].content) - # print(gdma_parser) - - - def __read_pure_pmu_data(self, item): - bd_pair, _ = self.__find_profile_sync_points([], item.monitor_bd[0], self.archlib.bd_sys_code, self.archlib.profile_sys_num, pure_pmu=True) - sdma_pair, _ = self.__find_profile_sync_points([], item.monitor_sdma[0], self.archlib.dma_sys_code, self.archlib.profile_sys_num, pure_pmu=True) - gdma_pair, _ = self.__find_profile_sync_points([], item.monitor_gdma[0], self.archlib.dma_sys_code, self.archlib.profile_sys_num, pure_pmu=True) - self.bd_pairs.append(bd_pair) - self.gdma_pairs.append(gdma_pair) - self.sdma_pairs.append(sdma_pair) + bd_pair = self.make_pairs(bd_cmd, item.monitor_bd[0], self.archlib.bd_sys_code, + item.des_bdc) + gdma_pair = self.make_pairs(gdma_cmd, item.monitor_gdma[0], self.archlib.dma_sys_code, + item.des_gdma) + sdma_pair = self.make_pairs(sdma_cmd, item.monitor_sdma[0], self.archlib.dma_sys_code, + item.des_sdma) + self.bd_pairs.append(bd_pair) + self.gdma_pairs.append(gdma_pair) + self.sdma_pairs.append(sdma_pair) def __base_read_command_data(self, base, offset, engine_type, core_num, command_parser): basename = "cmd_%x_%d_%d.dat" @@ -674,8 +696,10 @@ def __base_read_command_data(self, base, offset, engine_type, core_num, command_ return command_list def __get_gdma_info(self, monitor_info, reg_info, core_id, engine_id=1): - if self.is_dyn: - if hasattr(reg_info, 'detailed_cmd'): + if reg_info is None: + return get_dma_info_dyn(monitor_info, reg_info, engine_id) + if self.is_dyn and hasattr(reg_info, "extra_info"): + if hasattr(reg_info, 'detailed_cmd') and engine_id != 4: _reg_info = self.gdma_parser.parse(reg_info.detailed_cmd)[0] return get_dma_info(monitor_info, _reg_info, core_id, engine_id) return get_dma_info_dyn(monitor_info, reg_info, engine_id) @@ -683,7 +707,9 @@ def __get_gdma_info(self, monitor_info, reg_info, core_id, engine_id=1): return get_dma_info(monitor_info, reg_info, core_id, engine_id) def __get_tiu_info(self, monitor_info, reg_info, core_id=None, engine_id=0): - if self.is_dyn: + if reg_info is None: + return get_tiu_info_dyn(monitor_info, reg_info) + if self.is_dyn and hasattr(reg_info, "extra_info"): if hasattr(reg_info, 'detailed_cmd'): _reg_info = self.bdc_parser.parse(reg_info.detailed_cmd)[0] return get_tiu_info(monitor_info, _reg_info)