From b6017e8701847ebb9ebebf77e3e7f33bf0fbcf9f Mon Sep 17 00:00:00 2001 From: haozhe Date: Sun, 19 Jan 2025 18:54:47 +0800 Subject: [PATCH] feat: make lmem assignment stage more analyzable - define some commonly used LOG macro (Logger.h) - define some strinify function to show lmem type and timestep mode (LayerGroupDefs.h) - add show_timestep_table to print readable timestep table (BasicTimeStep.h/BasicTimeStep.cpp) - add many DEBUG_WITH_TYPE logs and comments in lmem assignment stage (BasicTimeStep.cpp/LmemAllocator.cpp/TimeStepMethod.cpp/SwPipeline.cpp) - rename some variables and function names for better represent the process(gen_all_mem_buffer_ts/tgt_min_address/...) - reduce assignLmemAddr cyclomatic complexity.(LmemAllocator.cpp:989) Change-Id: I31dadb9424be334da481f9dfbd45985ca89dc058 --- .../Tpu/Transforms/LayerGroup/BasicTimeStep.h | 5 +- .../Transforms/LayerGroup/LayerGroupDefs.h | 30 ++ .../Tpu/Transforms/LayerGroup/TimeStep.h | 2 +- include/tpu_mlir/Support/Logger.h | 23 +- include/tpu_mlir/Support/Module.h | 1 + .../Transforms/LayerGroup/BasicTimeStep.cpp | 173 ++++++- .../Transforms/LayerGroup/CoeffReloadOpt.cpp | 2 +- .../Tpu/Transforms/LayerGroup/GroupMethod.cpp | 44 +- .../Transforms/LayerGroup/LmemAllocator.cpp | 437 ++++++++++++------ .../Tpu/Transforms/LayerGroup/SwPipeline.cpp | 13 +- .../Transforms/LayerGroup/TimeStepMethod.cpp | 40 +- lib/Support/Module.cpp | 7 + python/tools/logdebug_tool.py | 4 +- 13 files changed, 598 insertions(+), 183 deletions(-) diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h index df318107f..4ac7b0e21 100644 --- a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h +++ b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h @@ -39,6 +39,7 @@ class BasicTimeStep { void add_tpu0_gdma0_ts_field(const TpuTsField &tpu_field, const GdmaTsField &gdma_field); void update_gdma0_ts_field(int64_t ts, const GdmaTsField &field); + void show_timestep_table(); std::vector &get_timestep_table() { return timestep_table_; } size_t get_timestep_num() { return timestep_table_.size(); } @@ -79,12 +80,12 @@ class BasicTimeStep { } TensorInfo &get_tensor_infos(); - + std::string get_tensor_mode_str(Value v); // setter void set_lmem_addr(const mem_buffer_key_t &buffer_key, int64_t lmem_addr); void set_lmem_occupy(int64_t occupy) { lmem_occupy_ = occupy; } - void gen_all_mem_buffer(); + void gen_all_mem_buffer_ts(); void update_all_mem_buffer_size(const LgInfo &lg_info); void gen_hold_coeff(); bool is_tensor_hold_in_lmem(Value v); diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h index a708d378e..1d8e24971 100644 --- a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h +++ b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h @@ -98,6 +98,20 @@ typedef struct mem_buffer_key { } return false; } + + std::string lmem_type_str() { + switch (type) { + case LMEM_WEIGHT: + return "LMEM_WEIGHT"; + case LMEM_ACTIVATION: + return "LMEM_ACTIVATION"; + case LMEM_OPERATION: + return "LMEM_OPERATION"; + case LMEM_ANY: + return "LMEM_ANY"; + } + return "LMEM_UNKNOWN"; + } } mem_buffer_key_t; typedef struct mem_buffer_value { @@ -155,6 +169,22 @@ struct tensor_info_t { void add_slice_info(Operation *next_op, slice_info_t slice_info) { slice_infos[next_op] = slice_info; } + + const std::string mode_str() const { + switch (mode) { + case TIMESTEP_LOAD: + return "TIMESTEP_LOAD"; + case TIMESTEP_STORE: + return "TIMESTEP_STORE"; + case TIMESTEP_MOVE: + return "TIMESTEP_MOVE"; + case TIMESTEP_LD_G2L2: + return "TIMESTEP_LD_G2L2"; + case TIMESTEP_LDST_UNKNOWN: + return "TIMESTEP_LDST_UNKNOWN"; + } + return "TIMESTEP_UNKNOWN"; + } }; using ValueSet = std::set; diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h index a4d490eb1..b5a6f4c68 100644 --- a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h +++ b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h @@ -74,7 +74,7 @@ class BasicTimeStep { void show_timestep(); void clear(); - void gen_all_mem_buffer(); + void gen_all_mem_buffer_ts(); protected: LgOptions options_; diff --git a/include/tpu_mlir/Support/Logger.h b/include/tpu_mlir/Support/Logger.h index 16733f812..fc2870106 100644 --- a/include/tpu_mlir/Support/Logger.h +++ b/include/tpu_mlir/Support/Logger.h @@ -46,26 +46,27 @@ inline std::string formatString(const char *format, ...) { inline void SetLogFlag(int32_t log_level) { cur_log_level = log_level; } +#define LOG_KV(key, value) "; " << key << " = " << value + +#define LOG_ITEM(key) "; " << key + +#define LOG_ACTION(action) "; action = " << action + +#define LOG_STEP(step) "; step = " << step + #define PROFILE_LOG(step, begin) \ do { \ DEBUG_WITH_TYPE("profile", { \ auto current_time = std::chrono::high_resolution_clock::now(); \ auto time_string = std::chrono::system_clock::to_time_t(current_time); \ if (begin) { \ - llvm::dbgs() << "; action = profile" \ - << "; step = " << step \ - << "; begin = " << std::ctime(&time_string) << "\n"; \ + llvm::dbgs() << LOG_ACTION("profile") << LOG_STEP(step) \ + << LOG_KV("begin", std::ctime(&time_string)) << "\n"; \ } else { \ - llvm::dbgs() << "; action = profile" \ - << "; step = " << step \ - << "; end = " << std::ctime(&time_string) << "\n"; \ + llvm::dbgs() << LOG_ACTION("profile") << LOG_STEP(step) \ + << LOG_KV("end", std::ctime(&time_string)) << "\n"; \ } \ }); \ } while (0) -#define DEBUG_KV(key, value) \ - do { \ - llvm::dbgs() << "; " << key << " = " << value << "\n"; \ - } while (0) - } // namespace tpu_mlir diff --git a/include/tpu_mlir/Support/Module.h b/include/tpu_mlir/Support/Module.h index 9b254c633..f676d0b81 100644 --- a/include/tpu_mlir/Support/Module.h +++ b/include/tpu_mlir/Support/Module.h @@ -200,6 +200,7 @@ bool IsHdimIsBatch(Value value); bool isOpInCoreMatch(Operation *Op); bool isOpInCoreParallel(Operation *Op); bool isOpInGroupParallel(Operation *Op); +bool isValueBlockArgument(Value v); bool isOpInDevParallel(Operation *Op); bool isOpInBlock(Operation *op); FuncOp getFuncOp(ModuleOp module, StringRef func_name); diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp index b87f7d976..712662e82 100644 --- a/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp +++ b/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp @@ -16,6 +16,38 @@ namespace tpu { using namespace tpu_mlir::tpu; using namespace tpu_mlir::backend; +static inline void stream_tpu_field(const TpuTsField &field) { + llvm::dbgs() << " [ "; + for (int i = 0; i < field.size(); ++i) { + if (i > 0) + llvm::dbgs() << ", "; + llvm::dbgs() << "C(\"" << field[i]->getName() << "\"), \"" + << module::getName(field[i]) << "\""; + } + llvm::dbgs() << " ]"; +} + +static inline void stream_gdma_field(const GdmaTsField &field) { + llvm::dbgs() << " [ "; + for (int i = 0; i < field.size(); ++i) { + auto mode = field[i].second.mode; + auto modestr = "L"; + if (mode == TIMESTEP_STORE) { + modestr = "S"; + } + + if (i > 0) + llvm::dbgs() << ", "; + std::string op_type = + module::isValueBlockArgument(field[i].first) + ? "block_arg" + : field[i].first.getDefiningOp()->getName().getStringRef().str(); + llvm::dbgs() << modestr << "(\"" << module::getName(field[i].first) + << "\")->" << op_type; + } + llvm::dbgs() << " ]"; +} + BasicTimeStep::BasicTimeStep() { // options_ = options; swpipl_ = std::make_shared(); @@ -50,12 +82,29 @@ void BasicTimeStep::add_tpu0_ts_field(const TpuTsField &field) { TimestepRow row; row.tpu0_ts_field = field; timestep_table_.push_back(row); + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "; action = add_tpu0_ts_field" + << "; ts = " << timestep_table_.size() - 1; + + stream_tpu_field(field); + + llvm::dbgs() << "\n"; + }); } void BasicTimeStep::add_gdma0_ts_field(const GdmaTsField &field) { TimestepRow row; row.gdma0_ts_field = field; timestep_table_.push_back(row); + + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "; action = add_gdma0_ts_field" + << "; ts = " << timestep_table_.size() - 1; + + stream_gdma_field(field); + + llvm::dbgs() << "\n"; + }); } void BasicTimeStep::add_tpu0_gdma0_ts_field(const TpuTsField &tpu_field, @@ -64,12 +113,43 @@ void BasicTimeStep::add_tpu0_gdma0_ts_field(const TpuTsField &tpu_field, row.tpu0_ts_field = tpu_field; row.gdma0_ts_field = gdma_field; timestep_table_.push_back(row); + + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "; action = add_tpu0_gdma0_ts_field" + << "; ts = " << timestep_table_.size() - 1; + + stream_tpu_field(tpu_field); + stream_gdma_field(gdma_field); + + llvm::dbgs() << "\n"; + }); } void BasicTimeStep::update_gdma0_ts_field(int64_t ts, const GdmaTsField &field) { this->timestep_table_[ts].gdma0_ts_field.clear(); this->timestep_table_[ts].gdma0_ts_field = field; + + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "; action = update_gdma0_ts_field" + << "; ts = " << ts; + + stream_gdma_field(field); + + llvm::dbgs() << "\n"; + }); +} + +void BasicTimeStep::show_timestep_table() { + DEBUG_WITH_TYPE("timestep_assign", { + for (int i = 0; i < timestep_table_.size(); ++i) { + llvm::dbgs() << "; ts = " << i << "; "; + stream_tpu_field(timestep_table_[i].tpu0_ts_field); + llvm::dbgs() << " || "; + stream_gdma_field(timestep_table_[i].gdma0_ts_field); + llvm::dbgs() << "\n"; + } + }); } int64_t BasicTimeStep::get_layer_swpipl_stage(Operation *op) { @@ -208,7 +288,7 @@ void BasicTimeStep::gen_hold_coeff() { // } // } -void BasicTimeStep::gen_all_mem_buffer() { +void BasicTimeStep::gen_all_mem_buffer_ts() { // input: need_imm_buffers lmem_buffer_.clear(); l2mem_buffer_.clear(); @@ -219,10 +299,24 @@ void BasicTimeStep::gen_all_mem_buffer() { for (int64_t stg = 0; stg < this->swpipl_stage_num_; ++stg) { // add for software pipeline + // swpipl_stage_num_ always be 3 after software pipeline bool layer_timestep_valid = (swpipl_stage_num_ == 1) || (swpipl_stage_num_ > 1 && stg == 1); + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() << "; action = lmem_buffer_assign" + << "; step = " + << "process_current_stage" + << "; stg = " << stg + << "; swpipl_stage_num_ = " << swpipl_stage_num_ << "\n"; + }); for (size_t ts = 0; ts < get_timestep_num(); ++ts) { // process current timestep layers + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() << "; action = lmem_buffer_assign" + << "; step = " + << "process_current_timestep_layers" + << "; ts = " << ts << "\n"; + }); const TpuTsField &cur_tpu_field = timestep_table_[ts].tpu0_ts_field; if (layer_timestep_valid) { for (auto op : cur_tpu_field) { @@ -235,6 +329,16 @@ void BasicTimeStep::gen_all_mem_buffer() { lmem_value.start_ts = ts; lmem_value.end_ts = -1; + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() << "; action = lmem_buffer_assign" + << "; step = " + << "initial_results_buffer" + << "; lmem_key = " << module::getName(lmem_key.value) + << "; lmem_type = " << lmem_key.lmem_type_str() + << "; lmem_start_ts = " << lmem_value.start_ts + << "; lmem_end_ts = " << lmem_value.end_ts << "\n"; + }); + lmem_buffer_[lmem_key] = lmem_value; } @@ -250,11 +354,36 @@ void BasicTimeStep::gen_all_mem_buffer() { } lmem_key.value = in; - // lmem_buffer_[lmem_key].end_ts = ts; if (lmem_buffer_.find(lmem_key) != lmem_buffer_.end()) { lmem_buffer_[lmem_key].end_ts = ts; + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() + << "; action = lmem_buffer_assign" + << "; step = " + << "update_operands_lmem_buffer" + << "; lmem_key = " << module::getName(lmem_key.value) + << "; lmem_type = " << lmem_key.lmem_type_str() + << "; lmem_start_ts = " << lmem_buffer_[lmem_key].start_ts + << "; timestep_mode = " + << get_tensor_mode_str(lmem_key.value) + << "; lmem_end_ts = " << lmem_buffer_[lmem_key].end_ts + << "\n"; + }); } else { l2mem_buffer_[lmem_key].end_ts = ts; + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() + << "; action = lmem_buffer_assign" + << "; step = " + << "update_operands_l2mem_buffer" + << "; lmem_key = " << module::getName(lmem_key.value) + << "; lmem_type = " << lmem_key.lmem_type_str() + << "; lmem_start_ts = " << l2mem_buffer_[lmem_key].start_ts + << "; timestep_mode = " + << get_tensor_mode_str(lmem_key.value) + << "; lmem_end_ts = " << l2mem_buffer_[lmem_key].end_ts + << "\n"; + }); } } @@ -264,7 +393,17 @@ void BasicTimeStep::gen_all_mem_buffer() { lmem_value.start_ts = ts; lmem_value.end_ts = ts; - + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() << "; action = lmem_buffer_assign" + << "; step = " + << "update_imm_buffer" + << "; lmem_key = " << module::getName(lmem_key.value) + << "; lmem_type = " << lmem_key.lmem_type_str() + << "; lmem_start_ts = " << lmem_value.start_ts + << "; timestep_mode = " + << get_tensor_mode_str(lmem_key.value) + << "; lmem_end_ts = " << lmem_value.end_ts << "\n"; + }); lmem_buffer_[lmem_key] = lmem_value; } // cur_tpu_field } @@ -294,12 +433,34 @@ void BasicTimeStep::gen_all_mem_buffer() { l2mem_buffer_[lmem_key] = lmem_value; } else { lmem_buffer_[lmem_key] = lmem_value; + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() << "; action = lmem_buffer_assign" + << "; step = " + << "update_load_buffer" + << "; lmem_key = " << module::getName(lmem_key.value) + << "; lmem_type = " << lmem_key.lmem_type_str() + << "; timestep_mode = " << tensor_info.mode_str() + << "; lmem_start_ts = " << lmem_value.start_ts + << "; lmem_end_ts = " << lmem_value.end_ts << "\n"; + }); } } else if (tensor_info.mode == TIMESTEP_STORE) { lmem_key.value = tensor.first; lmem_key.type = LMEM_ACTIVATION; lmem_buffer_[lmem_key].end_ts = ts; + DEBUG_WITH_TYPE("lmem_buffer_assign", { + llvm::dbgs() << "; action = lmem_buffer_assign" + << "; step = " + << "update_store_buffer" + << "; lmem_key = " << module::getName(lmem_key.value) + << "; lmem_type = " << lmem_key.lmem_type_str() + << "; timestep_mode = " << tensor_info.mode_str() + << "; lmem_start_ts = " + << lmem_buffer_[lmem_key].start_ts + << "; lmem_end_ts = " << lmem_buffer_[lmem_key].end_ts + << "\n"; + }); } } } @@ -308,7 +469,7 @@ void BasicTimeStep::gen_all_mem_buffer() { void BasicTimeStep::update_all_mem_buffer_size(const LgInfo &lg_info) { if (lmem_buffer_.empty()) { - gen_all_mem_buffer(); + gen_all_mem_buffer_ts(); } auto &tensor_infos = tensor_infos_; @@ -424,6 +585,10 @@ bool BasicTimeStep::is_tensor_hold_in_lmem(Value v) { TensorInfo &BasicTimeStep::get_tensor_infos() { return tensor_infos_; } +std::string BasicTimeStep::get_tensor_mode_str(Value v) { + return tensor_infos_[v].mode_str(); +} + typedef struct { Value value; int64_t addr; diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp index 8ba96c35c..0774cb6c3 100644 --- a/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp +++ b/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp @@ -38,7 +38,7 @@ void coeff_reload_open(BasicTimeStepPtr &time_step, TensorInfo &tensor_infos) { Bm168xCycleCalculator *cyc_ptr = new Bm168xCycleCalculator(); cycle_calculator.reset(cyc_ptr); } - time_step->gen_all_mem_buffer(); + time_step->gen_all_mem_buffer_ts(); for (int64_t ts = 0; ts < timestep_num; ++ts) { int64_t slack = 0; tensor_to_coeff_cost.clear(); diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp index 2b4c93c41..ef7b60398 100644 --- a/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp +++ b/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp @@ -1582,6 +1582,10 @@ void GroupMethod::load_lg_results( auto &root = *jsonOrErr; int opt = options_.opt; // Load group layers + + std::vector> base_groups; + get_base_groups(base_groups, subnet_ops); + if (auto *rootObj = root.getAsObject()) { if (auto opt_ = rootObj->getInteger("opt")) { opt = *opt_; @@ -1619,6 +1623,12 @@ void GroupMethod::load_lg_results( } } } + } else { + // assuming the base_group partition did not change when locs are + // not assigned in cache + get_layer_group(lg_info, base_groups[lg_info.base_group_idx], + lg_info.start_idx, lg_info.end_idx, + lg_info.base_group_idx); } // Get shape_secs if available if (auto shapeArray = groupObj_->getArray("shape_secs")) { @@ -1683,25 +1693,23 @@ void GroupMethod::load_lg_results( for (auto &lg_info : lg_infos) { int64_t cost = 0; lg_info.use_cache = true; - if (lg_info.group_cost > 0) { - DEBUG_WITH_TYPE("lg_index", { - llvm::dbgs() << "; action = lg_index" - << "; start_idx = " << lg_info.start_idx - << "; end_idx = " << lg_info.end_idx - << "; group_idx = " << lg_info.base_group_idx << "\n"; - }); - if (!is_layer_group_valid(lg_info, true, &cost)) { - llvm_unreachable("group_cost is not valid"); - } - DEBUG_WITH_TYPE("lg_cost", { - llvm::dbgs() << "; action = lg_cost" - << "; step = group_layer" - << "; start_idx = " << lg_info.start_idx - << "; end_idx = " << lg_info.end_idx - << "; group_idx = " << lg_info.base_group_idx - << "; group_cost = " << lg_info.group_cost << "\n"; - }); + DEBUG_WITH_TYPE("lg_index", { + llvm::dbgs() << "; action = lg_index" + << "; start_idx = " << lg_info.start_idx + << "; end_idx = " << lg_info.end_idx + << "; group_idx = " << lg_info.base_group_idx << "\n"; + }); + if (!is_layer_group_valid(lg_info, true, &cost)) { + llvm_unreachable("group_cost is not valid"); } + DEBUG_WITH_TYPE("lg_cost", { + llvm::dbgs() << "; action = lg_cost" + << "; step = group_layer" + << "; start_idx = " << lg_info.start_idx + << "; end_idx = " << lg_info.end_idx + << "; group_idx = " << lg_info.base_group_idx + << "; group_cost = " << lg_info.group_cost << "\n"; + }); } llvm::outs() << "load lg results\n"; } diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp index 5fae8bc36..f22297193 100644 --- a/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp +++ b/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp @@ -70,14 +70,18 @@ static int64_t get_membuf_area(int64_t start_ts, int64_t end_ts, static bool is_buffer_used_by_npu(const mem_buffer_key_t &buffer_key, const TpuTsField &cur_layers) { + // LMEM_OPERATION is an operation buffer, so it is always used by npu + // LMEM_ACTIVATION/LMEM_WEIGHT can be used by gdma (in store/load op), so it + // can't return true directly if (buffer_key.type == LMEM_OPERATION) { return true; } auto users = buffer_key.value.getUsers(); auto src_op = buffer_key.value.getDefiningOp(); for (auto op : cur_layers) { - if (src_op == op || - std::find(users.begin(), users.end(), op) != users.end()) { + if (src_op == op /* src_op is an output */ || + std::find(users.begin(), users.end(), op) != + users.end() /* src_op is an input */) { return true; } } @@ -91,6 +95,8 @@ static bool is_buffer_used_by_gdma(const mem_buffer_key_t &buffer_key, for (auto &tensor : cur_tensors) { if (tensor.first == buffer_key.value && is_lmem_ldst(tensor.second.mode)) { + // TODO: maybe it can be move to outside in update_exclude_banks + // function if (is_npu_use && tensor.second.mode != TIMESTEP_STORE) { llvm::errs() << "tensor is loaded and used by npu simultaneously in " "timestep\n"; @@ -206,21 +212,49 @@ bool LmemAllocator::update_avail_lmems(std::list &avail_lmems, for (avail_iter = avail_lmems.begin(); avail_iter != avail_lmems.end();) { int64_t avail_start = avail_iter->first; int64_t avail_end = avail_iter->first + avail_iter->second; + /** + * Case 1: full overlap + * avail: |--------| + * exclude: |----------| + * result: (delete) + * + */ if (avail_start >= exclude_start && avail_end <= exclude_end) { avail_iter = avail_lmems.erase(avail_iter); - } else if (avail_start < exclude_start && avail_end > exclude_start && - avail_end <= exclude_end) { + } + /** + * Case 2: right overlap + * avail: |--------| + * exclude: |--------| + * result: |---| + */ + else if (avail_start < exclude_start && avail_end > exclude_start && + avail_end <= exclude_end) { avail_iter->second = exclude_start - avail_start; avail_iter++; - } else if (avail_start >= exclude_start && avail_start < exclude_end && - avail_end > exclude_end) { + } + /** + * Case 3: left overlap + * avail: |--------| + * exclude: |-------| + * result: |---| + */ + else if (avail_start >= exclude_start && avail_start < exclude_end && + avail_end > exclude_end) { if (avail_start == exclude_start) { space_split = true; } avail_iter->second = avail_end - exclude_end; avail_iter->first = exclude_end; avail_iter++; - } else if (avail_start < exclude_start && avail_end > exclude_end) { + } + /** + * Case 4: full split + * avail: |--------------| + * exclude: |-----| + * result: |---| |---| + */ + else if (avail_start < exclude_start && avail_end > exclude_end) { int new_buffer_addr = exclude_end; int new_buffer_size = avail_end - exclude_end; avail_iter->second = exclude_start - avail_start; @@ -508,9 +542,8 @@ MemBlock LmemAllocator::find_avail_lmem_location( MemBlock alloc_lmem(-1, -1); if (avail_space.avail_lmems.empty()) { DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = find_avail_lmem" - << "; step = avail_lmems_empty" - << "\n"; + llvm::dbgs() << LOG_ACTION("find_avail_lmem") + << LOG_STEP("avail_lmems_empty") << "\n"; }); return alloc_lmem; } @@ -518,10 +551,10 @@ MemBlock LmemAllocator::find_avail_lmem_location( if (allow_bank_conflict) { alloc_lmem = avail_space.avail_lmems.front(); DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = find_avail_lmem" - << "; step = use_bank_conflict_buffer" - << "; lmem = " << alloc_lmem.first - << "; size = " << alloc_lmem.second << "\n"; + llvm::dbgs() << LOG_ACTION("find_avail_lmem") + << LOG_STEP("use_bank_conflict_buffer") + << LOG_KV("lmem", alloc_lmem.first) + << LOG_KV("size", alloc_lmem.second) << "\n"; }); return alloc_lmem; } @@ -538,18 +571,18 @@ MemBlock LmemAllocator::find_avail_lmem_location( for (auto avail_iter = avail_lmems_tmp.begin(); avail_iter != avail_lmems_tmp.end(); ++avail_iter) { DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = find_avail_lmem" - << "; step = iter_avail_lmem" - << "; lmem = " << avail_iter->first - << "; size = " << avail_iter->second << "\n"; + llvm::dbgs() << LOG_ACTION("find_avail_lmem") + << LOG_STEP("iter_avail_lmem") + << LOG_KV("lmem", avail_iter->first) + << LOG_KV("size", avail_iter->second) << "\n"; }); if (avail_iter->second >= buffer_value.size) { alloc_lmem = *avail_iter; DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = find_avail_lmem" - << "; step = find_availble_buffer" - << "; lmem = " << alloc_lmem.first - << "; size = " << alloc_lmem.second << "\n"; + llvm::dbgs() << LOG_ACTION("find_avail_lmem") + << LOG_STEP("find_availble_buffer") + << LOG_KV("lmem", alloc_lmem.first) + << LOG_KV("size", alloc_lmem.second) << "\n"; }); break; } @@ -559,10 +592,10 @@ MemBlock LmemAllocator::find_avail_lmem_location( if (alloc_lmem.first == -1) { alloc_lmem = avail_space.avail_lmems.front(); DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = find_avail_lmem" - << "; step = use_bank_conflict_buffer" - << "; lmem = " << alloc_lmem.first - << "; size = " << alloc_lmem.second << "\n"; + llvm::dbgs() << LOG_ACTION("find_avail_lmem") + << LOG_STEP("use_bank_conflict_buffer") + << LOG_KV("lmem", alloc_lmem.first) + << LOG_KV("size", alloc_lmem.second) << "\n"; }); } @@ -582,6 +615,7 @@ void LmemAllocator::update_exclude_banks( bool is_npu_use, is_gdma_use; bool is_recent_used_banks_updated = false; std::set recent_used_banks; + // find in all timesteps in buffer life cycle for (int64_t ts = buffer_value.start_ts; (ts != ((buffer_value.end_ts + 1) % timestep_num)) || first_step; ts = (ts + 1) % timestep_num) { @@ -590,7 +624,12 @@ void LmemAllocator::update_exclude_banks( const GdmaTsField &cur_tensors = time_step->getTensors(ts); is_npu_use = is_buffer_used_by_npu(buffer_key, cur_layers); is_gdma_use = is_buffer_used_by_gdma(buffer_key, cur_tensors, is_npu_use); - + DEBUG_WITH_TYPE("assign_lmem", { + llvm::dbgs() << LOG_ACTION("update_exclude_banks") + << LOG_STEP("find_banks_used") << LOG_KV("ts", ts) + << LOG_KV("is_npu_use", is_npu_use) + << LOG_KV("is_gdma_use", is_gdma_use) << "\n"; + }); // find the banks that have been used by npu if the current buffer is used // by gdma if (is_gdma_use || is_npu_use) { @@ -833,8 +872,36 @@ bool LmemAllocator::assignLmemAddr(const LgInfo &lg_info, BasicTimeStepPtr &time_step, const shape_secs_t &shape_secs, bool allow_bank_conflict) { + /** + * + * assignLmemAddr is function for assign lmem addr for each mem buffer defined + * in lmem_buffer_ + * + * all lmem_buffer_ is a map + * + * mem_buffer_value_t is defined in BasicTimeStep.h: + * typedef struct mem_buffer_value { + * int64_t start_ts; + * int64_t end_ts; + * int64_t addr; + * int64_t size; + * int64_t align_bytes; + * } mem_buffer_value_t; + * + * we should fill all start_ts, end_ts, addr, size, align_bytes for each + * mem_buffer_key_t + * + * the start_ts, end_ts if filled by + * TimeStepMethod::memory_aware_timestep_assignment + * + * then the addr, size, align_bytes is filled in this function + * (assignLmemAddr) + * + */ PROFILE_LOG("assignLmemAddr", true); + // iterate all mem_buffer_key_t, then update mem_buffer_value_t.size time_step->update_all_mem_buffer_size(lg_info); + bool one_loop = (shape_secs.nsecs == 1 && shape_secs.hsecs == 1 && shape_secs.csecs == 1 && shape_secs.dsecs == 1 && shape_secs.wsecs == 1); @@ -852,129 +919,219 @@ bool LmemAllocator::assignLmemAddr(const LgInfo &lg_info, membuf_heap_create(npu_membuf_heap, gdma_membuf_heap, membuf_list, time_step); MemBlock alloc_lmem; // consider use alloc_position instead - int64_t tgt_position = 0; + int64_t tgt_min_address = 0; int64_t lmem_occupy = 0; - bool first_alloc = true; + bool is_first_alloc = true; mem_buffer_key_t recent_buffer_allocated; std::list::iterator buflist_it; - std::list::iterator tgt_buflist_it; + std::list::iterator tgt_membuf; while (!membuf_list.empty()) { - tgt_position = Arch::LMEM_BYTES; + tgt_min_address = Arch::LMEM_BYTES; DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = initial" - << "; tgt_position = " << tgt_position - << "; lmem_occupy = " << lmem_occupy << "\n"; + llvm::dbgs() << LOG_ACTION("assignLmemAddr") + << LOG_STEP("start_iteration") + << LOG_KV("lmem_occupy", lmem_occupy) + << LOG_KV("lmem_eu_bytes", Arch::EU_BYTES) + << LOG_KV("lmem_npu_num", Arch::NPU_NUM) + << LOG_KV("lmem_bytes", Arch::LMEM_BYTES) + << LOG_KV("lmem_banks", Arch::LMEM_BANKS) + << LOG_KV("lmem_bank_bytes", Arch::LMEM_BANK_BYTES) + << LOG_KV("remaining_buffers", membuf_list.size()) << "\n"; }); + update_membuf_conflict_param(npu_membuf_heap, gdma_membuf_heap, membuf_list); + + DEBUG_WITH_TYPE("assign_lmem_membuf_list", { + llvm::dbgs() << LOG_ACTION("assignLmemAddr") << LOG_STEP("before_sort") + << "\n"; + + int i = 0; + for (auto &iter : membuf_list) { + llvm::dbgs() << LOG_KV("buf_idx", i); + + if (iter.first.type == LMEM_OPERATION) { + llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.op)); + } else { + llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.value)); + } + + llvm::dbgs() << LOG_KV("op_conflict", iter.first.conflict) + << LOG_KV("op_type", iter.first.lmem_type_str()) + << LOG_KV("start_ts", iter.second.start_ts) + << LOG_KV("area", iter.second.area) << "\n"; + ++i; + } + }); membuf_list.sort(membuf_sort_std_cmp); + DEBUG_WITH_TYPE("assign_lmem_membuf_list", { + llvm::dbgs() << LOG_ACTION("assignLmemAddr") << LOG_STEP("after_sort") + << "\n"; + + int i = 0; + for (auto &iter : membuf_list) { + llvm::dbgs() << LOG_KV("buf_idx", i); + if (iter.first.type == LMEM_OPERATION) { + llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.op)); + } else { + llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.value)); + } + + llvm::dbgs() << LOG_KV("op_conflict", iter.first.conflict) + << LOG_KV("op_type", iter.first.lmem_type_str()) + << LOG_KV("start_ts", iter.second.start_ts) + << LOG_KV("area", iter.second.area) << "\n"; + ++i; + } + }); + + // 1. find a min position in membuf_list and then allocation for (buflist_it = membuf_list.begin(); buflist_it != membuf_list.end(); ++buflist_it) { - if (first_alloc) { - first_alloc = false; + + // 1.1 first allocation can start at 0 directly + if (is_first_alloc) { + is_first_alloc = false; DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = first_alloc" - << "; op = " << module::getName(buflist_it->first.value) - << "\n"; + std::set used_banks; + find_used_banks(used_banks, 0, + time_step->get_lmem_size(buflist_it->first)); + llvm::dbgs() << LOG_ACTION("assignLmemAddr") + << LOG_STEP("first_allocation") + << LOG_KV("op_type", buflist_it->first.lmem_type_str()) + << LOG_KV("op_name", + module::getName(buflist_it->first.value)) + << LOG_KV("size", + time_step->get_lmem_size(buflist_it->first)) + << "; banks = "; + for (auto bank : used_banks) { + llvm::dbgs() << bank << ","; + } + llvm::dbgs() << "\n"; }); - if (time_step->get_lmem_size(buflist_it->first) <= Arch::LMEM_BYTES) { - tgt_position = 0; - tgt_buflist_it = buflist_it; - DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = first_alloc_success" - << "; tgt_position = " << tgt_position - << "; lmem_occupy = " << lmem_occupy << "\n"; - }); - } else { - DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = find_op_assign_failed" - << "; tgt_position = " << tgt_position - << "; lmem_occupy = " << lmem_occupy << "; op = " - << module::getName(buflist_it->first.value) << "\n"; - }); + + if (time_step->get_lmem_size(buflist_it->first) > Arch::LMEM_BYTES) { PROFILE_LOG("assignLmemAddr", false); return false; } + + tgt_min_address = 0; + tgt_membuf = buflist_it; break; - } else { - alloc_lmem = global_find_avail_lmem_localtion( - buffer_avail_space[buflist_it->first], buflist_it->first, - recent_buffer_allocated, time_step, one_loop, allow_bank_conflict); + } + // 1.2 search an available lmem location + alloc_lmem = global_find_avail_lmem_localtion( + buffer_avail_space[buflist_it->first], buflist_it->first, + recent_buffer_allocated, time_step, one_loop, allow_bank_conflict); + + // 1.3 early return + // if this membuf can't find an available lmem in this step, it can't be + // allocated in any step after + if (alloc_lmem.first == -1) { DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = find_avail_lmem_location" - << "; op = " << module::getName(buflist_it->first.value) - << "\n"; + llvm::dbgs() << LOG_ACTION("assignLmemAddr") + << LOG_STEP("allocation_failed") + << LOG_KV("op_type", buflist_it->first.lmem_type_str()) + << LOG_KV("op_name", + module::getName(buflist_it->first.value)) + << LOG_KV("required_size", + time_step->get_lmem_size(buflist_it->first)) + << LOG_KV("max_addr", Arch::LMEM_BYTES) << "\n"; }); - if (alloc_lmem.first != -1) { - if (alloc_lmem.first < tgt_position) { - tgt_position = alloc_lmem.first; - tgt_buflist_it = buflist_it; - DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = update_min_tgt_position" - << "; tgt_position = " << tgt_position - << "; lmem_occupy = " << lmem_occupy << "\n"; - }); - } - } else { - DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = find_op_assign_failed" - << "; op = " - << module::getName(buflist_it->first.value) << "\n"; - }); - PROFILE_LOG("assignLmemAddr", false); - return false; - } + PROFILE_LOG("assignLmemAddr", false); + return false; } - } - if (tgt_position < Arch::LMEM_BYTES) { - recent_buffer_allocated = tgt_buflist_it->first; - time_step->set_lmem_addr(tgt_buflist_it->first, tgt_position); - int64_t buffer_end = - tgt_position + time_step->get_lmem_size(tgt_buflist_it->first); - lmem_occupy = buffer_end > lmem_occupy ? buffer_end : lmem_occupy; - conflict_heap_delete(npu_membuf_heap, gdma_membuf_heap, - &(tgt_buflist_it->first)); - membuf_list.erase(tgt_buflist_it); - buffer_avail_space.erase(tgt_buflist_it->first); DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = set_lmem_addr" - << "; tgt_position = " << tgt_position - << "; lmem_occupy = " << lmem_occupy - << "; buffer_end = " << buffer_end << "; op = " - << module::getName(tgt_buflist_it->first.value) << "\n"; + std::set used_banks; + find_used_banks(used_banks, alloc_lmem.first, alloc_lmem.second); + llvm::dbgs() + << LOG_ACTION("assignLmemAddr") + << LOG_STEP("found_available_location") + << LOG_KV("op_type", buflist_it->first.lmem_type_str()) + << LOG_KV("op_name", module::getName(buflist_it->first.value)) + << LOG_KV("addr", llvm::format_hex(alloc_lmem.first, 8)) + << LOG_KV("size", alloc_lmem.second) + << LOG_KV( + "timestep", + time_step->get_lmem_buffer_value(buflist_it->first).start_ts) + << "->" + << time_step->get_lmem_buffer_value(buflist_it->first).end_ts + << "; banks = "; + for (auto bank : used_banks) { + llvm::dbgs() << bank << ","; + } + llvm::dbgs() << "\n"; }); - } else { + // 1.4 update tgt_min_address and tgt_membuf + if (alloc_lmem.first < tgt_min_address) { + tgt_min_address = alloc_lmem.first; + tgt_membuf = buflist_it; + DEBUG_WITH_TYPE("assign_lmem", { + llvm::dbgs() << LOG_ACTION("assignLmemAddr") + << LOG_STEP("update_min_tgt_min_address") + << LOG_KV("tgt_min_address", tgt_min_address) + << LOG_KV("lmem_occupy", lmem_occupy) << "\n"; + }); + } + } + + // 2.a after search a min position, if can't find an available lmem, return + // false + if (tgt_min_address >= Arch::LMEM_BYTES) { llvm::errs() << "Cannot find local memory location for memory buffers\n"; - DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = op_assign_failed_in_loop_end" - << "; op = " << module::getName(buflist_it->first.value) - << "\n"; - }); PROFILE_LOG("assignLmemAddr", false); return false; } + + // 2.b allocate this available address for this membuf + recent_buffer_allocated = tgt_membuf->first; + time_step->set_lmem_addr(tgt_membuf->first, tgt_min_address); + int64_t buffer_end = + tgt_min_address + time_step->get_lmem_size(tgt_membuf->first); + lmem_occupy = buffer_end > lmem_occupy ? buffer_end : lmem_occupy; + conflict_heap_delete(npu_membuf_heap, gdma_membuf_heap, + &(tgt_membuf->first)); + membuf_list.erase(tgt_membuf); + buffer_avail_space.erase(tgt_membuf->first); + DEBUG_WITH_TYPE("assign_lmem", { + std::set used_banks; + find_used_banks(used_banks, tgt_min_address, + time_step->get_lmem_size(tgt_membuf->first)); + llvm::dbgs() + << LOG_ACTION("assignLmemAddr") << LOG_STEP("allocated_memory") + << LOG_KV("op_type", tgt_membuf->first.lmem_type_str()) + << LOG_KV("op_name", module::getName(tgt_membuf->first.value)) + << LOG_KV("addr", llvm::format_hex(tgt_min_address, 8)) + << LOG_KV("size", time_step->get_lmem_size(tgt_membuf->first)) + << LOG_KV( + "timestep_start", + time_step->get_lmem_buffer_value(tgt_membuf->first).start_ts) + << LOG_KV("timestep_end", + time_step->get_lmem_buffer_value(tgt_membuf->first).end_ts) + << LOG_KV("timestep_mode", + time_step->get_tensor_mode_str(tgt_membuf->first.value)) + << LOG_KV("lmem_occupy", lmem_occupy) << "; banks = "; + for (auto bank : used_banks) { + llvm::dbgs() << bank << ","; + } + llvm::dbgs() << "\n"; + }); } time_step->set_lmem_occupy(lmem_occupy); - assignL2memAddr(lg_info, time_step); + DEBUG_WITH_TYPE("assign_lmem", { - llvm::dbgs() << "; action = assign_lmem" - << "; step = final_assign_lmem_success" - << "\n"; + llvm::dbgs() << LOG_ACTION("assignLmemAddr") << LOG_STEP("completed") + << LOG_KV("total_lmem_used", lmem_occupy) + << LOG_KV("utilization", + (lmem_occupy * 100.0 / Arch::LMEM_BYTES)) + << "%\n"; }); + PROFILE_LOG("assignLmemAddr", false); return true; } @@ -1215,14 +1372,14 @@ void LmemAllocator::sc_method_multi_core(const LgInfo &lg_info, try_this_shape_secs(lg_info, shape_secs, allow_bank_conflict, time_step); if (ret >= SECS_VALID) { DEBUG_WITH_TYPE("shape_secs", { - llvm::dbgs() << "; action = shape_secs" - << "; step = sc_method_multi_core" - << "; nsecs = " << shape_secs.nsecs - << "; csecs = " << shape_secs.csecs - << "; dsecs = " << shape_secs.dsecs - << "; hsecs = " << shape_secs.hsecs - << "; wsecs = " << shape_secs.wsecs - << "; cost = " << last_group_cost_ << "\n"; + llvm::dbgs() << LOG_ACTION("shape_secs") + << LOG_STEP("sc_method_multi_core") + << LOG_KV("nsecs", shape_secs.nsecs) + << LOG_KV("csecs", shape_secs.csecs) + << LOG_KV("dsecs", shape_secs.dsecs) + << LOG_KV("hsecs", shape_secs.hsecs) + << LOG_KV("wsecs", shape_secs.wsecs) + << LOG_KV("cost", last_group_cost_) << "\n"; }); } } @@ -1280,15 +1437,15 @@ void LmemAllocator::sc_method_multi_core_v2(const LgInfo &lg_info, if (ret >= SECS_VALID) { DEBUG_WITH_TYPE("shape_secs", { - llvm::dbgs() << "; action = shape_secs" - << "; step = sc_method_multi_core_v2" - << "; nch_secs = " << i - << "; nsecs = " << core_shape_secs.nsecs - << "; csecs = " << core_shape_secs.csecs - << "; dsecs = " << core_shape_secs.dsecs - << "; hsecs = " << core_shape_secs.hsecs - << "; wsecs = " << core_shape_secs.wsecs - << "; cost = " << last_group_cost_ << "\n"; + llvm::dbgs() << LOG_ACTION("shape_secs") + << LOG_STEP("sc_method_multi_core_v2") + << LOG_KV("nch_secs", i) + << LOG_KV("nsecs", core_shape_secs.nsecs) + << LOG_KV("csecs", core_shape_secs.csecs) + << LOG_KV("dsecs", core_shape_secs.dsecs) + << LOG_KV("hsecs", core_shape_secs.hsecs) + << LOG_KV("wsecs", core_shape_secs.wsecs) + << LOG_KV("cost", last_group_cost_) << "\n"; }); } if (not_best_count >= MAX_TRY_NUM) { @@ -1333,14 +1490,14 @@ void LmemAllocator::sc_method_multi_core_v3(const LgInfo &lg_info, allow_bank_conflict, time_step); if (ret >= SECS_VALID) { DEBUG_WITH_TYPE("shape_secs", { - llvm::dbgs() << "; action = shape_secs" - << "; step = sc_method_multi_core_v3" - << "; nsecs = " << shape_secs.nsecs - << "; csecs = " << shape_secs.csecs - << "; dsecs = " << shape_secs.dsecs - << "; hsecs = " << shape_secs.hsecs - << "; wsecs = " << shape_secs.wsecs - << "; cost = " << last_group_cost_ << "\n"; + llvm::dbgs() << LOG_ACTION("shape_secs") + << LOG_STEP("sc_method_multi_core_v3") + << LOG_KV("nsecs", shape_secs.nsecs) + << LOG_KV("csecs", shape_secs.csecs) + << LOG_KV("dsecs", shape_secs.dsecs) + << LOG_KV("hsecs", shape_secs.hsecs) + << LOG_KV("wsecs", shape_secs.wsecs) + << LOG_KV("cost", last_group_cost_) << "\n"; }); } } diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp index b55b3c588..f7b1f8c67 100644 --- a/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp +++ b/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp @@ -94,7 +94,8 @@ int64_t SoftwarePipeline::software_pipeline_schedule( // delete the last row of time step table timestep_table.erase(last_row_iter); - // move the last tensor timestep to the first + + // 1. (try) move the last tensor timestep to the first bool move_valid; // consider time step 1, it is the second row of the table auto second_row_iter = timestep_table.begin() + 1; @@ -103,6 +104,8 @@ int64_t SoftwarePipeline::software_pipeline_schedule( for (uint32_t i = 0; i < last_tensor_timestep.size(); ++i) { move_valid = true; auto v = last_tensor_timestep[i].first; + // if v is used by tpu op (is opds or results of tpu op) in the second row + // it cannot be moved from last to second row (move_valid = false) for (auto op : second_row_iter->tpu0_ts_field) { auto opds = op->getOperands(); auto results = get_output_values(op); @@ -128,12 +131,18 @@ int64_t SoftwarePipeline::software_pipeline_schedule( timestep_table[0].gdma0_ts_field = rest_last_tensors_; } - // move the first tensor timestep to the last + // 2. (try) move the first tensor timestep to the last last_row_iter = timestep_table.end() - 1; + // consider time step n-1, it is the (n-1)th row of the table GdmaTsField rest_first_tensors_; for (uint32_t i = 0; i < first_tensor_timestep.size(); ++i) { move_valid = true; auto v = first_tensor_timestep[i].first; + + // if v is used by tpu op (is opds or results of tpu op) in the "new" last + // row it cannot be moved from first to the "new" last row (move_valid = + // false) note: the "new" last row is the last row before the "real" last + // row which is deleted for (auto op : last_row_iter->tpu0_ts_field) { auto opds = op->getOperands(); if (std::find(opds.begin(), opds.end(), v) != opds.end()) { diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp index 19c42e190..f7f989095 100644 --- a/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp +++ b/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp @@ -24,10 +24,18 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step, Operation *op; tensor_info_t tensor_info; + // in nearest algorithm, each op calculation will be assigned to a timestep for (size_t i = 0; i < group_ops.size(); ++i) { op = group_ops[i]; - // layer: 0 + // timestep: 0 + // load current layer's input from lmem + // current layer will have tpu_field in next timestep + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "; action = layer_nearest_timestep_assignment" + << "; ts = " << i << "\n"; + }); if (i == 0) { + // stage 0, only have load timestep gdma_field.clear(); have_load_tensor = false; for (auto in : op->getOperands()) { @@ -52,12 +60,17 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step, tpu_field.clear(); gdma_field.clear(); + + // stage 1, in pipeline, all calculate, load, store ops are in the same + // timestep for (auto out : get_output_values(op)) { tensor_in_lmem.insert(out); } + // stage 1.1: add current gdma and tpu timestep tpu_field.push_back(op); // layer: [1, N-1) + // stage 1.1: pre load next layer's input in current timestep if (i != group_ops.size() - 1) { auto next_op = group_ops[i + 1]; for (auto next_in : next_op->getOperands()) { @@ -76,6 +89,8 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step, } } + // layer: [1, N-1) + // stage 1.2: store current layer's output to lmem if (i > 0) { auto pre_op = group_ops[i - 1]; for (auto pre_out : get_output_values(pre_op)) { @@ -88,10 +103,15 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step, } } + // add current gdma and tpu timestep + // stage 1 finally add if (!(tpu_field.empty() && gdma_field.empty())) { time_step->add_tpu0_gdma0_ts_field(tpu_field, gdma_field); } + // last layer + // store last layer's output to lmem in a new timestep + // stage 2: last layer will only have gdma_field if (i == group_ops.size() - 1) { gdma_field.clear(); for (auto out : get_output_values(op)) { @@ -102,11 +122,18 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step, time_step->add_gdma0_ts_field(gdma_field); } } - + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "============= nearest algorithm =============\n"; + time_step->show_timestep_table(); + }); // use software pipeline if (group_ops.size() > 1) { time_step->software_pipeline(); } + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "============= software pipeline =============\n"; + time_step->show_timestep_table(); + }); } bool is_tensor_accessed_by_npu(Value v, BasicTimeStep *time_step, int64_t ts) { @@ -239,6 +266,10 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step, ValueIntMap tensor_to_bufsize; std::vector> tensor_timesteps; + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "============= memory aware algorithm =============\n"; + }); + // remove it after pid_node is extracted #pragma omp critical(get_cycle) get_timestep_cycle_slack(time_step, lg_info, tensor_to_cycle, @@ -275,6 +306,11 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step, } time_step->update_gdma0_ts_field(ts, new_tensor_timestep); } + time_step->show_timestep_table(); + + DEBUG_WITH_TYPE("timestep_assign", { + llvm::dbgs() << "=======================================\n"; + }); } void TimeStepMethod::get_timestep_cycle_slack( diff --git a/lib/Support/Module.cpp b/lib/Support/Module.cpp index 99fb04063..414f362c2 100644 --- a/lib/Support/Module.cpp +++ b/lib/Support/Module.cpp @@ -868,6 +868,13 @@ bool isOpInGroupParallel(Operation *Op) { return false; } +bool isValueBlockArgument(Value v) { + if (auto blockArg = dyn_cast(v)) { + return true; + } + return false; +} + // op in [CoreBegin, CoreEnd] bool isOpInCoreMatch(Operation *op) { while (!op->use_empty()) { diff --git a/python/tools/logdebug_tool.py b/python/tools/logdebug_tool.py index 837f3f644..6ce1500b2 100755 --- a/python/tools/logdebug_tool.py +++ b/python/tools/logdebug_tool.py @@ -13,7 +13,7 @@ import pandas as pd from itertools import combinations -kv_pattern = re.compile(r";\s*([\w/.]+)\s*=\s*(['\"]?)([:\w\s/\-\".]+)\2") +kv_pattern = re.compile(r";\s*([\w/.]+)\s*=\s*(['\"]?)([:,_\w\s/\-\".]+)\2") def comsume_in_main(func): @@ -48,7 +48,7 @@ def parse_dic(line, filter=None): dic = {} for k, _, v in ret: try: - dic[k] = int(v) + dic[k] = int(v) if v.isdigit() else v.strip() except Exception: dic[k] = v.strip() return dic