From b6017e8701847ebb9ebebf77e3e7f33bf0fbcf9f Mon Sep 17 00:00:00 2001
From: haozhe <haozhe.yang@sophgo.com>
Date: Sun, 19 Jan 2025 18:54:47 +0800
Subject: [PATCH] feat: make lmem assignment stage more analyzable

- define some commonly used LOG macro (Logger.h)
- define some strinify function to show lmem type and timestep mode
  (LayerGroupDefs.h)
- add show_timestep_table to print readable timestep table
  (BasicTimeStep.h/BasicTimeStep.cpp)
- add many DEBUG_WITH_TYPE logs and comments in lmem assignment stage
  (BasicTimeStep.cpp/LmemAllocator.cpp/TimeStepMethod.cpp/SwPipeline.cpp)
- rename some variables and function names for better represent the
  process(gen_all_mem_buffer_ts/tgt_min_address/...)
- reduce assignLmemAddr cyclomatic complexity.(LmemAllocator.cpp:989)

Change-Id: I31dadb9424be334da481f9dfbd45985ca89dc058
---
 .../Tpu/Transforms/LayerGroup/BasicTimeStep.h |   5 +-
 .../Transforms/LayerGroup/LayerGroupDefs.h    |  30 ++
 .../Tpu/Transforms/LayerGroup/TimeStep.h      |   2 +-
 include/tpu_mlir/Support/Logger.h             |  23 +-
 include/tpu_mlir/Support/Module.h             |   1 +
 .../Transforms/LayerGroup/BasicTimeStep.cpp   | 173 ++++++-
 .../Transforms/LayerGroup/CoeffReloadOpt.cpp  |   2 +-
 .../Tpu/Transforms/LayerGroup/GroupMethod.cpp |  44 +-
 .../Transforms/LayerGroup/LmemAllocator.cpp   | 437 ++++++++++++------
 .../Tpu/Transforms/LayerGroup/SwPipeline.cpp  |  13 +-
 .../Transforms/LayerGroup/TimeStepMethod.cpp  |  40 +-
 lib/Support/Module.cpp                        |   7 +
 python/tools/logdebug_tool.py                 |   4 +-
 13 files changed, 598 insertions(+), 183 deletions(-)
diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h
index df318107f..4ac7b0e21 100644
--- a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h
+++ b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.h
@@ -39,6 +39,7 @@ class BasicTimeStep {
   void add_tpu0_gdma0_ts_field(const TpuTsField &tpu_field,
                                const GdmaTsField &gdma_field);
   void update_gdma0_ts_field(int64_t ts, const GdmaTsField &field);
+  void show_timestep_table();
   std::vector<TimestepRow> &get_timestep_table() { return timestep_table_; }
   size_t get_timestep_num() { return timestep_table_.size(); }
 
@@ -79,12 +80,12 @@ class BasicTimeStep {
   }
 
   TensorInfo &get_tensor_infos();
-
+  std::string get_tensor_mode_str(Value v);
   // setter
   void set_lmem_addr(const mem_buffer_key_t &buffer_key, int64_t lmem_addr);
   void set_lmem_occupy(int64_t occupy) { lmem_occupy_ = occupy; }
 
-  void gen_all_mem_buffer();
+  void gen_all_mem_buffer_ts();
   void update_all_mem_buffer_size(const LgInfo &lg_info);
   void gen_hold_coeff();
   bool is_tensor_hold_in_lmem(Value v);
diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h
index a708d378e..1d8e24971 100644
--- a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h
+++ b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h
@@ -98,6 +98,20 @@ typedef struct mem_buffer_key {
     }
     return false;
   }
+
+  std::string lmem_type_str() {
+    switch (type) {
+    case LMEM_WEIGHT:
+      return "LMEM_WEIGHT";
+    case LMEM_ACTIVATION:
+      return "LMEM_ACTIVATION";
+    case LMEM_OPERATION:
+      return "LMEM_OPERATION";
+    case LMEM_ANY:
+      return "LMEM_ANY";
+    }
+    return "LMEM_UNKNOWN";
+  }
 } mem_buffer_key_t;
 
 typedef struct mem_buffer_value {
@@ -155,6 +169,22 @@ struct tensor_info_t {
   void add_slice_info(Operation *next_op, slice_info_t slice_info) {
     slice_infos[next_op] = slice_info;
   }
+
+  const std::string mode_str() const {
+    switch (mode) {
+    case TIMESTEP_LOAD:
+      return "TIMESTEP_LOAD";
+    case TIMESTEP_STORE:
+      return "TIMESTEP_STORE";
+    case TIMESTEP_MOVE:
+      return "TIMESTEP_MOVE";
+    case TIMESTEP_LD_G2L2:
+      return "TIMESTEP_LD_G2L2";
+    case TIMESTEP_LDST_UNKNOWN:
+      return "TIMESTEP_LDST_UNKNOWN";
+    }
+    return "TIMESTEP_UNKNOWN";
+  }
 };
 
 using ValueSet = std::set<Value, value_compare>;
diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h
index a4d490eb1..b5a6f4c68 100644
--- a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h
+++ b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStep.h
@@ -74,7 +74,7 @@ class BasicTimeStep {
   void show_timestep();
   void clear();
 
-  void gen_all_mem_buffer();
+  void gen_all_mem_buffer_ts();
 
 protected:
   LgOptions options_;
diff --git a/include/tpu_mlir/Support/Logger.h b/include/tpu_mlir/Support/Logger.h
index 16733f812..fc2870106 100644
--- a/include/tpu_mlir/Support/Logger.h
+++ b/include/tpu_mlir/Support/Logger.h
@@ -46,26 +46,27 @@ inline std::string formatString(const char *format, ...) {
 
 inline void SetLogFlag(int32_t log_level) { cur_log_level = log_level; }
 
+#define LOG_KV(key, value) "; " << key << " = " << value
+
+#define LOG_ITEM(key) "; " << key
+
+#define LOG_ACTION(action) "; action = " << action
+
+#define LOG_STEP(step) "; step = " << step
+
 #define PROFILE_LOG(step, begin)                                               \
   do {                                                                         \
     DEBUG_WITH_TYPE("profile", {                                               \
       auto current_time = std::chrono::high_resolution_clock::now();           \
       auto time_string = std::chrono::system_clock::to_time_t(current_time);   \
       if (begin) {                                                             \
-        llvm::dbgs() << "; action = profile"                                   \
-                     << "; step = " << step                                    \
-                     << "; begin = " << std::ctime(&time_string) << "\n";      \
+        llvm::dbgs() << LOG_ACTION("profile") << LOG_STEP(step)                \
+                     << LOG_KV("begin", std::ctime(&time_string)) << "\n";     \
       } else {                                                                 \
-        llvm::dbgs() << "; action = profile"                                   \
-                     << "; step = " << step                                    \
-                     << "; end = " << std::ctime(&time_string) << "\n";        \
+        llvm::dbgs() << LOG_ACTION("profile") << LOG_STEP(step)                \
+                     << LOG_KV("end", std::ctime(&time_string)) << "\n";       \
       }                                                                        \
     });                                                                        \
   } while (0)
 
-#define DEBUG_KV(key, value)                                                   \
-  do {                                                                         \
-    llvm::dbgs() << "; " << key << " = " << value << "\n";                     \
-  } while (0)
-
 } // namespace tpu_mlir
diff --git a/include/tpu_mlir/Support/Module.h b/include/tpu_mlir/Support/Module.h
index 9b254c633..f676d0b81 100644
--- a/include/tpu_mlir/Support/Module.h
+++ b/include/tpu_mlir/Support/Module.h
@@ -200,6 +200,7 @@ bool IsHdimIsBatch(Value value);
 bool isOpInCoreMatch(Operation *Op);
 bool isOpInCoreParallel(Operation *Op);
 bool isOpInGroupParallel(Operation *Op);
+bool isValueBlockArgument(Value v);
 bool isOpInDevParallel(Operation *Op);
 bool isOpInBlock(Operation *op);
 FuncOp getFuncOp(ModuleOp module, StringRef func_name);
diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp
index b87f7d976..712662e82 100644
--- a/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp
+++ b/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp
@@ -16,6 +16,38 @@ namespace tpu {
 using namespace tpu_mlir::tpu;
 using namespace tpu_mlir::backend;
 
+static inline void stream_tpu_field(const TpuTsField &field) {
+  llvm::dbgs() << " [ ";
+  for (int i = 0; i < field.size(); ++i) {
+    if (i > 0)
+      llvm::dbgs() << ", ";
+    llvm::dbgs() << "C(\"" << field[i]->getName() << "\"), \""
+                 << module::getName(field[i]) << "\"";
+  }
+  llvm::dbgs() << " ]";
+}
+
+static inline void stream_gdma_field(const GdmaTsField &field) {
+  llvm::dbgs() << " [ ";
+  for (int i = 0; i < field.size(); ++i) {
+    auto mode = field[i].second.mode;
+    auto modestr = "L";
+    if (mode == TIMESTEP_STORE) {
+      modestr = "S";
+    }
+
+    if (i > 0)
+      llvm::dbgs() << ", ";
+    std::string op_type =
+        module::isValueBlockArgument(field[i].first)
+            ? "block_arg"
+            : field[i].first.getDefiningOp()->getName().getStringRef().str();
+    llvm::dbgs() << modestr << "(\"" << module::getName(field[i].first)
+                 << "\")->" << op_type;
+  }
+  llvm::dbgs() << " ]";
+}
+
 BasicTimeStep::BasicTimeStep() {
   // options_ = options;
   swpipl_ = std::make_shared<SoftwarePipeline>();
@@ -50,12 +82,29 @@ void BasicTimeStep::add_tpu0_ts_field(const TpuTsField &field) {
   TimestepRow row;
   row.tpu0_ts_field = field;
   timestep_table_.push_back(row);
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "; action = add_tpu0_ts_field"
+                 << "; ts = " << timestep_table_.size() - 1;
+
+    stream_tpu_field(field);
+
+    llvm::dbgs() << "\n";
+  });
 }
 
 void BasicTimeStep::add_gdma0_ts_field(const GdmaTsField &field) {
   TimestepRow row;
   row.gdma0_ts_field = field;
   timestep_table_.push_back(row);
+
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "; action = add_gdma0_ts_field"
+                 << "; ts = " << timestep_table_.size() - 1;
+
+    stream_gdma_field(field);
+
+    llvm::dbgs() << "\n";
+  });
 }
 
 void BasicTimeStep::add_tpu0_gdma0_ts_field(const TpuTsField &tpu_field,
@@ -64,12 +113,43 @@ void BasicTimeStep::add_tpu0_gdma0_ts_field(const TpuTsField &tpu_field,
   row.tpu0_ts_field = tpu_field;
   row.gdma0_ts_field = gdma_field;
   timestep_table_.push_back(row);
+
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "; action = add_tpu0_gdma0_ts_field"
+                 << "; ts = " << timestep_table_.size() - 1;
+
+    stream_tpu_field(tpu_field);
+    stream_gdma_field(gdma_field);
+
+    llvm::dbgs() << "\n";
+  });
 }
 
 void BasicTimeStep::update_gdma0_ts_field(int64_t ts,
                                           const GdmaTsField &field) {
   this->timestep_table_[ts].gdma0_ts_field.clear();
   this->timestep_table_[ts].gdma0_ts_field = field;
+
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "; action = update_gdma0_ts_field"
+                 << "; ts = " << ts;
+
+    stream_gdma_field(field);
+
+    llvm::dbgs() << "\n";
+  });
+}
+
+void BasicTimeStep::show_timestep_table() {
+  DEBUG_WITH_TYPE("timestep_assign", {
+    for (int i = 0; i < timestep_table_.size(); ++i) {
+      llvm::dbgs() << "; ts = " << i << "; ";
+      stream_tpu_field(timestep_table_[i].tpu0_ts_field);
+      llvm::dbgs() << " || ";
+      stream_gdma_field(timestep_table_[i].gdma0_ts_field);
+      llvm::dbgs() << "\n";
+    }
+  });
 }
 
 int64_t BasicTimeStep::get_layer_swpipl_stage(Operation *op) {
@@ -208,7 +288,7 @@ void BasicTimeStep::gen_hold_coeff() {
 //   }
 // }
 
-void BasicTimeStep::gen_all_mem_buffer() {
+void BasicTimeStep::gen_all_mem_buffer_ts() {
   // input: need_imm_buffers
   lmem_buffer_.clear();
   l2mem_buffer_.clear();
@@ -219,10 +299,24 @@ void BasicTimeStep::gen_all_mem_buffer() {
 
   for (int64_t stg = 0; stg < this->swpipl_stage_num_; ++stg) {
     // add for software pipeline
+    // swpipl_stage_num_ always be 3 after software pipeline
     bool layer_timestep_valid =
         (swpipl_stage_num_ == 1) || (swpipl_stage_num_ > 1 && stg == 1);
+    DEBUG_WITH_TYPE("lmem_buffer_assign", {
+      llvm::dbgs() << "; action = lmem_buffer_assign"
+                   << "; step = "
+                   << "process_current_stage"
+                   << "; stg = " << stg
+                   << "; swpipl_stage_num_ = " << swpipl_stage_num_ << "\n";
+    });
     for (size_t ts = 0; ts < get_timestep_num(); ++ts) {
       // process current timestep layers
+      DEBUG_WITH_TYPE("lmem_buffer_assign", {
+        llvm::dbgs() << "; action = lmem_buffer_assign"
+                     << "; step =   "
+                     << "process_current_timestep_layers"
+                     << "; ts = " << ts << "\n";
+      });
       const TpuTsField &cur_tpu_field = timestep_table_[ts].tpu0_ts_field;
       if (layer_timestep_valid) {
         for (auto op : cur_tpu_field) {
@@ -235,6 +329,16 @@ void BasicTimeStep::gen_all_mem_buffer() {
             lmem_value.start_ts = ts;
             lmem_value.end_ts = -1;
 
+            DEBUG_WITH_TYPE("lmem_buffer_assign", {
+              llvm::dbgs() << "; action = lmem_buffer_assign"
+                           << "; step =     "
+                           << "initial_results_buffer"
+                           << "; lmem_key = " << module::getName(lmem_key.value)
+                           << "; lmem_type = " << lmem_key.lmem_type_str()
+                           << "; lmem_start_ts = " << lmem_value.start_ts
+                           << "; lmem_end_ts = " << lmem_value.end_ts << "\n";
+            });
+
             lmem_buffer_[lmem_key] = lmem_value;
           }
 
@@ -250,11 +354,36 @@ void BasicTimeStep::gen_all_mem_buffer() {
             }
             lmem_key.value = in;
 
-            // lmem_buffer_[lmem_key].end_ts = ts;
             if (lmem_buffer_.find(lmem_key) != lmem_buffer_.end()) {
               lmem_buffer_[lmem_key].end_ts = ts;
+              DEBUG_WITH_TYPE("lmem_buffer_assign", {
+                llvm::dbgs()
+                    << "; action = lmem_buffer_assign"
+                    << "; step =     "
+                    << "update_operands_lmem_buffer"
+                    << "; lmem_key = " << module::getName(lmem_key.value)
+                    << "; lmem_type = " << lmem_key.lmem_type_str()
+                    << "; lmem_start_ts = " << lmem_buffer_[lmem_key].start_ts
+                    << "; timestep_mode = "
+                    << get_tensor_mode_str(lmem_key.value)
+                    << "; lmem_end_ts = " << lmem_buffer_[lmem_key].end_ts
+                    << "\n";
+              });
             } else {
               l2mem_buffer_[lmem_key].end_ts = ts;
+              DEBUG_WITH_TYPE("lmem_buffer_assign", {
+                llvm::dbgs()
+                    << "; action = lmem_buffer_assign"
+                    << "; step =     "
+                    << "update_operands_l2mem_buffer"
+                    << "; lmem_key = " << module::getName(lmem_key.value)
+                    << "; lmem_type = " << lmem_key.lmem_type_str()
+                    << "; lmem_start_ts = " << l2mem_buffer_[lmem_key].start_ts
+                    << "; timestep_mode = "
+                    << get_tensor_mode_str(lmem_key.value)
+                    << "; lmem_end_ts = " << l2mem_buffer_[lmem_key].end_ts
+                    << "\n";
+              });
             }
           }
 
@@ -264,7 +393,17 @@ void BasicTimeStep::gen_all_mem_buffer() {
 
           lmem_value.start_ts = ts;
           lmem_value.end_ts = ts;
-
+          DEBUG_WITH_TYPE("lmem_buffer_assign", {
+            llvm::dbgs() << "; action = lmem_buffer_assign"
+                         << "; step =     "
+                         << "update_imm_buffer"
+                         << "; lmem_key = " << module::getName(lmem_key.value)
+                         << "; lmem_type = " << lmem_key.lmem_type_str()
+                         << "; lmem_start_ts = " << lmem_value.start_ts
+                         << "; timestep_mode = "
+                         << get_tensor_mode_str(lmem_key.value)
+                         << "; lmem_end_ts = " << lmem_value.end_ts << "\n";
+          });
           lmem_buffer_[lmem_key] = lmem_value;
         } // cur_tpu_field
       }
@@ -294,12 +433,34 @@ void BasicTimeStep::gen_all_mem_buffer() {
             l2mem_buffer_[lmem_key] = lmem_value;
           } else {
             lmem_buffer_[lmem_key] = lmem_value;
+            DEBUG_WITH_TYPE("lmem_buffer_assign", {
+              llvm::dbgs() << "; action = lmem_buffer_assign"
+                           << "; step =     "
+                           << "update_load_buffer"
+                           << "; lmem_key = " << module::getName(lmem_key.value)
+                           << "; lmem_type = " << lmem_key.lmem_type_str()
+                           << "; timestep_mode = " << tensor_info.mode_str()
+                           << "; lmem_start_ts = " << lmem_value.start_ts
+                           << "; lmem_end_ts = " << lmem_value.end_ts << "\n";
+            });
           }
         } else if (tensor_info.mode == TIMESTEP_STORE) {
           lmem_key.value = tensor.first;
           lmem_key.type = LMEM_ACTIVATION;
 
           lmem_buffer_[lmem_key].end_ts = ts;
+          DEBUG_WITH_TYPE("lmem_buffer_assign", {
+            llvm::dbgs() << "; action = lmem_buffer_assign"
+                         << "; step =     "
+                         << "update_store_buffer"
+                         << "; lmem_key = " << module::getName(lmem_key.value)
+                         << "; lmem_type = " << lmem_key.lmem_type_str()
+                         << "; timestep_mode = " << tensor_info.mode_str()
+                         << "; lmem_start_ts = "
+                         << lmem_buffer_[lmem_key].start_ts
+                         << "; lmem_end_ts = " << lmem_buffer_[lmem_key].end_ts
+                         << "\n";
+          });
         }
       }
     }
@@ -308,7 +469,7 @@ void BasicTimeStep::gen_all_mem_buffer() {
 
 void BasicTimeStep::update_all_mem_buffer_size(const LgInfo &lg_info) {
   if (lmem_buffer_.empty()) {
-    gen_all_mem_buffer();
+    gen_all_mem_buffer_ts();
   }
   auto &tensor_infos = tensor_infos_;
 
@@ -424,6 +585,10 @@ bool BasicTimeStep::is_tensor_hold_in_lmem(Value v) {
 
 TensorInfo &BasicTimeStep::get_tensor_infos() { return tensor_infos_; }
 
+std::string BasicTimeStep::get_tensor_mode_str(Value v) {
+  return tensor_infos_[v].mode_str();
+}
+
 typedef struct {
   Value value;
   int64_t addr;
diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp
index 8ba96c35c..0774cb6c3 100644
--- a/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp
+++ b/lib/Dialect/Tpu/Transforms/LayerGroup/CoeffReloadOpt.cpp
@@ -38,7 +38,7 @@ void coeff_reload_open(BasicTimeStepPtr &time_step, TensorInfo &tensor_infos) {
     Bm168xCycleCalculator *cyc_ptr = new Bm168xCycleCalculator();
     cycle_calculator.reset(cyc_ptr);
   }
-  time_step->gen_all_mem_buffer();
+  time_step->gen_all_mem_buffer_ts();
   for (int64_t ts = 0; ts < timestep_num; ++ts) {
     int64_t slack = 0;
     tensor_to_coeff_cost.clear();
diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp
index 2b4c93c41..ef7b60398 100644
--- a/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp
+++ b/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp
@@ -1582,6 +1582,10 @@ void GroupMethod::load_lg_results(
   auto &root = *jsonOrErr;
   int opt = options_.opt;
   // Load group layers
+
+  std::vector<std::vector<Operation *>> base_groups;
+  get_base_groups(base_groups, subnet_ops);
+
   if (auto *rootObj = root.getAsObject()) {
     if (auto opt_ = rootObj->getInteger("opt")) {
       opt = *opt_;
@@ -1619,6 +1623,12 @@ void GroupMethod::load_lg_results(
                 }
               }
             }
+          } else {
+            // assuming the base_group partition did not change when locs are
+            // not assigned in cache
+            get_layer_group(lg_info, base_groups[lg_info.base_group_idx],
+                            lg_info.start_idx, lg_info.end_idx,
+                            lg_info.base_group_idx);
           }
           // Get shape_secs if available
           if (auto shapeArray = groupObj_->getArray("shape_secs")) {
@@ -1683,25 +1693,23 @@ void GroupMethod::load_lg_results(
   for (auto &lg_info : lg_infos) {
     int64_t cost = 0;
     lg_info.use_cache = true;
-    if (lg_info.group_cost > 0) {
-      DEBUG_WITH_TYPE("lg_index", {
-        llvm::dbgs() << "; action = lg_index"
-                     << "; start_idx = " << lg_info.start_idx
-                     << "; end_idx = " << lg_info.end_idx
-                     << "; group_idx = " << lg_info.base_group_idx << "\n";
-      });
-      if (!is_layer_group_valid(lg_info, true, &cost)) {
-        llvm_unreachable("group_cost is not valid");
-      }
-      DEBUG_WITH_TYPE("lg_cost", {
-        llvm::dbgs() << "; action = lg_cost"
-                     << "; step = group_layer"
-                     << "; start_idx = " << lg_info.start_idx
-                     << "; end_idx = " << lg_info.end_idx
-                     << "; group_idx = " << lg_info.base_group_idx
-                     << "; group_cost = " << lg_info.group_cost << "\n";
-      });
+    DEBUG_WITH_TYPE("lg_index", {
+      llvm::dbgs() << "; action = lg_index"
+                   << "; start_idx = " << lg_info.start_idx
+                   << "; end_idx = " << lg_info.end_idx
+                   << "; group_idx = " << lg_info.base_group_idx << "\n";
+    });
+    if (!is_layer_group_valid(lg_info, true, &cost)) {
+      llvm_unreachable("group_cost is not valid");
     }
+    DEBUG_WITH_TYPE("lg_cost", {
+      llvm::dbgs() << "; action = lg_cost"
+                   << "; step = group_layer"
+                   << "; start_idx = " << lg_info.start_idx
+                   << "; end_idx = " << lg_info.end_idx
+                   << "; group_idx = " << lg_info.base_group_idx
+                   << "; group_cost = " << lg_info.group_cost << "\n";
+    });
   }
   llvm::outs() << "load lg results\n";
 }
diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp
index 5fae8bc36..f22297193 100644
--- a/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp
+++ b/lib/Dialect/Tpu/Transforms/LayerGroup/LmemAllocator.cpp
@@ -70,14 +70,18 @@ static int64_t get_membuf_area(int64_t start_ts, int64_t end_ts,
 
 static bool is_buffer_used_by_npu(const mem_buffer_key_t &buffer_key,
                                   const TpuTsField &cur_layers) {
+  // LMEM_OPERATION is an operation buffer, so it is always used by npu
+  // LMEM_ACTIVATION/LMEM_WEIGHT can be used by gdma (in store/load op), so it
+  // can't return true directly
   if (buffer_key.type == LMEM_OPERATION) {
     return true;
   }
   auto users = buffer_key.value.getUsers();
   auto src_op = buffer_key.value.getDefiningOp();
   for (auto op : cur_layers) {
-    if (src_op == op ||
-        std::find(users.begin(), users.end(), op) != users.end()) {
+    if (src_op == op /* src_op is an output */ ||
+        std::find(users.begin(), users.end(), op) !=
+            users.end() /* src_op is an input */) {
       return true;
     }
   }
@@ -91,6 +95,8 @@ static bool is_buffer_used_by_gdma(const mem_buffer_key_t &buffer_key,
     for (auto &tensor : cur_tensors) {
       if (tensor.first == buffer_key.value &&
           is_lmem_ldst(tensor.second.mode)) {
+        // TODO: maybe it can be move to outside in update_exclude_banks
+        // function
         if (is_npu_use && tensor.second.mode != TIMESTEP_STORE) {
           llvm::errs() << "tensor is loaded and used by npu simultaneously in "
                           "timestep\n";
@@ -206,21 +212,49 @@ bool LmemAllocator::update_avail_lmems(std::list<MemBlock> &avail_lmems,
   for (avail_iter = avail_lmems.begin(); avail_iter != avail_lmems.end();) {
     int64_t avail_start = avail_iter->first;
     int64_t avail_end = avail_iter->first + avail_iter->second;
+    /**
+     * Case 1: full overlap
+     * avail:     |--------|
+     * exclude:  |----------|
+     * result:    (delete)
+     *
+     */
     if (avail_start >= exclude_start && avail_end <= exclude_end) {
       avail_iter = avail_lmems.erase(avail_iter);
-    } else if (avail_start < exclude_start && avail_end > exclude_start &&
-               avail_end <= exclude_end) {
+    }
+    /**
+     * Case 2: right overlap
+     * avail:   |--------|
+     * exclude:     |--------|
+     * result:  |---|
+     */
+    else if (avail_start < exclude_start && avail_end > exclude_start &&
+             avail_end <= exclude_end) {
       avail_iter->second = exclude_start - avail_start;
       avail_iter++;
-    } else if (avail_start >= exclude_start && avail_start < exclude_end &&
-               avail_end > exclude_end) {
+    }
+    /**
+     * Case 3: left overlap
+     * avail:       |--------|
+     * exclude:  |-------|
+     * result:           |---|
+     */
+    else if (avail_start >= exclude_start && avail_start < exclude_end &&
+             avail_end > exclude_end) {
       if (avail_start == exclude_start) {
         space_split = true;
       }
       avail_iter->second = avail_end - exclude_end;
       avail_iter->first = exclude_end;
       avail_iter++;
-    } else if (avail_start < exclude_start && avail_end > exclude_end) {
+    }
+    /**
+     * Case 4: full split
+     * avail:   |--------------|
+     * exclude:     |-----|
+     * result:  |---|     |---|
+     */
+    else if (avail_start < exclude_start && avail_end > exclude_end) {
       int new_buffer_addr = exclude_end;
       int new_buffer_size = avail_end - exclude_end;
       avail_iter->second = exclude_start - avail_start;
@@ -508,9 +542,8 @@ MemBlock LmemAllocator::find_avail_lmem_location(
   MemBlock alloc_lmem(-1, -1);
   if (avail_space.avail_lmems.empty()) {
     DEBUG_WITH_TYPE("assign_lmem", {
-      llvm::dbgs() << "; action = find_avail_lmem"
-                   << "; step = avail_lmems_empty"
-                   << "\n";
+      llvm::dbgs() << LOG_ACTION("find_avail_lmem")
+                   << LOG_STEP("avail_lmems_empty") << "\n";
     });
     return alloc_lmem;
   }
@@ -518,10 +551,10 @@ MemBlock LmemAllocator::find_avail_lmem_location(
   if (allow_bank_conflict) {
     alloc_lmem = avail_space.avail_lmems.front();
     DEBUG_WITH_TYPE("assign_lmem", {
-      llvm::dbgs() << "; action = find_avail_lmem"
-                   << "; step = use_bank_conflict_buffer"
-                   << "; lmem = " << alloc_lmem.first
-                   << "; size = " << alloc_lmem.second << "\n";
+      llvm::dbgs() << LOG_ACTION("find_avail_lmem")
+                   << LOG_STEP("use_bank_conflict_buffer")
+                   << LOG_KV("lmem", alloc_lmem.first)
+                   << LOG_KV("size", alloc_lmem.second) << "\n";
     });
     return alloc_lmem;
   }
@@ -538,18 +571,18 @@ MemBlock LmemAllocator::find_avail_lmem_location(
   for (auto avail_iter = avail_lmems_tmp.begin();
        avail_iter != avail_lmems_tmp.end(); ++avail_iter) {
     DEBUG_WITH_TYPE("assign_lmem", {
-      llvm::dbgs() << "; action = find_avail_lmem"
-                   << "; step = iter_avail_lmem"
-                   << "; lmem = " << avail_iter->first
-                   << "; size = " << avail_iter->second << "\n";
+      llvm::dbgs() << LOG_ACTION("find_avail_lmem")
+                   << LOG_STEP("iter_avail_lmem")
+                   << LOG_KV("lmem", avail_iter->first)
+                   << LOG_KV("size", avail_iter->second) << "\n";
     });
     if (avail_iter->second >= buffer_value.size) {
       alloc_lmem = *avail_iter;
       DEBUG_WITH_TYPE("assign_lmem", {
-        llvm::dbgs() << "; action = find_avail_lmem"
-                     << "; step = find_availble_buffer"
-                     << "; lmem = " << alloc_lmem.first
-                     << "; size = " << alloc_lmem.second << "\n";
+        llvm::dbgs() << LOG_ACTION("find_avail_lmem")
+                     << LOG_STEP("find_availble_buffer")
+                     << LOG_KV("lmem", alloc_lmem.first)
+                     << LOG_KV("size", alloc_lmem.second) << "\n";
       });
       break;
     }
@@ -559,10 +592,10 @@ MemBlock LmemAllocator::find_avail_lmem_location(
   if (alloc_lmem.first == -1) {
     alloc_lmem = avail_space.avail_lmems.front();
     DEBUG_WITH_TYPE("assign_lmem", {
-      llvm::dbgs() << "; action = find_avail_lmem"
-                   << "; step = use_bank_conflict_buffer"
-                   << "; lmem = " << alloc_lmem.first
-                   << "; size = " << alloc_lmem.second << "\n";
+      llvm::dbgs() << LOG_ACTION("find_avail_lmem")
+                   << LOG_STEP("use_bank_conflict_buffer")
+                   << LOG_KV("lmem", alloc_lmem.first)
+                   << LOG_KV("size", alloc_lmem.second) << "\n";
     });
   }
 
@@ -582,6 +615,7 @@ void LmemAllocator::update_exclude_banks(
   bool is_npu_use, is_gdma_use;
   bool is_recent_used_banks_updated = false;
   std::set<int64_t> recent_used_banks;
+  // find in all timesteps in buffer life cycle
   for (int64_t ts = buffer_value.start_ts;
        (ts != ((buffer_value.end_ts + 1) % timestep_num)) || first_step;
        ts = (ts + 1) % timestep_num) {
@@ -590,7 +624,12 @@ void LmemAllocator::update_exclude_banks(
     const GdmaTsField &cur_tensors = time_step->getTensors(ts);
     is_npu_use = is_buffer_used_by_npu(buffer_key, cur_layers);
     is_gdma_use = is_buffer_used_by_gdma(buffer_key, cur_tensors, is_npu_use);
-
+    DEBUG_WITH_TYPE("assign_lmem", {
+      llvm::dbgs() << LOG_ACTION("update_exclude_banks")
+                   << LOG_STEP("find_banks_used") << LOG_KV("ts", ts)
+                   << LOG_KV("is_npu_use", is_npu_use)
+                   << LOG_KV("is_gdma_use", is_gdma_use) << "\n";
+    });
     // find the banks that have been used by npu if the current buffer is used
     // by gdma
     if (is_gdma_use || is_npu_use) {
@@ -833,8 +872,36 @@ bool LmemAllocator::assignLmemAddr(const LgInfo &lg_info,
                                    BasicTimeStepPtr &time_step,
                                    const shape_secs_t &shape_secs,
                                    bool allow_bank_conflict) {
+  /**
+   *
+   * assignLmemAddr is function for assign lmem addr for each mem buffer defined
+   * in lmem_buffer_
+   *
+   * all lmem_buffer_ is a map<mem_buffer_key_t, mem_buffer_value_t>
+   *
+   * mem_buffer_value_t is defined in BasicTimeStep.h:
+   * typedef struct mem_buffer_value {
+   *   int64_t start_ts;
+   *   int64_t end_ts;
+   *   int64_t addr;
+   *   int64_t size;
+   *   int64_t align_bytes;
+   * } mem_buffer_value_t;
+   *
+   * we should fill all start_ts, end_ts, addr, size, align_bytes for each
+   * mem_buffer_key_t
+   *
+   * the start_ts, end_ts if filled by
+   * TimeStepMethod::memory_aware_timestep_assignment
+   *
+   * then the addr, size, align_bytes is filled in this function
+   * (assignLmemAddr)
+   *
+   */
   PROFILE_LOG("assignLmemAddr", true);
+  // iterate all mem_buffer_key_t, then update mem_buffer_value_t.size
   time_step->update_all_mem_buffer_size(lg_info);
+
   bool one_loop =
       (shape_secs.nsecs == 1 && shape_secs.hsecs == 1 &&
        shape_secs.csecs == 1 && shape_secs.dsecs == 1 && shape_secs.wsecs == 1);
@@ -852,129 +919,219 @@ bool LmemAllocator::assignLmemAddr(const LgInfo &lg_info,
   membuf_heap_create(npu_membuf_heap, gdma_membuf_heap, membuf_list, time_step);
 
   MemBlock alloc_lmem; // consider use alloc_position instead
-  int64_t tgt_position = 0;
+  int64_t tgt_min_address = 0;
   int64_t lmem_occupy = 0;
-  bool first_alloc = true;
+  bool is_first_alloc = true;
   mem_buffer_key_t recent_buffer_allocated;
   std::list<MemBufSortStd>::iterator buflist_it;
-  std::list<MemBufSortStd>::iterator tgt_buflist_it;
+  std::list<MemBufSortStd>::iterator tgt_membuf;
   while (!membuf_list.empty()) {
-    tgt_position = Arch::LMEM_BYTES;
+    tgt_min_address = Arch::LMEM_BYTES;
     DEBUG_WITH_TYPE("assign_lmem", {
-      llvm::dbgs() << "; action = assign_lmem"
-                   << "; step = initial"
-                   << "; tgt_position = " << tgt_position
-                   << "; lmem_occupy = " << lmem_occupy << "\n";
+      llvm::dbgs() << LOG_ACTION("assignLmemAddr")
+                   << LOG_STEP("start_iteration")
+                   << LOG_KV("lmem_occupy", lmem_occupy)
+                   << LOG_KV("lmem_eu_bytes", Arch::EU_BYTES)
+                   << LOG_KV("lmem_npu_num", Arch::NPU_NUM)
+                   << LOG_KV("lmem_bytes", Arch::LMEM_BYTES)
+                   << LOG_KV("lmem_banks", Arch::LMEM_BANKS)
+                   << LOG_KV("lmem_bank_bytes", Arch::LMEM_BANK_BYTES)
+                   << LOG_KV("remaining_buffers", membuf_list.size()) << "\n";
     });
+
     update_membuf_conflict_param(npu_membuf_heap, gdma_membuf_heap,
                                  membuf_list);
+
+    DEBUG_WITH_TYPE("assign_lmem_membuf_list", {
+      llvm::dbgs() << LOG_ACTION("assignLmemAddr") << LOG_STEP("before_sort")
+                   << "\n";
+
+      int i = 0;
+      for (auto &iter : membuf_list) {
+        llvm::dbgs() << LOG_KV("buf_idx", i);
+
+        if (iter.first.type == LMEM_OPERATION) {
+          llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.op));
+        } else {
+          llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.value));
+        }
+
+        llvm::dbgs() << LOG_KV("op_conflict", iter.first.conflict)
+                     << LOG_KV("op_type", iter.first.lmem_type_str())
+                     << LOG_KV("start_ts", iter.second.start_ts)
+                     << LOG_KV("area", iter.second.area) << "\n";
+        ++i;
+      }
+    });
     membuf_list.sort(membuf_sort_std_cmp);
+    DEBUG_WITH_TYPE("assign_lmem_membuf_list", {
+      llvm::dbgs() << LOG_ACTION("assignLmemAddr") << LOG_STEP("after_sort")
+                   << "\n";
+
+      int i = 0;
+      for (auto &iter : membuf_list) {
+        llvm::dbgs() << LOG_KV("buf_idx", i);
 
+        if (iter.first.type == LMEM_OPERATION) {
+          llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.op));
+        } else {
+          llvm::dbgs() << LOG_KV("op_name", module::getName(iter.first.value));
+        }
+
+        llvm::dbgs() << LOG_KV("op_conflict", iter.first.conflict)
+                     << LOG_KV("op_type", iter.first.lmem_type_str())
+                     << LOG_KV("start_ts", iter.second.start_ts)
+                     << LOG_KV("area", iter.second.area) << "\n";
+        ++i;
+      }
+    });
+
+    // 1. find a min position in membuf_list and then allocation
     for (buflist_it = membuf_list.begin(); buflist_it != membuf_list.end();
          ++buflist_it) {
-      if (first_alloc) {
-        first_alloc = false;
+
+      //  1.1 first allocation can start at 0 directly
+      if (is_first_alloc) {
+        is_first_alloc = false;
         DEBUG_WITH_TYPE("assign_lmem", {
-          llvm::dbgs() << "; action = assign_lmem"
-                       << "; step = first_alloc"
-                       << "; op = " << module::getName(buflist_it->first.value)
-                       << "\n";
+          std::set<int64_t> used_banks;
+          find_used_banks(used_banks, 0,
+                          time_step->get_lmem_size(buflist_it->first));
+          llvm::dbgs() << LOG_ACTION("assignLmemAddr")
+                       << LOG_STEP("first_allocation")
+                       << LOG_KV("op_type", buflist_it->first.lmem_type_str())
+                       << LOG_KV("op_name",
+                                 module::getName(buflist_it->first.value))
+                       << LOG_KV("size",
+                                 time_step->get_lmem_size(buflist_it->first))
+                       << "; banks = ";
+          for (auto bank : used_banks) {
+            llvm::dbgs() << bank << ",";
+          }
+          llvm::dbgs() << "\n";
         });
-        if (time_step->get_lmem_size(buflist_it->first) <= Arch::LMEM_BYTES) {
-          tgt_position = 0;
-          tgt_buflist_it = buflist_it;
-          DEBUG_WITH_TYPE("assign_lmem", {
-            llvm::dbgs() << "; action = assign_lmem"
-                         << "; step = first_alloc_success"
-                         << "; tgt_position = " << tgt_position
-                         << "; lmem_occupy = " << lmem_occupy << "\n";
-          });
-        } else {
-          DEBUG_WITH_TYPE("assign_lmem", {
-            llvm::dbgs() << "; action = assign_lmem"
-                         << "; step = find_op_assign_failed"
-                         << "; tgt_position = " << tgt_position
-                         << "; lmem_occupy = " << lmem_occupy << "; op = "
-                         << module::getName(buflist_it->first.value) << "\n";
-          });
+
+        if (time_step->get_lmem_size(buflist_it->first) > Arch::LMEM_BYTES) {
           PROFILE_LOG("assignLmemAddr", false);
           return false;
         }
+
+        tgt_min_address = 0;
+        tgt_membuf = buflist_it;
         break;
-      } else {
-        alloc_lmem = global_find_avail_lmem_localtion(
-            buffer_avail_space[buflist_it->first], buflist_it->first,
-            recent_buffer_allocated, time_step, one_loop, allow_bank_conflict);
+      }
 
+      // 1.2 search an available lmem location
+      alloc_lmem = global_find_avail_lmem_localtion(
+          buffer_avail_space[buflist_it->first], buflist_it->first,
+          recent_buffer_allocated, time_step, one_loop, allow_bank_conflict);
+
+      // 1.3 early return
+      // if this membuf can't find an available lmem in this step, it can't be
+      // allocated in any step after
+      if (alloc_lmem.first == -1) {
         DEBUG_WITH_TYPE("assign_lmem", {
-          llvm::dbgs() << "; action = assign_lmem"
-                       << "; step = find_avail_lmem_location"
-                       << "; op = " << module::getName(buflist_it->first.value)
-                       << "\n";
+          llvm::dbgs() << LOG_ACTION("assignLmemAddr")
+                       << LOG_STEP("allocation_failed")
+                       << LOG_KV("op_type", buflist_it->first.lmem_type_str())
+                       << LOG_KV("op_name",
+                                 module::getName(buflist_it->first.value))
+                       << LOG_KV("required_size",
+                                 time_step->get_lmem_size(buflist_it->first))
+                       << LOG_KV("max_addr", Arch::LMEM_BYTES) << "\n";
         });
-        if (alloc_lmem.first != -1) {
-          if (alloc_lmem.first < tgt_position) {
-            tgt_position = alloc_lmem.first;
-            tgt_buflist_it = buflist_it;
-            DEBUG_WITH_TYPE("assign_lmem", {
-              llvm::dbgs() << "; action = assign_lmem"
-                           << "; step = update_min_tgt_position"
-                           << "; tgt_position = " << tgt_position
-                           << "; lmem_occupy = " << lmem_occupy << "\n";
-            });
-          }
-        } else {
-          DEBUG_WITH_TYPE("assign_lmem", {
-            llvm::dbgs() << "; action = assign_lmem"
-                         << "; step = find_op_assign_failed"
-                         << "; op = "
-                         << module::getName(buflist_it->first.value) << "\n";
-          });
-          PROFILE_LOG("assignLmemAddr", false);
-          return false;
-        }
+        PROFILE_LOG("assignLmemAddr", false);
+        return false;
       }
-    }
 
-    if (tgt_position < Arch::LMEM_BYTES) {
-      recent_buffer_allocated = tgt_buflist_it->first;
-      time_step->set_lmem_addr(tgt_buflist_it->first, tgt_position);
-      int64_t buffer_end =
-          tgt_position + time_step->get_lmem_size(tgt_buflist_it->first);
-      lmem_occupy = buffer_end > lmem_occupy ? buffer_end : lmem_occupy;
-      conflict_heap_delete(npu_membuf_heap, gdma_membuf_heap,
-                           &(tgt_buflist_it->first));
-      membuf_list.erase(tgt_buflist_it);
-      buffer_avail_space.erase(tgt_buflist_it->first);
       DEBUG_WITH_TYPE("assign_lmem", {
-        llvm::dbgs() << "; action = assign_lmem"
-                     << "; step = set_lmem_addr"
-                     << "; tgt_position = " << tgt_position
-                     << "; lmem_occupy = " << lmem_occupy
-                     << "; buffer_end = " << buffer_end << "; op = "
-                     << module::getName(tgt_buflist_it->first.value) << "\n";
+        std::set<int64_t> used_banks;
+        find_used_banks(used_banks, alloc_lmem.first, alloc_lmem.second);
+        llvm::dbgs()
+            << LOG_ACTION("assignLmemAddr")
+            << LOG_STEP("found_available_location")
+            << LOG_KV("op_type", buflist_it->first.lmem_type_str())
+            << LOG_KV("op_name", module::getName(buflist_it->first.value))
+            << LOG_KV("addr", llvm::format_hex(alloc_lmem.first, 8))
+            << LOG_KV("size", alloc_lmem.second)
+            << LOG_KV(
+                   "timestep",
+                   time_step->get_lmem_buffer_value(buflist_it->first).start_ts)
+            << "->"
+            << time_step->get_lmem_buffer_value(buflist_it->first).end_ts
+            << "; banks = ";
+        for (auto bank : used_banks) {
+          llvm::dbgs() << bank << ",";
+        }
+        llvm::dbgs() << "\n";
       });
-    } else {
+      // 1.4 update tgt_min_address and tgt_membuf
+      if (alloc_lmem.first < tgt_min_address) {
+        tgt_min_address = alloc_lmem.first;
+        tgt_membuf = buflist_it;
+        DEBUG_WITH_TYPE("assign_lmem", {
+          llvm::dbgs() << LOG_ACTION("assignLmemAddr")
+                       << LOG_STEP("update_min_tgt_min_address")
+                       << LOG_KV("tgt_min_address", tgt_min_address)
+                       << LOG_KV("lmem_occupy", lmem_occupy) << "\n";
+        });
+      }
+    }
+
+    // 2.a after search a min position, if can't find an available lmem, return
+    // false
+    if (tgt_min_address >= Arch::LMEM_BYTES) {
       llvm::errs() << "Cannot find local memory location for memory buffers\n";
-      DEBUG_WITH_TYPE("assign_lmem", {
-        llvm::dbgs() << "; action = assign_lmem"
-                     << "; step = op_assign_failed_in_loop_end"
-                     << "; op = " << module::getName(buflist_it->first.value)
-                     << "\n";
-      });
       PROFILE_LOG("assignLmemAddr", false);
       return false;
     }
+
+    // 2.b allocate this available address for this membuf
+    recent_buffer_allocated = tgt_membuf->first;
+    time_step->set_lmem_addr(tgt_membuf->first, tgt_min_address);
+    int64_t buffer_end =
+        tgt_min_address + time_step->get_lmem_size(tgt_membuf->first);
+    lmem_occupy = buffer_end > lmem_occupy ? buffer_end : lmem_occupy;
+    conflict_heap_delete(npu_membuf_heap, gdma_membuf_heap,
+                         &(tgt_membuf->first));
+    membuf_list.erase(tgt_membuf);
+    buffer_avail_space.erase(tgt_membuf->first);
+    DEBUG_WITH_TYPE("assign_lmem", {
+      std::set<int64_t> used_banks;
+      find_used_banks(used_banks, tgt_min_address,
+                      time_step->get_lmem_size(tgt_membuf->first));
+      llvm::dbgs()
+          << LOG_ACTION("assignLmemAddr") << LOG_STEP("allocated_memory")
+          << LOG_KV("op_type", tgt_membuf->first.lmem_type_str())
+          << LOG_KV("op_name", module::getName(tgt_membuf->first.value))
+          << LOG_KV("addr", llvm::format_hex(tgt_min_address, 8))
+          << LOG_KV("size", time_step->get_lmem_size(tgt_membuf->first))
+          << LOG_KV(
+                 "timestep_start",
+                 time_step->get_lmem_buffer_value(tgt_membuf->first).start_ts)
+          << LOG_KV("timestep_end",
+                    time_step->get_lmem_buffer_value(tgt_membuf->first).end_ts)
+          << LOG_KV("timestep_mode",
+                    time_step->get_tensor_mode_str(tgt_membuf->first.value))
+          << LOG_KV("lmem_occupy", lmem_occupy) << "; banks = ";
+      for (auto bank : used_banks) {
+        llvm::dbgs() << bank << ",";
+      }
+      llvm::dbgs() << "\n";
+    });
   }
 
   time_step->set_lmem_occupy(lmem_occupy);
-
   assignL2memAddr(lg_info, time_step);
+
   DEBUG_WITH_TYPE("assign_lmem", {
-    llvm::dbgs() << "; action = assign_lmem"
-                 << "; step = final_assign_lmem_success"
-                 << "\n";
+    llvm::dbgs() << LOG_ACTION("assignLmemAddr") << LOG_STEP("completed")
+                 << LOG_KV("total_lmem_used", lmem_occupy)
+                 << LOG_KV("utilization",
+                           (lmem_occupy * 100.0 / Arch::LMEM_BYTES))
+                 << "%\n";
   });
+
   PROFILE_LOG("assignLmemAddr", false);
   return true;
 }
@@ -1215,14 +1372,14 @@ void LmemAllocator::sc_method_multi_core(const LgInfo &lg_info,
       try_this_shape_secs(lg_info, shape_secs, allow_bank_conflict, time_step);
   if (ret >= SECS_VALID) {
     DEBUG_WITH_TYPE("shape_secs", {
-      llvm::dbgs() << "; action = shape_secs"
-                   << "; step = sc_method_multi_core"
-                   << "; nsecs = " << shape_secs.nsecs
-                   << "; csecs = " << shape_secs.csecs
-                   << "; dsecs = " << shape_secs.dsecs
-                   << "; hsecs = " << shape_secs.hsecs
-                   << "; wsecs = " << shape_secs.wsecs
-                   << "; cost = " << last_group_cost_ << "\n";
+      llvm::dbgs() << LOG_ACTION("shape_secs")
+                   << LOG_STEP("sc_method_multi_core")
+                   << LOG_KV("nsecs", shape_secs.nsecs)
+                   << LOG_KV("csecs", shape_secs.csecs)
+                   << LOG_KV("dsecs", shape_secs.dsecs)
+                   << LOG_KV("hsecs", shape_secs.hsecs)
+                   << LOG_KV("wsecs", shape_secs.wsecs)
+                   << LOG_KV("cost", last_group_cost_) << "\n";
     });
   }
 }
@@ -1280,15 +1437,15 @@ void LmemAllocator::sc_method_multi_core_v2(const LgInfo &lg_info,
 
       if (ret >= SECS_VALID) {
         DEBUG_WITH_TYPE("shape_secs", {
-          llvm::dbgs() << "; action = shape_secs"
-                       << "; step = sc_method_multi_core_v2"
-                       << "; nch_secs = " << i
-                       << "; nsecs = " << core_shape_secs.nsecs
-                       << "; csecs = " << core_shape_secs.csecs
-                       << "; dsecs = " << core_shape_secs.dsecs
-                       << "; hsecs = " << core_shape_secs.hsecs
-                       << "; wsecs = " << core_shape_secs.wsecs
-                       << "; cost = " << last_group_cost_ << "\n";
+          llvm::dbgs() << LOG_ACTION("shape_secs")
+                       << LOG_STEP("sc_method_multi_core_v2")
+                       << LOG_KV("nch_secs", i)
+                       << LOG_KV("nsecs", core_shape_secs.nsecs)
+                       << LOG_KV("csecs", core_shape_secs.csecs)
+                       << LOG_KV("dsecs", core_shape_secs.dsecs)
+                       << LOG_KV("hsecs", core_shape_secs.hsecs)
+                       << LOG_KV("wsecs", core_shape_secs.wsecs)
+                       << LOG_KV("cost", last_group_cost_) << "\n";
         });
       }
       if (not_best_count >= MAX_TRY_NUM) {
@@ -1333,14 +1490,14 @@ void LmemAllocator::sc_method_multi_core_v3(const LgInfo &lg_info,
                                                 allow_bank_conflict, time_step);
       if (ret >= SECS_VALID) {
         DEBUG_WITH_TYPE("shape_secs", {
-          llvm::dbgs() << "; action = shape_secs"
-                       << "; step = sc_method_multi_core_v3"
-                       << "; nsecs = " << shape_secs.nsecs
-                       << "; csecs = " << shape_secs.csecs
-                       << "; dsecs = " << shape_secs.dsecs
-                       << "; hsecs = " << shape_secs.hsecs
-                       << "; wsecs = " << shape_secs.wsecs
-                       << "; cost = " << last_group_cost_ << "\n";
+          llvm::dbgs() << LOG_ACTION("shape_secs")
+                       << LOG_STEP("sc_method_multi_core_v3")
+                       << LOG_KV("nsecs", shape_secs.nsecs)
+                       << LOG_KV("csecs", shape_secs.csecs)
+                       << LOG_KV("dsecs", shape_secs.dsecs)
+                       << LOG_KV("hsecs", shape_secs.hsecs)
+                       << LOG_KV("wsecs", shape_secs.wsecs)
+                       << LOG_KV("cost", last_group_cost_) << "\n";
         });
       }
     }
diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp
index b55b3c588..f7b1f8c67 100644
--- a/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp
+++ b/lib/Dialect/Tpu/Transforms/LayerGroup/SwPipeline.cpp
@@ -94,7 +94,8 @@ int64_t SoftwarePipeline::software_pipeline_schedule(
 
   // delete the last row of time step table
   timestep_table.erase(last_row_iter);
-  // move the last tensor timestep to the first
+
+  // 1. (try) move the last tensor timestep to the first
   bool move_valid;
   // consider time step 1, it is the second row of the table
   auto second_row_iter = timestep_table.begin() + 1;
@@ -103,6 +104,8 @@ int64_t SoftwarePipeline::software_pipeline_schedule(
   for (uint32_t i = 0; i < last_tensor_timestep.size(); ++i) {
     move_valid = true;
     auto v = last_tensor_timestep[i].first;
+    // if v is used by tpu op (is opds or results of tpu op) in the second row
+    // it cannot be moved from last to second row (move_valid = false)
     for (auto op : second_row_iter->tpu0_ts_field) {
       auto opds = op->getOperands();
       auto results = get_output_values(op);
@@ -128,12 +131,18 @@ int64_t SoftwarePipeline::software_pipeline_schedule(
     timestep_table[0].gdma0_ts_field = rest_last_tensors_;
   }
 
-  // move the first tensor timestep to the last
+  // 2. (try) move the first tensor timestep to the last
   last_row_iter = timestep_table.end() - 1;
+  // consider time step n-1, it is the (n-1)th row of the table
   GdmaTsField rest_first_tensors_;
   for (uint32_t i = 0; i < first_tensor_timestep.size(); ++i) {
     move_valid = true;
     auto v = first_tensor_timestep[i].first;
+
+    // if v is used by tpu op (is opds or results of tpu op) in the "new" last
+    // row it cannot be moved from first to the "new" last row (move_valid =
+    // false) note: the "new" last row is the last row before the "real" last
+    // row which is deleted
     for (auto op : last_row_iter->tpu0_ts_field) {
       auto opds = op->getOperands();
       if (std::find(opds.begin(), opds.end(), v) != opds.end()) {
diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp
index 19c42e190..f7f989095 100644
--- a/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp
+++ b/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp
@@ -24,10 +24,18 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step,
 
   Operation *op;
   tensor_info_t tensor_info;
+  // in nearest algorithm, each op calculation will be assigned to a timestep
   for (size_t i = 0; i < group_ops.size(); ++i) {
     op = group_ops[i];
-    // layer: 0
+    // timestep: 0
+    // load current layer's input from lmem
+    // current layer will have tpu_field in next timestep
+    DEBUG_WITH_TYPE("timestep_assign", {
+      llvm::dbgs() << "; action = layer_nearest_timestep_assignment"
+                   << "; ts = " << i << "\n";
+    });
     if (i == 0) {
+      // stage 0, only have load timestep
       gdma_field.clear();
       have_load_tensor = false;
       for (auto in : op->getOperands()) {
@@ -52,12 +60,17 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step,
 
     tpu_field.clear();
     gdma_field.clear();
+
+    // stage 1, in pipeline, all calculate, load, store ops are in the same
+    // timestep
     for (auto out : get_output_values(op)) {
       tensor_in_lmem.insert(out);
     }
+    // stage 1.1: add current gdma and tpu timestep
     tpu_field.push_back(op);
 
     // layer: [1, N-1)
+    // stage 1.1: pre load next layer's input in current timestep
     if (i != group_ops.size() - 1) {
       auto next_op = group_ops[i + 1];
       for (auto next_in : next_op->getOperands()) {
@@ -76,6 +89,8 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step,
       }
     }
 
+    // layer: [1, N-1)
+    // stage 1.2: store current layer's output to lmem
     if (i > 0) {
       auto pre_op = group_ops[i - 1];
       for (auto pre_out : get_output_values(pre_op)) {
@@ -88,10 +103,15 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step,
       }
     }
 
+    // add current gdma and tpu timestep
+    // stage 1 finally add
     if (!(tpu_field.empty() && gdma_field.empty())) {
       time_step->add_tpu0_gdma0_ts_field(tpu_field, gdma_field);
     }
 
+    // last layer
+    // store last layer's output to lmem in a new timestep
+    // stage 2: last layer will only have gdma_field
     if (i == group_ops.size() - 1) {
       gdma_field.clear();
       for (auto out : get_output_values(op)) {
@@ -102,11 +122,18 @@ void TimeStepMethod::layer_nearest_timestep_assignment(BasicTimeStep *time_step,
       time_step->add_gdma0_ts_field(gdma_field);
     }
   }
-
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "============= nearest algorithm =============\n";
+    time_step->show_timestep_table();
+  });
   // use software pipeline
   if (group_ops.size() > 1) {
     time_step->software_pipeline();
   }
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "============= software pipeline =============\n";
+    time_step->show_timestep_table();
+  });
 }
 
 bool is_tensor_accessed_by_npu(Value v, BasicTimeStep *time_step, int64_t ts) {
@@ -239,6 +266,10 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step,
   ValueIntMap tensor_to_bufsize;
   std::vector<std::list<GdmaElt>> tensor_timesteps;
 
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "============= memory aware algorithm =============\n";
+  });
+
 // remove it after pid_node is extracted
 #pragma omp critical(get_cycle)
   get_timestep_cycle_slack(time_step, lg_info, tensor_to_cycle,
@@ -275,6 +306,11 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step,
     }
     time_step->update_gdma0_ts_field(ts, new_tensor_timestep);
   }
+  time_step->show_timestep_table();
+
+  DEBUG_WITH_TYPE("timestep_assign", {
+    llvm::dbgs() << "=======================================\n";
+  });
 }
 
 void TimeStepMethod::get_timestep_cycle_slack(
diff --git a/lib/Support/Module.cpp b/lib/Support/Module.cpp
index 99fb04063..414f362c2 100644
--- a/lib/Support/Module.cpp
+++ b/lib/Support/Module.cpp
@@ -868,6 +868,13 @@ bool isOpInGroupParallel(Operation *Op) {
   return false;
 }
 
+bool isValueBlockArgument(Value v) {
+  if (auto blockArg = dyn_cast<BlockArgument>(v)) {
+    return true;
+  }
+  return false;
+}
+
 // op in [CoreBegin, CoreEnd]
 bool isOpInCoreMatch(Operation *op) {
   while (!op->use_empty()) {
diff --git a/python/tools/logdebug_tool.py b/python/tools/logdebug_tool.py
index 837f3f644..6ce1500b2 100755
--- a/python/tools/logdebug_tool.py
+++ b/python/tools/logdebug_tool.py
@@ -13,7 +13,7 @@
 import pandas as pd
 from itertools import combinations
 
-kv_pattern = re.compile(r";\s*([\w/.]+)\s*=\s*(['\"]?)([:\w\s/\-\".]+)\2")
+kv_pattern = re.compile(r";\s*([\w/.]+)\s*=\s*(['\"]?)([:,_\w\s/\-\".]+)\2")
 
 
 def comsume_in_main(func):
@@ -48,7 +48,7 @@ def parse_dic(line, filter=None):
     dic = {}
     for k, _, v in ret:
         try:
-            dic[k] = int(v)
+            dic[k] = int(v) if v.isdigit() else v.strip()
         except Exception:
             dic[k] = v.strip()
     return dic