From 81abcf9e741a7c0bcc10e514662f87896f1fe8dc Mon Sep 17 00:00:00 2001
From: haowhsu-quic <quic_haowhsu@quicinc.com>
Date: Sat, 29 Mar 2025 01:47:32 +0800
Subject: [PATCH] smart_ptr multi-turn demo

---
 .../oss_scripts/llama/runner/io_manager.cpp   | 77 +++++++++++++++++++
 .../oss_scripts/llama/runner/io_manager.h     | 11 +++
 .../oss_scripts/llama/runner/runner.cpp       | 57 +++++++++++---
 3 files changed, 135 insertions(+), 10 deletions(-)
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
index ce7baefa080..640896e71be 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
@@ -564,6 +564,77 @@ void ShiftPointerIoMgr::update_prefill_to_kv_io(
   }
 }
 
+void ShiftPointerIoMgr::update_kv_to_prefill_io(
+  int64_t pos,
+  std::vector<std::vector<Tensor>>& output_tensors) {
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in_prefill =
+      v_cache_in_[prefill_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in_kv =
+      v_cache_in_[kv_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out_prefil =
+      v_cache_out_[prefill_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out_kv =
+      v_cache_out_[kv_forward_name_];
+
+  // update v_cache
+  // this is critical to make generated v_cache always aligned in both prefill & decode mode
+  size_t prefill_offset = (kv_cache_len_ - prefill_cache_len_) * head_dim_;
+  for (int i = 0, v_stride = head_dim_*pos; i < v_cache_in_prefill.size(); ++i) {
+    v_cache_in_prefill[i]->set_data(v_cache_in_kv[i]->mutable_data<uint8_t>() +  + prefill_offset);
+    v_cache_out_prefil[i]->set_data(v_cache_out_kv[i]->mutable_data<uint8_t>());
+    // reset decode mode pointer since it will be updated again in update_prefill_to_kv
+    v_cache_in_kv[i]->set_data(v_cache_in_kv[i]->mutable_data<uint8_t>() - v_stride);
+    v_cache_out_kv[i]->set_data(v_cache_out_kv[i]->mutable_data<uint8_t>() - v_stride);
+  }
+
+  // make framework aware that output tensor pointers have changed
+  for (int shard = 0; shard < output_tensors.size(); shard++) {
+    for (int index = 0; index < output_tensors[shard].size(); index++) {
+      ET_CHECK_MSG(
+          modules_[shard]->set_output(
+              prefill_forward_name_, output_tensors[shard][index], index) ==
+              Error::Ok,
+          "failed to set output tensor for module %d's %d'th output "
+          "while updating kv_cache output tensors",
+          shard,
+          index);
+    }
+  }
+
+  // update k_cache
+  size_t copied_size = pos * sizeof(uint8_t);
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in_prefill =
+      k_cache_in_[prefill_forward_name_];
+  std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in_kv =
+      k_cache_in_[kv_forward_name_];
+
+  for (int i = 0; i < k_cache_in_prefill.size(); i++) {
+    // k_cache_in should be always the same between prefill & decode
+    k_cache_in_prefill[i]->set_data(k_cache_in_kv[i]->mutable_data<uint8_t>());
+    // always do deep copy from origin because of the consumed tensor size is different in prefill & decode
+    uint8_t* ptr_in = k_cache_in_prefill[i]->mutable_data<uint8_t>() - pos;
+    // reset decode mode pointer since it will be updated again in update_prefill_to_kv
+    k_cache_in_kv[i]->set_data(ptr_in);
+    // in order not to override existent k_cache_out
+    // update_prefill_to_kv: copy from last dimension
+    // update_kv_to_prefill: copy from first dimension
+    for (int j = 0; j <= head_dim_; ++j) {
+      uint8_t* dst = ptr_in + j * prefill_cache_len_;
+      const uint8_t* src = ptr_in + j * kv_cache_len_;
+      memcpy(dst, src, copied_size);
+    }
+  }
+
+  // probably could be more efficient
+  IO* ptr = static_cast<IO*>(data_ptr_.get());
+  for (int i = 0; i < pos; i++) {
+    int offset = context_len_ - prefill_ar_len_ - i - 1;
+    for (int j = 0; j < prefill_ar_len_; j++) {
+      ptr->prefill_attention_mask[j * context_len_ + offset] = 65535;
+    }
+  }
+}
+
 void ShiftPointerIoMgr::update_kv_io(
     int64_t cur_token,
     int64_t pos,
@@ -667,6 +738,7 @@ void ShiftPointerIoMgr::fill_prefill_toks(
     int64_t start_pos,
     std::vector<uint64_t>& prompt_tokens) {
   IO* ptr = static_cast<IO*>(get_mutable_ptr());
+
   for (int i = 0; i < prefill_ar_len_; i++) {
     if (!is_bert_) {
       ptr->prefill_input_pos[i] = start_pos + i;
@@ -1360,6 +1432,11 @@ void SmartMaskIoMgr::update_prefill_to_kv_io(
   }
 }
 
+void SmartMaskIoMgr::update_kv_to_prefill_io(
+  int64_t pos,
+  std::vector<std::vector<Tensor>>& output_tensors) {
+}
+
 void SmartMaskIoMgr::update_prefill_io(
     int64_t cur_token,
     int64_t pos,
diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
index 03808ede3bf..d91f650d273 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h
@@ -55,6 +55,9 @@ class IoMgrBase {
       int64_t cur_token,
       int64_t pos,
       std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
+  virtual void update_kv_to_prefill_io(
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
   virtual void update_kv_io(
       int64_t cur_token,
       int64_t pos,
@@ -126,6 +129,10 @@ class ShiftPointerIoMgr : public IoMgrBase {
       int64_t pos,
       std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
       override;
+  void update_kv_to_prefill_io(
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
   void update_kv_io(
       int64_t cur_token,
       int64_t pos,
@@ -234,6 +241,10 @@ class SmartMaskIoMgr : public IoMgrBase {
       int64_t pos,
       std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
       override;
+  void update_kv_to_prefill_io(
+      int64_t pos,
+      std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
+      override;
   void update_kv_io(
       int64_t cur_token,
       int64_t pos,
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index dafc911a172..0eda06469c8 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -57,7 +57,8 @@ Runner::Runner(
       performance_output_path_(performance_output_path),
       logits_scale_(logits_scale),
       logits_offset_(logits_offset),
-      temperature_(temperature),
+      // hardcoded here for comparing with goldens
+      temperature_(0),
       eval_mode_(static_cast<EvalMode>(eval_mode)),
       kv_updater_(kv_updater),
       num_iters_(num_iters) {
@@ -367,11 +368,12 @@ Error Runner::generate(
       pos += prefill_ar_len_;
     }
     Tensor& logits_tensor = output_tensors[method_name].back()[0];
-    prev_token = prompt_tokens[num_prompt_tokens - 1];
+    prev_token = prompt_tokens.back();
     long sample_start_time_ms = time_in_ms();
     cur_token = logitsToToken(
         logits_tensor,
         (num_prompt_tokens + prefill_ar_len_ - 1) % prefill_ar_len_);
+
     stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
 
     auto piece_res = tokenizer_->decode(prev_token, cur_token);
@@ -380,14 +382,17 @@ Error Runner::generate(
       token_callback(piece_res.get().c_str());
     }
 
-    pos = num_prompt_tokens;
+    pos = prompt_tokens.size();
     stats_.first_token_ms = time_in_ms();
     stats_.prompt_eval_end_ms = time_in_ms();
   };
 
   auto kv_execute = [&](const std::string& method_name) {
     io_mgr_->fill_kv_tok_mask(pos, cur_token);
-    while (pos < seq_len - 1) {
+    // force decode to generate 5 runs at most
+    int64_t max_pos = std::min(pos + 5, (int64_t)seq_len - 1);
+    //while (pos < seq_len - 1) {
+    while (pos < max_pos) {
       // inference
       run_model_step(method_name, inputs[method_name]);
       Tensor& logits_tensor = output_tensors[method_name].back()[0];
@@ -401,6 +406,7 @@ Error Runner::generate(
         }
       }
       prev_token = cur_token;
+      prompt_tokens.push_back(prev_token);
       long sample_start_time_ms = time_in_ms();
       cur_token = logitsToToken(logits_tensor, pos);
       stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
@@ -427,12 +433,43 @@ Error Runner::generate(
     case EvalMode::kKVCached:
       kv_execute(kv_forward_name_);
       break;
-    case EvalMode::kHybrid:
-      prefill_execute(prefill_forward_name_);
-      io_mgr_->update_prefill_to_kv_io(
-          cur_token, pos, output_tensors[kv_forward_name_]);
-      kv_execute(kv_forward_name_);
-      break;
+    case EvalMode::kHybrid: {
+      std::vector<std::vector<int32_t>> new_prompt_token;
+      new_prompt_token.push_back({7826, 4257, 365, 2354, 29889});
+      new_prompt_token.push_back({902, 304, 952, 322, 902, 25448, 304, 29891, 471, 263, 4802, 29892});
+      new_prompt_token.push_back({29892, 365, 2354, 29915});
+      new_prompt_token.push_back({1371, 902, 411, 278, 425, 870, 719});
+      new_prompt_token.push_back({304, 1371, 322, 1183, 1925, 599, 278, 22095, 297, 278, 471, 2790, 4933, 29889, 29871, 13, 13555, 278, 22095, 892, 471, 17143, 29892, 365, 2354, 29915, 29879, 16823, 4433, 902, 304, 1371, 902, 13958, 963, 701, 304, 15589});
+      new_prompt_token.push_back({});
+      for (int i = 0; i < new_prompt_token.size(); ++i) {
+        prefill_execute(prefill_forward_name_);
+        io_mgr_->update_prefill_to_kv_io(
+            cur_token, pos, output_tensors[kv_forward_name_]);
+        kv_execute(kv_forward_name_);
+        io_mgr_->update_kv_to_prefill_io(
+            pos, output_tensors[prefill_forward_name_]);
+
+        // check if generated tokens match goldens
+        ET_LOG(Info, "Current tokens after turn %d:", i);
+        for (size_t k = 0; k < prompt_tokens.size() ; k += 10) {
+          std::string tokens;
+          for (int i = k; i < std::min(k+10, prompt_tokens.size()); ++i) {
+            tokens += std::to_string(prompt_tokens[i]) + " ";
+          }
+          ET_LOG(Info, "%s", tokens.c_str());
+        }
+        // convert tokens into text
+        for (int j = 1; j < new_prompt_token[i].size(); ++j) {
+          auto piece = tokenizer_->decode(new_prompt_token[i][j-1], new_prompt_token[i][j]);
+          if (token_callback) {
+            token_callback(piece.get().c_str());
+          }
+        }
+        prompt_tokens.insert(
+            prompt_tokens.end(), begin(new_prompt_token[i]), end(new_prompt_token[i]));
+        num_prompt_tokens = new_prompt_token[i].size();
+      }
+    } break;
     default:
       ET_CHECK_MSG(false, "Unsupported eval mode");
       break;