From 81abcf9e741a7c0bcc10e514662f87896f1fe8dc Mon Sep 17 00:00:00 2001 From: haowhsu-quic Date: Sat, 29 Mar 2025 01:47:32 +0800 Subject: [PATCH] smart_ptr multi-turn demo --- .../oss_scripts/llama/runner/io_manager.cpp | 77 +++++++++++++++++++ .../oss_scripts/llama/runner/io_manager.h | 11 +++ .../oss_scripts/llama/runner/runner.cpp | 57 +++++++++++--- 3 files changed, 135 insertions(+), 10 deletions(-) diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp index ce7baefa080..640896e71be 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp @@ -564,6 +564,77 @@ void ShiftPointerIoMgr::update_prefill_to_kv_io( } } +void ShiftPointerIoMgr::update_kv_to_prefill_io( + int64_t pos, + std::vector>& output_tensors) { + std::vector>& v_cache_in_prefill = + v_cache_in_[prefill_forward_name_]; + std::vector>& v_cache_in_kv = + v_cache_in_[kv_forward_name_]; + std::vector>& v_cache_out_prefil = + v_cache_out_[prefill_forward_name_]; + std::vector>& v_cache_out_kv = + v_cache_out_[kv_forward_name_]; + + // update v_cache + // this is critical to make generated v_cache always aligned in both prefill & decode mode + size_t prefill_offset = (kv_cache_len_ - prefill_cache_len_) * head_dim_; + for (int i = 0, v_stride = head_dim_*pos; i < v_cache_in_prefill.size(); ++i) { + v_cache_in_prefill[i]->set_data(v_cache_in_kv[i]->mutable_data() + + prefill_offset); + v_cache_out_prefil[i]->set_data(v_cache_out_kv[i]->mutable_data()); + // reset decode mode pointer since it will be updated again in update_prefill_to_kv + v_cache_in_kv[i]->set_data(v_cache_in_kv[i]->mutable_data() - v_stride); + v_cache_out_kv[i]->set_data(v_cache_out_kv[i]->mutable_data() - v_stride); + } + + // make framework aware that output tensor pointers have changed + for (int shard = 0; shard < output_tensors.size(); shard++) { + for (int index = 0; index < output_tensors[shard].size(); index++) { + ET_CHECK_MSG( + modules_[shard]->set_output( + prefill_forward_name_, output_tensors[shard][index], index) == + Error::Ok, + "failed to set output tensor for module %d's %d'th output " + "while updating kv_cache output tensors", + shard, + index); + } + } + + // update k_cache + size_t copied_size = pos * sizeof(uint8_t); + std::vector>& k_cache_in_prefill = + k_cache_in_[prefill_forward_name_]; + std::vector>& k_cache_in_kv = + k_cache_in_[kv_forward_name_]; + + for (int i = 0; i < k_cache_in_prefill.size(); i++) { + // k_cache_in should be always the same between prefill & decode + k_cache_in_prefill[i]->set_data(k_cache_in_kv[i]->mutable_data()); + // always do deep copy from origin because of the consumed tensor size is different in prefill & decode + uint8_t* ptr_in = k_cache_in_prefill[i]->mutable_data() - pos; + // reset decode mode pointer since it will be updated again in update_prefill_to_kv + k_cache_in_kv[i]->set_data(ptr_in); + // in order not to override existent k_cache_out + // update_prefill_to_kv: copy from last dimension + // update_kv_to_prefill: copy from first dimension + for (int j = 0; j <= head_dim_; ++j) { + uint8_t* dst = ptr_in + j * prefill_cache_len_; + const uint8_t* src = ptr_in + j * kv_cache_len_; + memcpy(dst, src, copied_size); + } + } + + // probably could be more efficient + IO* ptr = static_cast(data_ptr_.get()); + for (int i = 0; i < pos; i++) { + int offset = context_len_ - prefill_ar_len_ - i - 1; + for (int j = 0; j < prefill_ar_len_; j++) { + ptr->prefill_attention_mask[j * context_len_ + offset] = 65535; + } + } +} + void ShiftPointerIoMgr::update_kv_io( int64_t cur_token, int64_t pos, @@ -667,6 +738,7 @@ void ShiftPointerIoMgr::fill_prefill_toks( int64_t start_pos, std::vector& prompt_tokens) { IO* ptr = static_cast(get_mutable_ptr()); + for (int i = 0; i < prefill_ar_len_; i++) { if (!is_bert_) { ptr->prefill_input_pos[i] = start_pos + i; @@ -1360,6 +1432,11 @@ void SmartMaskIoMgr::update_prefill_to_kv_io( } } +void SmartMaskIoMgr::update_kv_to_prefill_io( + int64_t pos, + std::vector>& output_tensors) { +} + void SmartMaskIoMgr::update_prefill_io( int64_t cur_token, int64_t pos, diff --git a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h index 03808ede3bf..d91f650d273 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/io_manager.h +++ b/examples/qualcomm/oss_scripts/llama/runner/io_manager.h @@ -55,6 +55,9 @@ class IoMgrBase { int64_t cur_token, int64_t pos, std::vector>& output_tensors) = 0; + virtual void update_kv_to_prefill_io( + int64_t pos, + std::vector>& output_tensors) = 0; virtual void update_kv_io( int64_t cur_token, int64_t pos, @@ -126,6 +129,10 @@ class ShiftPointerIoMgr : public IoMgrBase { int64_t pos, std::vector>& output_tensors) override; + void update_kv_to_prefill_io( + int64_t pos, + std::vector>& output_tensors) + override; void update_kv_io( int64_t cur_token, int64_t pos, @@ -234,6 +241,10 @@ class SmartMaskIoMgr : public IoMgrBase { int64_t pos, std::vector>& output_tensors) override; + void update_kv_to_prefill_io( + int64_t pos, + std::vector>& output_tensors) + override; void update_kv_io( int64_t cur_token, int64_t pos, diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index dafc911a172..0eda06469c8 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -57,7 +57,8 @@ Runner::Runner( performance_output_path_(performance_output_path), logits_scale_(logits_scale), logits_offset_(logits_offset), - temperature_(temperature), + // hardcoded here for comparing with goldens + temperature_(0), eval_mode_(static_cast(eval_mode)), kv_updater_(kv_updater), num_iters_(num_iters) { @@ -367,11 +368,12 @@ Error Runner::generate( pos += prefill_ar_len_; } Tensor& logits_tensor = output_tensors[method_name].back()[0]; - prev_token = prompt_tokens[num_prompt_tokens - 1]; + prev_token = prompt_tokens.back(); long sample_start_time_ms = time_in_ms(); cur_token = logitsToToken( logits_tensor, (num_prompt_tokens + prefill_ar_len_ - 1) % prefill_ar_len_); + stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; auto piece_res = tokenizer_->decode(prev_token, cur_token); @@ -380,14 +382,17 @@ Error Runner::generate( token_callback(piece_res.get().c_str()); } - pos = num_prompt_tokens; + pos = prompt_tokens.size(); stats_.first_token_ms = time_in_ms(); stats_.prompt_eval_end_ms = time_in_ms(); }; auto kv_execute = [&](const std::string& method_name) { io_mgr_->fill_kv_tok_mask(pos, cur_token); - while (pos < seq_len - 1) { + // force decode to generate 5 runs at most + int64_t max_pos = std::min(pos + 5, (int64_t)seq_len - 1); + //while (pos < seq_len - 1) { + while (pos < max_pos) { // inference run_model_step(method_name, inputs[method_name]); Tensor& logits_tensor = output_tensors[method_name].back()[0]; @@ -401,6 +406,7 @@ Error Runner::generate( } } prev_token = cur_token; + prompt_tokens.push_back(prev_token); long sample_start_time_ms = time_in_ms(); cur_token = logitsToToken(logits_tensor, pos); stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms; @@ -427,12 +433,43 @@ Error Runner::generate( case EvalMode::kKVCached: kv_execute(kv_forward_name_); break; - case EvalMode::kHybrid: - prefill_execute(prefill_forward_name_); - io_mgr_->update_prefill_to_kv_io( - cur_token, pos, output_tensors[kv_forward_name_]); - kv_execute(kv_forward_name_); - break; + case EvalMode::kHybrid: { + std::vector> new_prompt_token; + new_prompt_token.push_back({7826, 4257, 365, 2354, 29889}); + new_prompt_token.push_back({902, 304, 952, 322, 902, 25448, 304, 29891, 471, 263, 4802, 29892}); + new_prompt_token.push_back({29892, 365, 2354, 29915}); + new_prompt_token.push_back({1371, 902, 411, 278, 425, 870, 719}); + new_prompt_token.push_back({304, 1371, 322, 1183, 1925, 599, 278, 22095, 297, 278, 471, 2790, 4933, 29889, 29871, 13, 13555, 278, 22095, 892, 471, 17143, 29892, 365, 2354, 29915, 29879, 16823, 4433, 902, 304, 1371, 902, 13958, 963, 701, 304, 15589}); + new_prompt_token.push_back({}); + for (int i = 0; i < new_prompt_token.size(); ++i) { + prefill_execute(prefill_forward_name_); + io_mgr_->update_prefill_to_kv_io( + cur_token, pos, output_tensors[kv_forward_name_]); + kv_execute(kv_forward_name_); + io_mgr_->update_kv_to_prefill_io( + pos, output_tensors[prefill_forward_name_]); + + // check if generated tokens match goldens + ET_LOG(Info, "Current tokens after turn %d:", i); + for (size_t k = 0; k < prompt_tokens.size() ; k += 10) { + std::string tokens; + for (int i = k; i < std::min(k+10, prompt_tokens.size()); ++i) { + tokens += std::to_string(prompt_tokens[i]) + " "; + } + ET_LOG(Info, "%s", tokens.c_str()); + } + // convert tokens into text + for (int j = 1; j < new_prompt_token[i].size(); ++j) { + auto piece = tokenizer_->decode(new_prompt_token[i][j-1], new_prompt_token[i][j]); + if (token_callback) { + token_callback(piece.get().c_str()); + } + } + prompt_tokens.insert( + prompt_tokens.end(), begin(new_prompt_token[i]), end(new_prompt_token[i])); + num_prompt_tokens = new_prompt_token[i].size(); + } + } break; default: ET_CHECK_MSG(false, "Unsupported eval mode"); break;