Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

smart_ptr multi-turn demo #9743

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions examples/qualcomm/oss_scripts/llama/runner/io_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,77 @@ void ShiftPointerIoMgr::update_prefill_to_kv_io(
}
}

void ShiftPointerIoMgr::update_kv_to_prefill_io(
int64_t pos,
std::vector<std::vector<Tensor>>& output_tensors) {
std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in_prefill =
v_cache_in_[prefill_forward_name_];
std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_in_kv =
v_cache_in_[kv_forward_name_];
std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out_prefil =
v_cache_out_[prefill_forward_name_];
std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& v_cache_out_kv =
v_cache_out_[kv_forward_name_];

// update v_cache
// this is critical to make generated v_cache always aligned in both prefill & decode mode
size_t prefill_offset = (kv_cache_len_ - prefill_cache_len_) * head_dim_;
for (int i = 0, v_stride = head_dim_*pos; i < v_cache_in_prefill.size(); ++i) {
v_cache_in_prefill[i]->set_data(v_cache_in_kv[i]->mutable_data<uint8_t>() + + prefill_offset);
v_cache_out_prefil[i]->set_data(v_cache_out_kv[i]->mutable_data<uint8_t>());
// reset decode mode pointer since it will be updated again in update_prefill_to_kv
v_cache_in_kv[i]->set_data(v_cache_in_kv[i]->mutable_data<uint8_t>() - v_stride);
v_cache_out_kv[i]->set_data(v_cache_out_kv[i]->mutable_data<uint8_t>() - v_stride);
}

// make framework aware that output tensor pointers have changed
for (int shard = 0; shard < output_tensors.size(); shard++) {
for (int index = 0; index < output_tensors[shard].size(); index++) {
ET_CHECK_MSG(
modules_[shard]->set_output(
prefill_forward_name_, output_tensors[shard][index], index) ==
Error::Ok,
"failed to set output tensor for module %d's %d'th output "
"while updating kv_cache output tensors",
shard,
index);
}
}

// update k_cache
size_t copied_size = pos * sizeof(uint8_t);
std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in_prefill =
k_cache_in_[prefill_forward_name_];
std::vector<std::unique_ptr<executorch::aten::TensorImpl>>& k_cache_in_kv =
k_cache_in_[kv_forward_name_];

for (int i = 0; i < k_cache_in_prefill.size(); i++) {
// k_cache_in should be always the same between prefill & decode
k_cache_in_prefill[i]->set_data(k_cache_in_kv[i]->mutable_data<uint8_t>());
// always do deep copy from origin because of the consumed tensor size is different in prefill & decode
uint8_t* ptr_in = k_cache_in_prefill[i]->mutable_data<uint8_t>() - pos;
// reset decode mode pointer since it will be updated again in update_prefill_to_kv
k_cache_in_kv[i]->set_data(ptr_in);
// in order not to override existent k_cache_out
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, should be existent k_cache_in.

// update_prefill_to_kv: copy from last dimension
// update_kv_to_prefill: copy from first dimension
for (int j = 0; j <= head_dim_; ++j) {
uint8_t* dst = ptr_in + j * prefill_cache_len_;
const uint8_t* src = ptr_in + j * kv_cache_len_;
memcpy(dst, src, copied_size);
}
}

// probably could be more efficient
IO* ptr = static_cast<IO*>(data_ptr_.get());
for (int i = 0; i < pos; i++) {
int offset = context_len_ - prefill_ar_len_ - i - 1;
for (int j = 0; j < prefill_ar_len_; j++) {
ptr->prefill_attention_mask[j * context_len_ + offset] = 65535;
}
}
}

void ShiftPointerIoMgr::update_kv_io(
int64_t cur_token,
int64_t pos,
Expand Down Expand Up @@ -667,6 +738,7 @@ void ShiftPointerIoMgr::fill_prefill_toks(
int64_t start_pos,
std::vector<uint64_t>& prompt_tokens) {
IO* ptr = static_cast<IO*>(get_mutable_ptr());

for (int i = 0; i < prefill_ar_len_; i++) {
if (!is_bert_) {
ptr->prefill_input_pos[i] = start_pos + i;
Expand Down Expand Up @@ -1360,6 +1432,11 @@ void SmartMaskIoMgr::update_prefill_to_kv_io(
}
}

void SmartMaskIoMgr::update_kv_to_prefill_io(
int64_t pos,
std::vector<std::vector<Tensor>>& output_tensors) {
}

void SmartMaskIoMgr::update_prefill_io(
int64_t cur_token,
int64_t pos,
Expand Down
11 changes: 11 additions & 0 deletions examples/qualcomm/oss_scripts/llama/runner/io_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ class IoMgrBase {
int64_t cur_token,
int64_t pos,
std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
virtual void update_kv_to_prefill_io(
int64_t pos,
std::vector<std::vector<executorch::aten::Tensor>>& output_tensors) = 0;
virtual void update_kv_io(
int64_t cur_token,
int64_t pos,
Expand Down Expand Up @@ -126,6 +129,10 @@ class ShiftPointerIoMgr : public IoMgrBase {
int64_t pos,
std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
override;
void update_kv_to_prefill_io(
int64_t pos,
std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
override;
void update_kv_io(
int64_t cur_token,
int64_t pos,
Expand Down Expand Up @@ -234,6 +241,10 @@ class SmartMaskIoMgr : public IoMgrBase {
int64_t pos,
std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
override;
void update_kv_to_prefill_io(
int64_t pos,
std::vector<std::vector<executorch::aten::Tensor>>& output_tensors)
override;
void update_kv_io(
int64_t cur_token,
int64_t pos,
Expand Down
57 changes: 47 additions & 10 deletions examples/qualcomm/oss_scripts/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ Runner::Runner(
performance_output_path_(performance_output_path),
logits_scale_(logits_scale),
logits_offset_(logits_offset),
temperature_(temperature),
// hardcoded here for comparing with goldens
temperature_(0),
eval_mode_(static_cast<EvalMode>(eval_mode)),
kv_updater_(kv_updater),
num_iters_(num_iters) {
Expand Down Expand Up @@ -367,11 +368,12 @@ Error Runner::generate(
pos += prefill_ar_len_;
}
Tensor& logits_tensor = output_tensors[method_name].back()[0];
prev_token = prompt_tokens[num_prompt_tokens - 1];
prev_token = prompt_tokens.back();
long sample_start_time_ms = time_in_ms();
cur_token = logitsToToken(
logits_tensor,
(num_prompt_tokens + prefill_ar_len_ - 1) % prefill_ar_len_);

stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;

auto piece_res = tokenizer_->decode(prev_token, cur_token);
Expand All @@ -380,14 +382,17 @@ Error Runner::generate(
token_callback(piece_res.get().c_str());
}

pos = num_prompt_tokens;
pos = prompt_tokens.size();
stats_.first_token_ms = time_in_ms();
stats_.prompt_eval_end_ms = time_in_ms();
};

auto kv_execute = [&](const std::string& method_name) {
io_mgr_->fill_kv_tok_mask(pos, cur_token);
while (pos < seq_len - 1) {
// force decode to generate 5 runs at most
int64_t max_pos = std::min(pos + 5, (int64_t)seq_len - 1);
//while (pos < seq_len - 1) {
while (pos < max_pos) {
// inference
run_model_step(method_name, inputs[method_name]);
Tensor& logits_tensor = output_tensors[method_name].back()[0];
Expand All @@ -401,6 +406,7 @@ Error Runner::generate(
}
}
prev_token = cur_token;
prompt_tokens.push_back(prev_token);
long sample_start_time_ms = time_in_ms();
cur_token = logitsToToken(logits_tensor, pos);
stats_.aggregate_sampling_time_ms += time_in_ms() - sample_start_time_ms;
Expand All @@ -427,12 +433,43 @@ Error Runner::generate(
case EvalMode::kKVCached:
kv_execute(kv_forward_name_);
break;
case EvalMode::kHybrid:
prefill_execute(prefill_forward_name_);
io_mgr_->update_prefill_to_kv_io(
cur_token, pos, output_tensors[kv_forward_name_]);
kv_execute(kv_forward_name_);
break;
case EvalMode::kHybrid: {
std::vector<std::vector<int32_t>> new_prompt_token;
new_prompt_token.push_back({7826, 4257, 365, 2354, 29889});
new_prompt_token.push_back({902, 304, 952, 322, 902, 25448, 304, 29891, 471, 263, 4802, 29892});
new_prompt_token.push_back({29892, 365, 2354, 29915});
new_prompt_token.push_back({1371, 902, 411, 278, 425, 870, 719});
new_prompt_token.push_back({304, 1371, 322, 1183, 1925, 599, 278, 22095, 297, 278, 471, 2790, 4933, 29889, 29871, 13, 13555, 278, 22095, 892, 471, 17143, 29892, 365, 2354, 29915, 29879, 16823, 4433, 902, 304, 1371, 902, 13958, 963, 701, 304, 15589});
new_prompt_token.push_back({});
for (int i = 0; i < new_prompt_token.size(); ++i) {
prefill_execute(prefill_forward_name_);
io_mgr_->update_prefill_to_kv_io(
cur_token, pos, output_tensors[kv_forward_name_]);
kv_execute(kv_forward_name_);
io_mgr_->update_kv_to_prefill_io(
pos, output_tensors[prefill_forward_name_]);

// check if generated tokens match goldens
ET_LOG(Info, "Current tokens after turn %d:", i);
for (size_t k = 0; k < prompt_tokens.size() ; k += 10) {
std::string tokens;
for (int i = k; i < std::min(k+10, prompt_tokens.size()); ++i) {
tokens += std::to_string(prompt_tokens[i]) + " ";
}
ET_LOG(Info, "%s", tokens.c_str());
}
// convert tokens into text
for (int j = 1; j < new_prompt_token[i].size(); ++j) {
auto piece = tokenizer_->decode(new_prompt_token[i][j-1], new_prompt_token[i][j]);
if (token_callback) {
token_callback(piece.get().c_str());
}
}
prompt_tokens.insert(
prompt_tokens.end(), begin(new_prompt_token[i]), end(new_prompt_token[i]));
num_prompt_tokens = new_prompt_token[i].size();
}
} break;
default:
ET_CHECK_MSG(false, "Unsupported eval mode");
break;
Expand Down
Loading