Skip to content

Commit 07d903c

Browse files
ggerganovjordankanter
authored andcommitted
main : add self-extend support (ggml-org#4815)
* examples : add passkey test * passkey : better prints * passkey : select pass key pos from CLI * passkey : simplify n_past logic * llama : "self-extend"-like context extension * passkey : add comment * main : add Self-Extend support * llama : add comment about llama_kv_cache_seq_div
1 parent da05773 commit 07d903c

File tree

4 files changed

+85
-22
lines changed

4 files changed

+85
-22
lines changed

common/common.cpp

+18
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
220220
break;
221221
}
222222
params.n_ctx = std::stoi(argv[i]);
223+
} else if (arg == "--grp-attn-n" || arg == "-gan") {
224+
if (++i >= argc) {
225+
invalid_param = true;
226+
break;
227+
}
228+
229+
params.grp_attn_n = std::stoi(argv[i]);
230+
} else if (arg == "--grp-attn-w" || arg == "-gaw") {
231+
if (++i >= argc) {
232+
invalid_param = true;
233+
break;
234+
}
235+
236+
params.grp_attn_w = std::stoi(argv[i]);
223237
} else if (arg == "--rope-freq-base") {
224238
if (++i >= argc) {
225239
invalid_param = true;
@@ -904,6 +918,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
904918
printf(" Not recommended since this is both slower and uses more VRAM.\n");
905919
#endif // GGML_USE_CUBLAS
906920
#endif
921+
printf(" -gan N, --grp-attn-n N\n");
922+
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
923+
printf(" -gat N, --grp-attn-w N\n");
924+
printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
907925
printf(" --verbose-prompt print prompt before generation\n");
908926
printf(" -dkvc, --dump-kv-cache\n");
909927
printf(" verbose print of the KV cache\n");

common/common.h

+2
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ struct gpt_params {
6262
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
6363
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
6464
int32_t n_beams = 0; // if non-zero then use beam search of given width.
65+
int32_t grp_attn_n = 1; // group-attention factor
66+
int32_t grp_attn_w = 512; // group-attention width
6567
float rope_freq_base = 0.0f; // RoPE base frequency
6668
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
6769
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor

examples/main/main.cpp

+61-22
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,21 @@ int main(int argc, char ** argv) {
439439
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
440440
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
441441
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
442+
443+
// group-attention state
444+
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
445+
int ga_i = 0;
446+
447+
const int ga_n = params.grp_attn_n;
448+
const int ga_w = params.grp_attn_w;
449+
450+
if (ga_n != 1) {
451+
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
452+
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
453+
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
454+
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
455+
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
456+
}
442457
LOG_TEE("\n\n");
443458

444459
if (params.interactive) {
@@ -500,37 +515,61 @@ int main(int argc, char ** argv) {
500515
fflush(stdout);
501516
}
502517

503-
// infinite text generation via context swapping
504-
// if we run out of context:
505-
// - take the n_keep first tokens from the original prompt (via n_past)
506-
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
507-
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
508-
if (params.n_predict == -2) {
509-
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
510-
break;
511-
}
518+
if (ga_n == 1) {
519+
// infinite text generation via context shifting
520+
// if we run out of context:
521+
// - take the n_keep first tokens from the original prompt (via n_past)
522+
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
523+
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
524+
if (params.n_predict == -2) {
525+
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
526+
break;
527+
}
512528

513-
const int n_left = n_past - params.n_keep - 1;
514-
const int n_discard = n_left/2;
529+
const int n_left = n_past - params.n_keep - 1;
530+
const int n_discard = n_left/2;
515531

516-
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
517-
n_past, n_left, n_ctx, params.n_keep, n_discard);
532+
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
533+
n_past, n_left, n_ctx, params.n_keep, n_discard);
518534

519-
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
520-
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
535+
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
536+
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
521537

522-
n_past -= n_discard;
538+
n_past -= n_discard;
523539

524-
if (ctx_guidance) {
525-
n_past_guidance -= n_discard;
540+
if (ctx_guidance) {
541+
n_past_guidance -= n_discard;
542+
}
543+
544+
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
545+
546+
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
547+
548+
LOG("clear session path\n");
549+
path_session.clear();
526550
}
551+
} else {
552+
// context extension via Self-Extend
553+
while (n_past >= ga_i + ga_w) {
554+
const int ib = (ga_n*ga_i)/ga_w;
555+
const int bd = (ga_w/ga_n)*(ga_n - 1);
556+
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
527557

528-
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
558+
LOG("\n");
559+
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
560+
LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
561+
LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
529562

530-
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
563+
llama_kv_cache_seq_shift(ctx, 0, ga_i, n_past, ib*bd);
564+
llama_kv_cache_seq_div (ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
565+
llama_kv_cache_seq_shift(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd);
531566

532-
LOG("clear session path\n");
533-
path_session.clear();
567+
n_past -= bd;
568+
569+
ga_i += ga_w/ga_n;
570+
571+
LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
572+
}
534573
}
535574

536575
// try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)

llama.h

+4
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,10 @@ extern "C" {
484484
llama_pos p1,
485485
llama_pos delta);
486486

487+
// Integer division of the positions by factor of `d > 1`
488+
// If the KV cache is RoPEd, the KV data is updated accordingly
489+
// p0 < 0 : [0, p1]
490+
// p1 < 0 : [p0, inf)
487491
LLAMA_API void llama_kv_cache_seq_div(
488492
struct llama_context * ctx,
489493
llama_seq_id seq_id,

0 commit comments

Comments
 (0)