Skip to content

Commit f5adaab

Browse files
committed
kv-cache : avoid using the backends from the llama_context
ref #13113 ggml-ci
1 parent ed7bb58 commit f5adaab

File tree

4 files changed

+3
-26
lines changed

4 files changed

+3
-26
lines changed

src/llama-context.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -408,10 +408,6 @@ ggml_context * llama_context::get_ctx_compute() const {
408408
return ctx_compute.get();
409409
}
410410

411-
const std::vector<ggml_backend_ptr> & llama_context::get_backends() const {
412-
return backends;
413-
}
414-
415411
uint32_t llama_context::n_ctx() const {
416412
return cparams.n_ctx;
417413
}

src/llama-context.h

-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,6 @@ struct llama_context {
3434

3535
ggml_context * get_ctx_compute() const;
3636

37-
// TODO: this method might be possible to avoid (seach for TAG_BACKENDS)
38-
const std::vector<ggml_backend_ptr> & get_backends() const;
39-
4037
uint32_t n_ctx() const;
4138
uint32_t n_ctx_per_seq() const;
4239
uint32_t n_batch() const;

src/llama-kv-cache.cpp

+2-17
Original file line numberDiff line numberDiff line change
@@ -598,12 +598,8 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
598598
ggml_tensor * shift,
599599
ggml_tensor * factors,
600600
float freq_base,
601-
float freq_scale,
602-
ggml_backend_buffer * bbuf) const {
601+
float freq_scale) const {
603602
const auto & cparams = lctx.get_cparams();
604-
const auto & backends = lctx.get_backends();
605-
606-
auto * sched = lctx.get_sched();
607603

608604
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
609605

@@ -624,17 +620,6 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
624620
// dequantize to f32 -> RoPE -> quantize back
625621
tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
626622

627-
// TODO: can we simplify/avoid this? [TAG_BACKENDS]
628-
if (bbuf) {
629-
for (const auto & backend : backends) {
630-
// Figure out which backend KV cache belongs to
631-
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
632-
ggml_backend_sched_set_tensor_backend(sched, tmp, backend.get());
633-
break;
634-
}
635-
}
636-
}
637-
638623
tmp = ggml_rope_ext_inplace(ctx, tmp,
639624
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
640625
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
@@ -719,7 +704,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
719704
ggml_row_size(k_l[il]->type, n_embd_k_gqa),
720705
0);
721706

722-
ggml_tensor * cur = build_rope_shift(lctx, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, k_l[il]->buffer);
707+
ggml_tensor * cur = build_rope_shift(lctx, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
723708

724709
ggml_build_forward_expand(gf, cur);
725710
}

src/llama-kv-cache.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
239239
ggml_tensor * shift,
240240
ggml_tensor * factors,
241241
float freq_base,
242-
float freq_scale,
243-
ggml_backend_buffer * bbuf) const;
242+
float freq_scale) const;
244243

245244
llm_graph_result_ptr build_graph_shift(
246245
llama_context & lctx,

0 commit comments

Comments
 (0)