kv-cache : avoid using the backends from the llama_context

ggerganov · ggerganov · commit f5adaab745aa · 2025-04-30T16:03:49.000+03:00
ref #13113 ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -408,10 +408,6 @@ ggml_context * llama_context::get_ctx_compute() const {
     return ctx_compute.get();
 }
 
-const std::vector<ggml_backend_ptr> & llama_context::get_backends() const {
-    return backends;
-}
-
 uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -34,9 +34,6 @@ struct llama_context {
 
     ggml_context * get_ctx_compute() const;
 
-    // TODO: this method might be possible to avoid (seach for TAG_BACKENDS)
-    const std::vector<ggml_backend_ptr> & get_backends() const;
-
     uint32_t n_ctx()         const;
     uint32_t n_ctx_per_seq() const;
     uint32_t n_batch()       const;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -598,12 +598,8 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
           ggml_tensor * shift,
           ggml_tensor * factors,
                 float   freq_base,
-                float   freq_scale,
-  ggml_backend_buffer * bbuf) const {
+                float   freq_scale) const {
     const auto & cparams  = lctx.get_cparams();
-    const auto & backends = lctx.get_backends();
-
-    auto * sched = lctx.get_sched();
 
     const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
 
@@ -624,17 +620,6 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
         // dequantize to f32 -> RoPE -> quantize back
         tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
 
-        // TODO: can we simplify/avoid this? [TAG_BACKENDS]
-        if (bbuf) {
-            for (const auto & backend : backends) {
-                // Figure out which backend KV cache belongs to
-                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
-                    ggml_backend_sched_set_tensor_backend(sched, tmp, backend.get());
-                    break;
-                }
-            }
-        }
-
         tmp = ggml_rope_ext_inplace(ctx, tmp,
                 shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                 yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
@@ -719,7 +704,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
                 ggml_row_size(k_l[il]->type, n_embd_k_gqa),
                 0);
 
-        ggml_tensor * cur = build_rope_shift(lctx, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, k_l[il]->buffer);
+        ggml_tensor * cur = build_rope_shift(lctx, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
 
         ggml_build_forward_expand(gf, cur);
     }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -239,8 +239,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
               ggml_tensor * shift,
               ggml_tensor * factors,
                     float   freq_base,
-                    float   freq_scale,
-      ggml_backend_buffer * bbuf) const;
+                    float   freq_scale) const;
 
     llm_graph_result_ptr build_graph_shift(
             llama_context & lctx,

Original file line number	Diff line number	Diff line change
`@@ -408,10 +408,6 @@ ggml_context * llama_context::get_ctx_compute() const {`
`408`	`408`	`return ctx_compute.get();`
`409`	`409`	`}`
`410`	`410`
`411`		`-const std::vector<ggml_backend_ptr> & llama_context::get_backends() const {`
`412`		`- return backends;`
`413`		`-}`
`414`		`-`
`415`	`411`	`uint32_t llama_context::n_ctx() const {`
`416`	`412`	`return cparams.n_ctx;`
`417`	`413`	`}`