kv-cache : do not pass the full llama_context for kv graphs

ggerganov · ggerganov · commit 780d6fb128b9 · 2025-04-30T16:13:12.000+03:00
ggml-ci
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -386,7 +386,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
 
             auto * gf = lctx.graph_init();
 
-            auto res = build_graph_shift(lctx, gf);
+            auto res = build_graph_shift(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
 
             ggml_backend_sched_alloc_graph(sched, gf);
 
@@ -414,7 +414,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
 
             auto * gf = lctx.graph_init();
 
-            auto res = build_graph_defrag(lctx, gf);
+            auto res = build_graph_defrag(lctx.get_cparams(), lctx.get_ctx_compute(), gf);
 
             ggml_backend_sched_alloc_graph(sched, gf);
 
@@ -592,15 +592,13 @@ size_t llama_kv_cache_unified::size_v_bytes() const {
 }
 
 ggml_tensor * llama_kv_cache_unified::build_rope_shift(
-        llama_context & lctx,
-         ggml_context * ctx,
-          ggml_tensor * cur,
-          ggml_tensor * shift,
-          ggml_tensor * factors,
-                float   freq_base,
-                float   freq_scale) const {
-    const auto & cparams  = lctx.get_cparams();
-
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_tensor * cur,
+                ggml_tensor * shift,
+                ggml_tensor * factors,
+                      float   freq_base,
+                      float   freq_scale) const {
     const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
 
     const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@@ -662,14 +660,11 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
 }
 
 llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
-        llama_context & lctx,
-        ggml_cgraph * gf) const {
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_cgraph * gf) const {
     auto res = std::make_unique<llm_graph_result>();
 
-    auto * ctx = lctx.get_ctx_compute();
-
-    const auto & cparams = lctx.get_cparams();
-
     const auto & n_layer = hparams.n_layer;
 
     const auto & n_embd_head_k = hparams.n_embd_head_k;
@@ -704,7 +699,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
                 ggml_row_size(k_l[il]->type, n_embd_k_gqa),
                 0);
 
-        ggml_tensor * cur = build_rope_shift(lctx, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -715,16 +710,13 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
 }
 
 llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
-        llama_context & lctx,
-          ggml_cgraph * gf) const {
+        const llama_cparams & cparams,
+               ggml_context * ctx,
+                ggml_cgraph * gf) const {
     auto res = std::make_unique<llm_graph_result>();
 
-    auto * ctx = lctx.get_ctx_compute();
-
     const auto & ids = defrag_info.ids;
 
-    const auto & cparams = lctx.get_cparams();
-
 #if 0
     // CPU defrag
     //
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -233,21 +233,23 @@ class llama_kv_cache_unified : public llama_kv_cache {
     size_t size_v_bytes() const;
 
     ggml_tensor * build_rope_shift(
-            llama_context & lctx,
-             ggml_context * ctx,
-              ggml_tensor * cur,
-              ggml_tensor * shift,
-              ggml_tensor * factors,
-                    float   freq_base,
-                    float   freq_scale) const;
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_tensor * cur,
+                    ggml_tensor * shift,
+                    ggml_tensor * factors,
+                          float   freq_base,
+                          float   freq_scale) const;
 
     llm_graph_result_ptr build_graph_shift(
-            llama_context & lctx,
-              ggml_cgraph * gf) const;
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf) const;
 
     llm_graph_result_ptr build_graph_defrag(
-            llama_context & lctx,
-              ggml_cgraph * gf) const;
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf) const;
 
     void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
     void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;