@@ -598,12 +598,8 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
598
598
ggml_tensor * shift,
599
599
ggml_tensor * factors,
600
600
float freq_base,
601
- float freq_scale,
602
- ggml_backend_buffer * bbuf) const {
601
+ float freq_scale) const {
603
602
const auto & cparams = lctx.get_cparams ();
604
- const auto & backends = lctx.get_backends ();
605
-
606
- auto * sched = lctx.get_sched ();
607
603
608
604
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn ;
609
605
@@ -624,17 +620,6 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
624
620
// dequantize to f32 -> RoPE -> quantize back
625
621
tmp = ggml_cast (ctx, cur, GGML_TYPE_F32);
626
622
627
- // TODO: can we simplify/avoid this? [TAG_BACKENDS]
628
- if (bbuf) {
629
- for (const auto & backend : backends) {
630
- // Figure out which backend KV cache belongs to
631
- if (ggml_backend_supports_buft (backend.get (), ggml_backend_buffer_get_type (bbuf))) {
632
- ggml_backend_sched_set_tensor_backend (sched, tmp, backend.get ());
633
- break ;
634
- }
635
- }
636
- }
637
-
638
623
tmp = ggml_rope_ext_inplace (ctx, tmp,
639
624
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
640
625
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
@@ -719,7 +704,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
719
704
ggml_row_size (k_l[il]->type , n_embd_k_gqa),
720
705
0 );
721
706
722
- ggml_tensor * cur = build_rope_shift (lctx, ctx, k, inp->k_shift , rope_factors, freq_base_l, freq_scale_l, k_l[il]-> buffer );
707
+ ggml_tensor * cur = build_rope_shift (lctx, ctx, k, inp->k_shift , rope_factors, freq_base_l, freq_scale_l);
723
708
724
709
ggml_build_forward_expand (gf, cur);
725
710
}
0 commit comments