WIP: also allocate the KV cache using tensor split

2026-02-07 06:50:09 +00:00 · 2025-11-25 15:30:37 +00:00
parent 32c6df015b
commit bc4be331ee
3 changed files with 41 additions and 20 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -770,6 +770,8 @@ static bool llama_kv_cache_init(
                    split_v_l.ggml.n_device  = extra_V->n_device;
                    split_v_l.ggml.split_dim = 0;
                    split_v_l.ggml.splits    = split_v_l.tensor_splits.data();
+                    k->extra = (void *)&split_k_l.ggml;
+                    v->extra = (void *)&split_v_l.ggml;
                } else {
                    printf("Oops: don't have yet K and V for layer %d\n", i);
                }