This seems to work

This commit is contained in:
Iwan Kawrakow
2025-11-13 06:23:10 +02:00
parent 59ee8d7823
commit a9671fe368
4 changed files with 53 additions and 7 deletions

View File

@@ -469,6 +469,7 @@ ggml_tensor * llm_build_context::llm_build_inp_embd(
}
void llm_build_context::llm_build_kv_store(
struct llama_context & lctx,
struct ggml_context * ctx,
const llama_hparams & hparams,
const llama_cparams & cparams,
@@ -494,29 +495,36 @@ void llm_build_context::llm_build_kv_store(
// (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
//cb(k_cache_view, "k_cache_view", il);
GGML_ASSERT(2*il+1 < (int)lctx.cache_copies.size());
auto k_row_size = ggml_row_size(kv.k_l[il]->type, n_embd_head_k);
ggml_tensor * k_cache_view = ggml_view_2d(ctx, kv.k_l[il], n_embd_head_k, n_tokens*n_head_kv,
k_row_size, k_row_size*n_head_kv*kv_head);
lctx.cache_copies[2*il+0].cpy = ggml_cpy(ctx, k_cur, k_cache_view);
lctx.cache_copies[2*il+0].step = k_row_size*n_head_kv;
// note: storing RoPE-ed version of K in the KV cache
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
ggml_build_forward_expand(graph, lctx.cache_copies[2*il+0].cpy);
struct ggml_tensor * v_cache_view = nullptr;
if (cparams.flash_attn) {
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
lctx.cache_copies[2*il+1].step = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
} else {
// note: the V cache is transposed when not using flash attention
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
( n_ctx)*ggml_element_size(kv.v_l[il]),
(kv_head)*ggml_element_size(kv.v_l[il]));
lctx.cache_copies[2*il+1].step = ggml_element_size(kv.v_l[il]);
v_cur = ggml_transpose(ctx, v_cur);
}
cb(v_cache_view, "v_cache_view", il);
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
lctx.cache_copies[2*il+1].cpy = ggml_cpy(ctx, v_cur, v_cache_view);
ggml_build_forward_expand(graph, lctx.cache_copies[2*il+1].cpy);
}
ggml_tensor * llm_build_context::llm_build_lora_mm(
@@ -1205,7 +1213,7 @@ ggml_tensor * llm_build_context::llm_build_kv(
ggml_build_forward_expand(graph, k_cur);
ggml_build_forward_expand(graph, v_cur);
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
llm_build_kv_store(lctx, ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
struct ggml_tensor * cur;
@@ -6045,7 +6053,9 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
auto row_size = ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope);
ggml_tensor * kv_cache_view = ggml_view_2d(ctx0, kv_self.k_l[il], kv_self.k_l[il]->ne[0], n_tokens,
row_size, row_size*kv_head);
ggml_build_forward_expand(gf, ggml_cpy(ctx0, kvr, kv_cache_view));
lctx.cache_copies[2*il+0].cpy = ggml_cpy(ctx0, kvr, kv_cache_view);
lctx.cache_copies[2*il+0].step = row_size;
ggml_build_forward_expand(gf, lctx.cache_copies[2*il+0].cpy);
ggml_tensor * kv_cache = ggml_view_2d(ctx0, kv_self.k_l[il],
kv_lora_rank + n_embd_head_qk_rope, n_kv,
ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0);
@@ -7082,7 +7092,7 @@ ggml_cgraph * llm_build_context::build_t5_decoder() {
model.layers[il].wk, nullptr,
model.layers[il].wv, nullptr, 0, il);
llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
llm_build_kv_store(lctx, ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
struct ggml_tensor * k =
ggml_view_3d(ctx0, kv_self.k_l[il],

View File

@@ -292,7 +292,7 @@ struct llm_build_context {
llm_norm_type type,
const llm_build_cb & cb, int il, float scale_eps = 1);
static void llm_build_kv_store(ggml_context * ctx, const llama_hparams & hparams,
static void llm_build_kv_store(llama_context & lctx, ggml_context * ctx, const llama_hparams & hparams,
const llama_cparams & cparams,
const llama_kv_cache & kv,
ggml_cgraph * graph,

View File

@@ -212,4 +212,12 @@ struct llama_context {
void reset_scheduler();
bool can_reuse_graph(const llama_batch & u_batch) const;
struct CacheCopy {
ggml_tensor * cpy = nullptr;
size_t step = 0;
};
std::vector<CacheCopy> cache_copies;
void update_cache_copies();
};

View File

@@ -559,8 +559,35 @@ bool llama_context::can_reuse_graph(const llama_batch & u_batch) const {
n_outputs == prev->n_outputs;
}
void llama_context::update_cache_copies() {
int n_layer = cache_copies.size()/2;
GGML_ASSERT((int)kv_self.k_l.size() == n_layer);
GGML_ASSERT(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer);
//printf("%s: head = %d\n", __func__, kv_self.head);
for (int il = 0; il < n_layer; ++il) {
auto& c = cache_copies[2*il+0];
GGML_ASSERT(c.cpy->op == GGML_OP_CPY);
GGML_ASSERT(c.cpy->view_src == kv_self.k_l[il]);
c.cpy->view_offs = kv_self.head*c.step;
c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs;
c.cpy->data = c.cpy->src[1]->data;
}
if (kv_self.v_l.empty()) return;
for (int il = 0; il < n_layer; ++il) {
auto& c = cache_copies[2*il+1];
GGML_ASSERT(c.cpy->op == GGML_OP_CPY);
GGML_ASSERT(c.cpy->view_src == kv_self.v_l[il]);
c.cpy->view_offs = kv_self.head*c.step;
c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs;
c.cpy->data = c.cpy->src[1]->data;
}
}
llama_context::llama_context(const llama_model & model)
: model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {}
: model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
const auto & hparams = model.hparams;
cache_copies.resize(2*hparams.n_layer);
}
llama_context::~llama_context() {
ggml_backend_sched_free(sched);
@@ -2944,6 +2971,7 @@ static int llama_decode_internal(
} else {
//printf("Reusing graph\n");
gf = lctx.prev->graph;
lctx.update_cache_copies();
}
// the output is always the last tensor in the graph