mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-26 01:19:20 +00:00
This seems to work
This commit is contained in:
@@ -469,6 +469,7 @@ ggml_tensor * llm_build_context::llm_build_inp_embd(
|
||||
}
|
||||
|
||||
void llm_build_context::llm_build_kv_store(
|
||||
struct llama_context & lctx,
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
@@ -494,29 +495,36 @@ void llm_build_context::llm_build_kv_store(
|
||||
// (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
||||
//cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
GGML_ASSERT(2*il+1 < (int)lctx.cache_copies.size());
|
||||
auto k_row_size = ggml_row_size(kv.k_l[il]->type, n_embd_head_k);
|
||||
ggml_tensor * k_cache_view = ggml_view_2d(ctx, kv.k_l[il], n_embd_head_k, n_tokens*n_head_kv,
|
||||
k_row_size, k_row_size*n_head_kv*kv_head);
|
||||
|
||||
lctx.cache_copies[2*il+0].cpy = ggml_cpy(ctx, k_cur, k_cache_view);
|
||||
lctx.cache_copies[2*il+0].step = k_row_size*n_head_kv;
|
||||
|
||||
// note: storing RoPE-ed version of K in the KV cache
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||
ggml_build_forward_expand(graph, lctx.cache_copies[2*il+0].cpy);
|
||||
|
||||
struct ggml_tensor * v_cache_view = nullptr;
|
||||
|
||||
if (cparams.flash_attn) {
|
||||
v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
||||
(kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa));
|
||||
lctx.cache_copies[2*il+1].step = ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa);
|
||||
} else {
|
||||
// note: the V cache is transposed when not using flash attention
|
||||
v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
||||
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
||||
(kv_head)*ggml_element_size(kv.v_l[il]));
|
||||
lctx.cache_copies[2*il+1].step = ggml_element_size(kv.v_l[il]);
|
||||
|
||||
v_cur = ggml_transpose(ctx, v_cur);
|
||||
}
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
||||
lctx.cache_copies[2*il+1].cpy = ggml_cpy(ctx, v_cur, v_cache_view);
|
||||
ggml_build_forward_expand(graph, lctx.cache_copies[2*il+1].cpy);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_build_context::llm_build_lora_mm(
|
||||
@@ -1205,7 +1213,7 @@ ggml_tensor * llm_build_context::llm_build_kv(
|
||||
ggml_build_forward_expand(graph, k_cur);
|
||||
ggml_build_forward_expand(graph, v_cur);
|
||||
|
||||
llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
||||
llm_build_kv_store(lctx, ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
@@ -6045,7 +6053,9 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
auto row_size = ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope);
|
||||
ggml_tensor * kv_cache_view = ggml_view_2d(ctx0, kv_self.k_l[il], kv_self.k_l[il]->ne[0], n_tokens,
|
||||
row_size, row_size*kv_head);
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, kvr, kv_cache_view));
|
||||
lctx.cache_copies[2*il+0].cpy = ggml_cpy(ctx0, kvr, kv_cache_view);
|
||||
lctx.cache_copies[2*il+0].step = row_size;
|
||||
ggml_build_forward_expand(gf, lctx.cache_copies[2*il+0].cpy);
|
||||
ggml_tensor * kv_cache = ggml_view_2d(ctx0, kv_self.k_l[il],
|
||||
kv_lora_rank + n_embd_head_qk_rope, n_kv,
|
||||
ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0);
|
||||
@@ -7082,7 +7092,7 @@ ggml_cgraph * llm_build_context::build_t5_decoder() {
|
||||
model.layers[il].wk, nullptr,
|
||||
model.layers[il].wv, nullptr, 0, il);
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
|
||||
llm_build_kv_store(lctx, ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
|
||||
|
||||
struct ggml_tensor * k =
|
||||
ggml_view_3d(ctx0, kv_self.k_l[il],
|
||||
|
||||
@@ -292,7 +292,7 @@ struct llm_build_context {
|
||||
llm_norm_type type,
|
||||
const llm_build_cb & cb, int il, float scale_eps = 1);
|
||||
|
||||
static void llm_build_kv_store(ggml_context * ctx, const llama_hparams & hparams,
|
||||
static void llm_build_kv_store(llama_context & lctx, ggml_context * ctx, const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
const llama_kv_cache & kv,
|
||||
ggml_cgraph * graph,
|
||||
|
||||
@@ -212,4 +212,12 @@ struct llama_context {
|
||||
void reset_scheduler();
|
||||
bool can_reuse_graph(const llama_batch & u_batch) const;
|
||||
|
||||
struct CacheCopy {
|
||||
ggml_tensor * cpy = nullptr;
|
||||
size_t step = 0;
|
||||
};
|
||||
std::vector<CacheCopy> cache_copies;
|
||||
|
||||
void update_cache_copies();
|
||||
|
||||
};
|
||||
|
||||
@@ -559,8 +559,35 @@ bool llama_context::can_reuse_graph(const llama_batch & u_batch) const {
|
||||
n_outputs == prev->n_outputs;
|
||||
}
|
||||
|
||||
void llama_context::update_cache_copies() {
|
||||
int n_layer = cache_copies.size()/2;
|
||||
GGML_ASSERT((int)kv_self.k_l.size() == n_layer);
|
||||
GGML_ASSERT(kv_self.v_l.empty() || (int)kv_self.v_l.size() == n_layer);
|
||||
//printf("%s: head = %d\n", __func__, kv_self.head);
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto& c = cache_copies[2*il+0];
|
||||
GGML_ASSERT(c.cpy->op == GGML_OP_CPY);
|
||||
GGML_ASSERT(c.cpy->view_src == kv_self.k_l[il]);
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs;
|
||||
c.cpy->data = c.cpy->src[1]->data;
|
||||
}
|
||||
if (kv_self.v_l.empty()) return;
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
auto& c = cache_copies[2*il+1];
|
||||
GGML_ASSERT(c.cpy->op == GGML_OP_CPY);
|
||||
GGML_ASSERT(c.cpy->view_src == kv_self.v_l[il]);
|
||||
c.cpy->view_offs = kv_self.head*c.step;
|
||||
c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs;
|
||||
c.cpy->data = c.cpy->src[1]->data;
|
||||
}
|
||||
}
|
||||
|
||||
llama_context::llama_context(const llama_model & model)
|
||||
: model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {}
|
||||
: model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
|
||||
const auto & hparams = model.hparams;
|
||||
cache_copies.resize(2*hparams.n_layer);
|
||||
}
|
||||
|
||||
llama_context::~llama_context() {
|
||||
ggml_backend_sched_free(sched);
|
||||
@@ -2944,6 +2971,7 @@ static int llama_decode_internal(
|
||||
} else {
|
||||
//printf("Reusing graph\n");
|
||||
gf = lctx.prev->graph;
|
||||
lctx.update_cache_copies();
|
||||
}
|
||||
|
||||
// the output is always the last tensor in the graph
|
||||
|
||||
Reference in New Issue
Block a user