Make V mul mat follow QK mul mat

so they can be fused, which gives a slightly bbetter TG performance.
This commit is contained in:
Iwan Kawrakow
2025-11-04 10:39:23 +02:00
parent 1280304e46
commit 24f3cb644c

View File

@@ -1303,12 +1303,18 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
qk = ggml_add(ctx0, qk, bqk);
cb(qk, "qkv_b", il);
}
auto Vcur = llm_build_lora_mm(lctx, ctx0, wv, cur);
cb(Vcur, "Vcur", il);
if (bv) {
Vcur = ggml_add(ctx0, Vcur, bv);
cb(Vcur, "Vcur", il);
}
ggml_build_forward_expand(gf, qk);
ggml_build_forward_expand(gf, Vcur);
auto Qcur = ggml_view_3d(ctx0, qk, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), qk->nb[1], 0*sizeof(float)*(n_embd));
auto Kcur = ggml_view_3d(ctx0, qk, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), qk->nb[1], 1*sizeof(float)*Qcur->ne[0]*Qcur->ne[1]);
auto Vcur = llm_build_lora_mm(lctx, ctx0, wv, cur);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
if (q_norm) {
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
cb(Qcur, "Qcur_normed", il);