Fused Q and K fused_rms_norm for TG on CUDA (#882)

* Biased mmvq: minor optimization

* Fusing Q and K rms_norm for TG on CUDA

* Remove commented out code

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-10-31 14:41:28 +02:00
committed by GitHub
parent fd3757d4ee
commit 8c8a7fb7c8
5 changed files with 94 additions and 2 deletions

View File

@@ -1279,10 +1279,12 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
if (q_norm) {
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
cb(Qcur, "Qcur_normed", il);
ggml_build_forward_expand(gf, Qcur);
}
if (k_norm) {
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
cb(Kcur, "Kcur_normed", il);
ggml_build_forward_expand(gf, Kcur);
}
return {Qcur, Kcur, Vcur};