mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-28 17:14:17 +00:00
Fused Q and K fused_rms_norm for TG on CUDA (#882)
* Biased mmvq: minor optimization * Fusing Q and K rms_norm for TG on CUDA * Remove commented out code --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -1279,10 +1279,12 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
|
||||
if (q_norm) {
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Qcur);
|
||||
}
|
||||
if (k_norm) {
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Kcur);
|
||||
}
|
||||
|
||||
return {Qcur, Kcur, Vcur};
|
||||
|
||||
Reference in New Issue
Block a user