mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-12 15:00:11 +00:00
Fused Q and K fused_rms_norm for TG on CUDA (#882)
* Biased mmvq: minor optimization * Fusing Q and K rms_norm for TG on CUDA * Remove commented out code --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -1279,10 +1279,12 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
|
||||
if (q_norm) {
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Qcur);
|
||||
}
|
||||
if (k_norm) {
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
ggml_build_forward_expand(gf, Kcur);
|
||||
}
|
||||
|
||||
return {Qcur, Kcur, Vcur};
|
||||
|
||||
@@ -2451,7 +2451,6 @@ bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias) {
|
||||
layer.wk = ml.create_tensor_as_view(ctx_split, layer.wqkv, wk_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]);
|
||||
layer.wv = ml.create_tensor_as_view(ctx_split, layer.wqkv, wv_name.c_str(), { wv->ne[0], wv->ne[1] }, wq->ne[1]*wq->nb[1] + wk->ne[1]*wk->nb[1] );
|
||||
fused_qkv = true;
|
||||
printf("================================== Created merged qkv %s\n", layer.wqkv->name);
|
||||
if (bias) {
|
||||
auto bq_name = tn(LLM_TENSOR_ATTN_Q, "bias", i);
|
||||
auto bk_name = tn(LLM_TENSOR_ATTN_K, "bias", i);
|
||||
|
||||
Reference in New Issue
Block a user