mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-09 16:00:12 +00:00
Even more fused ops (#868)
* Fuse Q, K, V gemv+add * More gemv+add fusing * Faster copy when tensors are contiguous Relevant for storing data into the KV cache. I see ~1% speedup for fast models (Ling-mini-2.0, gpt-oss-20b, etc.) * Cleanup * Make sure the bias really is 1 row to use fusion --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -1240,14 +1240,17 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_context::llm_buil
|
||||
if (bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
ggml_build_forward_expand(gf, Qcur);
|
||||
}
|
||||
if (bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
ggml_build_forward_expand(gf, Kcur);
|
||||
}
|
||||
if (bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
ggml_build_forward_expand(gf, Vcur);
|
||||
}
|
||||
return {Qcur, Kcur, Vcur};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user