mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-11 08:50:11 +00:00
Faster MoE token generation on CUDA (#248)
* This gives us ~20% TG speedup for DeepSeek on CUDA * Slightly better * Also do it for plain (not fused) mul_mat_id * Guard against numerical precision issues for MLA on CUDA --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -13734,6 +13734,9 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, kv_cache, q);
|
||||
if (kv_cache->ne[1] < 256) {
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
}
|
||||
cb(kq, "kq", il);
|
||||
|
||||
if (!pp_opt) {
|
||||
|
||||
Reference in New Issue
Block a user