mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-08 07:20:12 +00:00
Faster MLA on CUDA (#234)
* Slight MLA TG performance improvement on CUDA The low MLA performance on CUDA is dues to the wk_b * q_nope operation. It turns into n_head matrix multiplications with n_head separate quantization and GEMV steps. The associated overhead is just too much for TG where each GEMV is very fast (512 x 128 = 131 KFLOP for DeepSeek-Lite, 4X that for DeepSeekV3/R1). The way it was done there was also a copy of each q_nope row before quantization, which I have now eliminated. This results in a ~2.5% speedup. What needs to happen instead is to launch a single computation that quantizes all heads, and then have a kernel that does the GEMV for all heads instead of n_head sequential GEMVs. * Slightly better * CUDA: Quantize non-contiguous tensors * Much better MLA It is a total hack, but it works. * Cleanup Remove duplicated gemv's. --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -3212,7 +3212,7 @@ static bool llama_kv_cache_init(
|
||||
ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_k, kv_lora_rank + n_embd_head_qk_rope, kv_size);
|
||||
//ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_k, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
|
||||
#else
|
||||
ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
|
||||
ggml_tensor * kv = ggml_new_tensor_2d(ctx, cache.type_v, kv_lora_rank + n_embd_head_qk_rope, kv_size);
|
||||
#endif
|
||||
ggml_format_name(kv, "cache_kv_l%d", i);
|
||||
cache.kv_l.push_back(kv);
|
||||
@@ -13579,6 +13579,7 @@ struct llm_build_context {
|
||||
cb(wk_b, "wk_b", il);
|
||||
|
||||
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
||||
//if (q_nope->ne[1] <= 32) q_nope = ggml_cont(ctx0, q_nope);
|
||||
cb(q_nope, "q_nope_perm", il);
|
||||
|
||||
struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
|
||||
|
||||
Reference in New Issue
Block a user