From 3aaf602da5931cc4488895bbabebfaf8b4fdf33e Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Sat, 8 Feb 2025 12:08:17 +0200
Subject: [PATCH] Remove some unnecessary copies in the MLA attention

---
 src/llama.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 345a7ceb..67a80eb2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13463,7 +13463,7 @@ struct llm_build_context {
                                 0);
                     cb(kv_cache_trans, "kv_cache_trans", il);
 
-                    q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                    //q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
                     q_pe = ggml_rope_ext(
                             ctx0, q_pe, inp_pos, nullptr,
                             n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13472,7 +13472,7 @@ struct llm_build_context {
                     cb(q_pe, "q_pe", il);
 
                     // shared RoPE key
-                    k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                    //k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
                     k_pe = ggml_rope_ext(
                             ctx0, k_pe, inp_pos, nullptr,
                             n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -13508,8 +13508,9 @@ struct llm_build_context {
                     struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2_perm);
                     cb(kq_nope, "kq_nope", il);
 
-                    struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
-                    cb(q_pe_perm, "q_pe_perm", il);
+                    // Huh? This is not used anywhere
+                    //struct ggml_tensor * q_pe_perm = ggml_permute(ctx0, q_pe, 0, 3, 2, 1);
+                    //cb(q_pe_perm, "q_pe_perm", il);
 
                     struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe);
                     cb(kq_pe, "kq_pe", il);
@@ -13517,6 +13518,7 @@ struct llm_build_context {
                     struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe);
                     cb(kq, "kq", il);
 
+                    // We need this copy because soft_max expects a contiguous tensor
                     kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3));
                     cb(kq, "kq_perm", il);