From fb0d5a995cd922227cfd98b97c39d5f680e43b8d Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Mon, 3 Nov 2025 18:42:20 +0200
Subject: [PATCH] RoPE cache (#887)

* Introducing rope cache

When computing RoPE, the rotation angles in each layer
are exactly the same, and only depend on the token positions
(and other constant, model dependent parameters).
So, I wonder, why don't we compute the angles just once
and then reuse for the Q and K RoPE in each layer?

This commit does it as a POC on the CPU, and uses it in
the Qwen3-MoE compute graph.

* cuda: neox works

* WIP

* rope_cache: norm works

* Fused rope+rope

* Fused rope+rope (norm)

* Fused rms+rms+rope+rope (neox) - not working

* WIP

* Also qwen3

* Add command line arg to disable rope cache

* Disable RoPE cache if rope type is not neox or norm

* Add missing break after merge with main

* Fused fused_rms+fused_rms+rope+rope (with -mqkv)

* Fused fused_rms+fused_rms+rope+rope (without -mqkv)

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 common/common.cpp           |   9 +-
 common/common.h             |   3 +-
 ggml/include/ggml.h         |  22 ++
 ggml/src/ggml-cuda.cu       |  52 +++-
 ggml/src/ggml-cuda/rope.cu  | 565 ++++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/rope.cuh |   8 +
 ggml/src/ggml.c             | 275 +++++++++++++++++-
 include/llama.h             |   1 +
 src/llama-build-context.cpp | 134 ++++-----
 src/llama-build-context.h   |   1 +
 src/llama-cparams.h         |   1 +
 src/llama.cpp               |   3 +
 12 files changed, 1002 insertions(+), 72 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 098e8be2..674e975d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1106,6 +1106,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.fused_mmad = false;
         return true;
     }
+    if (arg == "-no-rcache" || arg == "--no-rope-cache") {
+        params.rope_cache = false;
+        return true;
+    }
     if (arg == "-ser" || arg == "--smart-expert-reduction") {
         CHECK_ARG
         auto values = string_split_pairs<int,float>(argv[i], ',');
@@ -1914,6 +1918,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-ger,  --grouped-expert-routing", "enable grouped expert routing (default: %s)", params.grouped_expert_routing ? "enabled" : "disabled" });
     options.push_back({ "*",           "-no-fug, --no-fused-up-gate",   "disaable fused up-gate (default: %s)", params.fused_up_gate ? "enabled" : "disabled" });
     options.push_back({ "*",           "-no-mmad, --no-fused-mul-multiadd", "disaable fused mul-multi_add (default: %s)", params.fused_mmad? "enabled" : "disabled" });
+    options.push_back({ "*",           "-no-rcache, --no-rope-cache",   "disaable RoPE cache (default: %s)", params.rope_cache ? "enabled" : "disabled" });
     options.push_back({ "*",         "-ser,  --smart-expert-reduction,","experts reduction (default: %d,%g)", params.min_experts, params.thresh_experts});
     options.push_back({ "*",         "-mqkv,  --merge-qkv,",            "merge Q,K,V (default: %d)", params.merge_qkv});
     options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
@@ -2887,6 +2892,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.grouped_expert_routing = params.grouped_expert_routing;
     cparams.fused_up_gate     = params.fused_up_gate;
     cparams.fused_mmad        = params.fused_mmad;
+    cparams.rope_cache        = params.rope_cache;
     cparams.min_experts       = params.min_experts;
     cparams.thresh_experts    = params.thresh_experts;
     cparams.only_active_experts = params.only_active_exps;
@@ -4005,7 +4011,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "fused_moe: %s # default: false\n", params.fused_moe_up_gate ? "true" : "false");
     fprintf(stream, "grouped_expert_routing: %s # default: false\n", params.grouped_expert_routing ? "true" : "false");
     fprintf(stream, "fused_up_gate: %s # default: true\n", params.fused_up_gate ? "true" : "false");
-    fprintf(stream, "fused_mmad: %s # default: true\n", params.fused_mmad? "true" : "false");
+    fprintf(stream, "fused_mmad: %s # default: true\n", params.fused_mmad ? "true" : "false");
+    fprintf(stream, "rope_cache: %s # default: true\n", params.rope_cache ? "true" : "false");
     fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts);
     fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
 
diff --git a/common/common.h b/common/common.h
index 00ca2524..b202ad86 100644
--- a/common/common.h
+++ b/common/common.h
@@ -112,7 +112,7 @@ enum common_reasoning_format {
 enum common_webui {
     COMMON_WEBUI_NONE,
     COMMON_WEBUI_AUTO,
-    COMMON_WEBUI_LLAMACPP,  
+    COMMON_WEBUI_LLAMACPP,
 };
 
 common_webui common_webui_from_name(const std::string& format);
@@ -249,6 +249,7 @@ struct gpt_params {
     bool fused_up_gate     = true;  // fused up*unary(gate) op
     bool fused_mmad        = true;  // fused mul+multi_add op
     bool grouped_expert_routing = false; // if to use grouped expert routing (BailingMoeV2 arch)
+    bool rope_cache        = true;  // if to use RoPE cache (for supported models)
     int  min_experts       = -1;
     float thresh_experts   = 0;
 
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index d2cb164c..8104c045 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -639,6 +639,8 @@ extern "C" {
         GGML_OP_SOFT_MAX_BACK,
         GGML_OP_ROPE,
         GGML_OP_ROPE_BACK,
+        GGML_OP_ROPE_CACHE,
+        GGML_OP_ROPE_FAST,
         GGML_OP_CLAMP,
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
@@ -2020,6 +2022,26 @@ extern "C" {
             float                 beta_fast,
             float                 beta_slow);
 
+    GGML_API struct ggml_tensor * ggml_rope_cache(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                   ne0,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+    GGML_API struct ggml_tensor * ggml_rope_fast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     // clamp
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_clamp(
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 78456719..b478d593 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3062,6 +3062,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
     auto next = i < cgraph->n_nodes - 1 ? cgraph->nodes[i+1] : nullptr;
 
+    //printf("%4d %s(%s)\n", i, ggml_op_name(dst->op), dst->name);
     switch (dst->op) {
         case GGML_OP_ARGMAX:
             ggml_cuda_argmax(ctx, dst);
@@ -3096,7 +3097,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                 ggml_are_same_shape(dst, cgraph->nodes[i+1]->src[1]) &&
                 cgraph->nodes[i+1] == cgraph->nodes[i+2]->src[0] &&
                 ops_are_same_device(cgraph, i, i+2)) {
-                //printf("Fusing add->add->fused_rms of %s, %s, %s\n", dst->name, cgraph->nodes[i+1]->name, cgraph->nodes[i+2]->name);
                 ggml_cuda_op_fused_add_add_rms_norm(ctx, dst, cgraph->nodes[i+1], cgraph->nodes[i+2]);
                 i += 2;
             }
@@ -3244,7 +3244,27 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
             ggml_cuda_op_rms_norm(ctx, dst);
             break;
         case GGML_OP_FUSED_RMS_NORM:
-            if (i + 2 < cgraph->n_nodes &&
+            //if (i + 6 < cgraph->n_nodes) {
+            //    printf("=== Fused rms_norm(%s)\n", dst->name);
+            //    for (int j = 1; j <= 6; ++j) printf("    %s(%s)\n", ggml_op_name(cgraph->nodes[i+j]->op), cgraph->nodes[i+j]->name);
+            //}
+            if (ENABLE_FUSION && i + 4 < cgraph->n_nodes &&
+                cgraph->nodes[i+1]->op == GGML_OP_VIEW &&
+                cgraph->nodes[i+2]->op == GGML_OP_FUSED_RMS_NORM &&
+                cgraph->nodes[i+3]->op == GGML_OP_ROPE_FAST &&
+                cgraph->nodes[i+4]->op == GGML_OP_ROPE_FAST &&
+                ggml_cuda_op_fused_rms_rope_fast(ctx, cgraph->nodes[i+3], cgraph->nodes[i+4])) {
+                i += 4;
+            }
+            else if (ENABLE_FUSION && i + 4 < cgraph->n_nodes &&
+                cgraph->nodes[i+1]->op == GGML_OP_ROPE_FAST &&
+                cgraph->nodes[i+2]->op == GGML_OP_RESHAPE &&
+                cgraph->nodes[i+3]->op == GGML_OP_FUSED_RMS_NORM &&
+                cgraph->nodes[i+4]->op == GGML_OP_ROPE_FAST &&
+                ggml_cuda_op_fused_rms_rope_fast(ctx, cgraph->nodes[i+1], cgraph->nodes[i+4])) {
+                i += 4;
+            }
+            else if (ENABLE_FUSION && i + 2 < cgraph->n_nodes &&
                 cgraph->nodes[i+1]->op == GGML_OP_VIEW &&
                 cgraph->nodes[i+2]->op == GGML_OP_FUSED_RMS_NORM &&
                 dst->ne[2] == 1 && cgraph->nodes[i+2]->ne[2] == 1) {
@@ -3318,6 +3338,32 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ROPE_BACK:
             ggml_cuda_op_rope_back(ctx, dst);
             break;
+        case GGML_OP_ROPE_FAST:
+            if (ENABLE_FUSION && i + 3 < cgraph->n_nodes &&
+               (cgraph->nodes[i+1]->op == GGML_OP_RESHAPE || cgraph->nodes[i+1]->op == GGML_OP_VIEW) &&
+               (cgraph->nodes[i+2]->op == GGML_OP_RESHAPE || cgraph->nodes[i+2]->op == GGML_OP_VIEW) &&
+                cgraph->nodes[i+3]->op == GGML_OP_ROPE_FAST &&
+                ggml_cuda_op_fused_rope_fast(ctx, dst, cgraph->nodes[i+3])) {
+                i += 3;
+            }
+            else if (ENABLE_FUSION && i + 2 < cgraph->n_nodes &&
+               (cgraph->nodes[i+1]->op == GGML_OP_RESHAPE || cgraph->nodes[i+1]->op == GGML_OP_VIEW) &&
+                cgraph->nodes[i+2]->op == GGML_OP_ROPE_FAST &&
+                ggml_cuda_op_fused_rope_fast(ctx, dst, cgraph->nodes[i+2])) {
+                i += 2;
+            }
+            else if (ENABLE_FUSION && i + 1 < cgraph->n_nodes &&
+                cgraph->nodes[i+1]->op == GGML_OP_ROPE_FAST   &&
+                ggml_cuda_op_fused_rope_fast(ctx, dst, cgraph->nodes[i+1])) {
+                i += 1;
+            }
+            else {
+                ggml_cuda_op_rope_fast(ctx, dst);
+            }
+            break;
+        case GGML_OP_ROPE_CACHE:
+            ggml_cuda_op_rope_cache(ctx, dst);
+            break;
         case GGML_OP_IM2COL:
             ggml_cuda_op_im2col(ctx, dst);
             break;
@@ -4377,6 +4423,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
         case GGML_OP_SOFT_CAP_MAX:
         case GGML_OP_ROPE:
         case GGML_OP_ROPE_BACK:
+        case GGML_OP_ROPE_FAST:
+        case GGML_OP_ROPE_CACHE:
             return true;
         //case GGML_OP_ROPE:
         //    return ggml_is_contiguous(op->src[0]);
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index d058504c..b535854e 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -121,6 +121,226 @@ static __global__ void rope_neox(
     dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 
+static __global__ void rope_neox_fast(const float * src0, const float * src1, float * dst, int ne0, int ne1, int nelem,
+        int s01, int s02, int n_dims) {
+    int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= nelem) {
+        return;
+    }
+    //i = i0 + i1*ne0 + i2*ne0*ne1;
+    int i2 = i / (ne0*ne1); i -= i2*ne0*ne1;
+    int i1 = i / ne0;
+    int i0 = i - i1*ne0;
+
+    const int idst = i2*ne0*ne1 + i1*ne0 + i0/2;
+    const int ix   = i2*s02 + i1*s01 + i0/2;
+
+    if (i0 >= n_dims) {
+        dst[idst + i0/2 + 0] = src0[ix + i0/2 + 0];
+        dst[idst + i0/2 + 1] = src0[ix + i0/2 + 1];
+
+        return;
+    }
+
+    const float x0 = src0[ix + 0];
+    const float x1 = src0[ix + n_dims/2];
+
+    const float cos_theta = src1[i2*ne0 + i0 + 0];
+    const float sin_theta = src1[i2*ne0 + i0 + 1];
+
+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+static __global__ void fused_rope_neox_fast(const float * src0_1, const float * src0_2, const float * src1,
+        float * dst_1, float * dst_2, int ne0, int ne1_1, int ne1_2, int nelem1, int nelem,
+        int s01_1, int s02_1, int s01_2, int s02_2, int n_dims) {
+    int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= nelem) {
+        return;
+    }
+    const float * src0;
+    float * dst;
+    int ne1, s01, s02;
+    if (i < nelem1) {
+        src0 = src0_1;
+        dst  = dst_1;
+        ne1  = ne1_1;
+        s01  = s01_1;
+        s02  = s02_1;
+    } else {
+        i -= nelem1;
+        src0 = src0_2;
+        dst  = dst_2;
+        ne1  = ne1_2;
+        s01  = s01_2;
+        s02  = s02_2;
+    }
+    int i2 = i / (ne0*ne1); i -= i2*ne0*ne1;
+    int i1 = i / ne0;
+    int i0 = i - i1*ne0;
+
+    const int idst = i2*ne0*ne1 + i1*ne0 + i0/2;
+    const int ix   = i2*s02 + i1*s01 + i0/2;
+
+    if (i0 >= n_dims) {
+        dst[idst + i0/2 + 0] = src0[ix + i0/2 + 0];
+        dst[idst + i0/2 + 1] = src0[ix + i0/2 + 1];
+
+        return;
+    }
+
+    const float x0 = src0[ix + 0];
+    const float x1 = src0[ix + n_dims/2];
+
+    const float cos_theta = src1[i2*ne0 + i0 + 0];
+    const float sin_theta = src1[i2*ne0 + i0 + 1];
+
+    dst[idst + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+static __global__ void fused_rms_rope_neox_fast(const float * src0_1, const float * src0_2, const float * src1,
+        const float * c_1, const float * c_2,
+        float * dst_1, float * dst_2, int ne0, int ne1_1, int ne1_2,
+        int s01_1, int s02_1, int s01_2, int s02_2, int n_dims, float eps) {
+
+    int i0 = 2*threadIdx.y;
+    int i2 = blockIdx.x*blockDim.x + threadIdx.x;
+    int i1 = blockIdx.z*blockDim.z + threadIdx.z;
+
+    const float * src0, *c;
+    float * dst;
+    int ne1, s01, s02;
+
+    if (i1 < ne1_1) {
+        ne1 = ne1_1;
+        s01 = s01_1; s02 = s02_1;
+        src0 = src0_1 + i1*s01 + i2*s02;
+        dst  = dst_1 + ne0*(i1 + i2*ne1);
+        c = c_1;
+    } else {
+        i1 -= ne1_1;
+        ne1 = ne1_2;
+        s01 = s01_2; s02 = s02_2;
+        src0 = src0_2 + i1*s01 + i2*s02;
+        dst  = dst_2 + ne0*(i1 + i2*ne1);
+        c = c_2;
+    }
+
+    float sum = i0 < ne0 ? src0[i0]*src0[i0] + src0[i0+1]*src0[i0+1] : 0.0f;
+    sum = warp_reduce_sum(sum);
+    if constexpr (CUDA_ROPE_BLOCK_SIZE > WARP_SIZE) {
+        __shared__ float s_sum[WARP_SIZE];
+        int warp_id = (i0/2) / WARP_SIZE;
+        int lane_id = (i0/2) % WARP_SIZE;
+        if (lane_id == 0) s_sum[warp_id] = sum;
+        __syncthreads();
+        sum = lane_id < CUDA_ROPE_BLOCK_SIZE / WARP_SIZE ? s_sum[lane_id] : 0;
+        sum = warp_reduce_sum(sum);
+    }
+    float norm = rsqrtf(sum/ne0 + eps);
+
+    if (i0 >= ne0) return;
+
+    if (i0 >= n_dims) {
+        dst[i0 + 0] = norm*c[i0 + 0]*src0[i0 + 0];
+        dst[i0 + 1] = norm*c[i0 + 1]*src0[i0 + 1];
+        return;
+    }
+
+    const float cos_theta = src1[i2*ne0 + i0 + 0];
+    const float sin_theta = src1[i2*ne0 + i0 + 1];
+
+    const float x0 = norm*c[i0/2 +        0]*src0[i0/2 + 0];
+    const float x1 = norm*c[i0/2 + n_dims/2]*src0[i0/2 + n_dims/2];
+    dst[i0/2 + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i0/2 + n_dims/2] = x0*sin_theta + x1*cos_theta;
+
+}
+
+static __global__ void rope_norm_fast(const float * src0, const float * src1, float * dst, int ne0, int ne1, int nelem,
+        int s01, int s02, int n_dims) {
+    int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= nelem) {
+        return;
+    }
+
+    int i2 = i / (ne0*ne1); i -= i2*ne0*ne1;
+    int i1 = i / ne0;
+    int i0 = i - i1*ne0;
+
+    const int idst = i2*ne0*ne1 + i1*ne0 + i0;
+    const int ix   = i2*s02 + i1*s01 + i0;
+
+    if (i0 >= n_dims) {
+        dst[idst + 0] = src0[ix + 0];
+        dst[idst + 1] = src0[ix + 1];
+        return;
+    }
+
+    const float x0 = src0[ix + 0];
+    const float x1 = src0[ix + 1];
+
+    const float cos_theta = src1[i2*ne0 + i0 + 0];
+    const float sin_theta = src1[i2*ne0 + i0 + 1];
+
+    dst[idst + 0] = x0*cos_theta - x1*sin_theta;
+    dst[idst + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+static __global__ void fused_rope_norm_fast(const float * src0_1, const float * src0_2, const float * src1,
+        float * dst_1, float * dst_2, int ne0, int ne1_1, int ne1_2, int nelem1, int nelem,
+        int s01_1, int s02_1, int s01_2, int s02_2, int n_dims) {
+    int i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+
+    if (i >= nelem) {
+        return;
+    }
+    const float * src0;
+    float * dst;
+    int ne1, s01, s02;
+    if (i < nelem1) {
+        src0 = src0_1;
+        dst  = dst_1;
+        ne1  = ne1_1;
+        s01  = s01_1;
+        s02  = s02_1;
+    } else {
+        i -= nelem1;
+        src0 = src0_2;
+        dst  = dst_2;
+        ne1  = ne1_2;
+        s01  = s01_2;
+        s02  = s02_2;
+    }
+    int i2 = i / (ne0*ne1); i -= i2*ne0*ne1;
+    int i1 = i / ne0;
+    int i0 = i - i1*ne0;
+
+    const int idst = i2*ne0*ne1 + i1*ne0 + i0;
+    const int ix   = i2*s02 + i1*s01 + i0;
+
+    if (i0 >= n_dims) {
+        dst[idst + 0] = src0[ix + 0];
+        dst[idst + 1] = src0[ix + 1];
+        return;
+    }
+
+    const float x0 = src0[ix + 0];
+    const float x1 = src0[ix + 1];
+
+    const float cos_theta = src1[i2*ne0 + i0 + 0];
+    const float sin_theta = src1[i2*ne0 + i0 + 1];
+
+    dst[idst + 0] = x0*cos_theta - x1*sin_theta;
+    dst[idst + 1] = x0*sin_theta + x1*cos_theta;
+
+}
+
 template<bool forward, bool has_ff, typename T>
 static __global__ void rope_multi(
         const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
@@ -272,6 +492,84 @@ static void rope_neox_cuda(
     }
 }
 
+static void rope_neox_fast_cuda(const float * src0, const float * src1, float * dst, int ne00, int ne01, int ne02, int s01, int s02,
+        int n_dims, cudaStream_t stream) {
+    GGML_ASSERT(ne00 % 2 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int n_blocks = (ne00*ne01*ne02 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(n_blocks, 1, 1);
+    rope_neox_fast<<<block_nums, block_dims, 0, stream>>>(src0, src1, dst, ne00, ne01, ne01*ne02*ne02, s01, s02, n_dims);
+}
+
+static void fused_rope_neox_fast_cuda(const float * src0_1, const float * src0_2, const float * src1,
+        float * dst_1, float * dst_2, int ne0, int ne1_1, int ne1_2, int ne2, int s01_1, int s02_1, int s01_2, int s02_2,
+        int n_dims, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int nelem1 = ne0*ne1_1*ne2;
+    const int nelem2 = ne0*ne1_2*ne2;
+    const int nelem  = nelem1 + nelem2;
+    const int n_blocks = (nelem + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(n_blocks, 1, 1);
+    fused_rope_neox_fast<<<block_nums, block_dims, 0, stream>>>(src0_1, src0_2, src1, dst_1, dst_2, ne0, ne1_1, ne1_2, nelem1, nelem,
+            s01_1, s02_1, s01_2, s02_2, n_dims);
+}
+
+static void fused_rms_rope_neox_fast_cuda(const float * src0_1, const float * src0_2, const float * src1,
+        const float * c_1, const float * c_2,
+        float * dst_1, float * dst_2, int ne0, int ne1_1, int ne1_2, int ne2, int s01_1, int s02_1, int s01_2, int s02_2,
+        int n_dims, float eps, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    GGML_ASSERT(ne0 <= 2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const dim3 block_nums(ne2, 1, ne1_1 + ne1_2);
+    fused_rms_rope_neox_fast<<<block_nums, block_dims, 0, stream>>>(src0_1, src0_2, src1, c_1, c_2, dst_1, dst_2, ne0, ne1_1, ne1_2,
+            s01_1, s02_1, s01_2, s02_2, n_dims, eps);
+}
+
+static void fused_rope_norm_fast_cuda(const float * src0_1, const float * src0_2, const float * src1,
+        float * dst_1, float * dst_2, int ne0, int ne1_1, int ne1_2, int ne2, int s01_1, int s02_1, int s01_2, int s02_2,
+        int n_dims, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int nelem1 = ne0*ne1_1*ne2;
+    const int nelem2 = ne0*ne1_2*ne2;
+    const int nelem  = nelem1 + nelem2;
+    const int n_blocks = (nelem + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(n_blocks, 1, 1);
+    fused_rope_norm_fast<<<block_nums, block_dims, 0, stream>>>(src0_1, src0_2, src1, dst_1, dst_2, ne0, ne1_1, ne1_2, nelem1, nelem,
+            s01_1, s02_1, s01_2, s02_2, n_dims);
+}
+
+static void rope_norm_fast_cuda(const float * src0, const float * src1, float * dst, int ne00, int ne01, int ne02, int s01, int s02,
+        int n_dims, cudaStream_t stream) {
+    GGML_ASSERT(ne00 % 2 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int n_blocks = (ne00*ne01*ne02 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(n_blocks, 1, 1);
+    rope_norm_fast<<<block_nums, block_dims, 0, stream>>>(src0, src1, dst, ne00, ne01, ne01*ne02*ne02, s01, s02, n_dims);
+}
+
+static void rope_multi_fast_cuda(const float * src0, const float * src1, float * dst, int ne00, int ne01, int ne02, int s01, int s02,
+        int n_dims, cudaStream_t stream) {
+    GGML_ASSERT(ne00 % 2 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int n_blocks = (ne00*ne01*ne02 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(n_blocks, 1, 1);
+    // TODO
+    rope_neox_fast<<<block_nums, block_dims, 0, stream>>>(src0, src1, dst, ne00, ne01, ne02, s01, s02, n_dims);
+}
+
+static void rope_vision_fast_cuda(const float * src0, const float * src1, float * dst, int ne00, int ne01, int ne02, int s01, int s02,
+        int n_dims, cudaStream_t stream) {
+    GGML_ASSERT(ne00 % 2 == 0);
+    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE, 1, 1);
+    const int n_blocks = (ne00*ne01*ne02 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(n_blocks, 1, 1);
+    // TODO
+    rope_neox_fast<<<block_nums, block_dims, 0, stream>>>(src0, src1, dst, ne00, ne01, ne02, s01, s02, n_dims);
+}
+
 template<bool forward, typename T>
 static void rope_multi_cuda(
         const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
@@ -448,3 +746,270 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     ggml_cuda_op_rope_impl<false>(ctx, dst);
 }
+
+template <bool forward, bool has_ff>
+static __global__ void k_rope_cache(int nelem, int ne0, float * dst, const int * pos, const float * freq_factors,
+        float theta_scale, float freq_scale, rope_corr_dims corr_dims, float ext_factor, float attn_factor) {
+
+    int i = 2*(blockIdx.x*blockDim.x + threadIdx.x);
+    if (i >= nelem) {
+        return;
+    }
+    int i2 = i / ne0;
+    int i0 = i % ne0;
+
+    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
+
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    rope_yarn<forward>(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, dst[i], dst[i+1]);
+    if constexpr (!forward) {
+        dst[i+1] *= -1;
+    }
+}
+
+template <bool forward>
+void ggml_cuda_op_rope_cache_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+
+    const struct ggml_tensor * tpos = dst->src[0];
+    GGML_ASSERT(tpos->type == GGML_TYPE_I32);
+    GGML_ASSERT(tpos->ne[0] == dst->ne[1]);
+
+    GGML_ASSERT(n_dims <= dst->ne[0]);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == dst->ne[0]);
+    }
+
+    const float * freq_factors = NULL;
+    if (dst->src[1] != NULL) {
+        GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
+        GGML_ASSERT(dst->src[1]->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) dst->src[1]->data;
+    }
+
+    const int * pos = (const int *) dst->src[0]->data;
+
+    if (dst->src[1]!= nullptr) {
+        freq_factors = (const float *) dst->src[1]->data;
+    }
+
+    int nelem = ggml_nelements(dst);
+    int nblocks = (nelem + 2*CUDA_ROPE_BLOCK_SIZE - 1)/(2*CUDA_ROPE_BLOCK_SIZE);
+
+    if (freq_factors) {
+        k_rope_cache<true, true ><<<nblocks, CUDA_ROPE_BLOCK_SIZE, 0, ctx.stream()>>>(ggml_nelements(dst), dst->ne[0],
+                (float *)dst->data, pos, freq_factors, theta_scale, freq_scale, corr_dims, ext_factor, attn_factor);
+    } else {
+        k_rope_cache<true, false><<<nblocks, CUDA_ROPE_BLOCK_SIZE, 0, ctx.stream()>>>(ggml_nelements(dst), dst->ne[0],
+                (float *)dst->data, pos, freq_factors, theta_scale, freq_scale, corr_dims, ext_factor, attn_factor);
+    }
+}
+
+void ggml_cuda_op_rope_cache(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    ggml_cuda_op_rope_cache_impl<true>(ctx, dst);
+}
+
+void ggml_cuda_op_rope_fast(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0]; // head dims
+    const int64_t ne01 = src0->ne[1]; // num heads
+    const int64_t ne02 = src0->ne[2]; // num heads
+    const int64_t nr = ggml_nrows(src0);
+
+    const size_t s01 = src0->nb[1] / ggml_type_size(src0->type);
+    const size_t s02 = src0->nb[2] / ggml_type_size(src0->type);
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((const int32_t *) src1->op_params)[1];
+    const int mode       = ((const int32_t *) src1->op_params)[2];
+
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne00/2);
+    }
+
+    // compute
+    if (is_neox) {
+        //printf("Using neox\n");
+        rope_neox_fast_cuda(
+                (const float *)src0->data, (const float *)src1->data, (float *)dst->data, ne00, ne01, ne02, s01, s02, n_dims, stream);
+    } else if (is_mrope && !is_vision) {
+        rope_multi_fast_cuda(
+                (const float *)src0->data, (const float *)src1->data, (float *)dst->data, ne00, ne01, s01, s02, n_dims, nr, stream);
+    } else if (is_vision) {
+        rope_vision_fast_cuda(
+                (const float *)src0->data, (const float *)src1->data, (float *)dst->data, ne00, ne01, s01, s02, n_dims, nr, stream);
+    } else {
+        //printf("Using norm\n");
+        rope_norm_fast_cuda(
+                (const float *)src0->data, (const float *)src1->data, (float *)dst->data, ne00, ne01, s01, s02, n_dims, nr, stream);
+    }
+}
+
+bool ggml_cuda_op_fused_rope_fast(ggml_backend_cuda_context & ctx, ggml_tensor * dst1, ggml_tensor * dst2) {
+
+    if (dst1->src[1] != dst2->src[1]) return false;
+
+    const ggml_tensor * src0_1 = dst1->src[0];
+    const ggml_tensor * src0_2 = dst2->src[0];
+    const ggml_tensor * src1   = dst1->src[1];
+
+    if (src0_1->type != GGML_TYPE_F32) return false;
+    if (src0_2->type != GGML_TYPE_F32) return false;
+    if (dst1->type != GGML_TYPE_F32) return false;
+    if (dst2->type != GGML_TYPE_F32) return false;
+    if (src1->type != dst1->type) return false;
+
+    if (src0_1->ne[0] != src0_2->ne[0]) return false;
+    if (src0_1->ne[2] != src0_2->ne[2]) return false;
+    if (src0_1->ne[3] != src0_2->ne[3]) return false;
+
+    const int n_dims     = ((const int32_t *) src1->op_params)[1];
+    const int mode       = ((const int32_t *) src1->op_params)[2];
+
+    const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_vision || is_mrope) return false; // not implemented
+
+    const int64_t ne00 = src0_1->ne[0];   // head dims
+    const int64_t ne02 = src0_1->ne[2];   // num tokens
+    const int64_t ne01_1 = src0_1->ne[1]; // num heads
+    const int64_t ne01_2 = src0_2->ne[1]; // num heads
+
+    const size_t s01_1 = src0_1->nb[1] / ggml_type_size(src0_1->type);
+    const size_t s02_1 = src0_1->nb[2] / ggml_type_size(src0_1->type);
+    const size_t s01_2 = src0_2->nb[1] / ggml_type_size(src0_2->type);
+    const size_t s02_2 = src0_2->nb[2] / ggml_type_size(src0_2->type);
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne00/2);
+    }
+
+    // compute
+    if (is_neox) {
+        fused_rope_neox_fast_cuda(
+                (const float *)src0_1->data, (const float *)src0_2->data, (const float *)src1->data,
+                (float *)dst1->data, (float *)dst2->data, ne00, ne01_1, ne01_2, ne02, s01_1, s02_1, s01_2, s02_2, n_dims, ctx.stream());
+    } else {
+        fused_rope_norm_fast_cuda(
+                (const float *)src0_1->data, (const float *)src0_2->data, (const float *)src1->data,
+                (float *)dst1->data, (float *)dst2->data, ne00, ne01_1, ne01_2, ne02, s01_1, s02_1, s01_2, s02_2, n_dims, ctx.stream());
+    }
+    return true;
+}
+
+bool ggml_cuda_op_fused_rms_rope_fast(ggml_backend_cuda_context & ctx, ggml_tensor * dst1, ggml_tensor * dst2) {
+
+    if (dst1->src[1] != dst2->src[1]) return false;
+
+    const auto rms_1 = dst1->src[0];
+    const auto rms_2 = dst2->src[0];
+    const auto src1  = dst1->src[1];
+
+    if (rms_1->op != GGML_OP_FUSED_RMS_NORM) return false;
+    if (rms_2->op != GGML_OP_FUSED_RMS_NORM) return false;
+
+    const auto src0_1 = rms_1->src[0];
+    const auto src0_2 = rms_2->src[0];
+    const auto c_1    = rms_1->src[1];
+    const auto c_2    = rms_2->src[1];
+
+    if (src0_1->type != GGML_TYPE_F32) return false;
+    if (src0_2->type != GGML_TYPE_F32) return false;
+    if (dst1->type != GGML_TYPE_F32) return false;
+    if (dst2->type != GGML_TYPE_F32) return false;
+    if (src1->type != dst1->type) return false;
+    if (c_1->type != GGML_TYPE_F32) return false;
+    if (c_2->type != GGML_TYPE_F32) return false;
+
+    if (src0_1->ne[0] != src0_2->ne[0]) return false;
+    if (src0_1->ne[2] != src0_2->ne[2]) return false;
+    if (src0_1->ne[3] != src0_2->ne[3]) return false;
+    if (src0_1->ne[0] > 2*CUDA_ROPE_BLOCK_SIZE) return false;
+
+    GGML_ASSERT(ggml_nrows(c_1) == 1);
+    GGML_ASSERT(ggml_nrows(c_2) == 1);
+    GGML_ASSERT(c_1->ne[0] == src0_1->ne[0]);
+    GGML_ASSERT(c_2->ne[0] == src0_2->ne[0]);
+
+    const int n_dims     = ((const int32_t *) src1->op_params)[1];
+    const int mode       = ((const int32_t *) src1->op_params)[2];
+
+    const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_vision || is_mrope) return false; // not implemented
+    if (!is_neox) return false; // TODO
+
+    float eps1, eps2;
+    memcpy(&eps1, rms_1->op_params, sizeof(float));
+    memcpy(&eps2, rms_2->op_params, sizeof(float));
+    if (eps1 != eps2) return false;
+
+    const int64_t ne00 = src0_1->ne[0];   // head dims
+    const int64_t ne02 = src0_1->ne[2];   // num tokens
+    const int64_t ne01_1 = src0_1->ne[1]; // num heads
+    const int64_t ne01_2 = src0_2->ne[1]; // num heads
+
+    const size_t s01_1 = src0_1->nb[1] / ggml_type_size(src0_1->type);
+    const size_t s02_1 = src0_1->nb[2] / ggml_type_size(src0_1->type);
+    const size_t s01_2 = src0_2->nb[1] / ggml_type_size(src0_2->type);
+    const size_t s02_2 = src0_2->nb[2] / ggml_type_size(src0_2->type);
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == ne00/2);
+    }
+
+    // compute
+    fused_rms_rope_neox_fast_cuda(
+                (const float *)src0_1->data, (const float *)src0_2->data, (const float *)src1->data,
+                (const float *)c_1->data, (const float *)c_2->data,
+                (float *)dst1->data, (float *)dst2->data, ne00, ne01_1, ne01_2, ne02, s01_1, s02_1, s01_2, s02_2, n_dims, eps1,
+                ctx.stream());
+    return true;
+}
diff --git a/ggml/src/ggml-cuda/rope.cuh b/ggml/src/ggml-cuda/rope.cuh
index 9139f3b2..4bf0ed42 100644
--- a/ggml/src/ggml-cuda/rope.cuh
+++ b/ggml/src/ggml-cuda/rope.cuh
@@ -5,3 +5,11 @@
 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_rope_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_rope_cache(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_cuda_op_rope_fast(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+bool ggml_cuda_op_fused_rope_fast(ggml_backend_cuda_context & ctx, ggml_tensor * dst1, ggml_tensor * dst2);
+
+bool ggml_cuda_op_fused_rms_rope_fast(ggml_backend_cuda_context & ctx, ggml_tensor * dst1, ggml_tensor * dst2);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7d4c4feb..5a135cbb 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4242,6 +4242,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "SOFT_MAX_BACK",
     "ROPE",
     "ROPE_BACK",
+    "ROPE_CACHE",
+    "ROPE_FAST",
     "CLAMP",
     "CONV_TRANSPOSE_1D",
     "IM2COL",
@@ -4290,7 +4292,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
+static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4347,6 +4349,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "soft_max_back(x)",
     "rope(x)",
     "rope_back(x)",
+    "rope_cache(pos)",
+    "rope_fast(x)",
     "clamp(x)",
     "conv_transpose_1d(x)",
     "im2col(x)",
@@ -4395,7 +4399,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x),"
 };
 
-static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89");
+static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -8664,6 +8668,80 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
 
 // ggml_rope
 
+struct ggml_tensor * ggml_rope_cache(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c,
+            int                   ne0,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow) {
+    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+
+    GGML_ASSERT(ggml_is_vector(b));
+    GGML_ASSERT(b->type == GGML_TYPE_I32);
+
+    bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
+    GGML_ASSERT(!mrope_used);
+    //if (mrope_used) {
+    //    GGML_ASSERT(ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+    //} else {
+    //    GGML_ASSERT(a->ne[2] == b->ne[0]);
+    //}
+
+    if (c) {
+        GGML_ASSERT(c->type == GGML_TYPE_F32);
+        GGML_ASSERT(c->ne[0] >= n_dims / 2);
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne0, b->ne[0]);
+
+    int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memset(params + 11, 0,         sizeof(int32_t) * GGML_MROPE_SECTIONS);
+    //if (mrope_used) {
+    //    memcpy(params + 11, sections,  sizeof(int32_t) * GGML_MROPE_SECTIONS);
+    //} else {
+    //    memset(params + 11, 0,         sizeof(int32_t) * GGML_MROPE_SECTIONS);
+    //}
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_ROPE_CACHE;
+    result->src[0] = b;
+    result->src[1] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_rope_fast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b) {
+    GGML_ASSERT(a->ne[0] <= b->ne[0]);
+    GGML_ASSERT(a->ne[2] <= b->ne[1]);
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(b->type == GGML_TYPE_F32);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    result->op     = GGML_OP_ROPE_FAST;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
 static struct ggml_tensor * ggml_rope_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -18396,6 +18474,181 @@ static void ggml_mrope_cache_init(
     }
 }
 
+static void ggml_compute_forward_rope_cache_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst,
+        const bool forward) {
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
+
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
+
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
+
+    const struct ggml_tensor * tpos = dst->src[0];
+    GGML_ASSERT(tpos->type == GGML_TYPE_I32);
+    GGML_ASSERT(tpos->ne[0] == dst->ne[1]);
+
+    GGML_ASSERT(n_dims <= dst->ne[0]);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    if (is_mrope) {
+        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+
+    if (is_vision) {
+        GGML_ASSERT(n_dims == dst->ne[0]);
+    }
+
+    const float * freq_factors = NULL;
+    if (dst->src[1] != NULL) {
+        GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
+        GGML_ASSERT(dst->src[1]->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) dst->src[1]->data;
+    }
+
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
+    const int32_t * pos = (const int32_t *) tpos->data;
+
+    int ith = params->ith;
+    int nth = params->nth;
+    const int npt = (dst->ne[1] + nth - 1)/nth;
+
+    int first = npt*ith;
+    int last  = MIN(dst->ne[1], first + npt);
+
+    int64_t ne0 = dst->ne[0];
+    int64_t ne2 = dst->ne[1];
+
+    for (int i1 = first; i1 < last; ++i1) {
+        float * cache = (float *)((char *)dst->data + dst->nb[1]*i1);
+        if (!is_mrope) {
+            const int64_t p = pos[i1];
+            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+        }
+        else {
+            const int64_t p_t = pos[i1];
+            const int64_t p_h = pos[i1 + ne2];
+            const int64_t p_w = pos[i1 + ne2 * 2];
+            const int64_t p_e = pos[i1 + ne2 * 3];
+            ggml_mrope_cache_init(
+                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+        }
+    }
+}
+
+static void ggml_compute_forward_rope_fast_f32(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * dst) {
+
+    const struct ggml_tensor * src0 = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[0] <= src1->ne[0]);
+    GGML_ASSERT(src0->ne[2] <= src1->ne[1]);
+
+    const int n_dims     = ((const int32_t *) src1->op_params)[1];
+    const int mode       = ((const int32_t *) src1->op_params)[2];
+
+    const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
+    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nrows = ggml_nrows(src0);
+    const int npt   = (nrows + nth - 1)/nth;
+    const int first = ith*npt;
+    const int last  = MIN(first + npt, nrows);
+
+    const int ne02  = src0->ne[2];
+    const int ne01  = src0->ne[1];
+    const int ne00  = src0->ne[0];
+
+    for (int ir = first; ir < last; ++ir) {
+        const int i3 = ir/(ne01*ne02);
+        const int i2 = (ir - i3*ne01*ne02)/ne01;
+        const int i1 = ir - i3*ne01*ne02 - i2*ne01;
+        const float * c = (const float *)((const char *)src1->data + i2*src1->nb[1]);
+        const float * x = (const float *)((const char *)src0->data + i1*src0->nb[1] + i2*src0->nb[2] + i3*src0->nb[3]);
+        float       * y = (      float *)((      char *)dst->data  + i1* dst->nb[1] + i2* dst->nb[2] + i3* dst->nb[3]);
+        if (is_neox || is_mrope) {
+            const int n_gap = is_vision ? n_dims : n_dims/2;
+            for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                const int64_t ic = i0/2;
+
+                const float cos_theta = c[i0 + 0];
+                const float sin_theta = c[i0 + 1];
+
+                const float x0 = x[ic];
+                const float x1 = x[ic+n_gap];
+
+                y[ic      ] = x0*cos_theta - x1*sin_theta;
+                y[ic+n_gap] = x0*sin_theta + x1*cos_theta;
+            }
+        } else {
+            for (int i0 = 0; i0 < n_dims; i0 += 2) {
+                const float cos_theta = c[i0 + 0];
+                const float sin_theta = c[i0 + 1];
+
+                const float x0 = x[i0+0];
+                const float x1 = x[i0+1];
+
+                y[i0+0] = x0*cos_theta - x1*sin_theta;
+                y[i0+1] = x0*sin_theta + x1*cos_theta;
+            }
+        }
+
+        if (is_vision) {
+            for (int i0 = n_dims; i0 < ne00; i0 += 2) {
+                const int64_t ic = i0/2;
+
+                const float cos_theta = c[i0 + 0];
+                const float sin_theta = c[i0 + 1];
+
+                const float x0 = x[ic];
+                const float x1 = x[ic+n_dims];
+
+                y[ic]        = x0*cos_theta - x1*sin_theta;
+                y[ic+n_dims] = x0*sin_theta + x1*cos_theta;
+            }
+        } else {
+            // fill the remain channels with data from src tensor
+            for (int i0 = n_dims; i0 < ne00; i0 += 2) {
+                y[i0+0] = x[i0+0];
+                y[i0+1] = x[i0+1];
+            }
+        }
+    }
+}
+
 static void ggml_compute_forward_rope_f32(
         const struct ggml_compute_params * params,
         struct ggml_tensor * dst,
@@ -22584,6 +22837,14 @@ static int ggml_compute_forward(struct ggml_compute_params * params, struct ggml
             {
                 ggml_compute_forward_rope_back(params, tensor);
             } break;
+        case GGML_OP_ROPE_CACHE:
+            {
+                ggml_compute_forward_rope_cache_f32(params, tensor, true);
+            } break;
+        case GGML_OP_ROPE_FAST:
+            {
+                ggml_compute_forward_rope_fast_f32(params, tensor);
+            } break;
         case GGML_OP_CLAMP:
             {
                 ggml_compute_forward_clamp(params, tensor);
@@ -23635,6 +23896,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                             zero_table);
                 }
             } break;
+        case GGML_OP_ROPE_CACHE:
+            {
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
+        case GGML_OP_ROPE_FAST:
+            {
+                GGML_ABORT("fatal error"); // TODO: not implemented
+            }
         case GGML_OP_GLU:
             {
                 GGML_ABORT("fatal error"); // TODO: not implemented
@@ -24408,6 +24677,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_SOFT_MAX_BACK:
         case GGML_OP_ROPE:
         case GGML_OP_ROPE_BACK:
+        case GGML_OP_ROPE_CACHE:
+        case GGML_OP_ROPE_FAST:
         case GGML_OP_ADD_REL_POS:
             {
                 n_tasks = n_threads;
diff --git a/include/llama.h b/include/llama.h
index ef00a1fb..e41c0b5f 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -427,6 +427,7 @@ extern "C" {
         bool grouped_expert_routing; // whether to use grouped expert routing (BailingMoeV2 arch)
         bool fused_up_gate;     // whether to use fused up/gate op [EXPERIMENTAL]
         bool fused_mmad;        // whether to use fused mul+multi_add op [EXPERIMENTAL]
+        bool rope_cache;        // whether to use RoPE cache [EXPERIMENTAL]
         int  min_experts;
         float thresh_experts;
         bool only_active_experts;
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index 24150b11..440db5aa 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -51,6 +51,7 @@ llm_build_context::llm_build_context(
         grouped_expert_routing(cparams.grouped_expert_routing),
         fused_up_gate    (cparams.fused_up_gate),
         fused_mmad       (cparams.fused_mmad),
+        rope_cache       (cparams.rope_cache),
         min_experts      (cparams.min_experts),
         thresh_experts   (cparams.thresh_experts),
         pooling_type     (cparams.pooling_type),
@@ -3372,6 +3373,10 @@ ggml_cgraph * llm_build_context::build_qwen3() {
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
+    auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ?
+        ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow) : nullptr;
+
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
@@ -3388,14 +3393,16 @@ ggml_cgraph * llm_build_context::build_qwen3() {
                     model.layers[il].wv, nullptr,
                     model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0, il);
 
-            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
+            if (rope_cache) {
+                Qcur = ggml_rope_fast(ctx0, Qcur, rope_cache);
+                Kcur = ggml_rope_fast(ctx0, Kcur, rope_cache);
+            } else {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
             cb(Qcur, "Qcur", il);
-
-            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
             cb(Kcur, "Kcur", il);
 
             cur = llm_build_kv(ctx0, lctx, kv_self, gf,
@@ -3468,6 +3475,9 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
     struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
+    auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ?
+        ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow) : nullptr;
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
@@ -3483,18 +3493,16 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
                     model.layers[il].wq, nullptr, model.layers[il].wk, nullptr, model.layers[il].wv, nullptr,
                     model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0, il);
 
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
+            if (rope_cache) {
+                Qcur = ggml_rope_fast(ctx0, Qcur, rope_cache);
+                Kcur = ggml_rope_fast(ctx0, Kcur, rope_cache);
+            } else {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
             cb(Qcur, "Qcur", il);
-
-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
             cb(Kcur, "Kcur", il);
 
             cur = llm_build_kv(ctx0, lctx, kv_self, gf,
@@ -6083,6 +6091,10 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
     // output token IDs (for last layer cropping)
     struct ggml_tensor * inp_out_ids = build_inp_out_ids();
 
+    auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ?
+        ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow) : nullptr;
+
     // Only process up to last layer (skip final NextN layer)
     // Final layer tensors are loaded but not processed in forward pass
     const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
@@ -6103,12 +6115,15 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {
                     model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0.f, il);
 
             // apply RoPE
-            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
-            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
+            if (rope_cache) {
+                Qcur = ggml_rope_fast(ctx0, Qcur, rope_cache);
+                Kcur = ggml_rope_fast(ctx0, Kcur, rope_cache);
+            } else {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
             cb(Vcur, "Vcur", il);
@@ -7769,6 +7784,9 @@ ggml_cgraph * llm_build_context::build_hunyuan_moe() {
 ggml_cgraph * llm_build_context::build_openai_moe() {
     struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
@@ -7786,6 +7804,10 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
 
     const int sliding_window_pattern = 2;
 
+    auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ?
+        ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow) : nullptr;
+
     for (int il = 0; il < n_layer; ++il) {
         const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
         ggml_tensor * inpSA = inpL;
@@ -7805,30 +7827,18 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
                 model.layers[il].wv, model.layers[il].bv,
                 nullptr, nullptr, 0.0f, il);
 
-            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
-                    beta_fast, beta_slow);
+            if (rope_cache) {
+                Qcur = ggml_rope_fast(ctx0, Qcur, rope_cache);
+                Kcur = ggml_rope_fast(ctx0, Kcur, rope_cache);
+            } else {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
             cb(Qcur, "Qcur", il);
-
-            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
-                    attn_factor, beta_fast, beta_slow);
             cb(Kcur, "Kcur", il);
 
-            //auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, model.layers[il].wq, model.layers[il].bq,
-            //        model.layers[il].wk, model.layers[il].bk,
-            //        model.layers[il].wv, model.layers[il].bv, 0.f, il);
-
-            //Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
-            //        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
-            //        beta_fast, beta_slow);
-            //cb(Qcur, "Qcur", il);
-
-            //Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
-            //        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
-            //        attn_factor, beta_fast, beta_slow);
-            //cb(Kcur, "Kcur", il);
-
             cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo,
                     Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, kq_scale, cb, il, model.layers[il].attn_sinks,
                     is_sliding ? hparams.n_swa : 0);
@@ -7916,6 +7926,10 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() {
 
     const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
 
+    auto rope_cache = cparams.rope_cache && (rope_type == LLAMA_ROPE_TYPE_NEOX || rope_type == LLAMA_ROPE_TYPE_NORM) ?
+        ggml_rope_cache(ctx0, inp_pos, nullptr, n_embd_head, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+            ext_factor, attn_factor, beta_fast, beta_slow) : nullptr;
+
     for (int il = 0; il < n_transformer_layers; ++il) {
         ggml_tensor * inpSA = inpL;
 
@@ -7929,27 +7943,15 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() {
                     nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
                     model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, 0.0f, il);
 
-            //cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
-            //cb(cur, "wqkv", il);
-
-            //ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head,    n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd));
-            //ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd));
-          ////ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-            //ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa));
-
-            //Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
-            //cb(Qcur, "Qcur_normed", il);
-
-            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
-
-            //Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
-            //cb(Kcur, "Kcur_normed", il);
-
-            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
+            if (rope_cache) {
+                Qcur = ggml_rope_fast(ctx0, Qcur, rope_cache);
+                Kcur = ggml_rope_fast(ctx0, Kcur, rope_cache);
+            } else {
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor, beta_fast, beta_slow);
+            }
 
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
diff --git a/src/llama-build-context.h b/src/llama-build-context.h
index 150f3591..a959d6d6 100644
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -81,6 +81,7 @@ struct llm_build_context {
     const bool grouped_expert_routing;
     const bool fused_up_gate;
     const bool fused_mmad;
+    const bool rope_cache;
     const int  min_experts;
     const float thresh_experts;
 
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 10a777f9..3c32e404 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -37,6 +37,7 @@ struct llama_cparams {
     bool grouped_expert_routing;
     bool fused_up_gate;
     bool fused_mmad;
+    bool rope_cache;
     int  min_experts;
     float thresh_experts;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 2312fa2f..b2ffb0ec 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3833,6 +3833,7 @@ struct llama_context_params llama_context_default_params() {
         /*.grouped_expert_routing      =*/ false,
         /*.fused_up_gate               =*/ true,
         /*.fused_mmad                  =*/ true,
+        /*.rope_cache                  =*/ true,
         /*.min_experts                 =*/ -1,
         /*.thtesh_experts              =*/ 0.0f,
         /*.only_active_experts         =*/ false,
@@ -4134,6 +4135,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.grouped_expert_routing = params.grouped_expert_routing;
     cparams.fused_up_gate    = params.fused_up_gate;
     cparams.fused_mmad       = params.fused_mmad;
+    cparams.rope_cache       = params.rope_cache;
     cparams.min_experts      = params.min_experts;
     cparams.thresh_experts   = params.thresh_experts;
 
@@ -4216,6 +4218,7 @@ struct llama_context * llama_new_context_with_model(
     LLAMA_LOG_INFO("%s: grouped er    = %d\n",     __func__, cparams.grouped_expert_routing);
     LLAMA_LOG_INFO("%s: fused_up_gate = %d\n",     __func__, cparams.fused_up_gate);
     LLAMA_LOG_INFO("%s: fused_mmad    = %d\n",     __func__, cparams.fused_mmad);
+    LLAMA_LOG_INFO("%s: rope_cache    = %d\n",     __func__, cparams.rope_cache);
     LLAMA_LOG_INFO("%s: ser           = %d, %g\n", __func__, cparams.min_experts, cparams.thresh_experts);
     LLAMA_LOG_INFO("%s: freq_base     = %.1f\n",   __func__, cparams.rope_freq_base);
     LLAMA_LOG_INFO("%s: freq_scale    = %g\n",     __func__, cparams.rope_freq_scale);