diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index 0c0ef048..90391d01 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -637,7 +637,7 @@ ggml_tensor * llm_build_context::llm_build_ffn(
          ggml_tensor * act_scales,
             llm_ffn_op_type   type_op,
           llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
 
     if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s &&
         up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR &&
@@ -661,11 +661,11 @@ ggml_tensor * llm_build_context::llm_build_ffn(
             if (ffn_norm && ffn_norm->extra) {
                 auto norm = (ggml_split_tensor_t *)ffn_norm->extra;
                 GGML_ASSERT(norm->splits[id]);
-                cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
+                cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_inp_normed", il_cb);
             }
-            else if (input->type != GGML_TYPE_F32) {
-                cur = ggml_cast(ctx, input, GGML_TYPE_F32);
+            else if (cur->type != GGML_TYPE_F32) {
+                cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
             }
             cur = ggml_fused_up_gate(ctx, split_u, split_g, cur, unary_op);
             cb(cur, "ffn_up_gate", il_cb);
@@ -683,6 +683,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
             }
             ffn.push_back(cur);
         }
+        if (add_input) {
+            ffn.back() = ggml_add(ctx, ffn.back(), input);
+            cb(ffn.back(), "ffn_with_inp", il);
+        }
         if (ffn.size() == 1) return ffn.front();
         auto cur = ggml_add(ctx, ffn[0], ffn[1]);
         cb(cur, "combine_ffn", il);
@@ -849,6 +853,11 @@ ggml_tensor * llm_build_context::llm_build_ffn(
         cb(cur, "ffn_down_s", il);
     }
 
+    if (add_input) {
+        cur = ggml_add(ctx, cur, input);
+        cb(cur, "ffn_out_with_inp", il);
+    }
+
     return cur;
 }
 
@@ -868,7 +877,9 @@ ggml_tensor * llm_build_context::llm_build_moe_ffn(
                        bool   scale_w,
                       float   w_scale,
 llm_expert_gating_func_type   gating_op,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
+
+    auto input = cur;
 
     int64_t n_embd = cur->ne[0];
     int64_t n_tokens = cur->ne[1];
@@ -1040,20 +1051,30 @@ llm_expert_gating_func_type   gating_op,
         if (lctx.cparams.fused_mmad) {
             experts = ggml_mul_multi_add(ctx, experts, weights);
             cb(experts, "ffn_moe_weighted", il);
+            if (add_input) {
+                experts = ggml_add(ctx, experts, input);
+                cb(experts, "ffn_out_with_inp", il);
+            }
             return experts;
         }
         experts = ggml_mul(ctx, experts, weights);
         cb(experts, "ffn_moe_weighted", il);
     }
 
+    ggml_tensor * result;
     if (n_expert_used == 1) {
-        return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0));
+        result = ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0));
     }
     if (n_expert_used == 2) {
-        return ggml_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0),
+        result = ggml_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0),
                              ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1]));
     }
-    return ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used);
+    result = ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used);
+    if (add_input) {
+        cb(result, "ffn_out", il);
+        result = ggml_add(ctx, result, input);
+    }
+    return result;
 
 }
 
@@ -1076,7 +1097,7 @@ ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama
                       float   w_scale,
 llm_expert_gating_func_type   gating_op,
             llm_ffn_op_type   type_op_shexp,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
 
     auto split_up_exps    = (ggml_split_tensor_t *)up_exps->extra;
     auto split_gate_exps  = (ggml_split_tensor_t *)gate_exps->extra;
@@ -1110,7 +1131,7 @@ llm_expert_gating_func_type   gating_op,
                     the_exp_probs_b,
                     n_expert, n_expert_used,
                     type_op, norm_w, scale_w, w_scale,
-                    gating_op, cb, il, graph);
+                    gating_op, cb, il, graph, add_input);
         cb(routed_out, "routed_out", il);
         ggml_build_forward_expand(graph, routed_out);
 
@@ -1206,7 +1227,7 @@ llm_expert_gating_func_type   gating_op,
                     split_exp_probs_b ? split_exp_probs_b->splits[id] : nullptr,
                     n_expert, n_expert_used,
                     type_op, norm_w, scale_w, w_scale,
-                    gating_op, cb, il, graph);
+                    gating_op, cb, il, graph, add_input);
         cb(routed_out, "routed_out", il_cb);
 
         if (split_up_shexp) {
@@ -1754,7 +1775,7 @@ ggml_cgraph * llm_build_context::build_llama() {
         // self-attention
         if (use_rope) {
             cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr,
-                    this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il);
+                    this_KQ_mask, nullptr, nullptr, kq_scale, hparams.f_attention_scale, this_n_swa, il, true, false, true);
         }
         else {
 
@@ -1807,9 +1828,11 @@ ggml_cgraph * llm_build_context::build_llama() {
             struct ggml_tensor * inp_out_ids = build_inp_out_ids();
             n_tokens = n_outputs;
             cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             cb(cur, "last_attn", il);
-            cb(inpSA, "last_ffn_inp", il);
+            if (use_rope) {
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+                cb(inpSA, "last_ffn_inp", il);
+            }
         }
 
         // For Granite architecture
@@ -1818,8 +1841,13 @@ ggml_cgraph * llm_build_context::build_llama() {
             cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
         }
 
-        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
+        ggml_tensor * ffn_inp;
+        if (use_rope) {
+            ffn_inp = cur;
+        } else {
+            ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+        }
 
         // feed-forward network
         if (model.layers[il].ffn_gate_inp == nullptr) {
@@ -1829,7 +1857,7 @@ ggml_cgraph * llm_build_context::build_llama() {
                     model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
                     model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                     NULL,
-                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf);
+                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf, true);
             cb(cur, "ffn_out", il);
         } else if (model.arch == LLM_ARCH_LLAMA4) {
             // llama4 MoE
@@ -1846,7 +1874,7 @@ ggml_cgraph * llm_build_context::build_llama() {
                     LLM_FFN_SILU, false,
                     false, 0.0,
                     LLM_EXPERT_GATING_FUNC_SIGMOID,
-                    cb, il, gf);
+                    cb, il, gf, true);
 
             // Shared experts
             ggml_tensor * shexp_out = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp_normed,
@@ -1875,7 +1903,7 @@ ggml_cgraph * llm_build_context::build_llama() {
                     LLM_FFN_SILU, true,
                     false, 0.0,
                     LLM_EXPERT_GATING_FUNC_SOFTMAX,
-                    cb, il, gf);
+                    cb, il, gf, true);
             cb(cur, "ffn_moe_out", il);
         }
 
@@ -1885,8 +1913,8 @@ ggml_cgraph * llm_build_context::build_llama() {
             cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
         }
 
-        cur = ggml_add(ctx0, cur, ffn_inp);
-        cb(cur, "ffn_out", il);
+        //cur = ggml_add(ctx0, cur, ffn_inp);
+        //cb(cur, "ffn_out", il);
 
         cur = lctx.cvec.apply_to(ctx0, cur, il);
         cb(cur, "l_out", il);
@@ -9312,7 +9340,7 @@ ggml_cgraph * llm_build_context::llama_build_graph(
 ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * the_attn_norm,
         ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in,
         ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
-        int n_swa, int il, bool do_rope, bool add_graph_split) {
+        int n_swa, int il, bool do_rope, bool add_graph_split, bool add_input) {
     if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
          model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
         if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
@@ -9489,6 +9517,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                 attn.push_back(cur);
             }
             GGML_ASSERT(!attn.empty());
+            if (add_input) {
+                attn.back() = ggml_add(ctx0, attn.back(), input);
+                cb(attn.back(), "attn_out_with_input", il);
+            }
             if (attn.size() == 1) return attn.front();
             //if (attn.size() > 2 && attn.size()%2 == 0) {
             //    for (int id = 0; id < int(attn.size()/2); ++id) {
@@ -9515,6 +9547,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
             if (attn.size() > 2) {
                 cur->op_params[0] = 0xff;
             }
+            //if (add_input) {
+            //    cur = ggml_add(ctx0, cur, input);
+            //    cb(cur, "combine_attn_inp", il);
+            //}
             return cur;
         }
     }
@@ -9549,5 +9585,10 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
             model.layers[il].wo, model.layers[il].bo,
             Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, KQ_scale, cb, il, sinks, n_swa);
 
+    if (add_input) {
+        cb(cur, "attn_out", il);
+        cur = ggml_add(ctx0, cur, input);
+    }
+
     return cur;
 }
diff --git a/src/llama-build-context.h b/src/llama-build-context.h
index 347b177a..d8350828 100644
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -335,7 +335,7 @@ struct llm_build_context {
          ggml_tensor * act_scales,
             llm_ffn_op_type   type_op,
           llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr);
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false);
 
     static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
          ggml_tensor * cur,
@@ -351,7 +351,7 @@ struct llm_build_context {
                        bool   scale_w,
                       float   w_scale,
 llm_expert_gating_func_type   gating_op,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr);
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false);
 
     static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
          ggml_tensor * cur,
@@ -367,7 +367,7 @@ llm_expert_gating_func_type   gating_op,
                        bool   scale_w,
                       float   w_scale,
 llm_expert_gating_func_type   gating_op,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false) {
         return llm_build_moe_ffn(ctx, lctx, cur,
                 gate_inp,   nullptr,
                 up_exps,    nullptr,
@@ -376,7 +376,7 @@ llm_expert_gating_func_type   gating_op,
                 exp_probs_b,
                 n_expert, n_expert_used,
                 type_op, norm_w, scale_w, w_scale,
-                gating_op, cb, il, graph);
+                gating_op, cb, il, graph, add_input);
     }
 
     static ggml_tensor * llm_build_std_moe_ffn(ggml_context * ctx, llama_context & lctx,
@@ -398,7 +398,7 @@ llm_expert_gating_func_type   gating_op,
                       float   w_scale,
 llm_expert_gating_func_type   gating_op,
             llm_ffn_op_type   type_op_shexp,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph);
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false);
 
     static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids);
 
@@ -410,6 +410,6 @@ llm_expert_gating_func_type   gating_op,
 
     ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * attn_norm, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors,
             ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
-            int n_swa, int il, bool do_rope = true, bool add_graph_split = false);
+            int n_swa, int il, bool do_rope = true, bool add_graph_split = false, bool add_input = false);
 
 };