From bbb1b1da6c0fc2a4f2cb482d401c2b3b15d030ba Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Sun, 30 Nov 2025 07:51:34 +0000
Subject: [PATCH] Slightly better

---
 src/llama-build-context.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index da9755cc..3f397a19 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1130,7 +1130,6 @@ llm_expert_gating_func_type   gating_op,
                     if (shared_out->ne[1] > 32) {
                         shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16);
                     }
-                    ggml_build_forward_expand(graph, shared_out);
                     results[id] = shared_out;
                 }
                 cur = ggml_add(ctx, results[0], results[1]);
@@ -1140,10 +1139,12 @@ llm_expert_gating_func_type   gating_op,
                     cur = ggml_add(ctx, cur, results[id]);
                     cb(cur, "ffn_shared_combined", il);
                 }
-                if (cur->type == GGML_TYPE_F16) {
-                    cur = ggml_cast(ctx, cur, GGML_TYPE_F32);
+                if (routed_out->ne[1] > 32) {
+                    auto routed_out_f16 = ggml_cast(ctx, routed_out, GGML_TYPE_F16);
+                    cur = ggml_add(ctx, routed_out_f16, cur);
+                } else {
+                    cur = ggml_add(ctx, routed_out, cur);
                 }
-                cur = ggml_add(ctx, routed_out, cur);
                 cb(cur, "ffn_out", il);
             } else {
                 //printf("Using non-split ffn for shared experts in layer %d\n", il);