From bbb1b1da6c0fc2a4f2cb482d401c2b3b15d030ba Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sun, 30 Nov 2025 07:51:34 +0000 Subject: [PATCH] Slightly better --- src/llama-build-context.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index da9755cc..3f397a19 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1130,7 +1130,6 @@ llm_expert_gating_func_type gating_op, if (shared_out->ne[1] > 32) { shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16); } - ggml_build_forward_expand(graph, shared_out); results[id] = shared_out; } cur = ggml_add(ctx, results[0], results[1]); @@ -1140,10 +1139,12 @@ llm_expert_gating_func_type gating_op, cur = ggml_add(ctx, cur, results[id]); cb(cur, "ffn_shared_combined", il); } - if (cur->type == GGML_TYPE_F16) { - cur = ggml_cast(ctx, cur, GGML_TYPE_F32); + if (routed_out->ne[1] > 32) { + auto routed_out_f16 = ggml_cast(ctx, routed_out, GGML_TYPE_F16); + cur = ggml_add(ctx, routed_out_f16, cur); + } else { + cur = ggml_add(ctx, routed_out, cur); } - cur = ggml_add(ctx, routed_out, cur); cb(cur, "ffn_out", il); } else { //printf("Using non-split ffn for shared experts in layer %d\n", il);