From e65782de67db955c27539502e7f03e4e086358ef Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Wed, 14 Jan 2026 13:26:09 +0000
Subject: [PATCH] Fix experts/shared experts split

---
 src/llama-load-tensors.cpp | 76 +++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 26 deletions(-)
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
index cdbe7da1..bfcbcfc4 100644
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -3189,25 +3189,7 @@ bool create_tensors_helper::create_tensors() {
                 }
             }
 
-            //bool any_ffn_split = false;
-            if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
-                bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() &&
-                                 split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() &&
-                                 split_tensors.find(layer.ffn_up_shexp)   != split_tensors.end();
-                if (use_split) {
-                    //any_ffn_split = true;
-                    int ffn_granularity = 16;
-                    if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
-                        auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
-                        if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
-                    }
-                    auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used);
-                    prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp,   layer.split_ffn_up_shexp,   split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used);
-                }
-            }
-
+            std::vector<int> ffn_split;
             if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
                 bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() &&
                                  split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() &&
@@ -3220,23 +3202,65 @@ bool create_tensors_helper::create_tensors() {
                         auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
                         if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
                     }
-                    auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used);
+                    ffn_split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used);
                     //printf("split(%2d):", il); for (auto & s : split) printf(" %d", s); printf("\n");
-                    prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used);
+                    prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, ffn_split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   ffn_split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used);
                     if (layer.ffn_down_exps_b) {
-                        prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, split, mem_used);
+                        prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, ffn_split, mem_used);
                     }
                     if (layer.ffn_up_exps_b) {
-                        prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, split, mem_used);
+                        prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used);
                     }
                     if (layer.ffn_gate_exps_b) {
-                        prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, split, mem_used);
+                        prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used);
                     }
                 }
             }
 
+            if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
+                bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() &&
+                                 split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() &&
+                                 split_tensors.find(layer.ffn_up_shexp)   != split_tensors.end();
+                if (use_split) {
+                    int ffn_granularity = 16;
+                    if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
+                        auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
+                        if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                    }
+                    auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used);
+                    bool ok = true;
+                    if (!ffn_split.empty()) {
+                        ok = split.size() == ffn_split.size();
+                        if (ok) {
+                            for (int j = 0; j < int(ffn_split.size()); ++j) {
+                                if ((split[j] == 0 && ffn_split[j] > 0) || (split[j] > 0 && ffn_split[j] == 0)) {
+                                    ok = false; break;
+                                }
+                            }
+                        }
+                    }
+                    if (!ok) {
+                        printf("=== exp/shexp mismatch in layer %d\n", il);
+                        printf("    experts:"); for (auto& s : ffn_split) printf(" %d", s); printf("\n");
+                        printf(" sh_experts:"); for (auto& s : split    ) printf(" %d", s); printf("\n");
+                        std::vector<float> aux(ffn_split.size());
+                        float sum = 0;
+                        for (int j = 0; j < int(ffn_split.size()); ++j) {
+                            sum += ffn_split[j];
+                            aux[j] = sum;
+                        }
+                        for (auto& s : aux) s /= sum;
+                        split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, aux, mem_used);
+                        printf("        new:"); for (auto& s : split    ) printf(" %d", s); printf("\n");
+                    }
+                    prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp,   layer.split_ffn_up_shexp,   split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used);
+                }
+            }
+
             if (layer.ffn_gate_inp) {
                 if (auto it = split_tensors.find(layer.ffn_gate_inp); it != split_tensors.end()) {
                     auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, cur_splits, mem_used);