Fix experts/shared experts split

2026-03-13 15:30:03 +00:00 · 2026-01-14 13:26:09 +00:00
parent 3a0b234669
commit e65782de67
1 changed files with 50 additions and 26 deletions
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -3189,25 +3189,7 @@ bool create_tensors_helper::create_tensors() {
                }
            }

-            //bool any_ffn_split = false;
-            if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
-                bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() &&
-                                 split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() &&
-                                 split_tensors.find(layer.ffn_up_shexp)   != split_tensors.end();
-                if (use_split) {
-                    //any_ffn_split = true;
-                    int ffn_granularity = 16;
-                    if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
-                        auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
-                        if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
-                    }
-                    auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used);
-                    prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp,   layer.split_ffn_up_shexp,   split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used);
-                }
-            }
-
+            std::vector<int> ffn_split;
            if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) {
                bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() &&
                                 split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() &&
@@ -3220,23 +3202,65 @@ bool create_tensors_helper::create_tensors() {
                        auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type);
                        if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
                    }
-                    auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used);
+                    ffn_split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used);
                    //printf("split(%2d):", il); for (auto & s : split) printf(" %d", s); printf("\n");
-                    prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   split, mem_used);
-                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used);
+                    prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, ffn_split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_up_exps,   layer.split_ffn_up_exps,   ffn_split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used);
                    if (layer.ffn_down_exps_b) {
-                        prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, split, mem_used);
+                        prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, ffn_split, mem_used);
                    }
                    if (layer.ffn_up_exps_b) {
-                        prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, split, mem_used);
+                        prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used);
                    }
                    if (layer.ffn_gate_exps_b) {
-                        prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, split, mem_used);
+                        prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used);
                    }
                }
            }

+            if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) {
+                bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() &&
+                                 split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() &&
+                                 split_tensors.find(layer.ffn_up_shexp)   != split_tensors.end();
+                if (use_split) {
+                    int ffn_granularity = 16;
+                    if (ggml_is_quantized(layer.ffn_down_shexp->type)) {
+                        auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type);
+                        if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size;
+                    }
+                    auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used);
+                    bool ok = true;
+                    if (!ffn_split.empty()) {
+                        ok = split.size() == ffn_split.size();
+                        if (ok) {
+                            for (int j = 0; j < int(ffn_split.size()); ++j) {
+                                if ((split[j] == 0 && ffn_split[j] > 0) || (split[j] > 0 && ffn_split[j] == 0)) {
+                                    ok = false; break;
+                                }
+                            }
+                        }
+                    }
+                    if (!ok) {
+                        printf("=== exp/shexp mismatch in layer %d\n", il);
+                        printf("    experts:"); for (auto& s : ffn_split) printf(" %d", s); printf("\n");
+                        printf(" sh_experts:"); for (auto& s : split    ) printf(" %d", s); printf("\n");
+                        std::vector<float> aux(ffn_split.size());
+                        float sum = 0;
+                        for (int j = 0; j < int(ffn_split.size()); ++j) {
+                            sum += ffn_split[j];
+                            aux[j] = sum;
+                        }
+                        for (auto& s : aux) s /= sum;
+                        split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, aux, mem_used);
+                        printf("        new:"); for (auto& s : split    ) printf(" %d", s); printf("\n");
+                    }
+                    prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp,   layer.split_ffn_up_shexp,   split, mem_used);
+                    prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used);
+                }
+            }
+
            if (layer.ffn_gate_inp) {
                if (auto it = split_tensors.find(layer.ffn_gate_inp); it != split_tensors.end()) {
                    auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, cur_splits, mem_used);