From e65782de67db955c27539502e7f03e4e086358ef Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 14 Jan 2026 13:26:09 +0000 Subject: [PATCH] Fix experts/shared experts split --- src/llama-load-tensors.cpp | 76 +++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index cdbe7da1..bfcbcfc4 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -3189,25 +3189,7 @@ bool create_tensors_helper::create_tensors() { } } - //bool any_ffn_split = false; - if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) { - bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() && - split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() && - split_tensors.find(layer.ffn_up_shexp) != split_tensors.end(); - if (use_split) { - //any_ffn_split = true; - int ffn_granularity = 16; - if (ggml_is_quantized(layer.ffn_down_shexp->type)) { - auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type); - if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; - } - auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used); - prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used); - prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used); - prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used); - } - } - + std::vector ffn_split; if (layer.ffn_down_exps && layer.ffn_up_exps && layer.ffn_gate_exps) { bool use_split = split_tensors.find(layer.ffn_down_exps) != split_tensors.end() && split_tensors.find(layer.ffn_gate_exps) != split_tensors.end() && @@ -3220,23 +3202,65 @@ bool create_tensors_helper::create_tensors() { auto tt = ggml_internal_get_type_traits(layer.ffn_down_exps->type); if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; } - auto split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used); + ffn_split = create_split(layer.ffn_down_exps->ne[0], ffn_granularity, cur_splits, mem_used); //printf("split(%2d):", il); for (auto & s : split) printf(" %d", s); printf("\n"); - prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, split, mem_used); - prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, split, mem_used); - prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, split, mem_used); + prepare_split_tensors(0, ctx_split, layer.ffn_down_exps, layer.split_ffn_down_exps, ffn_split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up_exps, layer.split_ffn_up_exps, ffn_split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_exps, layer.split_ffn_gate_exps, ffn_split, mem_used); if (layer.ffn_down_exps_b) { - prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, split, mem_used); + prepare_split_tensors(-1, ctx_split, layer.ffn_down_exps_b, layer.split_ffn_down_exps_b, ffn_split, mem_used); } if (layer.ffn_up_exps_b) { - prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, split, mem_used); + prepare_split_tensors( 0, ctx_split, layer.ffn_up_exps_b, layer.split_ffn_up_exps_b, ffn_split, mem_used); } if (layer.ffn_gate_exps_b) { - prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, split, mem_used); + prepare_split_tensors( 0, ctx_split, layer.ffn_gate_exps_b, layer.split_ffn_gate_exps_b, ffn_split, mem_used); } } } + if (layer.ffn_down_shexp && layer.ffn_up_shexp && layer.ffn_gate_shexp) { + bool use_split = split_tensors.find(layer.ffn_down_shexp) != split_tensors.end() && + split_tensors.find(layer.ffn_gate_shexp) != split_tensors.end() && + split_tensors.find(layer.ffn_up_shexp) != split_tensors.end(); + if (use_split) { + int ffn_granularity = 16; + if (ggml_is_quantized(layer.ffn_down_shexp->type)) { + auto tt = ggml_internal_get_type_traits(layer.ffn_down_shexp->type); + if (tt.blck_size > ffn_granularity) ffn_granularity = tt.blck_size; + } + auto split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, cur_splits, mem_used); + bool ok = true; + if (!ffn_split.empty()) { + ok = split.size() == ffn_split.size(); + if (ok) { + for (int j = 0; j < int(ffn_split.size()); ++j) { + if ((split[j] == 0 && ffn_split[j] > 0) || (split[j] > 0 && ffn_split[j] == 0)) { + ok = false; break; + } + } + } + } + if (!ok) { + printf("=== exp/shexp mismatch in layer %d\n", il); + printf(" experts:"); for (auto& s : ffn_split) printf(" %d", s); printf("\n"); + printf(" sh_experts:"); for (auto& s : split ) printf(" %d", s); printf("\n"); + std::vector aux(ffn_split.size()); + float sum = 0; + for (int j = 0; j < int(ffn_split.size()); ++j) { + sum += ffn_split[j]; + aux[j] = sum; + } + for (auto& s : aux) s /= sum; + split = create_split(layer.ffn_down_shexp->ne[0], ffn_granularity, aux, mem_used); + printf(" new:"); for (auto& s : split ) printf(" %d", s); printf("\n"); + } + prepare_split_tensors(0, ctx_split, layer.ffn_down_shexp, layer.split_ffn_down_shexp, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_up_shexp, layer.split_ffn_up_shexp, split, mem_used); + prepare_split_tensors(1, ctx_split, layer.ffn_gate_shexp, layer.split_ffn_gate_shexp, split, mem_used); + } + } + if (layer.ffn_gate_inp) { if (auto it = split_tensors.find(layer.ffn_gate_inp); it != split_tensors.end()) { auto shared_split = create_split(ggml_nrows(layer.ffn_gate_inp), -1, cur_splits, mem_used);