From bfbfac0f1be7c8f359d49bd5c3043801bc8661c4 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sun, 30 Nov 2025 06:38:18 +0000 Subject: [PATCH] This works but is slow Still does not work for row-interleaved quants --- src/llama-build-context.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 245dbe55..c0c718a0 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1097,7 +1097,7 @@ llm_expert_gating_func_type gating_op, auto the_gate_inp = gate_inp->extra ? ((ggml_split_tensor_t *)gate_inp->extra)->splits[lctx.model.main_gpu] : gate_inp; auto the_gate_inp_b = gate_inp_b ? gate_inp_b->extra ? ((ggml_split_tensor_t *)gate_inp_b->extra)->splits[lctx.model.main_gpu] : gate_inp_b : nullptr; auto the_exp_probs_b = exp_probs_b ? exp_probs_b->extra ? ((ggml_split_tensor_t *)exp_probs_b->extra)->splits[lctx.model.main_gpu] : exp_probs_b : nullptr; - //printf("Using non-split llm_build_moe_ffn for layer %d\n", il); + //int n_before = graph->n_nodes; auto routed_out = llm_build_moe_ffn(ctx, lctx, cur, the_gate_inp, the_gate_inp_b, up_exps, up_exps_b, @@ -1108,6 +1108,8 @@ llm_expert_gating_func_type gating_op, type_op, norm_w, scale_w, w_scale, gating_op, cb, il, graph); cb(routed_out, "routed_out", il); + ggml_build_forward_expand(graph, routed_out); + //printf("Using non-split llm_build_moe_ffn for layer %d. n_before = %d, n_now = %d\n", il, n_before, graph->n_nodes); if (up_shexp && gate_shexp && down_shexp) { if (split_up_shexp) { @@ -1127,9 +1129,10 @@ llm_expert_gating_func_type gating_op, if (shared_out->ne[1] > 32) { shared_out = ggml_cast(ctx, shared_out, GGML_TYPE_F16); } + ggml_build_forward_expand(graph, shared_out); results[id] = shared_out; } - auto cur = ggml_add(ctx, results[0], results[1]); + cur = ggml_add(ctx, results[0], results[1]); cur->op_params[0] = 0xff; cb(cur, "ffn_shared_combined", il); for (int id = 2; id < int(results.size()); ++id) { @@ -1155,6 +1158,7 @@ llm_expert_gating_func_type gating_op, } else { cur = routed_out; } + ggml_build_forward_expand(graph, routed_out); return cur; } GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);