From 4303587f1c8d383a167ba0d8b1ec4fc2ea64f66b Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 26 Nov 2025 12:11:27 +0000 Subject: [PATCH] WIP --- src/llama-build-context.cpp | 8 ++++++-- src/llama-build-context.h | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index f89d4a2f..59db6f75 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -634,7 +634,7 @@ ggml_tensor * llm_build_context::llm_build_ffn( ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il) { + const llm_build_cb & cb, int il, ggml_cgraph * graph) { if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s && up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR && @@ -662,6 +662,9 @@ ggml_tensor * llm_build_context::llm_build_ffn( // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } + if (graph) { + ggml_build_forward_expand(graph, cur); + } ffn.push_back(cur); } if (ffn.size() == 1) return ffn.front(); @@ -1526,7 +1529,7 @@ ggml_cgraph * llm_build_context::build_llama() { model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf); cb(cur, "ffn_out", il); } else if (model.arch == LLM_ARCH_LLAMA4) { // llama4 MoE @@ -9152,6 +9155,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } cb(cur, "kqv_wo", il_cb); + ggml_build_forward_expand(gf, cur); // TODO: wo_b attn.push_back(cur); } diff --git a/src/llama-build-context.h b/src/llama-build-context.h index 271dd689..543ce9ca 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -331,7 +331,7 @@ struct llm_build_context { ggml_tensor * act_scales, llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, - const llm_build_cb & cb, int il); + const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr); static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx, ggml_tensor * cur,