diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index a1c5027d..21e60b66 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1237,7 +1237,8 @@ ggml_tensor * llm_build_context::llm_build_std_moe_ffn(ggml_context * ctx, llama llm_expert_gating_func_type gating_op, llm_ffn_op_type type_op_shexp, const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input, - ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b) { + ggml_tensor * up_gate_exps, ggml_tensor * up_gate_exps_b, + ggml_tensor * shexp_gate) { auto split_up_exps = (ggml_split_tensor_t *)up_exps->extra; auto split_gate_exps = (ggml_split_tensor_t *)gate_exps->extra; @@ -1327,6 +1328,18 @@ llm_expert_gating_func_type gating_op, down_shexp, down_b_shexp, nullptr, nullptr, type_op_shexp, LLM_FFN_PAR, cb, il); cb(shared_out, "ffn_shexp_out", il); + if (shexp_gate) { + auto shared_gate = llm_build_lora_mm(lctx, ctx, shexp_gate, cur); + cb(shared_gate, "shared_expert_gate", il); + if (shared_gate->ne[1] == 1) { + shared_out = ggml_fused_mul_unary(ctx, shared_gate, shared_out, GGML_UNARY_OP_SIGMOID); + } else { + shared_gate = ggml_sigmoid(ctx, shared_gate); + cb(shared_gate, "shared_expert_gate_sigmoid", il); + shared_out = ggml_mul(ctx, shared_out, shared_gate); + } + cb(shared_out, "ffn_shexp_gated", il); + } cur = ggml_add(ctx, routed_out, shared_out); cb(cur, "ffn_out", il); } @@ -4379,68 +4392,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() { return attn; }; - auto build_layer_ffn = [&](ggml_tensor * cur, int il) -> ggml_tensor * { - const bool has_moe = model.layers[il].ffn_gate_inp != nullptr; - const bool has_dense = model.layers[il].ffn_gate != nullptr && model.layers[il].ffn_up != nullptr && model.layers[il].ffn_down != nullptr; - - if (has_moe) { - ggml_tensor * moe_out = - llm_build_moe_ffn(ctx0, lctx, cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0f, LLM_EXPERT_GATING_FUNC_SOFTMAX, - cb, il, gf, false); - cb(moe_out, "ffn_moe_out", il); - - const bool has_shexp = model.layers[il].ffn_up_shexp != nullptr && - model.layers[il].ffn_gate_shexp != nullptr && - model.layers[il].ffn_down_shexp != nullptr && - model.layers[il].ffn_gate_inp_shexp != nullptr; - if (has_shexp) { - ggml_tensor * ffn_shexp = - llm_build_ffn(ctx0, lctx, nullptr, cur, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(ffn_shexp, "ffn_shexp", il); - - ggml_tensor * shared_gate = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur); - cb(shared_gate, "shared_expert_gate", il); - - if (shared_gate->ne[1] == 1) { - ffn_shexp = ggml_fused_mul_unary(ctx0, shared_gate, ffn_shexp, GGML_UNARY_OP_SIGMOID); - } else { - shared_gate = ggml_sigmoid(ctx0, shared_gate); - cb(shared_gate, "shared_expert_gate_sigmoid", il); - ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate); - } - cb(ffn_shexp, "ffn_shexp_gated", il); - - cur = ggml_add(ctx0, moe_out, ffn_shexp); - } else { - cur = moe_out; - } - cb(cur, "ffn_out", il); - return cur; - } - - GGML_ASSERT(has_dense); - cur = llm_build_ffn(ctx0, lctx, nullptr, cur, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_out", il); - return cur; - }; - ggml_tensor * inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = n_tokens > 1 ? build_inp_out_ids() : nullptr; @@ -4487,12 +4438,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() { if (hparams.is_recurrent(il)) { cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb); } else { - GGML_ASSERT(model.layers[il].wq != nullptr); - GGML_ASSERT(model.layers[il].wk != nullptr); - GGML_ASSERT(model.layers[il].wv != nullptr); - GGML_ASSERT(model.layers[il].wo != nullptr); - GGML_ASSERT(model.layers[il].attn_q_norm != nullptr); - GGML_ASSERT(model.layers[il].attn_k_norm != nullptr); cur = build_layer_attn(cur, inp_pos, KQ_mask, il); } @@ -4504,24 +4449,38 @@ ggml_cgraph * llm_build_context::build_qwen3next() { cur = ggml_add(ctx0, cur, inpSA); cb(cur, "attn_residual", il); - ggml_tensor * ffn_residual = cur; - ggml_tensor * attn_post_norm = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, cb, il); - cb(attn_post_norm, "attn_post_norm", il); + if (!model.layers[il].ffn_gate_inp) { + // dense FFN + cur = llm_build_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il, gf, true); + cb(cur, "ffn_out", il); + } else { + cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, cur, + model.layers[il].ffn_gate_inp, nullptr, + model.layers[il].ffn_up_exps, nullptr, + model.layers[il].ffn_gate_exps, nullptr, + model.layers[il].ffn_down_exps, nullptr, + nullptr, + model.layers[il].ffn_up_shexp, nullptr, // we don't have shared expert biases? + model.layers[il].ffn_gate_shexp, nullptr, + model.layers[il].ffn_down_shexp, nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, false, 0.0f, + LLM_EXPERT_GATING_FUNC_SOFTMAX, + LLM_FFN_SILU, cb, il, gf, true, model.layers[il].ffn_up_gate_exps, nullptr, model.layers[il].ffn_gate_inp_shexp); + } - cur = build_layer_ffn(attn_post_norm, il); - cb(cur, "ffn_out", il); - - cur = ggml_add(ctx0, cur, ffn_residual); cur = lctx.cvec.apply_to(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; } - cur = llm_build_norm(ctx0, inpL, hparams, model.output_norm, nullptr, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, inpL, model.output, model.output_norm, cb); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); diff --git a/src/llama-build-context.h b/src/llama-build-context.h index 1aa4aa60..67ce81e5 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -410,7 +410,8 @@ llm_expert_gating_func_type gating_op, llm_expert_gating_func_type gating_op, llm_ffn_op_type type_op_shexp, const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input = false, - ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr); + ggml_tensor * up_gate_exps = nullptr, ggml_tensor * up_gate_exps_b = nullptr, + ggml_tensor * shexp_gate = nullptr); static ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids); diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 8703cca8..8b8151e0 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1328,6 +1328,7 @@ bool create_tensors_helper::create_qwen3next_tensors(const LLM_TN & tn) { layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_post_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}); + layer.ffn_norm = layer.attn_post_norm; if (!hparams.is_recurrent(i)) { // Full-attention layer