From c51968b6d81e924fc69cc76a779bb572745e8263 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 1 Dec 2025 11:56:05 +0000 Subject: [PATCH] Split mode graph for qwen3moe --- src/llama-build-context.cpp | 81 +++++++++++++++++++++++-------------- src/llama-load-tensors.cpp | 34 +++++++++------- src/llama-model.h | 1 + src/llama.cpp | 1 + 4 files changed, 73 insertions(+), 44 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 28b92ea1..123fc185 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1637,6 +1637,41 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml return cur; } +static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml_tensor * cur, ggml_tensor * output, ggml_tensor * output_norm, const llm_build_cb & cb) { + // lm_head + if (output->extra) { + auto split_output = (ggml_split_tensor_t *)output->extra; + auto split_output_norm = output_norm && output_norm->extra ? (ggml_split_tensor_t *)output_norm->extra : nullptr; + std::vector o; + o.reserve(split_output->n_device); + for (int id = 0; id < split_output->n_device; ++id) { + auto split = split_output->splits[id]; + if (!split) continue; + if (output_norm) { + auto the_norm = split_output_norm ? split_output_norm->splits[id] : output_norm; + auto cur_normed = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, the_norm, NULL, LLM_NORM_RMS, cb, -1); + cb(cur_normed, "output_normed", 1000*(id+1)); + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur_normed)); + } else { + o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur)); + } + cb(o.back(), "output", id); + } + if (o.size() == 1) cur = o.front(); + cur = ggml_concat(ctx, o[0], o[1], 0); + for (int id = 2; id < int(o.size()); ++id) { + cur = ggml_concat(ctx, cur, o[id], 0); + } + } else { + if (output_norm) { + cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1); + cb(cur, "output_normed", -1); + } + cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur); + } + return cur; +} + ggml_cgraph * llm_build_context::build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); @@ -1826,11 +1861,8 @@ ggml_cgraph * llm_build_context::build_llama() { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - // lm_head - cur = build_output(lctx, ctx0, cur, model.output, cb); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); // For Granite architecture if (hparams.f_logit_scale) { @@ -3792,22 +3824,19 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - // MoE branch - cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); - cb(cur, "ffn_norm", il); - - cur = llm_build_moe_ffn(ctx0, lctx, cur, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLM_EXPERT_GATING_FUNC_SOFTMAX, - cb, il, gf); - cb(cur, "ffn_moe_out", il); + cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp, + model.layers[il].ffn_gate_inp, nullptr, + model.layers[il].ffn_up_exps, nullptr, + model.layers[il].ffn_gate_exps, nullptr, + model.layers[il].ffn_down_exps, nullptr, + model.layers[il].ffn_exp_probs_b, + nullptr, nullptr, // we don't have shared expert biases? + nullptr, nullptr, + nullptr, nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, false, 0.0f, + LLM_EXPERT_GATING_FUNC_SOFTMAX, + LLM_FFN_SILU, cb, il, gf); cur = ggml_add(ctx0, cur, ffn_inp); cur = lctx.cvec.apply_to(ctx0, cur, il); @@ -3819,11 +3848,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() { cur = inpL; - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - - // lm_head - cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -6777,12 +6802,8 @@ ggml_cgraph * llm_build_context::build_glm4_moe() { cur = inpL; - // final norm - cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); - cb(cur, "result_norm", -1); - // lm head - cur = build_output(lctx, ctx0, cur, model.output, cb); + cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 62148771..6ac5130d 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -352,11 +352,11 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm, bool use_ctx_split) { model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - if (has_norm) { - model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - } auto out_ctx = use_ctx_split ? ctx_output_split : ctx_output; - model.output = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + if (has_norm) { + model.output_norm = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + } + model.output = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { @@ -1142,11 +1142,11 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { // output { - model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed if (model.output == NULL) { - model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } } @@ -1156,18 +1156,19 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { auto & layer = model.layers[i]; - layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); use_mmap_buffer &= !merge_qkv(tn, i, 0); layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}); - layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}); - layer.attn_q_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}); + layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}); + layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}); - layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + auto ffn_ctx = model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer; + layer.ffn_norm = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_gate_inp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); if (n_expert == 0) { throw std::runtime_error("n_expert must be > 0 for QWEN3MOE"); @@ -1179,9 +1180,9 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) { // MoE branch const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used; - layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); - layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); - layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_down_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); + layer.ffn_up_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); } return use_mmap_buffer; } @@ -3061,9 +3062,14 @@ bool create_tensors_helper::create_tensors() { auto ctx_split = ctx_map[model.buft_output.buft_matrix]; auto split = create_split(model.output->ne[1], 16, model.splits); prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used); + if (auto it = split_tensors.find(model.output_norm); it != split_tensors.end() && !ggml_backend_buft_is_host(model.buft_output.buft_matrix)) { + auto ctx_split = ctx_map[model.buft_output.buft_matrix]; + prepare_split_tensors(-1, ctx_split, model.output_norm, model.split_output_norm, split, mem_used); + } } } } + LLAMA_LOG_INFO("Estimated model buffer size per device:\n"); for (int i = 0; i < int(mem_used.size()); ++i) { LLAMA_LOG_INFO(" Device %d: %8.2f MiB\n", i, mem_used[i]/1024./1024.); diff --git a/src/llama-model.h b/src/llama-model.h index bb7134cc..d6188721 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -344,6 +344,7 @@ struct llama_model { struct ggml_tensor * output_norm_enc; llama_split_tensor split_output; + llama_split_tensor split_output_norm; std::vector layers; diff --git a/src/llama.cpp b/src/llama.cpp index c1ffd5ac..6ea2b4cf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1726,6 +1726,7 @@ static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t back static bool is_model_split_supported(const llama_model & model) { static std::unordered_set k_supported = { LLM_ARCH_LLAMA, + LLM_ARCH_QWEN3MOE, LLM_ARCH_GLM4_MOE, }; auto it = k_supported.find(model.arch);