Split mode graph for qwen3moe

2026-05-11 08:30:19 +00:00 · 2025-12-01 11:56:05 +00:00
parent 63d0389e18
commit c51968b6d8
4 changed files with 73 additions and 44 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -1637,6 +1637,41 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
    return cur;
 }

+static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml_tensor * cur, ggml_tensor * output, ggml_tensor * output_norm, const llm_build_cb & cb) {
+    // lm_head
+    if (output->extra) {
+        auto split_output = (ggml_split_tensor_t *)output->extra;
+        auto split_output_norm = output_norm && output_norm->extra ? (ggml_split_tensor_t *)output_norm->extra : nullptr;
+        std::vector<ggml_tensor *> o;
+        o.reserve(split_output->n_device);
+        for (int id = 0; id < split_output->n_device; ++id) {
+            auto split = split_output->splits[id];
+            if (!split) continue;
+            if (output_norm) {
+                auto the_norm = split_output_norm ? split_output_norm->splits[id] : output_norm;
+                auto cur_normed = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, the_norm, NULL, LLM_NORM_RMS, cb, -1);
+                cb(cur_normed, "output_normed", 1000*(id+1));
+                o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur_normed));
+            } else {
+                o.push_back(llm_build_context::llm_build_lora_mm(lctx, ctx, split, cur));
+            }
+            cb(o.back(), "output", id);
+        }
+        if (o.size() == 1) cur = o.front();
+        cur = ggml_concat(ctx, o[0], o[1], 0);
+        for (int id = 2; id < int(o.size()); ++id) {
+            cur = ggml_concat(ctx, cur, o[id], 0);
+        }
+    } else {
+        if (output_norm) {
+            cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1);
+            cb(cur, "output_normed", -1);
+        }
+        cur = llm_build_context::llm_build_lora_mm(lctx, ctx, output, cur);
+    }
+    return cur;
+}
+
 ggml_cgraph * llm_build_context::build_llama() {
    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);

@@ -1826,11 +1861,8 @@ ggml_cgraph * llm_build_context::build_llama() {

    cur = inpL;

-    cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
-    cb(cur, "result_norm", -1);
-
    // lm_head
-    cur = build_output(lctx, ctx0, cur, model.output, cb);
+    cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb);

    // For Granite architecture
    if (hparams.f_logit_scale) {
@@ -3792,22 +3824,19 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
        struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
        cb(ffn_inp, "ffn_inp", il);

-        // MoE branch
-        cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = llm_build_moe_ffn(ctx0, lctx, cur,
-                    model.layers[il].ffn_gate_inp,
-                    model.layers[il].ffn_up_exps,
-                    model.layers[il].ffn_gate_exps,
-                    model.layers[il].ffn_down_exps,
-                    nullptr,
-                    n_expert, n_expert_used,
-                    LLM_FFN_SILU, true,
-                    false, 0.0,
-                    LLM_EXPERT_GATING_FUNC_SOFTMAX,
-                    cb, il, gf);
-        cb(cur, "ffn_moe_out", il);
+        cur = llm_build_std_moe_ffn(ctx0, lctx, model.layers[il].ffn_norm, ffn_inp,
+                model.layers[il].ffn_gate_inp,  nullptr,
+                model.layers[il].ffn_up_exps,   nullptr,
+                model.layers[il].ffn_gate_exps, nullptr,
+                model.layers[il].ffn_down_exps, nullptr,
+                model.layers[il].ffn_exp_probs_b,
+                nullptr,  nullptr, // we don't have shared expert biases?
+                nullptr,  nullptr,
+                nullptr,  nullptr,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, true, false, 0.0f,
+                LLM_EXPERT_GATING_FUNC_SOFTMAX,
+                LLM_FFN_SILU, cb, il, gf);

        cur = ggml_add(ctx0, cur, ffn_inp);
        cur = lctx.cvec.apply_to(ctx0, cur, il);
@@ -3819,11 +3848,7 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {

    cur = inpL;

-    cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
-    cb(cur, "result_norm", -1);
-
-    // lm_head
-    cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+    cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb);
    cb(cur, "result_output", -1);

    ggml_build_forward_expand(gf, cur);
@@ -6777,12 +6802,8 @@ ggml_cgraph * llm_build_context::build_glm4_moe() {

    cur = inpL;

-    // final norm
-    cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
-    cb(cur, "result_norm", -1);
-
    // lm head
-    cur = build_output(lctx, ctx0, cur, model.output, cb);
+    cur = build_output(lctx, ctx0, cur, model.output, model.output_norm, cb);
    cb(cur, "result_output", -1);

    ggml_build_forward_expand(gf, cur);
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -352,11 +352,11 @@ ggml_tensor * create_tensors_helper::create_tensor(ggml_context * ctx, const std
 void create_tensors_helper::create_embd_output(const LLM_TN & tn, int n_embd, int n_vocab, bool has_norm, bool use_ctx_split) {
    model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});

-    if (has_norm) {
-    model.output_norm = create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-    }
    auto out_ctx = use_ctx_split ? ctx_output_split : ctx_output;
-    model.output      = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+    if (has_norm) {
+        model.output_norm = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+    }
+    model.output = create_tensor(out_ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);

    // if output is NULL, init from the input tok embed
    if (model.output == NULL) {
@@ -1142,11 +1142,11 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) {

    // output
    {
-        model.output_norm = create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+        model.output_norm = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
        model.output      = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
        // if output is NULL, init from the input tok embed
        if (model.output == NULL) {
-            model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+            model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
        }
    }

@@ -1156,18 +1156,19 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) {

        auto & layer = model.layers[i];

-        layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+        layer.attn_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});

        use_mmap_buffer &= !merge_qkv(tn, i, 0);

        layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});

-        layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k});
-        layer.attn_q_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k});
+        layer.attn_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k});
+        layer.attn_q_norm = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k});

-        layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+        auto ffn_ctx = model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer;
+        layer.ffn_norm = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});

-        layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
+        layer.ffn_gate_inp = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});

        if (n_expert == 0) {
            throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
@@ -1179,9 +1180,9 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) {
        // MoE branch
        const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;

-        layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert});
-        layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert});
-        layer.ffn_up_exps   = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert});
+        layer.ffn_gate_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {  n_embd, n_ff_exp, n_expert});
+        layer.ffn_down_exps = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp,   n_embd, n_expert});
+        layer.ffn_up_exps   = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff_exp, n_expert});
    }
    return use_mmap_buffer;
 }
@@ -3061,9 +3062,14 @@ bool create_tensors_helper::create_tensors() {
                    auto ctx_split = ctx_map[model.buft_output.buft_matrix];
                    auto split = create_split(model.output->ne[1], 16, model.splits);
                    prepare_split_tensors(1, ctx_split, model.output, model.split_output, split, mem_used);
+                    if (auto it = split_tensors.find(model.output_norm); it != split_tensors.end() && !ggml_backend_buft_is_host(model.buft_output.buft_matrix)) {
+                        auto ctx_split = ctx_map[model.buft_output.buft_matrix];
+                        prepare_split_tensors(-1, ctx_split, model.output_norm, model.split_output_norm, split, mem_used);
+                    }
                }
            }
        }
+
        LLAMA_LOG_INFO("Estimated model buffer size per device:\n");
        for (int i = 0; i < int(mem_used.size()); ++i) {
            LLAMA_LOG_INFO("    Device %d:  %8.2f MiB\n", i, mem_used[i]/1024./1024.);
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -344,6 +344,7 @@ struct llama_model {
    struct ggml_tensor * output_norm_enc;

    llama_split_tensor split_output;
+    llama_split_tensor split_output_norm;

    std::vector<llama_layer> layers;

--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1726,6 +1726,7 @@ static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t back
 static bool is_model_split_supported(const llama_model & model) {
    static std::unordered_set<llm_arch> k_supported = {
        LLM_ARCH_LLAMA,
+        LLM_ARCH_QWEN3MOE,
        LLM_ARCH_GLM4_MOE,
    };
    auto it =  k_supported.find(model.arch);