From a2f561452988f75ab0c08070e0d080fb42e5b330 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Tue, 9 Dec 2025 10:09:04 +0000
Subject: [PATCH] Try to split offloaded MoE up/gate up

It becomes much slower, despite the graph splits looking
OK. Not sure where it bottlenecks.
---
 ggml/src/ggml-backend.cpp   |  5 ++-
 ggml/src/ggml-cuda.cu       | 10 +++++
 src/llama-build-context.cpp | 88 +++++++++++++++++++++++++++++++++----
 3 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 76d14127..b537482d 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1633,7 +1633,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
             // check if we should start a new split based on the sources of the current node
             bool need_new_split = false;
-            if (node->op == GGML_OP_ADD && node->op_params[0] == 0xff) {
+            if ((node->op == GGML_OP_ADD               && node->op_params[0] == 0xff) ||
+                (node->op == GGML_OP_MOE_FUSED_UP_GATE && node->op_params[6] == 0xff) ||
+                (node->op == GGML_OP_CONCAT            && node->op_params[1] == 0xff)) {
                 need_new_split = true;
             }
             else if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
@@ -2023,6 +2025,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                         }
                         needs_sync[split_backend_id] = false;
                     }
+                    printf("Copying %s\n", input->name);
                     ggml_backend_tensor_copy(input, input_cpy);
                 }
             }
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index cd0bf889..f603b940 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -3417,6 +3417,16 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
     ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
     ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
 
+    if (!ggml_backend_is_cuda(backend_dst) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
+        return false;
+    }
+    if (!ggml_backend_is_cuda(backend_src)) {
+        if (ggml_backend_buffer_is_host(buf_src) && ggml_backend_buffer_get_usage(buf_src) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            ggml_backend_cuda_set_tensor_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
+            return true;
+        }
+    }
+
     if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
         return false;
     }
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index 7809d855..1cf9d5c5 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -851,6 +851,75 @@ ggml_tensor * llm_build_context::llm_build_ffn(
     return cur;
 }
 
+//if (!backend || !ggml_backend_supports_op(backend, cur) || !ggml_backend_offload_op(backend, cur)) {
+static ggml_tensor * build_up_gate_exps(llama_context & lctx, ggml_context * ctx, llm_ffn_op_type type_op,
+        ggml_tensor * up, ggml_tensor * gate, ggml_tensor * up_b, ggml_tensor * gate_b,
+        ggml_tensor * selected_experts, ggml_tensor * cur, const llm_build_cb & cb, int il) {
+
+    ggml_tensor * par = nullptr;
+
+    if (up_b || gate_b) {
+        par = ggml_moe_up_gate_ext(ctx, up, gate, cur, selected_experts, up_b, gate_b,
+                type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
+                type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
+        return par;
+    }
+    GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+    par = ggml_moe_up_gate(ctx, up, gate, cur, selected_experts,
+            type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
+    return par;
+    //int n_backend = ggml_backend_sched_get_n_backends(lctx.sched);
+    //bool up_is_host = ggml_backend_buffer_is_host(up->buffer);
+    //bool gate_is_host = ggml_backend_buffer_is_host(gate->buffer);
+    //printf("%s: checking for split in layer %d with %d backends, %d, %d, %s, %s\n", __func__, il, n_backend, up_is_host, gate_is_host,
+    //        ggml_backend_buffer_name(up->buffer), ggml_backend_buffer_name(gate->buffer));
+    if (int n_backend = ggml_backend_sched_get_n_backends(lctx.sched); n_backend > 2 &&
+            ggml_backend_buffer_is_host(up->buffer) && ggml_backend_buffer_is_host(gate->buffer)) {
+        bool should_offload = true;
+        for (int b = 0; b < n_backend-1; ++b) {
+            auto backend = ggml_backend_sched_get_backend(lctx.sched, b);
+            if (!backend || !ggml_backend_offload_op(backend, par)) {
+                //printf("  Backend %d (%s) says no to offload\n", b, ggml_backend_name(backend));
+                should_offload = false; break;
+            }
+        }
+        if (should_offload) {
+            //printf("Selected experts: %s, %ld x %ld x %ld x %ld\n", ggml_type_name(selected_experts->type), selected_experts->ne[0], selected_experts->ne[1], selected_experts->ne[2], selected_experts->ne[3]);
+            auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU;
+            GGML_ASSERT(up->ne[1] % 16 == 0);
+            auto nrows16 = up->ne[1]/16;
+            auto nrows16_per_backend = (nrows16 + n_backend-2) / (n_backend-1);
+            nrows16_per_backend *= 16;
+            std::vector<ggml_tensor *> up_gate; up_gate.reserve(n_backend-1);
+            for (int b = 0; b < n_backend-1; ++b) {
+                auto first = nrows16_per_backend*b;
+                auto last  = std::min(up->ne[1], first + nrows16_per_backend);
+                if (last > first) {
+                    auto up_view   = ggml_view_3d(ctx, up,   up->ne[0],   last-first, up->ne[2],   up->nb[1],   up->nb[2],   first*up->nb[1]);
+                    auto gate_view = ggml_view_3d(ctx, gate, gate->ne[0], last-first, gate->ne[2], gate->nb[1], gate->nb[2], first*gate->nb[1]);
+                    auto backend = ggml_backend_sched_get_backend(lctx.sched, b);
+                    //printf("Adding up_gate %ld...%ld on backend %s\n", first, last, ggml_backend_name(backend));
+                    up_gate.push_back(ggml_moe_up_gate(ctx, up_view, gate_view, cur, selected_experts, unary_op));
+                    ggml_backend_sched_set_tensor_backend(lctx.sched, up_gate.back(), backend);
+                    up_gate.back()->op_params[6] = 0xff;
+                    int il_b = 1000*(b+1) + il;
+                    cb(up_gate.back(), "ffn_up_gate", il_b);
+                }
+            }
+            //printf("Split up_gate into %d parts\n", int(up_gate.size()));
+            GGML_ASSERT(up_gate.size() >= 2);
+            par = ggml_concat(ctx, up_gate[0], up_gate[1], 0);
+            par->op_params[1] = 0xff;
+            cb(par, "ffn_up_gate_concat", il);
+            for (int b = 2; b < int(up_gate.size()); ++b) {
+                par = ggml_concat(ctx, par, up_gate[b], 0);
+                cb(par, "ffn_up_gate_concat", il);
+            }
+        }
+    }
+    return par;
+}
+
 ggml_tensor * llm_build_context::llm_build_moe_ffn(
         ggml_context * ctx,
        llama_context & lctx,
@@ -981,15 +1050,16 @@ llm_expert_gating_func_type   gating_op,
 
     ggml_tensor * par;
     if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
-        if (up_exps_b || gate_exps_b) {
-            par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
-                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
-                    type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
-        } else {
-            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
-            par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
-                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
-        }
+        par = build_up_gate_exps(lctx, ctx, type_op, up_exps, gate_exps, up_exps_b, gate_exps_b, selected_experts, cur, cb, il);
+        //if (up_exps_b || gate_exps_b) {
+        //    par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
+        //            type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
+        //            type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
+        //} else {
+        //    GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+        //    par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
+        //            type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
+        //}
     } else {
         ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
         cb(up, "ffn_moe_up", il);