From de1614e7532e0710d9207dd4d3f7f67373941b05 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Wed, 3 Dec 2025 04:44:23 +0000
Subject: [PATCH] Fix type, add additional asserts

---
 src/llama-build-context.cpp | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index d8cb6237..fb7d407a 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -653,11 +653,12 @@ ggml_tensor * llm_build_context::llm_build_ffn(
             auto split_u = u->splits[id];
             auto split_g = g->splits[id];
             auto split_d = d->splits[id];
-            GGML_ASSERT((!split_u && !split_g && split_d) || (split_u && split_g && split_d));
+            GGML_ASSERT((!split_u && !split_g && !split_d) || (split_u && split_g && split_d));
             if (!split_u) continue;
             auto cur = input;
             if (ffn_norm && ffn_norm->extra) {
                 auto norm = (ggml_split_tensor_t *)ffn_norm->extra;
+                GGML_ASSERT(norm->splits[id]);
                 cur = llm_build_norm(ctx, input, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
                 cb(cur, "ffn_inp_normed", il_cb);
             }
@@ -1088,6 +1089,7 @@ llm_expert_gating_func_type   gating_op,
         auto cur = input;
         if (ffn_norm) {
             auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
+            GGML_ASSERT(the_ffn_norm);
             cur = llm_build_norm(ctx, input, lctx.model.hparams, the_ffn_norm, nullptr, LLM_NORM_RMS, cb, il);
             cb(cur, "ffn_inp_normed", il);
         }
@@ -1172,7 +1174,7 @@ llm_expert_gating_func_type   gating_op,
     }
     GGML_ASSERT(split_up_exps && split_gate_exps && split_down_exps);
     GGML_ASSERT(split_up_exps->n_device == split_gate_exps->n_device && split_up_exps->n_device == split_down_exps->n_device);
-    std::vector<ggml_tensor *> results(split_up_exps->n_device);
+    std::vector<ggml_tensor *> results; results.reserve(split_up_exps->n_device);
     GGML_ASSERT((!split_up_shexp && !split_gate_shexp && !split_down_shexp) ||
                 ( split_up_shexp &&  split_gate_shexp &&  split_down_shexp));
     auto split_gate_inp = (ggml_split_tensor_t *)gate_inp->extra;
@@ -1180,6 +1182,9 @@ llm_expert_gating_func_type   gating_op,
     auto split_exp_probs_b = exp_probs_b ? (ggml_split_tensor_t *)exp_probs_b->extra : nullptr;
     GGML_ASSERT(!split_exp_probs_b || split_exp_probs_b->n_device == split_up_exps->n_device);
     for (int id = 0; id < split_up_exps->n_device; ++id) {
+        GGML_ASSERT((split_up_exps->splits[id] && split_gate_exps->splits[id] && split_down_exps->splits[id]) ||
+                    (!split_up_exps->splits[id] && !split_gate_exps->splits[id] && !split_down_exps->splits[id]));
+        if (!split_up_exps->splits[id]) continue;
         int il_cb = 1000*(id + 1) + il;
         auto cur = input;
         if (ffn_norm) {
@@ -1222,8 +1227,9 @@ llm_expert_gating_func_type   gating_op,
             cur = ggml_cast(ctx, cur, GGML_TYPE_F16);
             cb(cur, "ffn_out_f16", il_cb);
         }
-        results[id] = cur;
+        results.push_back(cur);
     }
+    GGML_ASSERT(!results.empty());
     if (results.size() == 1) return results.front();
 
     auto cur = ggml_add(ctx, results[0], results[1]);
@@ -9359,6 +9365,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                 ggml_build_forward_expand(gf, cur);
                 attn.push_back(cur);
             }
+            GGML_ASSERT(!attn.empty());
             if (attn.size() == 1) return attn.front();
             auto cur = ggml_add(ctx0, attn[0], attn[1]);
             cb(cur, "combine_attn", il);
@@ -9367,10 +9374,6 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                 cur = ggml_add(ctx0, cur, attn[id]);
                 cb(cur, "combine_attn", il);
             }
-            // TODO: for more than 2 GPUs, do we need to add another forced graph split?
-            //if (attn.size() > 2) {
-            //    cur->op_params[0] = 0xff;
-            //}
             return cur;
         }
     }