Fix graph parallel when ngl < n_layers (#1241)

* Fix graph parallel when ngl < n_layers * Fix using ffn_norm When using graph parallel with ngl < n_layers, the ffn_norm tensor may have ended up being split, while the ffn tensors are on the CPU. In that case we will get a crash because we attempt to use the not-split buffer of ffn_norm, which is invalid. Thi commit fixes that. * Cleanup
2026-03-03 10:30:27 +00:00 · 2026-02-06 11:48:24 +02:00
parent 4d86907b18
commit c5d74f66e2
2 changed files with 68 additions and 62 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -744,8 +744,12 @@ ggml_tensor * llm_build_context::llm_build_ffn(
    }

    auto cur = input;
+    //if (input->op == GGML_OP_REDUCE) {
+    //    if (input->src[lctx.model.main_gpu]) cur = input->src[lctx.model.main_gpu];
+    //}
    if (ffn_norm) {
-        cur = llm_build_norm(ctx, cur, lctx.model.hparams, ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
+        auto the_ffn_norm = ffn_norm->extra ? ((ggml_split_tensor_t *)ffn_norm->extra)->splits[lctx.model.main_gpu] : ffn_norm;
+        cur = llm_build_norm(ctx, cur, lctx.model.hparams, the_ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
        cb(input, "ffn_norm", il);
    }
    if (cur->type != GGML_TYPE_F32) {