Don't split the output tensor (#1038)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-09 05:20:01 +00:00 · 2025-12-05 15:56:53 +01:00
parent 9264abfbaf
commit 2125f68636
1 changed files with 2 additions and 2 deletions
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -388,7 +388,7 @@ void create_tensors_helper::create_std_ffn(int i, const LLM_TN & tn, llama_layer

 bool create_tensors_helper::create_llama_tensors(const LLM_TN & tn) {
    LOADING_PRELUDE
-    create_embd_output(tn, n_embd, n_vocab, true, true);
+    create_embd_output(tn, n_embd, n_vocab, true, false); //true);

    for (int i = 0; i < n_layer; ++i) {
        ggml_context * ctx_layer = ctx_for_layer(i);
@@ -1843,7 +1843,7 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
    GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
    GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");

-    create_embd_output(tn, n_embd, n_vocab, true, true);
+    create_embd_output(tn, n_embd, n_vocab, true, false); //true);

    for (int i = 0; i < n_layer; ++i) {
        ggml_context * ctx_layer = ctx_for_layer(i);