Remove V reshaping, remove BOS by default for dots1 and fix warmup to handle models without BOS

2026-02-24 07:04:11 +00:00 · 2025-07-09 12:26:20 -05:00
parent b5bc8dcde7
commit 692dc0d9b5
1 changed files with 9 additions and 4 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -6177,7 +6177,12 @@ static void llm_load_vocab(
            }

            // default special tokens
-            vocab.special_bos_id  = 11;
+            if(model.arch == LLM_ARCH_DOTS1) {
+                vocab.special_bos_id = -1;
+            }
+            else {
+                vocab.special_bos_id  = 11;
+            }
            vocab.special_eos_id  = 11;
            vocab.special_unk_id  = -1;
            vocab.special_sep_id  = -1;
@@ -17009,7 +17014,7 @@ struct llm_build_context {

                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+                //Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
                cb(Qcur, "Qcur_normed", il);
@@ -17031,7 +17036,7 @@ struct llm_build_context {

                cb(Qcur, "Qcur", il);
                cb(Kcur, "Kcur", il);
-                cb(Vcur, "Vcur", il);
+                //cb(Vcur, "Vcur", il);

 		cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
@@ -17217,7 +17222,7 @@ static struct ggml_cgraph * llama_build_graph(
    const llama_vocab * vocab = llama_get_vocab(&lctx);
    llama_token bos = llama_token_bos_impl(*vocab);
    llama_token eos = llama_token_eos_impl(*vocab);
-    bool is_warming_up = (batch.n_tokens == 1 && batch.token[0] == bos);
+    bool is_warming_up = (batch.n_tokens == 1 && (batch.token[0] == ((bos != -1) ? bos : eos)));
    struct llm_build_context llm(lctx, batch, cb, worst_case, is_warming_up);

    llm.init();