WIP: Cohere2

2026-05-11 08:30:19 +00:00 · 2025-12-22 09:57:38 +00:00
parent d50ef0165e
commit aa3f14b963
3 changed files with 69 additions and 42 deletions
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1414,7 +1414,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
        // do not overwrite user assignments
        if (*leaf_backend_id == -1) {
            *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
-            //printf("Pass 1: assigned backend %d to leaf %d, %s\n", *leaf_backend_id, i, graph->leafs[i]->name);
        }
    }

@@ -1443,6 +1442,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
                *node_backend_id = *this_node_backend_id;
            }
        }
+        else if (node->op == GGML_OP_MUL && node->src[0]->op == GGML_OP_NORM) {
+            // This is a hack for Cohere2. Without this hack the scheduler creates
+            // totally nonsensical splits for that arch
+            int * src1_id = &tensor_backend_id(node->src[1]);
+            if (*src1_id >= 0) {
+                int * src0_id = &tensor_backend_id(node->src[0]);
+                int * dst_id  = &tensor_backend_id(node);
+                *src0_id = *src1_id;
+                *dst_id  = *src1_id;
+                // For some reason that I don't understand, we can have norm backend already assigned
+                // at this point. How? That's why this more logical approach of first checking is commented out
+                //if (*src0_id < 0) {
+                //    *src0_id = *src1_id;
+                //} else {
+                //    printf("Oops: backend_id_src0(%s) = %d, backend_id_src1(%s) = %d\n", node->src[0]->name, *src0_id, node->src[1]->name, *src1_id);
+                //    //GGML_ASSERT(*src0_id == *src1_id);
+                //}
+                //if (*dst_id < 0) {
+                //    *dst_id = *src1_id;
+                //} else {
+                //    printf("Oops: backend_id_dst(%s) = %d, backend_id_src1(%s) = %d\n", node->name, *dst_id, node->src[1]->name, *src1_id);
+                //    //GGML_ASSERT(*dst_id == *src1_id);
+                //}
+            }
+        }
        // do not overwrite user assignments
        if (*node_backend_id == -1) {
            *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -651,7 +651,8 @@ ggml_tensor * llm_build_context::llm_build_ffn(
         ggml_tensor * act_scales,
            llm_ffn_op_type   type_op,
          llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input) {
+         const llm_build_cb & cb, int il, ggml_cgraph * graph, bool add_input,
+         bool is_norm, ggml_tensor * add_extra) {

    if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s &&
        up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR &&
@@ -676,7 +677,13 @@ ggml_tensor * llm_build_context::llm_build_ffn(
            if (ffn_norm && ffn_norm->extra) {
                auto norm = (ggml_split_tensor_t *)ffn_norm->extra;
                GGML_ASSERT(norm->splits[id]);
-                cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
+                if (is_norm) {
+                    cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM, cb, il);
+                    GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM);
+                    cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
+                } else {
+                    cur = llm_build_norm(ctx, cur, lctx.model.hparams, norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
+                }
                cb(cur, "ffn_inp_normed", il_cb);
            }
            else if (cur->type != GGML_TYPE_F32) {
@@ -702,6 +709,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
            ffn.back() = ggml_add(ctx, ffn.back(), input);
            cb(ffn.back(), "ffn_with_inp", il);
        }
+        if (add_extra) {
+            ffn.back() = ggml_add(ctx, ffn.back(), add_extra);
+            cb(ffn.back(), "ffn_with_inp", il);
+        }
        if (ffn.size() == 1) return ffn.front();
        auto cur = ggml_reduce(ctx, ffn.data(), u->n_device, GGML_OP_ADD);
        cb(cur, "ffn_combined", il);
@@ -711,7 +722,7 @@ ggml_tensor * llm_build_context::llm_build_ffn(

    auto cur = input;
    if (ffn_norm) {
-        cur = llm_build_norm(ctx, cur, lctx.model.hparams, ffn_norm, NULL, LLM_NORM_RMS, cb, il);
+        cur = llm_build_norm(ctx, cur, lctx.model.hparams, ffn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
        cb(input, "ffn_norm", il);
    }
    if (cur->type != GGML_TYPE_F32) {
@@ -746,6 +757,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
            cur = ggml_add(ctx, cur, input);
            cb(cur, "ffn_out_with_inp", il);
        }
+        if (add_extra) {
+            cur = ggml_add(ctx, cur, add_extra);
+            cb(cur, "ffn_out_with_inp", il);
+        }
        return cur;
    }

@@ -865,6 +880,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
        cur = ggml_add(ctx, cur, input);
        cb(cur, "ffn_out_with_inp", il);
    }
+    if (add_extra) {
+        cur = ggml_add(ctx, cur, add_extra);
+        cb(cur, "ffn_out_with_inp", il);
+    }

    return cur;
 }
@@ -7254,48 +7273,25 @@ ggml_cgraph * llm_build_context::build_cohere2() {
        const bool           is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
        struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;

-        // norm
-        auto attn_norm = model.layers[il].attn_norm;
-        int id = -1;
-        if (attn_norm->extra) {
-            auto extra = (ggml_split_tensor_t *)attn_norm->extra;
-            for (int i = extra->n_device-1; i >= 0; --i) {
-                if (extra->splits[i]) {
-                    attn_norm = extra->splits[i];
-                    id = i;
-                    break;
-                }
-            }
-        }
-        cur = llm_build_norm(ctx0, inpL, hparams, attn_norm, NULL, LLM_NORM, cb, il);
-        if (id >= 0) {
-            ggml_backend_sched_set_tensor_backend(lctx.sched, cur->src[0], ggml_backend_sched_get_backend(lctx.sched, id));
-        }
-        cb(cur, "attn_norm", il);
-        auto ffn_inp = cur;
-
        // self-attention
-        auto attn_out = build_std_attention(gf, nullptr, cur, inp_pos, nullptr, KQ_mask_l, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), 0.f,
-                is_sliding ? hparams.n_swa : 0, il, is_sliding, true);
+        auto attn_out = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, nullptr, KQ_mask_l, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), 0.f,
+                is_sliding ? hparams.n_swa : 0, il, is_sliding, false, true, true);
        cb(attn_out, "attn_out", il);

        if (il == n_layer - 1) {
            // skip computing output for unused tokens
            struct ggml_tensor * inp_out_ids = build_inp_out_ids();
            attn_out                         = ggml_get_rows(ctx0, attn_out, inp_out_ids);
-            ffn_inp                          = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
            inpL                             = ggml_get_rows(ctx0, inpL, inp_out_ids);
        }

        // feed-forward network
-        cur = llm_build_ffn(ctx0, lctx, nullptr, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
+        cur = llm_build_ffn(ctx0, lctx, model.layers[il].attn_norm, inpL, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
                    NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
-                    cb, il, gf);
+                    cb, il, gf, false, true, attn_out);
        cb(cur, "ffn_out", il);

        // add together residual + FFN + self-attention
-        cur = ggml_add(ctx0, cur, attn_out);
-        cur = ggml_add(ctx0, cur, inpL);
        cur = lctx.cvec.apply_to(ctx0, cur, il);
        cb(cur, "l_out", il);

@@ -7304,9 +7300,6 @@ ggml_cgraph * llm_build_context::build_cohere2() {
    }

    cur = inpL;
-    //if (cur->type != GGML_TYPE_F32) {
-    //    cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
-    //}

    cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
    cb(cur, "result_norm", -1);
@@ -9337,7 +9330,7 @@ ggml_cgraph * llm_build_context::llama_build_graph(
 ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tensor * the_attn_norm,
        ggml_tensor * input, ggml_tensor * inp_pos, ggml_tensor * rope_factors_in,
        ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
-        int n_swa, int il, bool do_rope, bool add_graph_split, bool add_input) {
+        int n_swa, int il, bool do_rope, bool add_graph_split, bool add_input, bool is_norm) {
    if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
         model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
        if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
@@ -9382,11 +9375,20 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
                if (!split_wq) continue;
                auto cur = get_input_tensor_sm_graph(input, id);
                if (attn_norm) {
-                    auto split_norm = attn_norm->splits[id];
-                    cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, LLM_NORM_RMS, cb, il);
-                    cb(cur, "attn_norm", il_cb);
+                    if (is_norm) {
+                        cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM, cb, il);
+                        GGML_ASSERT(cur->src[0]->op == GGML_OP_NORM);
+                        cur->src[0]->op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t) - 1] = 0xff;
+                    } else {
+                        cur = llm_build_norm(ctx0, cur, lctx.model.hparams, attn_norm->splits[id], NULL, LLM_NORM_RMS, cb, il);
+                    }
                }
-                else if (cur->type != GGML_TYPE_F32) {
+                //if (attn_norm) {
+                //    auto split_norm = attn_norm->splits[id];
+                //    cur = llm_build_norm(ctx0, cur, hparams, split_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
+                //    cb(cur, "attn_norm", il_cb);
+                //}
+                if (cur->type != GGML_TYPE_F32) {
                    cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
                }
                auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ?
@@ -9559,7 +9561,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens

    auto cur = input;
    if (the_attn_norm) {
-        cur = llm_build_norm(ctx0, cur, hparams, the_attn_norm, NULL, LLM_NORM_RMS, cb, il);
+        cur = llm_build_norm(ctx0, cur, hparams, the_attn_norm, NULL, is_norm ? LLM_NORM : LLM_NORM_RMS, cb, il);
        cb(cur, "attn_norm", il);
    }

--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@@ -335,7 +335,8 @@ struct llm_build_context {
         ggml_tensor * act_scales,
            llm_ffn_op_type   type_op,
          llm_ffn_gate_type   type_gate,
-         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false);
+         const llm_build_cb & cb, int il, ggml_cgraph * graph = nullptr, bool add_input = false,
+         bool is_norm = false, ggml_tensor * add_extra = nullptr);

    static ggml_tensor * llm_build_moe_ffn(ggml_context * ctx, llama_context & lctx,
         ggml_tensor * cur,
@@ -410,6 +411,6 @@ llm_expert_gating_func_type   gating_op,

    ggml_tensor * build_std_attention(ggml_cgraph * gf, ggml_tensor * attn_norm, ggml_tensor * cur, ggml_tensor * inp_pos, ggml_tensor * rope_factors,
            ggml_tensor * KQ_mask, ggml_tensor * sinks, ggml_tensor * inp_attn_scale, float KQ_scale, float f_attn_scale,
-            int n_swa, int il, bool do_rope = true, bool add_graph_split = false, bool add_input = false);
+            int n_swa, int il, bool do_rope = true, bool add_graph_split = false, bool add_input = false, bool is_norm = false);

 };