multi_sdd: WIP

2026-03-03 02:20:01 +00:00 · 2024-10-29 12:11:37 +02:00
parent 5ad6439486
commit 70dd470cbc
3 changed files with 150 additions and 2 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -8351,16 +8351,42 @@ static struct ggml_tensor * llm_build_moe_ffn(

    experts = ggml_mul(ctx, experts, weights);

+    if (n_expert_used == 1) {
+        return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0));
+    }
+    if (n_expert_used == 2) {
+        return ggml_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0),
+                             ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1]));
+    }
+    if (n_expert_used <= GGML_MAX_SRC) {
+        ggml_tensor * src[GGML_MAX_SRC];
+        for (int i = 0; i < n_expert_used; ++i) {
+            src[i] = ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
+        }
+        for (int i = n_expert_used; i < GGML_MAX_SRC; ++i) src[i] = nullptr;
+        return ggml_multi_add(ctx, src);
+    }
+
+    GGML_ABORT("fatal error");
+
+    //int nloop = (n_expert_used + GGML_MAX_SRC - 1)/GGML_MAX_SRC;
+
    // aggregate experts
    ggml_tensor * moe_out = nullptr;
+    //ggml_tensor * first_expert = nullptr;
    for (int i = 0; i < n_expert_used; ++i) {
        ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
                experts->nb[2], i*experts->nb[1]);

        if (i == 0) {
            moe_out = cur_expert;
+            //first_expert = cur_expert;
+            //printf("%s: %d: %d x %d x %d x %d | %d x %d x %d x %d\n", __func__, ggml_is_contiguous(first_expert),
+            //        (int)cur_expert->ne[0], (int)cur_expert->ne[1], (int)cur_expert->ne[2], (int)cur_expert->ne[3],
+            //        (int)cur_expert->nb[0], (int)cur_expert->nb[1], (int)cur_expert->nb[2], (int)cur_expert->nb[3]);
        } else {
            moe_out = ggml_add(ctx, moe_out, cur_expert);
+            //printf("%s: %d  %d\n", __func__, ggml_is_contiguous(cur_expert), ggml_are_same_shape(cur_expert, first_expert));
        }
    }

@@ -9011,6 +9037,7 @@ struct llm_build_context {
                // compute Q and K and RoPE them
                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                if (hparams.f_attention_scale != 0) {
+                    // Why is hparams.f_attention_scale not simply absorbed into model.layers[il].wq ?
                    Qcur = ggml_scale(ctx0, Qcur, hparams.f_attention_scale);
                }
                cb(Qcur, "Qcur", il);
@@ -9062,6 +9089,7 @@ struct llm_build_context {

            // For Granite architecture
            if (hparams.f_residual_scale) {
+                // Why is hparams.f_residual_scale not simply absorbed into model.layers[il].wv ?
                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
            }

@@ -9103,6 +9131,7 @@ struct llm_build_context {

            // For Granite architecture
            if (hparams.f_residual_scale) {
+                // Why is hparams.f_residual_scale not simply absorbed into model.layers[il].ffn_down_exps ?
                cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
            }

@@ -9128,6 +9157,7 @@ struct llm_build_context {

        // For Granite architecture
        if (hparams.f_logit_scale) {
+            // Why is hparams.f_logit_scale not simply absorbed into model.output ?
            cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
        }