llama4: this seems to be working

2026-02-24 23:24:13 +00:00 · 2025-04-09 12:02:22 +03:00
parent 7a1fc34023
commit b51661bbff
1 changed files with 29 additions and 17 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -321,6 +321,7 @@ enum llm_kv {
    LLM_KV_TIME_DECAY_EXTRA_DIM,
    LLM_KV_RESIDUAL_SCALE,
    LLM_KV_EMBEDDING_SCALE,
+    LLM_KV_TOKEN_SHIFT_COUNT,
    LLM_KV_INTERLEAVE_MOE_LAYER_STEP,

    LLM_KV_ATTENTION_HEAD_COUNT,
@@ -425,6 +426,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_FINAL_LOGIT_SOFTCAPPING,           "%s.final_logit_softcapping"           },
    { LLM_KV_RESIDUAL_SCALE,                    "%s.residual_scale"                    },
    { LLM_KV_EMBEDDING_SCALE,                   "%s.embedding_scale"                   },
+    { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
+    { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },

    { LLM_KV_ATTENTION_HEAD_COUNT,             "%s.attention.head_count"             },
    { LLM_KV_ATTENTION_HEAD_COUNT_KV,          "%s.attention.head_count_kv"          },
@@ -2983,6 +2986,8 @@ struct llama_context {
    struct llama_kv_cache       kv_self;
    struct llama_control_vector cvec;

+    std::vector<float> scale_data;
+
    std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;

    std::vector<ggml_backend_t> backends;
@@ -3059,6 +3064,7 @@ struct llama_context {
    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+    struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens]
 };

 struct llama_lora_weight {
@@ -8761,8 +8767,6 @@ static void llm_build_kv_store(
    // note: storing RoPE-ed version of K in the KV cache
    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));

-    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
-
    struct ggml_tensor * v_cache_view = nullptr;

    if (cparams.flash_attn) {
@@ -9643,6 +9647,14 @@ struct llm_build_context {
        return lctx.inp_pos;
    }

+    struct ggml_tensor * build_inpup_scale(int n_tokens) {
+        int n_pos_per_token = 1;
+        lctx.inp_scale = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token);
+        cb(lctx.inp_scale, "inp_scale", -1);
+        ggml_set_input(lctx.inp_scale);
+        return lctx.inp_scale;
+    }
+
    struct ggml_tensor * build_rope_factors(int il) {
        // choose long/short freq factors based on the context size
        const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
@@ -9833,14 +9845,7 @@ struct llm_build_context {
        struct ggml_tensor * inp_pos = build_inp_pos();

        if (model.arch == LLM_ARCH_LLAMA4) {
-            //inp_attn_scale = build_inp_attn_scale();
-            inp_attn_scale = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*1); // n_pos_per_token(), which is 4 for LLM_ARCH_QWEN2VL, 1 otherwise
-            ggml_set_input(inp_attn_scale);
-            auto scale_data = (float *)inp_attn_scale->data;
-            auto pos = (const int32_t *)inp_pos->data;
-            for (int i = 0; i < n_tokens; ++i) {
-                scale_data[i] = std::log(std::floor((pos[i] + 1.0f) / hparams.n_attn_temp_floor_scale) + 1.0f) * hparams.f_attn_temp_scale + 1.0f;
-            }
+            inp_attn_scale = build_inpup_scale(n_tokens);
        }

        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
@@ -9890,20 +9895,16 @@ struct llm_build_context {
                    cb(Vcur, "Vcur", il);
                }

-                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
                if (use_rope) {
-                    Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
+                    Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                            ext_factor, attn_factor, beta_fast, beta_slow);

-                    Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
+                    Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                            ext_factor, attn_factor, beta_fast, beta_slow);
                } else if (inp_attn_scale) {
-                    Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
+                    Qcur = ggml_mul(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_attn_scale);
                }

                cb(Qcur, "Qcur", il);
@@ -15640,6 +15641,17 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
        ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
    }

+    if (lctx.inp_pos && lctx.inp_scale) {
+        int n_tokens = batch.n_tokens;
+        GGML_ASSERT(ggml_nelements(lctx.inp_scale) >= n_tokens);
+        if (int(lctx.scale_data.size()) < n_tokens) lctx.scale_data.resize(n_tokens);
+        int n_pos_per_token = 1;
+        for (int i = 0; i < n_tokens; ++i) {
+            lctx.scale_data[i] = std::log(std::floor((batch.pos[i] + 1.0f) / hparams.n_attn_temp_floor_scale) + 1.0f) * hparams.f_attn_temp_scale + 1.0f;
+        }
+        ggml_backend_tensor_set(lctx.inp_scale, lctx.scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(lctx.inp_scale));
+    }
+
    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
        GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
        const int64_t n_tokens = batch.n_tokens;