diff --git a/src/llama.cpp b/src/llama.cpp index 6bdb0621..5565ffcd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -321,6 +321,7 @@ enum llm_kv { LLM_KV_TIME_DECAY_EXTRA_DIM, LLM_KV_RESIDUAL_SCALE, LLM_KV_EMBEDDING_SCALE, + LLM_KV_TOKEN_SHIFT_COUNT, LLM_KV_INTERLEAVE_MOE_LAYER_STEP, LLM_KV_ATTENTION_HEAD_COUNT, @@ -425,6 +426,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" }, { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, + { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, + { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -2983,6 +2986,8 @@ struct llama_context { struct llama_kv_cache kv_self; struct llama_control_vector cvec; + std::vector scale_data; + std::unordered_map lora_adapters; std::vector backends; @@ -3059,6 +3064,7 @@ struct llama_context { struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch] struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc] struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch] + struct ggml_tensor * inp_scale = nullptr; // F32 [n_tokens] }; struct llama_lora_weight { @@ -8761,8 +8767,6 @@ static void llm_build_kv_store( // note: storing RoPE-ed version of K in the KV cache ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); - struct ggml_tensor * v_cache_view = nullptr; if (cparams.flash_attn) { @@ -9643,6 +9647,14 @@ struct llm_build_context { return lctx.inp_pos; } + struct ggml_tensor * build_inpup_scale(int n_tokens) { + int n_pos_per_token = 1; + lctx.inp_scale = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token); + cb(lctx.inp_scale, "inp_scale", -1); + ggml_set_input(lctx.inp_scale); + return lctx.inp_scale; + } + struct ggml_tensor * build_rope_factors(int il) { // choose long/short freq factors based on the context size const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; @@ -9833,14 +9845,7 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); if (model.arch == LLM_ARCH_LLAMA4) { - //inp_attn_scale = build_inp_attn_scale(); - inp_attn_scale = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*1); // n_pos_per_token(), which is 4 for LLM_ARCH_QWEN2VL, 1 otherwise - ggml_set_input(inp_attn_scale); - auto scale_data = (float *)inp_attn_scale->data; - auto pos = (const int32_t *)inp_pos->data; - for (int i = 0; i < n_tokens; ++i) { - scale_data[i] = std::log(std::floor((pos[i] + 1.0f) / hparams.n_attn_temp_floor_scale) + 1.0f) * hparams.f_attn_temp_scale + 1.0f; - } + inp_attn_scale = build_inpup_scale(n_tokens); } // KQ_mask (mask for 1 head, it will be broadcasted to all heads) @@ -9890,20 +9895,16 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - if (use_rope) { - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, + Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, + Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); } else if (inp_attn_scale) { - Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale); + Qcur = ggml_mul(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_attn_scale); } cb(Qcur, "Qcur", il); @@ -15640,6 +15641,17 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); } + if (lctx.inp_pos && lctx.inp_scale) { + int n_tokens = batch.n_tokens; + GGML_ASSERT(ggml_nelements(lctx.inp_scale) >= n_tokens); + if (int(lctx.scale_data.size()) < n_tokens) lctx.scale_data.resize(n_tokens); + int n_pos_per_token = 1; + for (int i = 0; i < n_tokens; ++i) { + lctx.scale_data[i] = std::log(std::floor((batch.pos[i] + 1.0f) / hparams.n_attn_temp_floor_scale) + 1.0f) * hparams.f_attn_temp_scale + 1.0f; + } + ggml_backend_tensor_set(lctx.inp_scale, lctx.scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(lctx.inp_scale)); + } + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); const int64_t n_tokens = batch.n_tokens;