From 90d7499c2c03ff894d2d73a177b166fff8ce2c59 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sat, 7 Feb 2026 07:56:11 +0200 Subject: [PATCH] Step-3.5: llama.cpp compatibility changes (#1240) * Step-3.5: llama.cpp compatibility changes * Also read rope_freq_base_train_swa from the GGUF --- src/llama-arch.cpp | 2 ++ src/llama-arch.h | 2 ++ src/llama-hparams.cpp | 19 ++++++++++++++----- src/llama-hparams.h | 2 +- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 10816d2f..6402491f 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -131,6 +131,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, { LLM_KV_SWIGLU_LIMITS, "%s.swiglu_limits" }, { LLM_KV_SWIGLU_LIMITS_SHARED, "%s.swiglu_limits_shared" }, + { LLM_KV_SWIGLU_CLAMP_EXP, "%s.swiglu_clamp_exp" }, + { LLM_KV_SWIGLU_CLAMP_SHEXP, "%s.swiglu_clamp_shexp" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 0dca2a06..9dd8df26 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -124,6 +124,8 @@ enum llm_kv { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, LLM_KV_SWIGLU_LIMITS, LLM_KV_SWIGLU_LIMITS_SHARED, + LLM_KV_SWIGLU_CLAMP_EXP, + LLM_KV_SWIGLU_CLAMP_SHEXP, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 38c0773c..c6697d4a 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -1129,11 +1129,21 @@ void llm_load_hparams( if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) { hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID; } - ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_limits, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer); + if (!ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer, false)) { + for (int i = 0; i < hparams.n_layer; ++i) { + hparams.rope_dim_per_layer[i] = hparams.swa_layers[i] ? hparams.n_rot : hparams.n_rot/2; + } + } + // The following two parameters: one of the two versions must be present in the GGUF + if (!ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_limits, hparams.n_layer, false)) { + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_limits, hparams.n_layer, true); + } + if (!ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer, false)) { + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_limits_shared, hparams.n_layer, true); + } // Optional: Step35-only gating for applying rope scaling (HF: yarn_only_types). // Default is 3 (apply on all layers) if the key is absent. //ml.get_key(format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()), @@ -1145,7 +1155,6 @@ void llm_load_hparams( hparams.rope_scaling_apply_mask, false); hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr(LLM_KV_ROPE_FREQ_BASE_PER_LAYER, hparams.rope_freq_base_per_layer, hparams.n_layer, false); - //type = LLM_TYPE_UNKNOWN; <--- what is this? } break; default: (void)0; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 3c0cb328..e2a1007b 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -70,7 +70,7 @@ struct llama_hparams { float rope_freq_base_train_swa; float rope_freq_scale_train; float rope_freq_scale_train_swa; - uint32_t rope_scaling_apply_mask = 0x3; + uint32_t rope_scaling_apply_mask = 0x1; bool has_rope_freq_base_per_layer = false; uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f;