From 1f072ab135b210fad60474350d849a768ba055fd Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Mon, 20 Oct 2025 10:09:39 +0300 Subject: [PATCH] Do not allocate KV cache for unused layers (#843) * Do not allocate KV cache for unused layers * Do not apply experts weight scale if it is 1 --------- Co-authored-by: Iwan Kawrakow --- src/llama-build-context.cpp | 2 +- src/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index dd0b62be..6693e7c2 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -863,7 +863,7 @@ llm_expert_gating_func_type gating_op, weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); } - if (scale_w) { + if (scale_w && std::abs(w_scale-1) > 1e-5f) { weights = ggml_scale(ctx, weights, w_scale); cb(weights, "ffn_moe_weights_scaled", il); } diff --git a/src/llama.cpp b/src/llama.cpp index 928d66b0..57f2b75c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -532,7 +532,7 @@ static bool llama_kv_cache_init( const struct llama_hparams & hparams = model.hparams; - const int64_t n_layer = hparams.n_layer; + const int64_t n_layer = hparams.n_layer - hparams.nextn_predict_layers; cache.has_shift = false;