mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Do not allocate KV cache for unused layers (#843)
* Do not allocate KV cache for unused layers * Do not apply experts weight scale if it is 1 --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
@@ -863,7 +863,7 @@ llm_expert_gating_func_type gating_op,
|
||||
|
||||
weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
|
||||
}
|
||||
if (scale_w) {
|
||||
if (scale_w && std::abs(w_scale-1) > 1e-5f) {
|
||||
weights = ggml_scale(ctx, weights, w_scale);
|
||||
cb(weights, "ffn_moe_weights_scaled", il);
|
||||
}
|
||||
|
||||
@@ -532,7 +532,7 @@ static bool llama_kv_cache_init(
|
||||
|
||||
const struct llama_hparams & hparams = model.hparams;
|
||||
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_layer = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
cache.has_shift = false;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user