mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-24 23:24:13 +00:00
Warn user when disabling MLA
This commit is contained in:
@@ -3168,6 +3168,8 @@ static bool llama_kv_cache_init(
|
||||
cache.kv_l.reserve(n_layer);
|
||||
cache.kvt_l.reserve(n_layer);
|
||||
|
||||
bool warn = true;
|
||||
int n_mla = 0;
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
@@ -3175,6 +3177,17 @@ static bool llama_kv_cache_init(
|
||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
ggml_tensor * k;
|
||||
ggml_tensor * v;
|
||||
if (cparams.mla_attn) {
|
||||
if (!model.layers[i].wk_b || !model.layers[i].wv_b) {
|
||||
if (warn) {
|
||||
LLAMA_LOG_WARN("=======================================================================================\n");
|
||||
LLAMA_LOG_WARN("%s: missing MLA tensors => disabling MLA\n", __func__);
|
||||
LLAMA_LOG_WARN("%s: you need to reconvert your model in order to use MLA\n", __func__);
|
||||
LLAMA_LOG_WARN("=======================================================================================\n");
|
||||
warn = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) {
|
||||
// DeepSeek MLA
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
@@ -3186,6 +3199,7 @@ static bool llama_kv_cache_init(
|
||||
ggml_format_name(kvt, "cache_kvt_l%d", i);
|
||||
cache.kv_l.push_back(kv);
|
||||
cache.kvt_l.push_back(kvt);
|
||||
n_mla++;
|
||||
}
|
||||
else {
|
||||
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
@@ -3196,6 +3210,11 @@ static bool llama_kv_cache_init(
|
||||
cache.v_l.push_back(v);
|
||||
}
|
||||
}
|
||||
if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
|
||||
LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer));
|
||||
LLAMA_LOG_ERROR("%s: bailing out\n", __func__);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||
for (auto it : ctx_map) {
|
||||
|
||||
Reference in New Issue
Block a user