mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-30 11:21:56 +00:00
Warn user when disabling MLA
This commit is contained in:
@@ -3168,6 +3168,8 @@ static bool llama_kv_cache_init(
|
|||||||
cache.kv_l.reserve(n_layer);
|
cache.kv_l.reserve(n_layer);
|
||||||
cache.kvt_l.reserve(n_layer);
|
cache.kvt_l.reserve(n_layer);
|
||||||
|
|
||||||
|
bool warn = true;
|
||||||
|
int n_mla = 0;
|
||||||
for (int i = 0; i < (int) n_layer; i++) {
|
for (int i = 0; i < (int) n_layer; i++) {
|
||||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||||
@@ -3175,6 +3177,17 @@ static bool llama_kv_cache_init(
|
|||||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||||
ggml_tensor * k;
|
ggml_tensor * k;
|
||||||
ggml_tensor * v;
|
ggml_tensor * v;
|
||||||
|
if (cparams.mla_attn) {
|
||||||
|
if (!model.layers[i].wk_b || !model.layers[i].wv_b) {
|
||||||
|
if (warn) {
|
||||||
|
LLAMA_LOG_WARN("=======================================================================================\n");
|
||||||
|
LLAMA_LOG_WARN("%s: missing MLA tensors => disabling MLA\n", __func__);
|
||||||
|
LLAMA_LOG_WARN("%s: you need to reconvert your model in order to use MLA\n", __func__);
|
||||||
|
LLAMA_LOG_WARN("=======================================================================================\n");
|
||||||
|
warn = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) {
|
if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) {
|
||||||
// DeepSeek MLA
|
// DeepSeek MLA
|
||||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||||
@@ -3186,6 +3199,7 @@ static bool llama_kv_cache_init(
|
|||||||
ggml_format_name(kvt, "cache_kvt_l%d", i);
|
ggml_format_name(kvt, "cache_kvt_l%d", i);
|
||||||
cache.kv_l.push_back(kv);
|
cache.kv_l.push_back(kv);
|
||||||
cache.kvt_l.push_back(kvt);
|
cache.kvt_l.push_back(kvt);
|
||||||
|
n_mla++;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||||
@@ -3196,6 +3210,11 @@ static bool llama_kv_cache_init(
|
|||||||
cache.v_l.push_back(v);
|
cache.v_l.push_back(v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
|
||||||
|
LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer));
|
||||||
|
LLAMA_LOG_ERROR("%s: bailing out\n", __func__);
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
|
||||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||||
for (auto it : ctx_map) {
|
for (auto it : ctx_map) {
|
||||||
|
|||||||
Reference in New Issue
Block a user