Support GigaChat3 (#995)

* Fixing Gigachat support * Gigachat: CUDA FA (needs 192 x 192 for MLA = 3) * Gigachat: CPU FA (needs 192 x 192 for MLA = 3) --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-04-29 10:51:51 +00:00 · 2025-11-24 06:55:14 +01:00
parent f6163dd58f
commit 920f424929
11 changed files with 103 additions and 4 deletions
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -142,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SCALE,                  "%s.attention.scale"                  },
    { LLM_KV_ATTENTION_OUTPUT_SCALE,           "%s.attention.output_scale"           },
    { LLM_KV_ATTENTION_TEMPERATURE_LENGTH,     "%s.attention.temperature_length"     },
+    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,         "%s.attention.key_length_mla"         },
+    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,       "%s.attention.value_length_mla"       },

    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,       "%s.rope.dimension_sections"              },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -135,6 +135,8 @@ enum llm_kv {
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_OUTPUT_SCALE,
    LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
+    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
+    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,

    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -5931,7 +5931,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
    // mutable variable, needed during the last layer of the computation to skip unused tokens
    int32_t n_tokens = this->n_tokens;

-    bool is_lite = (hparams.n_layer == 27);
+    bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);

    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -762,8 +762,10 @@ void llm_load_hparams(
                    for (auto& item : hparams.n_head_kv_arr) item = n_nead_kv;
                    hparams.n_embd_head_k = 192;
                    hparams.n_embd_head_v = 128;
+                    ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k);
+                    ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v);
                }
-                bool is_lite = (hparams.n_layer == 27);
+                bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
                if (!is_lite) {
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -1617,7 +1617,7 @@ bool create_tensors_helper::create_arctix_tensors(const LLM_TN & tn) {
 bool create_tensors_helper::create_deepseek2_tensors(const LLM_TN & tn) {
    LOADING_PRELUDE

-    const bool is_lite = (hparams.n_layer == 27);
+    const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);

    const int64_t n_embd_head_qk_rope = hparams.n_rot;
    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;