diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6402491f..91ce5af9 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -72,6 +72,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MIMO2, "mimo2" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_STEP35, "step35" }, + { LLM_ARCH_GLM_DSA, "glm-dsa" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -154,6 +155,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, + { LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" }, + { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" }, + { LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,"%s.rope.dimension_count_per_layer" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 9dd8df26..97915945 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -71,6 +71,7 @@ enum llm_arch { LLM_ARCH_MIMO2, LLM_ARCH_SEED_OSS, LLM_ARCH_STEP35, + LLM_ARCH_GLM_DSA, LLM_ARCH_UNKNOWN, }; @@ -147,6 +148,10 @@ enum llm_kv { LLM_KV_ATTENTION_TEMPERATURE_SCALE, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, + LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, + LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, + LLM_KV_ATTENTION_INDEXER_TOP_K, + LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, @@ -320,6 +325,10 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_INDEXER_K_NORM, + LLM_TENSOR_INDEXER_PROJ, + LLM_TENSOR_INDEXER_ATTN_K, + LLM_TENSOR_INDEXER_ATTN_Q_B, }; llm_arch llm_arch_from_string(const std::string & name); diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index eb6d3894..68e5a17a 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -6496,7 +6496,8 @@ ggml_cgraph * llm_build_context::build_deepseek2() { ggml_rope_cache(ctx0, inp_pos, nullptr, n_rot, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) : nullptr; - for (int il = 0; il < n_layer; ++il) { + int n_active_layers = hparams.n_layer - hparams.nextn_predict_layers; + for (int il = 0; il < n_active_layers; ++il) { struct ggml_tensor * inpSA = inpL; // norm @@ -6919,7 +6920,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() { } } - if (il == n_layer - 1) { + if (il == n_active_layers - 1) { // skip computing output for unused tokens struct ggml_tensor * inp_out_ids = build_inp_out_ids(); n_tokens = n_outputs; @@ -9361,6 +9362,7 @@ ggml_cgraph * llm_build_context::llama_build_graph( result = llm.build_arctic(); } break; case LLM_ARCH_DEEPSEEK2: + case LLM_ARCH_GLM_DSA: { result = llm.build_deepseek2(); } break; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 8fe793e9..bbdd83a1 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -761,7 +761,7 @@ void llm_load_hparams( if (n_nead_kv%4 != 0 || hparams.n_embd_head_k != 576 || hparams.n_embd_head_v != 512 || hparams.n_rot != 64) { printf("==========================================================================\n"); - printf("Detected incompatible DeepSeek model without a known way to fixc it.\n"); + printf("Detected incompatible DeepSeek model without a known way to fix it.\n"); printf("Consider making your own ik_llama.cpp compatible model or\n"); printf("ask the model provider to make one for you,\n\n"); printf("Sorry, uknown model => cannot fix it => bailing out\n"); @@ -1157,6 +1157,67 @@ void llm_load_hparams( hparams.rope_freq_base_per_layer, hparams.n_layer, false); GGML_ASSERT(hparams.has_rope_freq_base_per_layer || have_rfb_train_swa); } break; + case LLM_ARCH_GLM_DSA: + { + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); + + // MoE parameters + ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert); + ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + + // deepseek MLA parameters + ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + //ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false); + //ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + + // DSA parameters + ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); + ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); + ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); + + // Expert gating function (GLM-4.5 uses sigmoid) + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID; + } + + // NextN/MTP parameters + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + + // TODO: when MTP is implemented, this should probably be updated if needed + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + + switch (hparams.n_layer) { + case 79: model.type = MODEL_744B_A40B; break; + default: model.type = MODEL_UNKNOWN; + } + if (hparams.n_head_kv() == 1) { + int n_nead_kv = hparams.n_gqa(); + if (n_nead_kv%4 != 0 || hparams.n_embd_head_k != 576 || hparams.n_embd_head_v != 512 || + hparams.n_rot != 64) { + printf("==========================================================================\n"); + printf("Detected incompatible DeepSeek model without a known way to fix it.\n"); + printf("Sorry, uknown model => cannot fix it => bailing out\n"); + printf("==========================================================================\n"); + GGML_ABORT("Fatal error"); + } + printf("================= Adjusted mainline llama.cpp MLA tensors to ik_llama.cpp\n"); + for (auto& item : hparams.n_head_kv_arr) item = n_nead_kv; + hparams.n_embd_head_k = 192; + hparams.n_embd_head_v = 128; + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v); + } + } break; default: (void)0; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index e2a1007b..0dd22303 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -115,6 +115,11 @@ struct llama_hparams { uint32_t n_attn_temp_floor_scale = 8192; float f_attn_temp_scale = 0.1; + // DSA (deepseek sparse attention) + uint32_t indexer_n_head = 0; + uint32_t indexer_head_size = 0; + uint32_t indexer_top_k = 0; + // qwen3vl deepstack uint32_t n_deepstack_layers = 0; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 89cbee91..e2baacbb 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -105,6 +105,8 @@ struct create_tensors_helper : public create_tensors_helper_interface { bool create_deepseek2_tensors(const LLM_TN & tn); + bool create_glm_dsa_tensors(const LLM_TN & tn); + bool create_glm4_tensors(const LLM_TN & tn); bool create_glm4_moe_tensors(const LLM_TN & tn); @@ -1975,6 +1977,120 @@ bool create_tensors_helper::create_deepseek2_tensors(const LLM_TN & tn) { return use_mmap_buffer; } +bool create_tensors_helper::create_glm_dsa_tensors(const LLM_TN & tn) { + LOADING_PRELUDE + + const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + int flags = 0; + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + flags |= llama_model_loader::TENSOR_SKIP | llama_model_loader::TENSOR_NOT_REQUIRED; + } + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + layer.attn_q_a_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags); + + layer.attn_kv_a_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags); + + bool merged = false; + if (ml.merge_qkv) { + auto q_name = tn(LLM_TENSOR_ATTN_Q_A, "weight", i); + auto k_name = tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i); + auto wq = ml.require_tensor_meta(q_name.c_str()); + auto wk = ml.require_tensor_meta(k_name.c_str()); + GGML_ASSERT(wq && wk); + if (wq->type == wk->type) { + GGML_ASSERT(wq->ne[0] == wk->ne[0]); + layer.wkq_a_mqa = ggml_new_tensor_2d(ctx_split, wq->type, wq->ne[0], wq->ne[1] + wk->ne[1]); + snprintf(layer.wkq_a_mqa->name, GGML_MAX_NAME, "blk.%d.attn_qk_a_mqa.weight", i); + layer.wq_a = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0, flags); + layer.wkv_a_mqa = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, k_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1], flags); + merged = true; + use_mmap_buffer = false; + LLAMA_LOG_DEBUG("============== Merged %s (%ld x %ld) and %s (%ld x %ld)\n", q_name.c_str(), + wq->ne[0], wq->ne[1], k_name.c_str(), wk->ne[0], wk->ne[1]); + } + } + + if (!merged) { + layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags); + } + layer.wq_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, flags); + + if (!merged) { + layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, flags); + } + + // Incompatible mainline model. Let's see if we can still load it + layer.wk_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0); + layer.wv_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v, n_head}, 0); + layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v, n_embd}, flags); + + // DSA indexer + layer.indexer_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags); + layer.indexer_k_norm_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags); + layer.indexer_proj = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags); + layer.indexer_attn_k = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags); + layer.indexer_attn_q_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags); + + layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); + + if (i < (int) hparams.n_layer_dense_lead) { + layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); + } else { + layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags); + layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, flags); + + GGML_ASSERT(n_expert > 0); + GGML_ASSERT(n_expert_used > 0); + + // MoE branch + layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags); + layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + + // Shared expert branch + layer.ffn_gate_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); + layer.ffn_down_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags); + layer.ffn_up_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); + } + + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + layer.nextn.eh_proj = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.enorm = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + + // Optional tensors + layer.nextn.embed_tokens = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | llama_model_loader::TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | llama_model_loader::TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | llama_model_loader::TENSOR_NOT_REQUIRED); + } + } + return use_mmap_buffer; +} + bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) { LOADING_PRELUDE @@ -3143,6 +3259,8 @@ bool create_tensors_helper::create_tensors() { use_mmap_buffer = create_arctix_tensors(tn); break; case LLM_ARCH_DEEPSEEK2: use_mmap_buffer = create_deepseek2_tensors(tn); break; + case LLM_ARCH_GLM_DSA: + use_mmap_buffer = create_glm_dsa_tensors(tn); break; case LLM_ARCH_GLM4_MOE: use_mmap_buffer = create_glm4_moe_tensors(tn); break; case LLM_ARCH_BITNET: diff --git a/src/llama-model.cpp b/src/llama-model.cpp index cd95dac1..fa66c1e3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1365,6 +1365,50 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, }, }, + { + LLM_ARCH_GLM_DSA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, + { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, + { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, + { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, + { LLM_TENSOR_ATTN_KQ_A_MQA, "blk.%d.attn_kq_a_mqa" }, + { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, + { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + { LLM_TENSOR_INDEXER_K_NORM, "blk.%d.indexer.k_norm" }, + { LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" }, + { LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" }, + { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, + { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, + { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, + { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, + { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, + + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1612,6 +1656,7 @@ const char * llama_model_type_name(e_model type) { case MODEL_310B_A15B: return "310B.A15B"; case MODEL_300B_A47B: return "300B.A47B"; case MODEL_355B_A32B: return "355B.A32B"; + case MODEL_744B_A40B: return "744B.A40B"; case MODEL_E2B: return "E2B"; case MODEL_E4B: return "E4B"; default: return "?B"; diff --git a/src/llama-model.h b/src/llama-model.h index 7c55645c..702fcae0 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -115,6 +115,7 @@ enum e_model { MODEL_310B_A15B, MODEL_300B_A47B, // Ernie MoE big MODEL_355B_A32B, + MODEL_744B_A40B, MODEL_E2B, MODEL_E4B, }; @@ -298,6 +299,13 @@ struct llama_layer { struct ggml_tensor * ssm_conv1d_b = nullptr; struct ggml_tensor * ssm_dt_b = nullptr; + // DSA (deepseek sparse attention) + struct ggml_tensor * indexer_k_norm = nullptr; + struct ggml_tensor * indexer_k_norm_b = nullptr; + struct ggml_tensor * indexer_proj = nullptr; + struct ggml_tensor * indexer_attn_k = nullptr; + struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias + // long rope factors struct ggml_tensor * rope_long = nullptr; struct ggml_tensor * rope_short = nullptr; diff --git a/src/llama.cpp b/src/llama.cpp index be46f5d1..c1314c13 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -677,8 +677,10 @@ static bool llama_kv_cache_init( } } + bool is_mla_attn = model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA; + bool split_cache = false; - if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.arch != LLM_ARCH_DEEPSEEK2 && offload) { + if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && !is_mla_attn && offload) { cache.split_k_l.reserve(n_layer); cache.split_v_l.reserve(n_layer); split_cache = true; @@ -718,7 +720,7 @@ static bool llama_kv_cache_init( cache.ctxs.push_back(ctx); } - if (model.arch == LLM_ARCH_DEEPSEEK2) { + if (is_mla_attn) { bool have_wkv_b = true; for (auto& l : model.layers) { if (!l.wkv_b) { @@ -744,7 +746,7 @@ static bool llama_kv_cache_init( bool needs_v_cache = true; cache.k_l.reserve(n_layer); - if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { + if (is_mla_attn && cparams.mla_attn) { needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn; } if (needs_v_cache) cache.v_l.reserve(n_layer); @@ -760,7 +762,7 @@ static bool llama_kv_cache_init( struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; ggml_tensor * v; - if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) { + if (is_mla_attn && cparams.mla_attn) { // DeepSeek MLA const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; @@ -841,7 +843,7 @@ static bool llama_kv_cache_init( } } } - if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) { + if (is_mla_attn && cparams.mla_attn && n_mla < n_layer && n_mla > 0) { LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer)); LLAMA_LOG_ERROR("%s: bailing out\n", __func__); GGML_ABORT("fatal error"); @@ -1379,7 +1381,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { // general kv LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); - if (model.arch == LLM_ARCH_DEEPSEEK2) { + if (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); @@ -1424,7 +1426,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } static void llm_prepare_mla(llama_model & model, int mla) { - if (model.arch != LLM_ARCH_DEEPSEEK2) return; + if (model.arch != LLM_ARCH_DEEPSEEK2 && model.arch != LLM_ARCH_GLM_DSA) return; const auto& hparams = model.hparams; const int n_layer = model.layers.size(); int n_to_compute = 0; @@ -2048,7 +2050,7 @@ static bool llm_load_tensors( } } - if (model.arch == LLM_ARCH_DEEPSEEK2) { + if (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA) { llm_prepare_mla(model, mla_attn); } @@ -3735,7 +3737,7 @@ static int32_t llama_kv_cache_update_internal(struct llama_context & lctx) { // apply K-shift if needed if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) { - if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA + if (lctx.model.arch == LLM_ARCH_DEEPSEEK2 || lctx.model.arch == LLM_ARCH_GLM_DSA) { // not supported due to MLA return 1; } @@ -4542,20 +4544,10 @@ struct llama_context * llama_init_from_model( params.seed = time(NULL); } - if (model->arch != LLM_ARCH_DEEPSEEK2 && cparams.mla_attn != 0) { - //LLAMA_LOG_WARN("=====================================================================\n"); - //LLAMA_LOG_WARN(" MLA is only available for LLM_ARCH_DEEPSEEK2 -> turning off MLA\n"); - //LLAMA_LOG_WARN("=====================================================================\n"); + if (model->arch != LLM_ARCH_DEEPSEEK2 && model->arch != LLM_ARCH_GLM_DSA && cparams.mla_attn != 0) { cparams.mla_attn = 0; } if (model->arch == LLM_ARCH_OPENAI_MOE && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) { - //if (cparams.split_mode_f16) { - // LLAMA_LOG_WARN("=====================================================================\n"); - // LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n"); - // LLAMA_LOG_WARN(" => changing cparams.split_mode_f16 to 'false'\n"); - // LLAMA_LOG_WARN("=====================================================================\n"); - // cparams.split_mode_f16 = false; - //} if (cparams.reduce_type == GGML_TYPE_F16) { LLAMA_LOG_WARN("=====================================================================\n"); LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n"); @@ -4569,7 +4561,7 @@ struct llama_context * llama_init_from_model( LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); - if (model->arch == LLM_ARCH_DEEPSEEK2) { + if (model->arch == LLM_ARCH_DEEPSEEK2 || model->arch == LLM_ARCH_GLM_DSA) { LLAMA_LOG_INFO("%s: mla_attn = %d\n", __func__, cparams.mla_attn); } LLAMA_LOG_INFO("%s: attn_max_b = %d\n", __func__, cparams.attn_max_batch); @@ -5020,6 +5012,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_SMOLLM3: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_GLM_DSA: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2