mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-05 03:20:00 +00:00
GLM-5 support (#1268)
This commit is contained in:
@@ -72,6 +72,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_MIMO2, "mimo2" },
|
||||
{ LLM_ARCH_SEED_OSS, "seed_oss" },
|
||||
{ LLM_ARCH_STEP35, "step35" },
|
||||
{ LLM_ARCH_GLM_DSA, "glm-dsa" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -154,6 +155,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
|
||||
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
|
||||
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,"%s.rope.dimension_count_per_layer" },
|
||||
|
||||
@@ -71,6 +71,7 @@ enum llm_arch {
|
||||
LLM_ARCH_MIMO2,
|
||||
LLM_ARCH_SEED_OSS,
|
||||
LLM_ARCH_STEP35,
|
||||
LLM_ARCH_GLM_DSA,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -147,6 +148,10 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_TEMPERATURE_SCALE,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
|
||||
LLM_KV_ATTENTION_INDEXER_TOP_K,
|
||||
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER,
|
||||
@@ -320,6 +325,10 @@ enum llm_tensor {
|
||||
LLM_TENSOR_NEXTN_HNORM,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
|
||||
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
|
||||
LLM_TENSOR_INDEXER_K_NORM,
|
||||
LLM_TENSOR_INDEXER_PROJ,
|
||||
LLM_TENSOR_INDEXER_ATTN_K,
|
||||
LLM_TENSOR_INDEXER_ATTN_Q_B,
|
||||
};
|
||||
|
||||
llm_arch llm_arch_from_string(const std::string & name);
|
||||
|
||||
@@ -6496,7 +6496,8 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
ggml_rope_cache(ctx0, inp_pos, nullptr, n_rot, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow) : nullptr;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
int n_active_layers = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
for (int il = 0; il < n_active_layers; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
@@ -6919,7 +6920,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
}
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
if (il == n_active_layers - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
@@ -9361,6 +9362,7 @@ ggml_cgraph * llm_build_context::llama_build_graph(
|
||||
result = llm.build_arctic();
|
||||
} break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
{
|
||||
result = llm.build_deepseek2();
|
||||
} break;
|
||||
|
||||
@@ -761,7 +761,7 @@ void llm_load_hparams(
|
||||
if (n_nead_kv%4 != 0 || hparams.n_embd_head_k != 576 || hparams.n_embd_head_v != 512 ||
|
||||
hparams.n_rot != 64) {
|
||||
printf("==========================================================================\n");
|
||||
printf("Detected incompatible DeepSeek model without a known way to fixc it.\n");
|
||||
printf("Detected incompatible DeepSeek model without a known way to fix it.\n");
|
||||
printf("Consider making your own ik_llama.cpp compatible model or\n");
|
||||
printf("ask the model provider to make one for you,\n\n");
|
||||
printf("Sorry, uknown model => cannot fix it => bailing out\n");
|
||||
@@ -1157,6 +1157,67 @@ void llm_load_hparams(
|
||||
hparams.rope_freq_base_per_layer, hparams.n_layer, false);
|
||||
GGML_ASSERT(hparams.has_rope_freq_base_per_layer || have_rfb_train_swa);
|
||||
} break;
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
{
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
||||
|
||||
// MoE parameters
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
|
||||
// deepseek MLA parameters
|
||||
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
//ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
|
||||
//ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
|
||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
|
||||
// DSA parameters
|
||||
ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
|
||||
ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
|
||||
ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
|
||||
|
||||
// Expert gating function (GLM-4.5 uses sigmoid)
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID;
|
||||
}
|
||||
|
||||
// NextN/MTP parameters
|
||||
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
||||
|
||||
// TODO: when MTP is implemented, this should probably be updated if needed
|
||||
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 79: model.type = MODEL_744B_A40B; break;
|
||||
default: model.type = MODEL_UNKNOWN;
|
||||
}
|
||||
if (hparams.n_head_kv() == 1) {
|
||||
int n_nead_kv = hparams.n_gqa();
|
||||
if (n_nead_kv%4 != 0 || hparams.n_embd_head_k != 576 || hparams.n_embd_head_v != 512 ||
|
||||
hparams.n_rot != 64) {
|
||||
printf("==========================================================================\n");
|
||||
printf("Detected incompatible DeepSeek model without a known way to fix it.\n");
|
||||
printf("Sorry, uknown model => cannot fix it => bailing out\n");
|
||||
printf("==========================================================================\n");
|
||||
GGML_ABORT("Fatal error");
|
||||
}
|
||||
printf("================= Adjusted mainline llama.cpp MLA tensors to ik_llama.cpp\n");
|
||||
for (auto& item : hparams.n_head_kv_arr) item = n_nead_kv;
|
||||
hparams.n_embd_head_k = 192;
|
||||
hparams.n_embd_head_v = 128;
|
||||
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k);
|
||||
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v);
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
|
||||
@@ -115,6 +115,11 @@ struct llama_hparams {
|
||||
uint32_t n_attn_temp_floor_scale = 8192;
|
||||
float f_attn_temp_scale = 0.1;
|
||||
|
||||
// DSA (deepseek sparse attention)
|
||||
uint32_t indexer_n_head = 0;
|
||||
uint32_t indexer_head_size = 0;
|
||||
uint32_t indexer_top_k = 0;
|
||||
|
||||
// qwen3vl deepstack
|
||||
uint32_t n_deepstack_layers = 0;
|
||||
|
||||
|
||||
@@ -105,6 +105,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {
|
||||
|
||||
bool create_deepseek2_tensors(const LLM_TN & tn);
|
||||
|
||||
bool create_glm_dsa_tensors(const LLM_TN & tn);
|
||||
|
||||
bool create_glm4_tensors(const LLM_TN & tn);
|
||||
|
||||
bool create_glm4_moe_tensors(const LLM_TN & tn);
|
||||
@@ -1975,6 +1977,120 @@ bool create_tensors_helper::create_deepseek2_tensors(const LLM_TN & tn) {
|
||||
return use_mmap_buffer;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::create_glm_dsa_tensors(const LLM_TN & tn) {
|
||||
LOADING_PRELUDE
|
||||
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
||||
|
||||
const int64_t q_lora_rank = hparams.n_lora_q;
|
||||
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
||||
|
||||
const int64_t n_ff_exp = hparams.n_ff_exp;
|
||||
const int64_t n_expert_shared = hparams.n_expert_shared;
|
||||
|
||||
model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
int flags = 0;
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
flags |= llama_model_loader::TENSOR_SKIP | llama_model_loader::TENSOR_NOT_REQUIRED;
|
||||
}
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
||||
layer.attn_q_a_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
|
||||
|
||||
layer.attn_kv_a_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
|
||||
|
||||
bool merged = false;
|
||||
if (ml.merge_qkv) {
|
||||
auto q_name = tn(LLM_TENSOR_ATTN_Q_A, "weight", i);
|
||||
auto k_name = tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i);
|
||||
auto wq = ml.require_tensor_meta(q_name.c_str());
|
||||
auto wk = ml.require_tensor_meta(k_name.c_str());
|
||||
GGML_ASSERT(wq && wk);
|
||||
if (wq->type == wk->type) {
|
||||
GGML_ASSERT(wq->ne[0] == wk->ne[0]);
|
||||
layer.wkq_a_mqa = ggml_new_tensor_2d(ctx_split, wq->type, wq->ne[0], wq->ne[1] + wk->ne[1]);
|
||||
snprintf(layer.wkq_a_mqa->name, GGML_MAX_NAME, "blk.%d.attn_qk_a_mqa.weight", i);
|
||||
layer.wq_a = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0, flags);
|
||||
layer.wkv_a_mqa = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, k_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1], flags);
|
||||
merged = true;
|
||||
use_mmap_buffer = false;
|
||||
LLAMA_LOG_DEBUG("============== Merged %s (%ld x %ld) and %s (%ld x %ld)\n", q_name.c_str(),
|
||||
wq->ne[0], wq->ne[1], k_name.c_str(), wk->ne[0], wk->ne[1]);
|
||||
}
|
||||
}
|
||||
|
||||
if (!merged) {
|
||||
layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
|
||||
}
|
||||
layer.wq_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, flags);
|
||||
|
||||
if (!merged) {
|
||||
layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, flags);
|
||||
}
|
||||
|
||||
// Incompatible mainline model. Let's see if we can still load it
|
||||
layer.wk_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
|
||||
layer.wv_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v, n_head}, 0);
|
||||
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v, n_embd}, flags);
|
||||
|
||||
// DSA indexer
|
||||
layer.indexer_k_norm = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
|
||||
layer.indexer_k_norm_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
|
||||
layer.indexer_proj = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
|
||||
layer.indexer_attn_k = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
|
||||
layer.indexer_attn_q_b = create_tensor(ctx_split, tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
|
||||
|
||||
layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
||||
|
||||
if (i < (int) hparams.n_layer_dense_lead) {
|
||||
layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
||||
layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
|
||||
layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
||||
} else {
|
||||
layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
|
||||
layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, flags);
|
||||
|
||||
GGML_ASSERT(n_expert > 0);
|
||||
GGML_ASSERT(n_expert_used > 0);
|
||||
|
||||
// MoE branch
|
||||
layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
||||
layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
|
||||
layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
||||
|
||||
// Shared expert branch
|
||||
layer.ffn_gate_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
|
||||
layer.ffn_down_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags);
|
||||
layer.ffn_up_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
|
||||
}
|
||||
|
||||
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
||||
layer.nextn.eh_proj = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
||||
layer.nextn.enorm = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
||||
layer.nextn.hnorm = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
||||
|
||||
// Optional tensors
|
||||
layer.nextn.embed_tokens = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_head = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
layer.nextn.shared_head_norm = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
}
|
||||
}
|
||||
return use_mmap_buffer;
|
||||
}
|
||||
|
||||
bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
|
||||
LOADING_PRELUDE
|
||||
|
||||
@@ -3143,6 +3259,8 @@ bool create_tensors_helper::create_tensors() {
|
||||
use_mmap_buffer = create_arctix_tensors(tn); break;
|
||||
case LLM_ARCH_DEEPSEEK2:
|
||||
use_mmap_buffer = create_deepseek2_tensors(tn); break;
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
use_mmap_buffer = create_glm_dsa_tensors(tn); break;
|
||||
case LLM_ARCH_GLM4_MOE:
|
||||
use_mmap_buffer = create_glm4_moe_tensors(tn); break;
|
||||
case LLM_ARCH_BITNET:
|
||||
|
||||
@@ -1365,6 +1365,50 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_GLM_DSA,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
|
||||
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
||||
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
||||
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
||||
{ LLM_TENSOR_ATTN_KQ_A_MQA, "blk.%d.attn_kq_a_mqa" },
|
||||
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
||||
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
|
||||
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
{ LLM_TENSOR_INDEXER_K_NORM, "blk.%d.indexer.k_norm" },
|
||||
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
|
||||
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
|
||||
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
|
||||
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
|
||||
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
|
||||
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
|
||||
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
|
||||
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
|
||||
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
|
||||
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -1612,6 +1656,7 @@ const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_310B_A15B: return "310B.A15B";
|
||||
case MODEL_300B_A47B: return "300B.A47B";
|
||||
case MODEL_355B_A32B: return "355B.A32B";
|
||||
case MODEL_744B_A40B: return "744B.A40B";
|
||||
case MODEL_E2B: return "E2B";
|
||||
case MODEL_E4B: return "E4B";
|
||||
default: return "?B";
|
||||
|
||||
@@ -115,6 +115,7 @@ enum e_model {
|
||||
MODEL_310B_A15B,
|
||||
MODEL_300B_A47B, // Ernie MoE big
|
||||
MODEL_355B_A32B,
|
||||
MODEL_744B_A40B,
|
||||
MODEL_E2B,
|
||||
MODEL_E4B,
|
||||
};
|
||||
@@ -298,6 +299,13 @@ struct llama_layer {
|
||||
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
||||
struct ggml_tensor * ssm_dt_b = nullptr;
|
||||
|
||||
// DSA (deepseek sparse attention)
|
||||
struct ggml_tensor * indexer_k_norm = nullptr;
|
||||
struct ggml_tensor * indexer_k_norm_b = nullptr;
|
||||
struct ggml_tensor * indexer_proj = nullptr;
|
||||
struct ggml_tensor * indexer_attn_k = nullptr;
|
||||
struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
|
||||
|
||||
// long rope factors
|
||||
struct ggml_tensor * rope_long = nullptr;
|
||||
struct ggml_tensor * rope_short = nullptr;
|
||||
|
||||
@@ -677,8 +677,10 @@ static bool llama_kv_cache_init(
|
||||
}
|
||||
}
|
||||
|
||||
bool is_mla_attn = model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA;
|
||||
|
||||
bool split_cache = false;
|
||||
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.arch != LLM_ARCH_DEEPSEEK2 && offload) {
|
||||
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && !is_mla_attn && offload) {
|
||||
cache.split_k_l.reserve(n_layer);
|
||||
cache.split_v_l.reserve(n_layer);
|
||||
split_cache = true;
|
||||
@@ -718,7 +720,7 @@ static bool llama_kv_cache_init(
|
||||
cache.ctxs.push_back(ctx);
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||
if (is_mla_attn) {
|
||||
bool have_wkv_b = true;
|
||||
for (auto& l : model.layers) {
|
||||
if (!l.wkv_b) {
|
||||
@@ -744,7 +746,7 @@ static bool llama_kv_cache_init(
|
||||
|
||||
bool needs_v_cache = true;
|
||||
cache.k_l.reserve(n_layer);
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
|
||||
if (is_mla_attn && cparams.mla_attn) {
|
||||
needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn;
|
||||
}
|
||||
if (needs_v_cache) cache.v_l.reserve(n_layer);
|
||||
@@ -760,7 +762,7 @@ static bool llama_kv_cache_init(
|
||||
struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
ggml_tensor * k;
|
||||
ggml_tensor * v;
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn) {
|
||||
if (is_mla_attn && cparams.mla_attn) {
|
||||
// DeepSeek MLA
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
@@ -841,7 +843,7 @@ static bool llama_kv_cache_init(
|
||||
}
|
||||
}
|
||||
}
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 && cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
|
||||
if (is_mla_attn && cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
|
||||
LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer));
|
||||
LLAMA_LOG_ERROR("%s: bailing out\n", __func__);
|
||||
GGML_ABORT("fatal error");
|
||||
@@ -1379,7 +1381,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
// general kv
|
||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA) {
|
||||
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
||||
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
||||
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
||||
@@ -1424,7 +1426,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
}
|
||||
|
||||
static void llm_prepare_mla(llama_model & model, int mla) {
|
||||
if (model.arch != LLM_ARCH_DEEPSEEK2) return;
|
||||
if (model.arch != LLM_ARCH_DEEPSEEK2 && model.arch != LLM_ARCH_GLM_DSA) return;
|
||||
const auto& hparams = model.hparams;
|
||||
const int n_layer = model.layers.size();
|
||||
int n_to_compute = 0;
|
||||
@@ -2048,7 +2050,7 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
||||
if (model.arch == LLM_ARCH_DEEPSEEK2 || model.arch == LLM_ARCH_GLM_DSA) {
|
||||
llm_prepare_mla(model, mla_attn);
|
||||
}
|
||||
|
||||
@@ -3735,7 +3737,7 @@ static int32_t llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||
|
||||
// apply K-shift if needed
|
||||
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
||||
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
||||
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2 || lctx.model.arch == LLM_ARCH_GLM_DSA) { // not supported due to MLA
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -4542,20 +4544,10 @@ struct llama_context * llama_init_from_model(
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
if (model->arch != LLM_ARCH_DEEPSEEK2 && cparams.mla_attn != 0) {
|
||||
//LLAMA_LOG_WARN("=====================================================================\n");
|
||||
//LLAMA_LOG_WARN(" MLA is only available for LLM_ARCH_DEEPSEEK2 -> turning off MLA\n");
|
||||
//LLAMA_LOG_WARN("=====================================================================\n");
|
||||
if (model->arch != LLM_ARCH_DEEPSEEK2 && model->arch != LLM_ARCH_GLM_DSA && cparams.mla_attn != 0) {
|
||||
cparams.mla_attn = 0;
|
||||
}
|
||||
if (model->arch == LLM_ARCH_OPENAI_MOE && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
|
||||
//if (cparams.split_mode_f16) {
|
||||
// LLAMA_LOG_WARN("=====================================================================\n");
|
||||
// LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n");
|
||||
// LLAMA_LOG_WARN(" => changing cparams.split_mode_f16 to 'false'\n");
|
||||
// LLAMA_LOG_WARN("=====================================================================\n");
|
||||
// cparams.split_mode_f16 = false;
|
||||
//}
|
||||
if (cparams.reduce_type == GGML_TYPE_F16) {
|
||||
LLAMA_LOG_WARN("=====================================================================\n");
|
||||
LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n");
|
||||
@@ -4569,7 +4561,7 @@ struct llama_context * llama_init_from_model(
|
||||
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
|
||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
||||
if (model->arch == LLM_ARCH_DEEPSEEK2) {
|
||||
if (model->arch == LLM_ARCH_DEEPSEEK2 || model->arch == LLM_ARCH_GLM_DSA) {
|
||||
LLAMA_LOG_INFO("%s: mla_attn = %d\n", __func__, cparams.mla_attn);
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: attn_max_b = %d\n", __func__, cparams.attn_max_batch);
|
||||
@@ -5020,6 +5012,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_ERNIE4_5_MOE:
|
||||
case LLM_ARCH_SMOLLM3:
|
||||
case LLM_ARCH_MISTRAL3:
|
||||
case LLM_ARCH_GLM_DSA:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
|
||||
Reference in New Issue
Block a user