From 6dc8c79d8157fc6fbc28db2e0110ef33f0ba67d6 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 14 Oct 2025 16:24:16 +0300 Subject: [PATCH] Adding Ling/Ring (a.k.a., Bailing-MoE2) --- src/llama-arch.cpp | 9 ++- src/llama-arch.h | 3 + src/llama-build-context.cpp | 137 ++++++++++++++++++++++++++++++++++++ src/llama-build-context.h | 2 + src/llama-hparams.cpp | 25 +++++++ src/llama-hparams.h | 39 ++++++---- src/llama-load-tensors.cpp | 75 ++++++++++++++++++++ src/llama-model.cpp | 34 +++++++++ src/llama-model.h | 2 + src/llama-vocab.cpp | 4 +- src/llama.cpp | 70 ++++++++++++++++++ 11 files changed, 381 insertions(+), 19 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6d72f959..6bb16a04 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -58,11 +58,12 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_COHERE2, "cohere2" }, - { LLM_ARCH_DOTS1, "dots1" }, - { LLM_ARCH_ERNIE4_5, "ernie4_5" }, - { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" }, + { LLM_ARCH_DOTS1, "dots1" }, + { LLM_ARCH_ERNIE4_5, "ernie4_5" }, + { LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" }, { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_OPENAI_MOE, "gpt-oss" }, + { LLM_ARCH_BAILINGMOE2, "bailingmoe2" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -103,6 +104,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" }, { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" }, + { LLM_KV_EXPERT_GROUP_COUNT, "%s.expert_group_count" }, + { LLM_KV_EXPERT_GROUP_USED_COUNT, "%s.expert_group_used_count" }, { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" }, { LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" }, { LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 65e036ca..0069a8ee 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -62,6 +62,7 @@ enum llm_arch { LLM_ARCH_ERNIE4_5_MOE, LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_OPENAI_MOE, + LLM_ARCH_BAILINGMOE2, LLM_ARCH_UNKNOWN, }; @@ -92,6 +93,8 @@ enum llm_kv { LLM_KV_EXPERT_COUNT, LLM_KV_EXPERT_USED_COUNT, LLM_KV_EXPERT_SHARED_COUNT, + LLM_KV_EXPERT_GROUP_COUNT, + LLM_KV_EXPERT_GROUP_USED_COUNT, LLM_KV_EXPERT_WEIGHTS_SCALE, LLM_KV_EXPERT_WEIGHTS_NORM, LLM_KV_EXPERT_GATING_FUNC, diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 8b3cd9a0..04a8a299 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -8192,6 +8192,139 @@ ggml_cgraph * llm_build_context::build_openai_moe() { return gf; } +ggml_cgraph * llm_build_context::build_bailingmoe2() { + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + //auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * KQ_mask = build_inp_KQ_mask(); + //const int64_t n_embd_head = hparams.n_embd_head_v; + const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; + + for (int il = 0; il < n_transformer_layers; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 0*sizeof(float)*(n_embd)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd)); + //ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head*sizeof(float), cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } + if (il == n_transformer_layers - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA); + cb(sa_out, "sa_out", il); + + // MoE branch + cur = llm_build_norm(ctx0, sa_out, hparams, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + if (static_cast(il) < hparams.n_layer_dense_lead) { + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } else { + + ggml_tensor * moe_out = + llm_build_moe_ffn(ctx0, lctx, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + true, hparams.expert_weights_scale, + (llm_expert_gating_func_type) hparams.expert_gating_func, + cb, il, gf); + cb(moe_out, "ffn_moe_out", il); + + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, sa_out); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); + + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} + ggml_cgraph * llm_build_context::llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { llama_batch dummy; dummy.n_tokens = 0; @@ -8513,6 +8646,10 @@ ggml_cgraph * llm_build_context::llama_build_graph( { result = llm.build_openai_moe(); } break; + case LLM_ARCH_BAILINGMOE2: + { + result = llm.build_bailingmoe2(); + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-build-context.h b/src/llama-build-context.h index c5789ad9..a1f0b8ae 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -251,6 +251,8 @@ struct llm_build_context { ggml_cgraph * build_openai_moe(); + ggml_cgraph * build_bailingmoe2(); + // static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0, ggml_tensor * w, ggml_tensor * cur); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 7f5c80f4..d706c091 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -894,6 +894,31 @@ void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_BAILINGMOE2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups); + ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + + // TODO: when MTP is implemented, this should probably be updated if needed + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + + switch (hparams.n_layer) { + case 20: model.type = MODEL_16B_A1B; break; + case 21: model.type = MODEL_16B_A1B; break; + case 32: model.type = MODEL_100B_A6B; break; + case 33: model.type = MODEL_100B_A6B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_DOTS1: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index ea1a7562..4235714a 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -24,6 +24,7 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; uint32_t n_layer; + int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache uint32_t n_rot; uint32_t n_swa = 0; // sliding window attention (SWA) uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention @@ -39,22 +40,30 @@ struct llama_hparams { std::array n_ff_arr; uint32_t n_layer_dense_lead = 0; - uint32_t n_lora_q = 0; - uint32_t n_lora_kv = 0; - uint32_t n_ff_exp = 0; - uint32_t n_ff_shexp = 0; - uint32_t n_expert_shared = 0; - float expert_weights_scale = 0.0; - bool expert_weights_norm = false; - uint32_t expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX; + uint32_t n_lora_q = 0; + uint32_t n_lora_kv = 0; + uint32_t n_ff_exp = 0; + uint32_t n_ff_shexp = 0; + uint32_t n_expert_shared = 0; + uint32_t n_norm_groups = 0; + uint32_t n_expert_groups = 0; + uint32_t n_group_used = 0; + uint32_t n_group_experts = 0; + + float expert_group_scale = 0.05f; + float expert_weights_scale = 0.0f; + bool expert_weights_norm = false; + uint32_t expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX; + uint32_t moe_every_n_layers = 0; uint32_t nextn_predict_layers = 0; float f_norm_eps; float f_norm_rms_eps; + float f_norm_group_eps; - float f_attn_logit_softcapping = 50.0f; + float f_attn_logit_softcapping = 50.0f; float f_router_logit_softcapping = 30.0f; - float f_final_logit_softcapping = 30.0f; + float f_final_logit_softcapping = 30.0f; float rope_attn_factor = 1.0f; float rope_freq_base_train; @@ -62,12 +71,12 @@ struct llama_hparams { float rope_freq_scale_train; float rope_freq_scale_train_swa; uint32_t n_ctx_orig_yarn; - float rope_yarn_log_mul; + float rope_yarn_log_mul = 0.0f; - float yarn_ext_factor = -1.0f; - float yarn_attn_factor = 1.0f; - float yarn_beta_fast = 32.0f; - float yarn_beta_slow = 1.0f; + float yarn_ext_factor = -1.0f; + float yarn_attn_factor = 1.0f; + float yarn_beta_fast = 32.0f; + float yarn_beta_slow = 1.0f; std::array rope_sections; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index c13d95fc..85190c57 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -124,6 +124,8 @@ struct create_tensors_helper : public create_tensors_helper_interface { bool create_openai_moe_tensors(const LLM_TN & tn); + bool create_bailingmoe2_tensors(const LLM_TN & tn); + llama_model_loader & ml; llama_model & model; @@ -2205,6 +2207,77 @@ bool create_tensors_helper::create_dots1_tensors(const LLM_TN & tn) { return use_mmap_buffer; } +bool create_tensors_helper::create_bailingmoe2_tensors(const LLM_TN & tn) { + LOADING_PRELUDE + + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output = create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2"); + GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2"); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + int flags = 0; + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // skip all tensors in the NextN layers + flags |= llama_model_loader::TENSOR_SKIP; + } + + layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + + layer.wqkv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags); + layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags); + + layer.attn_q_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags); + layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags); + + layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); + + if (static_cast(i) >= hparams.n_layer_dense_lead) { // MoE layers + const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared; + + layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags); + layer.ffn_exp_probs_b = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, + llama_model_loader::TENSOR_NOT_REQUIRED | flags); + + layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags); + layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + + layer.ffn_gate_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags); + layer.ffn_down_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags); + layer.ffn_up_shexp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags); + } else { // Dense layers + layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); + } + + // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + layer.nextn.eh_proj = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.embed_tokens = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, + llama_model_loader::TENSOR_NOT_REQUIRED | flags); + layer.nextn.enorm = create_tensor(ctx_layer, tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(ctx_layer, tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + layer.nextn.shared_head_head = create_tensor(ctx_split, tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); + layer.nextn.shared_head_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, llama_model_loader::TENSOR_NOT_REQUIRED | flags); + layer.layer_out_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags); + } + } + return use_mmap_buffer; +} + bool create_tensors_helper::create_ernie45_tensors(const LLM_TN & tn) { LOADING_PRELUDE @@ -2460,6 +2533,8 @@ bool create_tensors_helper::create_tensors() { use_mmap_buffer = create_hunyuan_tensors(tn); break; case LLM_ARCH_OPENAI_MOE: use_mmap_buffer = create_openai_moe_tensors(tn); break; + case LLM_ARCH_BAILINGMOE2: + use_mmap_buffer = create_bailingmoe2_tensors(tn); break; default: throw std::runtime_error("unknown architecture"); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 57c7a6a3..d2b92e75 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1157,6 +1157,38 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, + { + LLM_ARCH_BAILINGMOE2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, + { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, + { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, + { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1368,6 +1400,8 @@ const char * llama_model_type_name(e_model type) { case MODEL_80B_A13B: return "80B.A13B"; case MODEL_21B_A3B: return "21B.A3B"; case MODEL_300B_A47B: return "300B.A47B"; + case MODEL_16B_A1B: return "16B.A1B"; + case MODEL_100B_A6B: return "100B.A6B"; default: return "?B"; } } diff --git a/src/llama-model.h b/src/llama-model.h index 6b6fb47f..8446be98 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -84,6 +84,8 @@ enum e_model { MODEL_17B_128E, MODEL_80B_A13B, MODEL_300B_A47B, // Ernie MoE big + MODEL_16B_A1B, + MODEL_100B_A6B, }; struct llama_layer_nextn { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 372407a0..9f4f8b08 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1961,7 +1961,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION; clean_spaces = false; } else if ( - tokenizer_pre == "bailingmoe") { + tokenizer_pre == "bailingmoe" || + tokenizer_pre == "bailingmoe2" || + tokenizer_pre == "llada-moe") { pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE; clean_spaces = false; } else if ( diff --git a/src/llama.cpp b/src/llama.cpp index 53ebb075..5dc11e47 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -194,6 +194,9 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_KIMI_K2, LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_GROK_2, + LLM_CHAT_TEMPLATE_BAILING, + LLM_CHAT_TEMPLATE_BAILING_THINK, + LLM_CHAT_TEMPLATE_BAILING2, LLM_CHAT_TEMPLATE_UNKNOWN, }; @@ -236,6 +239,10 @@ static const std::map LLM_CHAT_TEMPLATES = { { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "bitnet", LLM_CHAT_TEMPLATE_BITNET }, { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 }, + { "bailing", LLM_CHAT_TEMPLATE_BAILING }, + { "bailing-think", LLM_CHAT_TEMPLATE_BAILING_THINK }, + { "bailing2", LLM_CHAT_TEMPLATE_BAILING2 }, + }; // @@ -1200,6 +1207,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale); } + if (model.arch == LLM_ARCH_BAILINGMOE2) { + LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); + LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); + LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llm_expert_gating_func_type) hparams.expert_gating_func)); + LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); + } + vocab.print_info(); } @@ -4444,6 +4464,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_DOTS1: case LLM_ARCH_HUNYUAN_MOE: case LLM_ARCH_OPENAI_MOE: + case LLM_ARCH_BAILINGMOE2: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: @@ -6255,6 +6276,12 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_GIGACHAT; } else if (tmpl_contains("<|role_start|>")) { return LLM_CHAT_TEMPLATE_MEGREZ; + } else if (tmpl_contains("ASSISTANT") && tmpl_contains("'HUMAN'")) { + return LLM_CHAT_TEMPLATE_BAILING; + } else if (tmpl_contains("ASSISTANT") && tmpl_contains("\"HUMAN\"") && tmpl_contains("")) { + return LLM_CHAT_TEMPLATE_BAILING_THINK; + } else if (tmpl_contains("ASSISTANT") && tmpl_contains("HUMAN") && tmpl_contains("<|role_end|>")) { + return LLM_CHAT_TEMPLATE_BAILING2; } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) { return LLM_CHAT_TEMPLATE_LLAMA4; } else if (tmpl_contains("<|endofuserprompt|>")) { @@ -6657,6 +6684,49 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|role_start|>assistant<|role_end|>"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) { + // Bailing (Ling/Ring) template + for (auto message : chat) { + std::string role(message->role); + + if (role == "user") { + role = "HUMAN"; + } else { + std::transform(role.begin(), role.end(), role.begin(), ::toupper); + } + + ss << "" << role << "" << message->content; + } + + if (add_ass) { + ss << "ASSISTANT"; + + if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) { + ss << ""; + } + } + } else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) { + // Bailing2 (Ling 2.0) template + bool has_system = !chat.empty() && std::string(chat[0]->role) == "system"; + + if (!has_system) { + ss << "SYSTEMdetailed thinking off<|role_end|>"; + } + + for (auto message : chat) { + std::string role(message->role); + + if (role == "user") { + role = "HUMAN"; + } else { + std::transform(role.begin(), role.end(), role.begin(), ::toupper); + } + + ss << "" << role << "" << message->content << "<|role_end|>"; + } + if (add_ass) { + ss << "ASSISTANT"; + } } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) { // Llama 4 for (auto message : chat) {