diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c4aabbff..1af5b4b6 100644 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -657,7 +657,9 @@ class Model: if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0 res = "bailingmoe2" - + if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95": + # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2 + res = "minimax-m2" if res is None: logger.warning("\n") logger.warning("**************************************************************************************") @@ -4122,6 +4124,63 @@ class JaisModel(Model): super().prepare_tensors() self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) +@Model.register("MiniMaxM2ForCausalLM") +class MiniMaxM2Model(Model): + model_arch = gguf.MODEL_ARCH.MINIMAXM2 + _experts_cache: dict[int, dict[str, Tensor]] = {} + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams["num_experts"] = self.hparams["num_local_experts"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.hparams["scoring_func"] == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif self.hparams["scoring_func"] == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}") + + self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"])) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + # merge expert weights + if 'experts' in name: + n_experts = self.hparams["num_experts"] + assert bid is not None + + expert_cache = self._experts_cache.setdefault(bid, {}) + expert_cache[name] = data_torch + expert_weights = ["w1", "w2", "w3"] + + # not enough expert weights to merge + if len(expert_cache) < n_experts * len(expert_weights): + return [] + + tensors: list[tuple[str, Tensor]] = [] + for w_name in expert_weights: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(expert_cache[ename]) + del expert_cache[ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + new_name = self.map_tensor_name(merged_name) + tensors.append((new_name, data_torch)) + + del self._experts_cache[bid] + return tensors + + return super().modify_tensors(data_torch, name, bid) + @Model.register("Dots1ForCausalLM") class Dots1Model(Qwen2MoeModel): @@ -4150,6 +4209,7 @@ class Dots1Model(Qwen2MoeModel): return [(self.map_tensor_name(name), data_torch)] return super().modify_tensors(data_torch, name, bid) + @Model.register("Glm4MoeForCausalLM") class Glm4MoeModel(Model): model_arch = gguf.MODEL_ARCH.GLM4_MOE diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 02b1cfba..96936717 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -100,6 +100,7 @@ models = [ {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902", }, {"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890", }, {"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"}, + {"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", }, ] @@ -164,7 +165,7 @@ for model in models: tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) else: tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") - except OSError as e: + except (OSError, TypeError) as e: logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") continue # Skip to the next model if the tokenizer can't be loaded diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 59bcba52..e1ed604c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -248,7 +248,8 @@ class MODEL_ARCH(IntEnum): ERNIE4_5 = auto() ERNIE4_5_MOE = auto() BAILINGMOE2 = auto() - + MINIMAXM2 = auto() + class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() @@ -384,7 +385,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ARCTIC: "arctic", MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.CHATGLM: "chatglm", - MODEL_ARCH.GLM4_MOE: "glm4moe", + MODEL_ARCH.GLM4_MOE: "glm4moe", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.BITNET_25: "bitnet-25", MODEL_ARCH.T5: "t5", @@ -394,6 +395,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.ERNIE4_5: "ernie4_5", MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe", MODEL_ARCH.BAILINGMOE2: "bailingmoe2", + MODEL_ARCH.MINIMAXM2: "minimax-m2", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -1324,6 +1326,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.MINIMAXM2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d9af8e32..367f47be 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -267,6 +267,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1 "model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe "model.layers.{bid}.mlp.gate.expert_bias", # bailingmoe2 + "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 ), # Feed-forward up diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 5966f057..d680673c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -66,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_OPENAI_MOE, "gpt-oss" }, { LLM_ARCH_BAILINGMOE2, "bailingmoe2" }, + { LLM_ARCH_MINIMAX_M2, "minimax-m2" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; diff --git a/src/llama-arch.h b/src/llama-arch.h index 949a946e..1b32463b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -65,6 +65,7 @@ enum llm_arch { LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_OPENAI_MOE, LLM_ARCH_BAILINGMOE2, + LLM_ARCH_MINIMAX_M2, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index f8219ee1..154870e3 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -7847,7 +7847,6 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() { cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); - // self-attention // self-attention { // Q, K, V projections @@ -7899,7 +7898,6 @@ ggml_cgraph * llm_build_context::build_ernie4_5_moe() { } if (il == n_layer - 1 && inp_out_ids) { - ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -8366,6 +8364,129 @@ ggml_cgraph * llm_build_context::build_bailingmoe2() { return gf; } +ggml_cgraph* llm_build_context::build_minimaxm2() { + ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64 + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + ggml_tensor * inp_pos = build_inp_pos(); + + + //auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor* inpSA = inpL; + + cur = inpL; + + // self_attention + { + cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // Q, K, V projections + ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Qcur, "Qcur_normed", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(Kcur, "Kcur_normed", il); + + // reshape for multi-head + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + + // apply RoPE + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, + n_tokens, kv_head, n_kv, + 1.0f / sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS,cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_moe_ffn(ctx0, lctx, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0, + (llm_expert_gating_func_type)hparams.expert_gating_func, + cb, il, gf); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, + hparams, model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} + ggml_cgraph * llm_build_context::llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { llama_batch dummy; dummy.n_tokens = 0; @@ -8712,6 +8833,10 @@ ggml_cgraph * llm_build_context::llama_build_graph( { result = llm.build_bailingmoe2(); } break; + case LLM_ARCH_MINIMAX_M2: + { + result = llm.build_minimaxm2(); + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-build-context.h b/src/llama-build-context.h index ff1cf7fe..391cf319 100644 --- a/src/llama-build-context.h +++ b/src/llama-build-context.h @@ -268,6 +268,8 @@ struct llm_build_context { ggml_cgraph * build_bailingmoe2(); + ggml_cgraph * build_minimaxm2(); + // static ggml_tensor * llm_build_lora_mm(llama_context & lctx, ggml_context * ctx0, ggml_tensor * w, ggml_tensor * cur); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index d904e90a..2d7cd439 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -1012,6 +1012,17 @@ void llm_load_hparams( // TODO: switch (hparams.n_layer) } break; + case LLM_ARCH_MINIMAX_M2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + + switch (hparams.n_layer) { + case 62: model.type = e_model::MODEL_230B_A10B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index d30da5bf..a2fc5803 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -128,6 +128,8 @@ struct create_tensors_helper : public create_tensors_helper_interface { bool create_bailingmoe2_tensors(const LLM_TN & tn); + bool create_minimaxm2_tensors(const LLM_TN & tn); + llama_model_loader & ml; llama_model & model; @@ -2434,6 +2436,36 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) { return use_mmap_buffer; } +bool create_tensors_helper::create_minimaxm2_tensors(const LLM_TN & tn) { + LOADING_PRELUDE + + create_embd_output(tn, n_embd, n_vocab); + + for (int i = 0; i < n_layer; ++i) { + ggml_context* ctx_layer = ctx_for_layer(i); + ggml_context* ctx_split = ctx_for_layer_split(i); + auto& layer = model.layers[i]; + + layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0); + layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0); + layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0); + layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + + layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.attn_q_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k * n_head }, 0); + layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_k_gqa }, 0); + + layer.ffn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + + layer.ffn_gate_inp = create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); + layer.ffn_gate_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff, n_expert }, 0); + layer.ffn_down_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert }, 0); + layer.ffn_up_exps = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert }, 0); + layer.ffn_exp_probs_b = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, 0); + } + return use_mmap_buffer; +} + bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, int bias) { auto& hparams = model.hparams; const int64_t n_head = hparams.n_head(); @@ -2665,6 +2697,8 @@ bool create_tensors_helper::create_tensors() { use_mmap_buffer = create_openai_moe_tensors(tn); break; case LLM_ARCH_BAILINGMOE2: use_mmap_buffer = create_bailingmoe2_tensors(tn); break; + case LLM_ARCH_MINIMAX_M2: + use_mmap_buffer = create_minimaxm2_tensors(tn); break; default: throw std::runtime_error("unknown architecture"); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index eb5acda2..0f43aa50 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1228,6 +1228,27 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, }, }, + { + LLM_ARCH_MINIMAX_M2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1470,6 +1491,7 @@ const char * llama_model_type_name(e_model type) { case MODEL_80B_A13B: return "80B.A13B"; case MODEL_100B_A6B: return "100B.A6B"; case MODEL_106B_A12B: return "106B.A12B"; + case MODEL_230B_A10B: return "230B.A10B"; case MODEL_235B_A22B: return "235B.A22B"; case MODEL_300B_A47B: return "300B.A47B"; case MODEL_355B_A32B: return "355B.A32B"; diff --git a/src/llama-model.h b/src/llama-model.h index 2df01fdd..81420788 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -110,6 +110,7 @@ enum e_model { MODEL_80B_A13B, MODEL_100B_A6B, MODEL_106B_A12B, + MODEL_230B_A10B, // Minimax M2 MODEL_235B_A22B, MODEL_300B_A47B, // Ernie MoE big MODEL_355B_A32B, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9f4f8b08..e80d604d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -399,6 +399,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { }; break; case LLAMA_VOCAB_PRE_TYPE_GPT4O: + case LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2: regex_exprs = { // original regex from tokenizer.json // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", @@ -1987,6 +1988,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "grok-2") { pre_type = LLAMA_VOCAB_PRE_TYPE_GROK_2; clean_spaces = false; + } else if ( + tokenizer_pre == "minimax-m2") { + pre_type = LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 17d13ff9..a4c1df15 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -48,6 +48,7 @@ enum llama_vocab_pre_type { LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37, LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38, LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39, + LLAMA_VOCAB_PRE_TYPE_MINIMAX_M2 = 40, }; struct LLM_KV; diff --git a/src/llama.cpp b/src/llama.cpp index b78adf90..ee666fc3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4668,6 +4668,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_HUNYUAN_MOE: case LLM_ARCH_OPENAI_MOE: case LLM_ARCH_BAILINGMOE2: + case LLM_ARCH_MINIMAX_M2: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: