mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
* Mimo-2 support * Fix bug for head sizes not being the same It still does not solve the Mimo-2 quantized cache issue. * Fix quantized cache * Minor --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
1104 lines
54 KiB
C++
1104 lines
54 KiB
C++
|
|
#include "llama-hparams.h"
|
|
#include "llama-model-loader.h"
|
|
#include "llama-model.h"
|
|
|
|
#include <map>
|
|
|
|
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
|
|
|
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
|
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
|
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
|
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
|
};
|
|
|
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
|
if (kv.second == name) {
|
|
return (llama_rope_scaling_type) kv.first;
|
|
}
|
|
}
|
|
|
|
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
}
|
|
|
|
const char * llama_hparams::rope_scaling_type_name(llama_rope_scaling_type type) {
|
|
return LLAMA_ROPE_SCALING_TYPES.at(type);
|
|
}
|
|
|
|
|
|
|
|
void llm_load_hparams(
|
|
llama_model_loader & ml,
|
|
llama_model & model) {
|
|
auto & hparams = model.hparams;
|
|
const gguf_context * ctx = ml.meta;
|
|
|
|
// get metadata as string
|
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
|
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
|
if (type == GGUF_TYPE_ARRAY) {
|
|
continue;
|
|
}
|
|
const char * name = gguf_get_key(ctx, i);
|
|
const std::string value = gguf_kv_to_str(ctx, i);
|
|
model.gguf_kv.emplace(name, value);
|
|
}
|
|
|
|
// get general kv
|
|
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
|
|
|
// get hparams kv
|
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
|
|
|
// everything past this point is not vocab-related
|
|
if (hparams.vocab_only) {
|
|
return;
|
|
}
|
|
|
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
|
|
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
|
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
|
if (hparams.n_expert > 0) {
|
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
|
} else {
|
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
}
|
|
|
|
// zero-out the per-layer hparams
|
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
|
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
|
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
|
|
|
|
// n_head_kv is optional, default to n_head
|
|
hparams.n_head_kv_arr = hparams.n_head_arr;
|
|
|
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
|
|
|
|
bool rope_finetuned = false;
|
|
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
hparams.rope_finetuned = rope_finetuned;
|
|
|
|
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
|
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
|
|
|
// rope_freq_base (optional)
|
|
hparams.rope_freq_base_train = 10000.0f;
|
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
|
|
|
|
std::string rope_scaling("linear");
|
|
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
|
|
|
// rope_freq_scale (inverse of the kv) is optional
|
|
float ropescale = 0.0f;
|
|
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
|
// try the old key name
|
|
ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
|
|
}
|
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
|
|
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
|
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
|
|
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
|
|
// non-transformer models do not have attention heads
|
|
if (hparams.n_head() > 0) {
|
|
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
|
// gpt-j n_rot = rotary_dim
|
|
|
|
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
|
|
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
|
|
|
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
|
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
|
|
|
// sanity check for n_rot (optional)
|
|
hparams.n_rot = hparams.n_embd_head_k;
|
|
|
|
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
|
|
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON || model.arch == LLM_ARCH_BITNET_25 || model.arch == LLM_ARCH_BITNET_B158 || model.arch == LLM_ARCH_DECI) {
|
|
if (hparams.n_rot != hparams.n_embd_head_k) {
|
|
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
|
}
|
|
}
|
|
} else {
|
|
hparams.n_rot = 0;
|
|
hparams.n_embd_head_k = 0;
|
|
hparams.n_embd_head_v = 0;
|
|
}
|
|
|
|
// arch-specific KVs
|
|
switch (model.arch) {
|
|
case LLM_ARCH_LLAMA:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
if (hparams.n_expert == 8) {
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_8x7B; break;
|
|
case 56: model.type = e_model::MODEL_8x22B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} else {
|
|
switch (hparams.n_layer) {
|
|
case 22: model.type = e_model::MODEL_1B; break;
|
|
case 26: model.type = e_model::MODEL_3B; break;
|
|
// granite uses a vocab with len 49152
|
|
case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
|
|
case 36: model.type = e_model::MODEL_8B; break; // granite
|
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
case 48: model.type = e_model::MODEL_34B; break;
|
|
case 60: model.type = e_model::MODEL_30B; break;
|
|
case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
}
|
|
} break;
|
|
case LLM_ARCH_LLAMA4:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
|
|
hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
|
hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
|
|
|
|
switch (hparams.n_expert) {
|
|
case 16: model.type = MODEL_17B_16E; break;
|
|
case 128: model.type = MODEL_17B_128E; break;
|
|
default: model.type = MODEL_UNKNOWN;
|
|
}
|
|
|
|
if (model.type == MODEL_17B_128E) {
|
|
hparams.use_kq_norm = false;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_DECI:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 80: model.type = e_model::MODEL_70B; break;
|
|
case 162: model.type = e_model::MODEL_405B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_MINICPM:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 40: model.type = e_model::MODEL_2B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GROK:
|
|
{
|
|
// defaults for old GGUFs
|
|
hparams.yarn_beta_fast = 8.0f;
|
|
hparams.f_logit_scale = 0.5773502691896257f;
|
|
hparams.f_embedding_scale = 78.38367176906169f;
|
|
hparams.f_attn_out_scale = 0.08838834764831845f;
|
|
hparams.f_attn_logit_softcapping = 30.0f;
|
|
hparams.f_router_logit_softcapping = 30.0f;
|
|
// no final_logit_softcapping in grok-1
|
|
hparams.f_final_logit_softcapping = 0.0f;
|
|
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
|
|
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
|
|
ml.get_key(LLM_KV_ATTENTION_OUTPUT_SCALE, hparams.f_attn_out_scale, false);
|
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
ml.get_key(LLM_KV_ROUTER_LOGIT_SOFTCAPPING, hparams.f_router_logit_softcapping, false);
|
|
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
|
|
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.attn_temp_length, false);
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, hparams.yarn_ext_factor, false);
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, hparams.yarn_attn_factor, false);
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 64: model.type = e_model::MODEL_314B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_FALCON:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 60: model.type = e_model::MODEL_40B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_BAICHUAN:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
|
|
if (model.type == e_model::MODEL_13B) {
|
|
// TODO: become GGUF KV parameter
|
|
hparams.f_max_alibi_bias = 8.0f;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_STARCODER:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
case 36: model.type = e_model::MODEL_3B; break;
|
|
case 42: model.type = e_model::MODEL_7B; break;
|
|
case 40: model.type = e_model::MODEL_15B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_REFACT:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_1B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
|
|
// TODO: become GGUF KV parameter
|
|
hparams.f_max_alibi_bias = 8.0f;
|
|
} break;
|
|
case LLM_ARCH_BERT:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 3:
|
|
model.type = e_model::MODEL_17M; break; // bge-micro
|
|
case 6:
|
|
model.type = e_model::MODEL_22M; break; // MiniLM-L6
|
|
case 12:
|
|
switch (hparams.n_embd) {
|
|
case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
|
|
case 768: model.type = e_model::MODEL_109M; break; // bge-base
|
|
} break;
|
|
case 24:
|
|
model.type = e_model::MODEL_335M; break; // bge-large
|
|
}
|
|
} break;
|
|
case LLM_ARCH_JINA_BERT_V2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
hparams.f_max_alibi_bias = 8.0f;
|
|
|
|
switch (hparams.n_layer) {
|
|
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
|
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
|
}
|
|
} break;
|
|
case LLM_ARCH_NOMIC_BERT:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
|
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
model.type = e_model::MODEL_137M;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_BLOOM:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
case 30:
|
|
switch (hparams.n_embd) {
|
|
case 2560: model.type = e_model::MODEL_3B; break;
|
|
case 4096: model.type = e_model::MODEL_7B; break;
|
|
} break;
|
|
}
|
|
|
|
// TODO: become GGUF KV parameter
|
|
hparams.f_max_alibi_bias = 8.0f;
|
|
} break;
|
|
case LLM_ARCH_MPT:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 48: model.type = e_model::MODEL_30B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_STABLELM:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
case 40: model.type = e_model::MODEL_12B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_QWEN:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_QWEN2VL:
|
|
{
|
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
|
}
|
|
// fall through
|
|
case LLM_ARCH_QWEN2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
|
case 80: model.type = e_model::MODEL_70B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_QWEN2MOE:
|
|
{
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = e_model::MODEL_A2_7B; break;
|
|
case 28: model.type = e_model::MODEL_57B_A14B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
|
|
case LLM_ARCH_QWEN3:
|
|
{
|
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 28: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_6B : e_model::MODEL_1_7B; break;
|
|
case 36: model.type = hparams.n_embd == 2560 ? e_model::MODEL_4B : e_model::MODEL_8B; break;
|
|
case 40: model.type = e_model::MODEL_14B; break;
|
|
case 64: model.type = e_model::MODEL_32B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_QWEN3VL:
|
|
{
|
|
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
|
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 28: model.type = e_model::MODEL_1_7B; break;
|
|
case 36: model.type = hparams.n_embd == 2560 ? e_model::MODEL_4B : e_model::MODEL_8B; break;
|
|
case 64: model.type = e_model::MODEL_32B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
// since vision model stacks deepstack features along feature dim
|
|
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
|
|
hparams.n_embd *= hparams.n_deepstack_layers + 1;
|
|
} break;
|
|
case LLM_ARCH_QWEN3MOE:
|
|
{
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 48: model.type = e_model::MODEL_30B_A3B; break;
|
|
case 94: model.type = e_model::MODEL_235B_A22B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_QWEN3VLMOE:
|
|
{
|
|
ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false);
|
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 48: model.type = e_model::MODEL_30B_A3B; break;
|
|
case 94: model.type = e_model::MODEL_235B_A22B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
// since vision model stacks deepstack features along feature dim
|
|
// we also create a fake "n_embd" for text model to be the main embd + deepstack embds
|
|
hparams.n_embd *= hparams.n_deepstack_layers + 1;
|
|
} break;
|
|
case LLM_ARCH_PHI2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_PHI3:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
case 40: model.type = e_model::MODEL_14B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
|
|
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
|
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
|
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
|
hparams.n_swa = 2047;
|
|
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
|
// default value for Phi-3-mini-128k-instruct
|
|
hparams.n_swa = 262144;
|
|
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
|
// default value for Phi-3-medium-128k-instruct
|
|
hparams.n_swa = 131072;
|
|
}
|
|
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
if (!found_swa && hparams.n_swa == 0) {
|
|
throw std::runtime_error("invalid value for sliding_window");
|
|
}
|
|
} break;
|
|
case LLM_ARCH_PLAMO:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GPT2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
switch (hparams.n_layer) {
|
|
case 12: model.type = e_model::MODEL_SMALL; break;
|
|
case 24: model.type = e_model::MODEL_MEDIUM; break;
|
|
case 36: model.type = e_model::MODEL_LARGE; break;
|
|
case 48: model.type = e_model::MODEL_XL; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_CODESHELL:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
switch (hparams.n_layer) {
|
|
case 42: model.type = e_model::MODEL_7B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_ORION:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 40: model.type = e_model::MODEL_14B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_INTERNLM2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 48: model.type = e_model::MODEL_20B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GEMMA:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 18: model.type = e_model::MODEL_2B; break;
|
|
case 28: model.type = e_model::MODEL_7B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GEMMA2:
|
|
{
|
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
|
hparams.attn_soft_cap = true;
|
|
|
|
switch (hparams.n_layer) {
|
|
case 26: model.type = e_model::MODEL_2B; break;
|
|
case 42: model.type = e_model::MODEL_9B; break;
|
|
case 46: model.type = e_model::MODEL_27B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GEMMA3:
|
|
{
|
|
hparams.n_swa_pattern = 6;
|
|
|
|
hparams.rope_freq_base_train_swa = 10000.0f;
|
|
hparams.rope_freq_scale_train_swa = 1.0f;
|
|
|
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 26: model.type = e_model::MODEL_2B; break;
|
|
case 34: model.type = e_model::MODEL_4B; break;
|
|
case 48: model.type = e_model::MODEL_12B; break;
|
|
case 62: model.type = e_model::MODEL_27B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
|
|
hparams.f_attention_scale = model.type == e_model::MODEL_27B
|
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
} break;
|
|
case LLM_ARCH_STARCODER2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
switch (hparams.n_layer) {
|
|
case 30: model.type = e_model::MODEL_3B; break;
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 40: model.type = e_model::MODEL_15B; break;
|
|
case 52: model.type = e_model::MODEL_20B; break; // granite
|
|
case 88: model.type = e_model::MODEL_34B; break; // granite
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_MAMBA:
|
|
{
|
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 24:
|
|
switch (hparams.n_embd) {
|
|
case 768: model.type = e_model::MODEL_SMALL; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 48:
|
|
switch (hparams.n_embd) {
|
|
case 1024: model.type = e_model::MODEL_MEDIUM; break;
|
|
case 1536: model.type = e_model::MODEL_LARGE; break;
|
|
case 2048: model.type = e_model::MODEL_XL; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 64:
|
|
switch (hparams.n_embd) {
|
|
case 2560: model.type = e_model::MODEL_3B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_XVERSE:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
case 80: model.type = e_model::MODEL_65B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_COMMAND_R:
|
|
{
|
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
switch (hparams.n_layer) {
|
|
case 40: model.type = e_model::MODEL_35B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_DBRX:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 40: model.type = e_model::MODEL_16x12B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_OLMO:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 22: model.type = e_model::MODEL_1B; break;
|
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
case 80: model.type = e_model::MODEL_70B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_OPENELM:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 16: model.type = e_model::MODEL_270M; break;
|
|
case 20: model.type = e_model::MODEL_450M; break;
|
|
case 28: model.type = e_model::MODEL_1B; break;
|
|
case 36: model.type = e_model::MODEL_3B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GPTNEOX:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
|
switch (hparams.n_layer) {
|
|
case 6:
|
|
switch (hparams.n_ff()) {
|
|
case 512: model.type = e_model::MODEL_14M; break;
|
|
case 2048: model.type = e_model::MODEL_70M; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 12:
|
|
switch (hparams.n_ff()) {
|
|
case 3072: model.type = e_model::MODEL_160M; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 16:
|
|
switch (hparams.n_ff()) {
|
|
case 8192: model.type = e_model::MODEL_1B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 24:
|
|
switch (hparams.n_ff()) {
|
|
case 4096: model.type = e_model::MODEL_410M; break;
|
|
case 8192: model.type = e_model::MODEL_1_4B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 32:
|
|
switch (hparams.n_ff()) {
|
|
case 10240: model.type = e_model::MODEL_2_8B; break;
|
|
case 16384: model.type = e_model::MODEL_6_9B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 36:
|
|
switch (hparams.n_ff()) {
|
|
case 20480: model.type = e_model::MODEL_12B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 44:
|
|
switch (hparams.n_ff()) {
|
|
case 24576: model.type = e_model::MODEL_20B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_ARCTIC:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
if (hparams.n_expert == 128) {
|
|
switch (hparams.n_layer) {
|
|
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} else {
|
|
model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_DEEPSEEK2:
|
|
{
|
|
if (hparams.n_head_kv() == 1) {
|
|
int n_nead_kv = hparams.n_gqa();
|
|
if (n_nead_kv%16 != 0 || hparams.n_embd_head_k != 576 || hparams.n_embd_head_v != 512 ||
|
|
hparams.n_rot != 64) {
|
|
printf("==========================================================================\n");
|
|
printf("Detected incompatible DeepSeek model without a known way to fixc it.\n");
|
|
printf("Consider making your own ik_llama.cpp compatible model or\n");
|
|
printf("ask the model provider to make one for you,\n\n");
|
|
printf("Sorry, uknown model => cannot fix it => bailing out\n");
|
|
printf("==========================================================================\n");
|
|
GGML_ABORT("Fatal error");
|
|
}
|
|
printf("================= Adjusted mainline llama.cpp MLA tensors to ik_llama.cpp\n");
|
|
for (auto& item : hparams.n_head_kv_arr) item = n_nead_kv;
|
|
hparams.n_embd_head_k = 192;
|
|
hparams.n_embd_head_v = 128;
|
|
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k);
|
|
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v);
|
|
}
|
|
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
if (!is_lite) {
|
|
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
}
|
|
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
if (hparams.expert_gating_func == 0) {
|
|
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
|
// that have no expert_gating_func model parameter set
|
|
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX;
|
|
}
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 27: model.type = e_model::MODEL_16B; break;
|
|
case 60: model.type = e_model::MODEL_236B; break;
|
|
case 61: model.type = e_model::MODEL_671B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_CHATGLM:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 28: model.type = e_model::MODEL_6B; break;
|
|
case 40: model.type = e_model::MODEL_9B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GLM4:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
switch (hparams.n_layer) {
|
|
case 40: model.type = e_model::MODEL_9B; break;
|
|
case 61: model.type = e_model::MODEL_32B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GLM4_MOE:
|
|
{
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
// MoE parameters
|
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
|
|
// Expert gating function (GLM4_MOE uses sigmoid)
|
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
if (hparams.expert_gating_func == 0) {
|
|
hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID;
|
|
}
|
|
|
|
// NextN/MTP parameters
|
|
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 47: model.type = e_model::MODEL_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
|
|
case 93: model.type = e_model::MODEL_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_BITNET:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 26: model.type = e_model::MODEL_3B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_BITNET_B158:
|
|
case LLM_ARCH_BITNET_25:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 30: model.type = e_model::MODEL_2B; break; // bitnet2b_2501
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_T5:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
|
|
|
uint32_t dec_start_token_id;
|
|
if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
|
|
hparams.dec_start_token_id = dec_start_token_id;
|
|
}
|
|
|
|
switch (hparams.n_layer) {
|
|
case 6: model.type = e_model::MODEL_60M; break; // t5-small
|
|
case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
|
|
case 12:
|
|
switch (hparams.n_ff()) {
|
|
case 3072: model.type = e_model::MODEL_220M; break; // t5-base
|
|
case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case 24:
|
|
switch (hparams.n_ff()) {
|
|
case 4096: model.type = e_model::MODEL_770M; break; // t5-large
|
|
case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
|
|
case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
|
|
case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
|
|
case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
|
|
case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_T5ENCODER:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
|
model.type = e_model::MODEL_UNKNOWN;
|
|
} break;
|
|
case LLM_ARCH_JAIS:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 24: model.type = e_model::MODEL_1_3B; break;
|
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
/* TODO: add variants */
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_GRANITE:
|
|
case LLM_ARCH_GRANITE_MOE:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
case 40: model.type = e_model::MODEL_3B; break;
|
|
// Add additional layer/vocab/etc checks here for other model sizes
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_COHERE2:
|
|
{
|
|
hparams.n_swa_pattern = 4;
|
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_8B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_BAILINGMOE2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups);
|
|
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
|
|
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
|
|
switch (hparams.n_layer) {
|
|
case 20: model.type = MODEL_16B_A1B; break;
|
|
case 21: model.type = MODEL_16B_A1B; break;
|
|
case 32: model.type = MODEL_100B_A6B; break;
|
|
case 33: model.type = MODEL_100B_A6B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_DOTS1:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
switch (hparams.n_layer) {
|
|
case 62: model.type = e_model::MODEL_142B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_ERNIE4_5:
|
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
if (model.arch == LLM_ARCH_ERNIE4_5_MOE) {
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
}
|
|
|
|
switch (hparams.n_layer) {
|
|
case 18: model.type = e_model::MODEL_0_3B; break;
|
|
case 28: model.type = e_model::MODEL_21B_A3B; break;
|
|
case 54: model.type = e_model::MODEL_300B_A47B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 32: model.type = e_model::MODEL_80B_A13B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_OPENAI_MOE:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
|
|
//TODO OAI_MOE: SWA
|
|
//hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
//hparams.set_swa_pattern(2);
|
|
|
|
// TODO: switch (hparams.n_layer)
|
|
|
|
} break;
|
|
case LLM_ARCH_MINIMAX_M2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 62: model.type = e_model::MODEL_230B_A10B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_SMOLLM3:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
hparams.n_no_rope_layer_step = 4;
|
|
|
|
switch (hparams.n_layer) {
|
|
case 36: model.type = e_model::MODEL_3B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_MISTRAL3:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
|
|
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false);
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false);
|
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
|
|
|
|
if (hparams.f_attn_temp_scale != 0.0f) {
|
|
hparams.n_attn_temp_floor_scale = hparams.n_ctx_orig_yarn;
|
|
if (hparams.n_attn_temp_floor_scale == 0) {
|
|
throw std::runtime_error("invalid n_ctx_orig_yarn for attention temperature scaling");
|
|
}
|
|
}
|
|
|
|
// TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
|
|
// but may need further verification with other values
|
|
if (hparams.rope_yarn_log_mul != 0.0f) {
|
|
float factor = 1.0f / hparams.rope_freq_scale_train;
|
|
float mscale = 1.0f;
|
|
float mscale_all_dims = hparams.rope_yarn_log_mul;
|
|
static auto get_mscale = [](float scale, float mscale) {
|
|
return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
|
|
};
|
|
hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
|
|
}
|
|
|
|
switch (hparams.n_layer) {
|
|
case 26: model.type = e_model::MODEL_3B; break;
|
|
case 34: model.type = e_model::MODEL_8B; break;
|
|
case 40: model.type = e_model::MODEL_14B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
} break;
|
|
case LLM_ARCH_MIMO2:
|
|
{
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
//TODO
|
|
//hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; // which is the same as OpenAI
|
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
|
|
|
switch (hparams.n_layer) {
|
|
case 48: model.type = e_model::MODEL_310B_A15B; break;
|
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
}
|
|
|
|
} break;
|
|
|
|
default: (void)0;
|
|
}
|
|
|
|
model.ftype = ml.ftype;
|
|
|
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
|
hparams.use_alibi = true;
|
|
}
|
|
|
|
hparams.rope_type = llama_rope_type(&model);
|
|
}
|