From 3d5b854aee86c8a17d5fe18285d300bf431b0b16 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Wed, 21 Jan 2026 07:08:54 +0000 Subject: [PATCH] Make comments more precise when experts gating function is missing --- src/llama-hparams.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 7b889c9c..24d07a9a 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -789,10 +789,13 @@ void llm_load_hparams( hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_TYPE_NONE; ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) { - // Some models don't have the experts gating function recorded in the GGUF + // Older DeepSeek models from the 2.0/2.5 series may not have the experts gating function recorded in the GGUF. + // Such models use SOFTMAX as the experts gating function. + // The new (new as of this commit) GLM-4.7-Flash may also be missing the experts gating function. + // GLM-4.7-Flash uses SIGMOID as the experts gating function. // Hence, we make the LLM_KV_EXPERT_GATING_FUNC entry optional, and set here if missing. - // DeepSeek models normally have softmax as gating function, but there is GLM-4.7-Flash now - // (identified via number of layers being 47 or 48), which uses sigmoid. + // We distinguish between GLM-4.7-Flash and DeepSeek-2/2.5 models by the number of layers. + // GLM-4.7-Flash has 47 layers (or 48, if an MTP layer is included in the GGUF). hparams.expert_gating_func = hparams.n_layer == 47 || hparams.n_layer == 48 ? LLM_EXPERT_GATING_FUNC_SIGMOID : LLM_EXPERT_GATING_FUNC_SOFTMAX; printf("================= Missing experts gating function -> set to %s\n",