Correct GLM-4.7-Flash gating function

2026-04-20 22:49:31 +00:00 · 2026-01-21 05:38:36 +00:00
parent 6f1a69352f
commit 06bfd8861b
1 changed files with 22 additions and 4 deletions
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -27,6 +27,14 @@ const char * llama_hparams::rope_scaling_type_name(llama_rope_scaling_type type)
    return LLAMA_ROPE_SCALING_TYPES.at(type);
 }

+static inline const char * llm_expert_gating_func_name(llm_expert_gating_func_type type) {
+    switch (type) {
+        case LLM_EXPERT_GATING_FUNC_SOFTMAX: return "softmax";
+        case LLM_EXPERT_GATING_FUNC_SIGMOID: return "sigmoid";
+        case LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT: return "weight";
+        default: return "none";
+    }
+}


 void llm_load_hparams(
@@ -779,10 +787,20 @@ void llm_load_hparams(
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
                ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
-	            if (hparams.expert_gating_func == 0) {
-                    // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
-                    // that have no expert_gating_func model parameter set
-                    hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX;
+                if (hparams.n_layer == 47 || hparams.n_layer == 48) {
+                    if (hparams.expert_gating_func != LLM_EXPERT_GATING_FUNC_SIGMOID) {
+                        printf("============== Corrected experts gating function from %s to %s\n",
+                                llm_expert_gating_func_name(llm_expert_gating_func_type(hparams.expert_gating_func)),
+                                llm_expert_gating_func_name(LLM_EXPERT_GATING_FUNC_SIGMOID));
+                        hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SIGMOID;
+                    }
+                } else {
+                    if (hparams.expert_gating_func == LLM_EXPERT_GATING_FUNC_TYPE_NONE) {
+                        // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
+                        // that have no expert_gating_func model parameter set
+                        printf("============== Missing experts gating function -> set to SOFTMAX\n");
+                        hparams.expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX;
+                    }
                }
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);