Split mode "graph" for GPT-OSS (#1118)

* Split mode "graph" for GPT-OSS * Force split_mode_f16 to false --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-02-08 15:30:15 +00:00 · 2026-01-08 09:14:15 +02:00
parent 1b24192873
commit d581d75537
3 changed files with 57 additions and 80 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1734,6 +1734,7 @@ static bool is_model_split_supported(const llama_model & model) {
        LLM_ARCH_QWEN3,
        LLM_ARCH_QWEN3VL,
        LLM_ARCH_HUNYUAN_MOE,
+        LLM_ARCH_OPENAI_MOE,
    };
    auto it =  k_supported.find(model.arch);
    return it != k_supported.end();
@@ -4432,6 +4433,15 @@ struct llama_context * llama_new_context_with_model(
        //LLAMA_LOG_WARN("=====================================================================\n");
        cparams.mla_attn = 0;
    }
+    if (model->arch == LLM_ARCH_OPENAI_MOE && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
+        if (cparams.split_mode_f16) {
+            LLAMA_LOG_WARN("=====================================================================\n");
+            LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n");
+            LLAMA_LOG_WARN("    => changing cparams.split_mode_f16 to 'false'\n");
+            LLAMA_LOG_WARN("=====================================================================\n");
+            cparams.split_mode_f16 = false;
+        }
+    }

    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",     __func__, cparams.n_ctx);
    LLAMA_LOG_INFO("%s: n_batch       = %u\n",     __func__, cparams.n_batch);