Additional graph reduce types for split mode graph (#1154)

* WIP: add Q8_0 and BF16 as possible reduce types Does not work - there is a big somewhere * This finally works
2026-01-26 17:20:01 +00:00 · 2026-01-18 08:02:49 +02:00
parent ee463b079e
commit 7024fdbc72
10 changed files with 347 additions and 86 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1459,11 +1459,18 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        return true;
    }
    if (arg == "-smf16" || arg == "--split-mode-f16") {
-        params.split_mode_f16 = true;
+        params.reduce_type = "f16";
+        //params.split_mode_f16 = true;
        return true;
    }
    if (arg == "-smf32" || arg == "--split-mode-f32") {
-        params.split_mode_f16 = false;
+        params.reduce_type = "f32";
+        //params.split_mode_f16 = false;
+        return true;
+    }
+    if (arg == "-grt" || arg == "--graph-reduce-type") {
+        CHECK_ARG
+        params.reduce_type = argv[i];
        return true;
    }
    if (arg == "--numa") {
@@ -2154,8 +2161,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",         "-mqkv,  --merge-qkv,",            "merge Q,K,V (default: %d)", params.merge_qkv});
    options.push_back({ "*",         "-muge,  --merge-up-gate-experts,","merge ffn_up/gate_exps (default: %d)", params.merge_up_gate_exps});
    options.push_back({ "*",         "-khad,  --k-cache-hadamard,",     "Use Hadamard transform for K-cache (default: %d)", params.k_cache_hadamard});
-    options.push_back({ "*",         "-smf16, --split-mode-f16,",       "Use f16 for data exchange between GPUs (default: %d)", params.split_mode_f16});
-    options.push_back({ "*",         "-smf32, --split-mode-f32,",       "Use f32 for data exchange between GPUs (default: %d)", !params.split_mode_f16});
+    options.push_back({ "*",         "-smf16, --split-mode-f16,",       "Use f16 for data exchange between GPUs (default: %d)", true});
+    options.push_back({ "*",         "-smf32, --split-mode-f32,",       "Use f32 for data exchange between GPUs (default: %d)", false});
+    options.push_back({ "*",         "-grt, --graph-reduce-type",       "Type for data exchange between GPUs (default: %s)", "f32"});
    options.push_back({ "*",         "-smgs, --split-mode-graph-scheduling,", "Force Split Mode Graph Scheduling (default: %d)", params.split_mode_graph_scheduling});
    options.push_back({ "*",         "-sas,  ==scheduler_async,",       "Async evaluation of compute graphs: %d)", params.scheduler_async});
    options.push_back({ "*",         "-vq, --validate-quants",          "validate quantized data while loading the model (default: %d)", params.validate_quants});
@@ -3148,6 +3156,22 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
    throw std::runtime_error("Invalid cache type: " + s);
 }

+static ggml_type ggml_type_from_str(const std::string & s) {
+    if (s == "f32") {
+        return GGML_TYPE_F32;
+    }
+    if (s == "f16") {
+        return GGML_TYPE_F16;
+    }
+    if (s == "bf16") {
+        return GGML_TYPE_BF16;
+    }
+    if (s == "q8_0") {
+        return GGML_TYPE_Q8_0;
+    }
+    throw std::runtime_error("Invalid graph reduce type: " + s);
+}
+
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();
    int n_batch = params.n_batch;
@@ -3194,7 +3218,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.graph_reuse       = params.graph_reuse;
    cparams.k_cache_hadamard  = params.k_cache_hadamard;
    cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
-    cparams.split_mode_f16    = params.split_mode_f16;
+    //cparams.split_mode_f16    = params.split_mode_f16;
    cparams.scheduler_async   = params.scheduler_async;
    cparams.min_experts       = params.min_experts;
    cparams.thresh_experts    = params.thresh_experts;
@@ -3203,6 +3227,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
+    cparams.type_reduce = ggml_type_from_str(params.reduce_type);

    if (!params.offload_policy.empty()) cparams.offload_policy = (void *)&params.offload_policy;
    if (!params.cuda_params.empty()) cparams.cuda_params = (void *)params.cuda_params.data();
@@ -4180,7 +4205,8 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "graph_reuse: %s # default: false\n", params.graph_reuse ? "true" : "false");
    fprintf(stream, "k_cache_hadamard: %s # default: false\n", params.k_cache_hadamard ? "true" : "false");
    fprintf(stream, "split_mode_graph_scheduling: %s # default: false\n", params.split_mode_graph_scheduling ? "true" : "false");
-    fprintf(stream, "split_mode_f16: %s # default: true\n", params.split_mode_f16 ? "true" : "false");
+    //fprintf(stream, "split_mode_f16: %s # default: true\n", params.split_mode_f16 ? "true" : "false");
+    fprintf(stream, "reduce_type: %s # default f16\n", params.reduce_type.c_str());
    fprintf(stream, "scheduler_async: %s # default: false\n", params.scheduler_async ? "true" : "false");
    fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts);
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
--- a/common/common.h
+++ b/common/common.h
@@ -290,7 +290,7 @@ struct gpt_params {
    bool merge_up_gate_exps= false; // if true, merge ffn_up_exps and ffn_gate_exps into a single, contiguous tensor
    bool k_cache_hadamard  = false; // if true, use Hadamard transform for the K-cache (only makes sense with quantized cache)
    bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling
-    bool split_mode_f16    = true;  // if true, intermediate results will be cast to f16 before copying to other GPUs to perform reduce ops
+    //bool split_mode_f16    = true;  // if true, intermediate results will be cast to f16 before copying to other GPUs to perform reduce ops
    bool scheduler_async   = false; // if true, in split mode graph the scheduler will use multiple threads to evaluate the graph

    std::string cache_type_k = "f16"; // KV cache data type for the K
@@ -298,6 +298,8 @@ struct gpt_params {
    std::string cache_type_k_draft = ""; // KV cache data type for K for the draft model
    std::string cache_type_v_draft = ""; // KV cache data type for V for the draft model

+    std::string reduce_type = "f16";
+
    // multimodal models (see examples/mtmd)
    model_paths mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model