Additional graph reduce types for split mode graph (#1154)

* WIP: add Q8_0 and BF16 as possible reduce types Does not work - there is a big somewhere * This finally works
2026-04-26 09:29:27 +00:00 · 2026-01-18 08:02:49 +02:00
parent ee463b079e
commit 7024fdbc72
10 changed files with 347 additions and 86 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -426,6 +426,7 @@ extern "C" {

        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
+        enum ggml_type type_reduce; // data type for reduce operations

        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
@@ -445,7 +446,7 @@ extern "C" {
        bool only_active_experts;
        bool k_cache_hadamard;  // if true, apply Hadamard transfrom to K-cache
        bool split_mode_graph_scheduling; // if true, force split mode graph scheduling
-        bool split_mode_f16;    // if true, cast intermediate results to f16 before copying to other GPUs
+        //bool split_mode_f16;    // if true, cast intermediate results to f16 before copying to other GPUs
        bool scheduler_async;   // if true, with split mode "graph" graph evaluation will be done using multiple threads

        // Abort callback