Be able to set reduce op data type for split mode "graph"

2026-03-04 19:10:03 +00:00 · 2025-12-24 10:57:41 +00:00
parent 1d7d0225a0
commit c6a3903571
7 changed files with 23 additions and 6 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -444,6 +444,7 @@ extern "C" {
        bool only_active_experts;
        bool k_cache_hadamard;  // if true, apply Hadamard transfrom to K-cache
        bool split_mode_graph_scheduling; // if true, force split mode graph scheduling
+        bool split_mode_f16;    // if true, cast intermediate results to f16 before copying to other GPUs

        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted