From d7c0104967a653ac70c2bf6f4bb222a23118a223 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Tue, 24 Feb 2026 12:59:07 +0000
Subject: [PATCH] Change meaning of fdn from bool flag to threshold value

---
 common/common.cpp                    |  7 ++++---
 common/common.h                      |  2 +-
 examples/llama-bench/llama-bench.cpp | 16 ++++++++--------
 include/llama.h                      |  2 +-
 src/llama-cparams.h                  |  2 +-
 src/llama-delta-net.cpp              |  2 +-
 src/llama.cpp                        |  2 +-
 7 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 83c8d562..6486c097 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1532,7 +1532,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "-fdn" || arg == "--fused-delta-net") {
-        params.fused_delta_net = true;
+        CHECK_ARG
+        params.fused_delta_net = std::stoi(argv[i]);
         return true;
     }
     if (arg == "-smf16" || arg == "--split-mode-f16") {
@@ -2262,7 +2263,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",         "-grt, --graph-reduce-type",       "Type for data exchange between GPUs (default: %s)", "f32"});
     options.push_back({ "*",         "-smgs, --split-mode-graph-scheduling,", "Force Split Mode Graph Scheduling (default: %d)", params.split_mode_graph_scheduling});
     options.push_back({ "*",         "-sas,  --scheduler_async,",       "Async evaluation of compute graphs: %d)", params.scheduler_async});
-    options.push_back({ "*",         "-fdn,  --fused-delta-net",        "Use fused delta-net for TH with recurrent models: %d)", params.fused_delta_net});
+    options.push_back({ "*",         "-fdn,  --fused-delta-net N",      "Use fused delta-net when batch size is <= N with recurrent models: %d)", params.fused_delta_net});
     options.push_back({ "*",         "-vq, --validate-quants",          "validate quantized data while loading the model (default: %d)", params.validate_quants});
     options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
                                                                         "in conversation mode, this will be used as system prompt\n"
@@ -4352,7 +4353,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     //fprintf(stream, "split_mode_f16: %s # default: true\n", params.split_mode_f16 ? "true" : "false");
     fprintf(stream, "reduce_type: %s # default f16\n", params.reduce_type.c_str());
     fprintf(stream, "scheduler_async: %s # default: false\n", params.scheduler_async ? "true" : "false");
-    fprintf(stream, "fused_delta_net: %s # default: false\n", params.fused_delta_net ? "true" : "false");
+    fprintf(stream, "fused_delta_net: %d # default: 0\n", params.fused_delta_net );
     fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts);
     fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
 
diff --git a/common/common.h b/common/common.h
index b29c9d97..c14d2e22 100644
--- a/common/common.h
+++ b/common/common.h
@@ -357,7 +357,7 @@ struct gpt_params {
     bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling
     //bool split_mode_f16    = true;  // if true, intermediate results will be cast to f16 before copying to other GPUs to perform reduce ops
     bool scheduler_async   = false; // if true, in split mode graph the scheduler will use multiple threads to evaluate the graph
-    bool fused_delta_net   = false; // if true, use fused delta-net for TG with hybrid/recurrent models
+    int  fused_delta_net   = 0;     // use fused delta-net if number of tokens in the batch is less than this value
     bool has_mtp           = false; // enable MTP if supported by the model
 
     std::string cache_type_k = "f16"; // KV cache data type for the K
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 0a73dbb2..aeb78f89 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -271,7 +271,7 @@ struct cmd_params {
     bool muge = false;
     bool rcache = false;
     bool sas = false;
-    bool fdn = false; // fdn = fused delta net
+    int  fdn = 0; // fdn = fused delta net
     bool print_overrides = false;
     output_formats output_format;
     output_formats output_format_stderr;
@@ -317,7 +317,7 @@ static const cmd_params cmd_params_defaults = {
     /* muge                 */ false,
     /* rcache               */ false,
     /* sas                  */ false,
-    /* fdn                  */ false,
+    /* fdn                  */ 0,
     /* print_overrides      */ false,
     /* output_format        */ MARKDOWN,
     /* output_format_stderr */ NONE,
@@ -371,7 +371,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -no-fug, --no-fused-up-gate <0|1>   (default: %s)\n", cmd_params_defaults.no_fug? "1" : "0");
     printf("  -no-ooae, --no-offload-only-active-experts <0|1>   (default: %s)\n", cmd_params_defaults.no_ooae? "1" : "0");
     printf("  -sas, --scheduler-async <0|1>       (default: %s)\n", cmd_params_defaults.sas ? "1" : "0");
-    printf("  -fdn, --fused-delta-net <0|1>       (default: %s)\n", cmd_params_defaults.fdn ? "1" : "0");
+    printf("  -fdn, --fused-delta-net <n>         (default: %d)\n", cmd_params_defaults.fdn);
     printf("        --print-overrides <0|1>       (default: %s)\n", cmd_params_defaults.print_overrides ? "1" : "0");
     printf("\n");
     printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -965,7 +965,7 @@ struct cmd_params_instance {
     bool muge = false;
     bool rcache = false;
     bool sas = false;
-    bool fdn = false;
+    int fdn = 0;
     const llama_model_tensor_buft_override* buft_overrides;
 
     llama_model_params to_llama_mparams() const {
@@ -1282,7 +1282,7 @@ struct test {
     bool muge = false;
     bool rcache = false;
     bool sas = false;
-    bool fdn = false;
+    int fdn = 0;
     std::string override_tensor;
     int n_prompt;
     int n_gen;
@@ -1429,14 +1429,14 @@ struct test {
             field == "model_size" || field == "model_n_params" ||
             field == "n_gpu_layers" || field == "main_gpu" ||
             field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" ||
-            field == "avg_ns" || field == "stddev_ns") {
+            field == "avg_ns" || field == "stddev_ns" || field == "fdn") {
             return INT;
         }
         if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
             field == "gpu_blas" || field == "blas" || field == "sycl" || field == "no_kv_offload" ||
             field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" ||
             field == "fused_moe" || field == "grouped_er" || field == "no_fused_up_gate" || field == "no_ooae" || field == "mqkv" ||
-            field == "rcache" || field == "reuse" || field == "muge" || field == "sas" || field == "fnd") {
+            field == "rcache" || field == "reuse" || field == "muge" || field == "sas") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -1692,7 +1692,7 @@ struct markdown_printer : public printer {
             return 3;
         }
         if (field == "fdn") {
-            return 3;
+            return 4;
         }
         if (field == "use_thp") {
             return 3;
diff --git a/include/llama.h b/include/llama.h
index f1b43a5a..01f843b5 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -456,7 +456,7 @@ extern "C" {
         bool split_mode_graph_scheduling; // if true, force split mode graph scheduling
         //bool split_mode_f16;    // if true, cast intermediate results to f16 before copying to other GPUs
         bool scheduler_async;   // if true, with split mode "graph" graph evaluation will be done using multiple threads
-        bool fused_delta_net;
+        int  fused_delta_net;
         bool mtp;   // Activate MTP if supported
         enum llama_mtp_op_type mtp_op_type;
 
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 05bda231..6ac0a3a3 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -43,7 +43,7 @@ struct llama_cparams {
     bool split_mode_graph_scheduling;
     //bool split_mode_f16;
     bool scheduler_async;
-    bool fused_delta_net;
+    int  fused_delta_net;
     int  min_experts;
     float thresh_experts;
     bool mtp;
diff --git a/src/llama-delta-net.cpp b/src/llama-delta-net.cpp
index 41c18752..840bebf4 100644
--- a/src/llama-delta-net.cpp
+++ b/src/llama-delta-net.cpp
@@ -686,7 +686,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
 
     std::pair<ggml_tensor *, ggml_tensor *> attn_out;
     // The fused delta-net implementation is only faster than chunked for n_tok <= 8, so use it only in that case
-    attn_out = lctx.cparams.fused_delta_net && n_tok <= 8 ? build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb) :
+    attn_out = n_tok <= lctx.cparams.fused_delta_net ? build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb) :
         n_tok == 1 ? build_delta_net_autoregressive(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb)
                    : build_delta_net_chunking(ctx0, q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il, cb);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 00f24c2c..2641f988 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4380,7 +4380,7 @@ struct llama_context_params llama_context_default_params() {
         /*.split_mode_graph_scheduling =*/ false,
         // /*.split_mode_f16           =*/ true,
         /*.scheduler_async             =*/ false,
-        /*.fused_delta_net             =*/ false,
+        /*.fused_delta_net             =*/ 0,
         /*.mtp                         =*/ false,
         /*.mtp_op_type                 =*/ MTP_OP_NONE,
         /*.abort_callback              =*/ nullptr,