Change meaning of fdn from bool flag to threshold value

2026-03-01 01:24:08 +00:00 · 2026-02-24 12:59:07 +00:00
parent b184e84480
commit d7c0104967
7 changed files with 17 additions and 16 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1532,7 +1532,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        return true;
    }
    if (arg == "-fdn" || arg == "--fused-delta-net") {
-        params.fused_delta_net = true;
+        CHECK_ARG
+        params.fused_delta_net = std::stoi(argv[i]);
        return true;
    }
    if (arg == "-smf16" || arg == "--split-mode-f16") {
@@ -2262,7 +2263,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",         "-grt, --graph-reduce-type",       "Type for data exchange between GPUs (default: %s)", "f32"});
    options.push_back({ "*",         "-smgs, --split-mode-graph-scheduling,", "Force Split Mode Graph Scheduling (default: %d)", params.split_mode_graph_scheduling});
    options.push_back({ "*",         "-sas,  --scheduler_async,",       "Async evaluation of compute graphs: %d)", params.scheduler_async});
-    options.push_back({ "*",         "-fdn,  --fused-delta-net",        "Use fused delta-net for TH with recurrent models: %d)", params.fused_delta_net});
+    options.push_back({ "*",         "-fdn,  --fused-delta-net N",      "Use fused delta-net when batch size is <= N with recurrent models: %d)", params.fused_delta_net});
    options.push_back({ "*",         "-vq, --validate-quants",          "validate quantized data while loading the model (default: %d)", params.validate_quants});
    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with\n"
                                                                        "in conversation mode, this will be used as system prompt\n"
@@ -4352,7 +4353,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    //fprintf(stream, "split_mode_f16: %s # default: true\n", params.split_mode_f16 ? "true" : "false");
    fprintf(stream, "reduce_type: %s # default f16\n", params.reduce_type.c_str());
    fprintf(stream, "scheduler_async: %s # default: false\n", params.scheduler_async ? "true" : "false");
-    fprintf(stream, "fused_delta_net: %s # default: false\n", params.fused_delta_net ? "true" : "false");
+    fprintf(stream, "fused_delta_net: %d # default: 0\n", params.fused_delta_net );
    fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts);
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

--- a/common/common.h
+++ b/common/common.h
@@ -357,7 +357,7 @@ struct gpt_params {
    bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling
    //bool split_mode_f16    = true;  // if true, intermediate results will be cast to f16 before copying to other GPUs to perform reduce ops
    bool scheduler_async   = false; // if true, in split mode graph the scheduler will use multiple threads to evaluate the graph
-    bool fused_delta_net   = false; // if true, use fused delta-net for TG with hybrid/recurrent models
+    int  fused_delta_net   = 0;     // use fused delta-net if number of tokens in the batch is less than this value
    bool has_mtp           = false; // enable MTP if supported by the model

    std::string cache_type_k = "f16"; // KV cache data type for the K
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -271,7 +271,7 @@ struct cmd_params {
    bool muge = false;
    bool rcache = false;
    bool sas = false;
-    bool fdn = false; // fdn = fused delta net
+    int  fdn = 0; // fdn = fused delta net
    bool print_overrides = false;
    output_formats output_format;
    output_formats output_format_stderr;
@@ -317,7 +317,7 @@ static const cmd_params cmd_params_defaults = {
    /* muge                 */ false,
    /* rcache               */ false,
    /* sas                  */ false,
-    /* fdn                  */ false,
+    /* fdn                  */ 0,
    /* print_overrides      */ false,
    /* output_format        */ MARKDOWN,
    /* output_format_stderr */ NONE,
@@ -371,7 +371,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -no-fug, --no-fused-up-gate <0|1>   (default: %s)\n", cmd_params_defaults.no_fug? "1" : "0");
    printf("  -no-ooae, --no-offload-only-active-experts <0|1>   (default: %s)\n", cmd_params_defaults.no_ooae? "1" : "0");
    printf("  -sas, --scheduler-async <0|1>       (default: %s)\n", cmd_params_defaults.sas ? "1" : "0");
-    printf("  -fdn, --fused-delta-net <0|1>       (default: %s)\n", cmd_params_defaults.fdn ? "1" : "0");
+    printf("  -fdn, --fused-delta-net <n>         (default: %d)\n", cmd_params_defaults.fdn);
    printf("        --print-overrides <0|1>       (default: %s)\n", cmd_params_defaults.print_overrides ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -965,7 +965,7 @@ struct cmd_params_instance {
    bool muge = false;
    bool rcache = false;
    bool sas = false;
-    bool fdn = false;
+    int fdn = 0;
    const llama_model_tensor_buft_override* buft_overrides;

    llama_model_params to_llama_mparams() const {
@@ -1282,7 +1282,7 @@ struct test {
    bool muge = false;
    bool rcache = false;
    bool sas = false;
-    bool fdn = false;
+    int fdn = 0;
    std::string override_tensor;
    int n_prompt;
    int n_gen;
@@ -1429,14 +1429,14 @@ struct test {
            field == "model_size" || field == "model_n_params" ||
            field == "n_gpu_layers" || field == "main_gpu" ||
            field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" ||
-            field == "avg_ns" || field == "stddev_ns") {
+            field == "avg_ns" || field == "stddev_ns" || field == "fdn") {
            return INT;
        }
        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" || field == "no_kv_offload" ||
            field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" ||
            field == "fused_moe" || field == "grouped_er" || field == "no_fused_up_gate" || field == "no_ooae" || field == "mqkv" ||
-            field == "rcache" || field == "reuse" || field == "muge" || field == "sas" || field == "fnd") {
+            field == "rcache" || field == "reuse" || field == "muge" || field == "sas") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -1692,7 +1692,7 @@ struct markdown_printer : public printer {
            return 3;
        }
        if (field == "fdn") {
-            return 3;
+            return 4;
        }
        if (field == "use_thp") {
            return 3;
--- a/include/llama.h
+++ b/include/llama.h
@@ -456,7 +456,7 @@ extern "C" {
        bool split_mode_graph_scheduling; // if true, force split mode graph scheduling
        //bool split_mode_f16;    // if true, cast intermediate results to f16 before copying to other GPUs
        bool scheduler_async;   // if true, with split mode "graph" graph evaluation will be done using multiple threads
-        bool fused_delta_net;
+        int  fused_delta_net;
        bool mtp;   // Activate MTP if supported
        enum llama_mtp_op_type mtp_op_type;

--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -43,7 +43,7 @@ struct llama_cparams {
    bool split_mode_graph_scheduling;
    //bool split_mode_f16;
    bool scheduler_async;
-    bool fused_delta_net;
+    int  fused_delta_net;
    int  min_experts;
    float thresh_experts;
    bool mtp;
--- a/src/llama-delta-net.cpp
+++ b/src/llama-delta-net.cpp
@@ -686,7 +686,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_

    std::pair<ggml_tensor *, ggml_tensor *> attn_out;
    // The fused delta-net implementation is only faster than chunked for n_tok <= 8, so use it only in that case
-    attn_out = lctx.cparams.fused_delta_net && n_tok <= 8 ? build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb) :
+    attn_out = n_tok <= lctx.cparams.fused_delta_net ? build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb) :
        n_tok == 1 ? build_delta_net_autoregressive(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb)
                   : build_delta_net_chunking(ctx0, q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il, cb);

--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4380,7 +4380,7 @@ struct llama_context_params llama_context_default_params() {
        /*.split_mode_graph_scheduling =*/ false,
        // /*.split_mode_f16           =*/ true,
        /*.scheduler_async             =*/ false,
-        /*.fused_delta_net             =*/ false,
+        /*.fused_delta_net             =*/ 0,
        /*.mtp                         =*/ false,
        /*.mtp_op_type                 =*/ MTP_OP_NONE,
        /*.abort_callback              =*/ nullptr,