From d7c0104967a653ac70c2bf6f4bb222a23118a223 Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Tue, 24 Feb 2026 12:59:07 +0000 Subject: [PATCH] Change meaning of fdn from bool flag to threshold value --- common/common.cpp | 7 ++++--- common/common.h | 2 +- examples/llama-bench/llama-bench.cpp | 16 ++++++++-------- include/llama.h | 2 +- src/llama-cparams.h | 2 +- src/llama-delta-net.cpp | 2 +- src/llama.cpp | 2 +- 7 files changed, 17 insertions(+), 16 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 83c8d562..6486c097 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1532,7 +1532,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return true; } if (arg == "-fdn" || arg == "--fused-delta-net") { - params.fused_delta_net = true; + CHECK_ARG + params.fused_delta_net = std::stoi(argv[i]); return true; } if (arg == "-smf16" || arg == "--split-mode-f16") { @@ -2262,7 +2263,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", "-grt, --graph-reduce-type", "Type for data exchange between GPUs (default: %s)", "f32"}); options.push_back({ "*", "-smgs, --split-mode-graph-scheduling,", "Force Split Mode Graph Scheduling (default: %d)", params.split_mode_graph_scheduling}); options.push_back({ "*", "-sas, --scheduler_async,", "Async evaluation of compute graphs: %d)", params.scheduler_async}); - options.push_back({ "*", "-fdn, --fused-delta-net", "Use fused delta-net for TH with recurrent models: %d)", params.fused_delta_net}); + options.push_back({ "*", "-fdn, --fused-delta-net N", "Use fused delta-net when batch size is <= N with recurrent models: %d)", params.fused_delta_net}); options.push_back({ "*", "-vq, --validate-quants", "validate quantized data while loading the model (default: %d)", params.validate_quants}); options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n" "in conversation mode, this will be used as system prompt\n" @@ -4352,7 +4353,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l //fprintf(stream, "split_mode_f16: %s # default: true\n", params.split_mode_f16 ? "true" : "false"); fprintf(stream, "reduce_type: %s # default f16\n", params.reduce_type.c_str()); fprintf(stream, "scheduler_async: %s # default: false\n", params.scheduler_async ? "true" : "false"); - fprintf(stream, "fused_delta_net: %s # default: false\n", params.fused_delta_net ? "true" : "false"); + fprintf(stream, "fused_delta_net: %d # default: 0\n", params.fused_delta_net ); fprintf(stream, "ser: %d,%g # defaulr: -1,0\n", params.min_experts, params.thresh_experts); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); diff --git a/common/common.h b/common/common.h index b29c9d97..c14d2e22 100644 --- a/common/common.h +++ b/common/common.h @@ -357,7 +357,7 @@ struct gpt_params { bool split_mode_graph_scheduling = false; // if true, force split mode graph scheduling //bool split_mode_f16 = true; // if true, intermediate results will be cast to f16 before copying to other GPUs to perform reduce ops bool scheduler_async = false; // if true, in split mode graph the scheduler will use multiple threads to evaluate the graph - bool fused_delta_net = false; // if true, use fused delta-net for TG with hybrid/recurrent models + int fused_delta_net = 0; // use fused delta-net if number of tokens in the batch is less than this value bool has_mtp = false; // enable MTP if supported by the model std::string cache_type_k = "f16"; // KV cache data type for the K diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 0a73dbb2..aeb78f89 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -271,7 +271,7 @@ struct cmd_params { bool muge = false; bool rcache = false; bool sas = false; - bool fdn = false; // fdn = fused delta net + int fdn = 0; // fdn = fused delta net bool print_overrides = false; output_formats output_format; output_formats output_format_stderr; @@ -317,7 +317,7 @@ static const cmd_params cmd_params_defaults = { /* muge */ false, /* rcache */ false, /* sas */ false, - /* fdn */ false, + /* fdn */ 0, /* print_overrides */ false, /* output_format */ MARKDOWN, /* output_format_stderr */ NONE, @@ -371,7 +371,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -no-fug, --no-fused-up-gate <0|1> (default: %s)\n", cmd_params_defaults.no_fug? "1" : "0"); printf(" -no-ooae, --no-offload-only-active-experts <0|1> (default: %s)\n", cmd_params_defaults.no_ooae? "1" : "0"); printf(" -sas, --scheduler-async <0|1> (default: %s)\n", cmd_params_defaults.sas ? "1" : "0"); - printf(" -fdn, --fused-delta-net <0|1> (default: %s)\n", cmd_params_defaults.fdn ? "1" : "0"); + printf(" -fdn, --fused-delta-net (default: %d)\n", cmd_params_defaults.fdn); printf(" --print-overrides <0|1> (default: %s)\n", cmd_params_defaults.print_overrides ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); @@ -965,7 +965,7 @@ struct cmd_params_instance { bool muge = false; bool rcache = false; bool sas = false; - bool fdn = false; + int fdn = 0; const llama_model_tensor_buft_override* buft_overrides; llama_model_params to_llama_mparams() const { @@ -1282,7 +1282,7 @@ struct test { bool muge = false; bool rcache = false; bool sas = false; - bool fdn = false; + int fdn = 0; std::string override_tensor; int n_prompt; int n_gen; @@ -1429,14 +1429,14 @@ struct test { field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "mla_attn" || field == "attn_max_batch" || - field == "avg_ns" || field == "stddev_ns") { + field == "avg_ns" || field == "stddev_ns" || field == "fdn") { return INT; } if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || field == "gpu_blas" || field == "blas" || field == "sycl" || field == "no_kv_offload" || field == "flash_attn" || field == "use_mmap" || field == "embeddings" || field == "repack" || field == "use_thp" || field == "fused_moe" || field == "grouped_er" || field == "no_fused_up_gate" || field == "no_ooae" || field == "mqkv" || - field == "rcache" || field == "reuse" || field == "muge" || field == "sas" || field == "fnd") { + field == "rcache" || field == "reuse" || field == "muge" || field == "sas") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -1692,7 +1692,7 @@ struct markdown_printer : public printer { return 3; } if (field == "fdn") { - return 3; + return 4; } if (field == "use_thp") { return 3; diff --git a/include/llama.h b/include/llama.h index f1b43a5a..01f843b5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -456,7 +456,7 @@ extern "C" { bool split_mode_graph_scheduling; // if true, force split mode graph scheduling //bool split_mode_f16; // if true, cast intermediate results to f16 before copying to other GPUs bool scheduler_async; // if true, with split mode "graph" graph evaluation will be done using multiple threads - bool fused_delta_net; + int fused_delta_net; bool mtp; // Activate MTP if supported enum llama_mtp_op_type mtp_op_type; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 05bda231..6ac0a3a3 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -43,7 +43,7 @@ struct llama_cparams { bool split_mode_graph_scheduling; //bool split_mode_f16; bool scheduler_async; - bool fused_delta_net; + int fused_delta_net; int min_experts; float thresh_experts; bool mtp; diff --git a/src/llama-delta-net.cpp b/src/llama-delta-net.cpp index 41c18752..840bebf4 100644 --- a/src/llama-delta-net.cpp +++ b/src/llama-delta-net.cpp @@ -686,7 +686,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ std::pair attn_out; // The fused delta-net implementation is only faster than chunked for n_tok <= 8, so use it only in that case - attn_out = lctx.cparams.fused_delta_net && n_tok <= 8 ? build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb) : + attn_out = n_tok <= lctx.cparams.fused_delta_net ? build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb) : n_tok == 1 ? build_delta_net_autoregressive(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb) : build_delta_net_chunking(ctx0, q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il, cb); diff --git a/src/llama.cpp b/src/llama.cpp index 00f24c2c..2641f988 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4380,7 +4380,7 @@ struct llama_context_params llama_context_default_params() { /*.split_mode_graph_scheduling =*/ false, // /*.split_mode_f16 =*/ true, /*.scheduler_async =*/ false, - /*.fused_delta_net =*/ false, + /*.fused_delta_net =*/ 0, /*.mtp =*/ false, /*.mtp_op_type =*/ MTP_OP_NONE, /*.abort_callback =*/ nullptr,