qwen3next: keep fused delta on safe path and remove PR artifacts

2026-04-30 03:11:51 +00:00 · 2026-02-08 19:48:15 -08:00
parent 69529d3f49
commit 48e0e351ce
8 changed files with 13 additions and 1528 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1250,8 +1250,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    if (arg == "--qwen3next-fused-delta") {
        CHECK_ARG
        params.qwen3next_fused_delta = std::stoi(argv[i]);
-        if (params.qwen3next_fused_delta < 0 || params.qwen3next_fused_delta > 2) {
-            fprintf(stderr, "error: Invalid value for --qwen3next-fused-delta: %d (must be 0, 1, or 2)\n",
+        if (params.qwen3next_fused_delta < 0 || params.qwen3next_fused_delta > 1) {
+            fprintf(stderr, "error: Invalid value for --qwen3next-fused-delta: %d (must be 0 or 1)\n",
                    params.qwen3next_fused_delta);
            invalid_param = true;
        }
@@ -2181,7 +2181,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
    options.push_back({ "*",           "-no-fa, --no-flash-attn",       "disable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
    options.push_back({ "*",           "-fa, --flash-attn (auto|on|off|0|1)", "set Flash Attention (default: %s)", params.flash_attn ? "on" : "off" });
-    options.push_back({ "*",           "       --qwen3next-fused-delta {0,1,2}",
+    options.push_back({ "*",           "       --qwen3next-fused-delta {0,1}",
                                                                        "force LLAMA_QWEN3NEXT_FUSED_DELTA mode for Qwen3Next (default: env/model default)" });
    options.push_back({ "*",           "-mla,  --mla-use",              "enable MLA (default: %d)", params.mla_attn });
    options.push_back({ "*",           "-amb,  --attention-max-batch",  "max batch size for attention computations (default: %d)", params.attn_max_batch});
--- a/common/common.h
+++ b/common/common.h
@@ -259,7 +259,7 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = true;  // flash attention
-    int  qwen3next_fused_delta = -1; // -1 keep env/default, otherwise force LLAMA_QWEN3NEXT_FUSED_DELTA={0,1,2}
+    int  qwen3next_fused_delta = -1; // -1 keep env/default, otherwise force LLAMA_QWEN3NEXT_FUSED_DELTA={0,1}
    int  mla_attn          = 3;     // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds
    int  attn_max_batch    = 0;     // Max batch size to use when computing attention (only applicable if flash_attn = false)
    bool fused_moe_up_gate = true;  // fused up*unary(gate) op for MoE models