From 343e335ff047dcf406a5326d88944fd086186caa Mon Sep 17 00:00:00 2001 From: yurko Date: Sun, 8 Feb 2026 00:08:33 -0800 Subject: [PATCH] qwen3next: warn when forcing fused decode mode --- src/llama-build-context.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index d1112fcd..fc749a5f 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -4208,6 +4208,13 @@ ggml_cgraph * llm_build_context::build_qwen3next() { return qwen3next_fused_delta_mode::off; } }(); + if (fused_delta_mode == qwen3next_fused_delta_mode::all_tokens) { + static bool warned_all_tokens = false; + if (!warned_all_tokens) { + LLAMA_LOG_WARN("%s: LLAMA_QWEN3NEXT_FUSED_DELTA=2 enables fused single-token decode; quality regression is known in this mode\n", __func__); + warned_all_tokens = true; + } + } auto get_slice_2d = [&](ggml_tensor * t, int64_t c) -> ggml_tensor * { return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],