From 64099e71c0749f7038f28c69af67f4b55faaa2c1 Mon Sep 17 00:00:00 2001 From: yurko Date: Sun, 8 Feb 2026 00:06:29 -0800 Subject: [PATCH] qwen3next: make fused delta safe by default and fix fused tensor layout --- .../qwen3next_bench_16k_pp16384_tg128.md | 38 +++++++++++++++++++ .../development/qwen3next_perf_diff_report.md | 6 ++- src/llama-build-context.cpp | 31 ++++++--------- 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/docs/development/qwen3next_bench_16k_pp16384_tg128.md b/docs/development/qwen3next_bench_16k_pp16384_tg128.md index cd29065b..8e91e562 100644 --- a/docs/development/qwen3next_bench_16k_pp16384_tg128.md +++ b/docs/development/qwen3next_bench_16k_pp16384_tg128.md @@ -142,3 +142,41 @@ Notes: - Decode-only fused mode preserves prompt-quality metrics in this test. - TG improved significantly in this run; PP variance was higher, so PP delta should be treated as noisy. + +## Fused DeltaNet Safety Update (Superseding) + +Date: 2026-02-08 + +This section supersedes the earlier `LLAMA_QWEN3NEXT_FUSED_DELTA` mode mapping. + +Updated env behavior in `src/llama-build-context.cpp`: + +- `0` / unset: non-fused for all token counts +- `1`: fused only for `n_tok > 1` (prefill/chunking), non-fused for single-token decode +- `2`: fused for all token counts (experimental) + +Reason: + +- Fused path has a known decode-path quality regression when forced on single-token steps. +- The safer default acceleration is therefore prefill-only fused mode (`=1`). + +Validation (CUDA, `qwen3-next-coder.gguf`, `-c 2048 -b 1 -ub 1 -fa on -ngl 47 --n-cpu-moe 40 --chunks 1 --no-warmup`): + +| Mode | PPL | +|---|---:| +| `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `3.9148 +/- 0.31093` | +| `LLAMA_QWEN3NEXT_FUSED_DELTA=1` | `3.9148 +/- 0.31093` | +| `LLAMA_QWEN3NEXT_FUSED_DELTA=2` | `6.1277 +/- 0.54810` | + +Quick throughput check (`-p 8192 -n 128 -b 2048 -ub 512 -r 1 -rtr 1`, same CUDA settings): + +| Mode | PP 8192 (tok/s) | TG 128 (tok/s) | +|---|---:|---:| +| `0` | `179.30` | `24.69` | +| `1` | `252.12` | `22.99` | +| `2` | `245.71` | `27.94` | + +Interpretation: + +- Use `=1` for production-safe quality with strong PP gain. +- Reserve `=2` for experiments only until decode-path correctness is fixed. diff --git a/docs/development/qwen3next_perf_diff_report.md b/docs/development/qwen3next_perf_diff_report.md index 40c84184..78ea673a 100644 --- a/docs/development/qwen3next_perf_diff_report.md +++ b/docs/development/qwen3next_perf_diff_report.md @@ -35,7 +35,7 @@ Not directly mirrored yet (by design divergence from mainline model layout): ## Required Adjustments (remaining) -1. Keep fused DeltaNet as default, but preserve safe fallback path (`LLAMA_QWEN3NEXT_FUSED_DELTA=0`) for debugging/regression checks. +1. Keep non-fused as the strict safety baseline, and use `LLAMA_QWEN3NEXT_FUSED_DELTA=1` (prefill-only fused) as the practical acceleration mode. 2. Port selective graph-shape optimizations from PR #19375 into `src/llama-build-context.cpp` where they map cleanly (avoid blind copy due architectural divergence). 3. Add one dedicated Qwen3Next perf regression target in CI/dev docs (single-GPU 8k proxy + 65k fit sanity). 4. Investigate ik CPU Flash-Attn assertion path for Qwen3Next (`iqk_fa_templates.h`, `S > 0`) before enabling `-fa 1` for CPU benchmark profiles. @@ -93,3 +93,7 @@ Relative (`ik` vs mainline): - `ik` CPU benchmark with `-fa 1` currently aborts for this model in `iqk_fa_templates.h` (`GGML_ASSERT(S > 0)`), so CPU matrix uses `-fa 0` for both repos. - `ik` benchmark JSON currently includes some non-JSON log lines in stdout around context creation; parsing should tolerate that. +- Fused DeltaNet mode mapping has been updated in code: + - `0` / unset: non-fused + - `1`: fused only for `n_tok > 1` (safe mode) + - `2`: fused on all token counts (experimental; decode-quality regression observed) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 85c82994..d1112fcd 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -4180,15 +4180,15 @@ ggml_cgraph * llm_build_context::build_qwen3next() { enum class qwen3next_fused_delta_mode { off, - on, - tok1_only, + tok_gt1, + all_tokens, }; // Keep legacy DeltaNet path as default for correctness. // LLAMA_QWEN3NEXT_FUSED_DELTA values: // unset / 0 : off - // 1 : fused for all token counts - // 2 : fused only for single-token decode steps + // 1 : fused only for n_tok > 1 (safer; avoids known decode regression) + // 2 : fused for all token counts (experimental) const qwen3next_fused_delta_mode fused_delta_mode = []() { const char * env = std::getenv("LLAMA_QWEN3NEXT_FUSED_DELTA"); if (env == nullptr || env[0] == '\0') { @@ -4201,14 +4201,13 @@ ggml_cgraph * llm_build_context::build_qwen3next() { case 'Y': case 't': case 'T': - return qwen3next_fused_delta_mode::on; + return qwen3next_fused_delta_mode::tok_gt1; case '2': - return qwen3next_fused_delta_mode::tok1_only; + return qwen3next_fused_delta_mode::all_tokens; default: return qwen3next_fused_delta_mode::off; } }(); - const bool use_fused_delta_net_full = fused_delta_mode == qwen3next_fused_delta_mode::on; auto get_slice_2d = [&](ggml_tensor * t, int64_t c) -> ggml_tensor * { return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3], @@ -4503,14 +4502,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() { GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs); GGML_ASSERT(H_k == H_v); - const float eps_norm = hparams.f_norm_rms_eps; - q = ggml_l2_norm(ctx0, q, eps_norm); - k = ggml_l2_norm(ctx0, k, eps_norm); - - const float scale = 1.0f / sqrtf(S_v); - q = ggml_scale(ctx0, q, scale); - beta = ggml_sigmoid(ctx0, beta); - cb(q, "q_in", il); cb(k, "k_in", il); cb(v, "v_in", il); @@ -4521,8 +4512,8 @@ ggml_cgraph * llm_build_context::build_qwen3next() { q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); - g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 1, 3, 0, 2), n_tokens, 1, H_k, n_seqs); - beta = ggml_cont_4d(ctx0, ggml_permute(ctx0, beta, 1, 2, 0, 3), 1, n_tokens, H_k, n_seqs); + g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs); + beta = ggml_cont_4d(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3), 1, n_tokens, H_k, n_seqs); ggml_tensor * state_flat = ggml_reshape_4d(ctx0, state, S_v, S_v * H_v, 1, n_seqs); if (!ggml_is_contiguous(state_flat)) { @@ -4853,8 +4844,8 @@ ggml_cgraph * llm_build_context::build_qwen3next() { std::pair attn_out; const bool use_fused_delta_net = - use_fused_delta_net_full || - (fused_delta_mode == qwen3next_fused_delta_mode::tok1_only && n_tok == 1); + (fused_delta_mode == qwen3next_fused_delta_mode::tok_gt1 && n_tok > 1) || + (fused_delta_mode == qwen3next_fused_delta_mode::all_tokens); if (use_fused_delta_net) { attn_out = build_delta_net_fused(q_conv, k_conv, v_conv, gate, beta, state, il); @@ -4938,7 +4929,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() { ggml_tensor * causal_mask = nullptr; ggml_tensor * identity = nullptr; ggml_tensor * diag_mask = nullptr; - if (!use_fused_delta_net_full) { + if (fused_delta_mode != qwen3next_fused_delta_mode::all_tokens) { causal_mask = ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f), GGML_TRI_TYPE_LOWER);