diff --git a/common/common.cpp b/common/common.cpp
index 3a4d9a72..39e95531 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1250,8 +1250,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     if (arg == "--qwen3next-fused-delta") {
         CHECK_ARG
         params.qwen3next_fused_delta = std::stoi(argv[i]);
-        if (params.qwen3next_fused_delta < 0 || params.qwen3next_fused_delta > 2) {
-            fprintf(stderr, "error: Invalid value for --qwen3next-fused-delta: %d (must be 0, 1, or 2)\n",
+        if (params.qwen3next_fused_delta < 0 || params.qwen3next_fused_delta > 1) {
+            fprintf(stderr, "error: Invalid value for --qwen3next-fused-delta: %d (must be 0 or 1)\n",
                     params.qwen3next_fused_delta);
             invalid_param = true;
         }
@@ -2181,7 +2181,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
     options.push_back({ "*",           "-no-fa, --no-flash-attn",       "disable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
     options.push_back({ "*",           "-fa, --flash-attn (auto|on|off|0|1)", "set Flash Attention (default: %s)", params.flash_attn ? "on" : "off" });
-    options.push_back({ "*",           "       --qwen3next-fused-delta {0,1,2}",
+    options.push_back({ "*",           "       --qwen3next-fused-delta {0,1}",
                                                                         "force LLAMA_QWEN3NEXT_FUSED_DELTA mode for Qwen3Next (default: env/model default)" });
     options.push_back({ "*",           "-mla,  --mla-use",              "enable MLA (default: %d)", params.mla_attn });
     options.push_back({ "*",           "-amb,  --attention-max-batch",  "max batch size for attention computations (default: %d)", params.attn_max_batch});
diff --git a/common/common.h b/common/common.h
index 827fc1bb..a18d45d4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -259,7 +259,7 @@ struct gpt_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = true;  // flash attention
-    int  qwen3next_fused_delta = -1; // -1 keep env/default, otherwise force LLAMA_QWEN3NEXT_FUSED_DELTA={0,1,2}
+    int  qwen3next_fused_delta = -1; // -1 keep env/default, otherwise force LLAMA_QWEN3NEXT_FUSED_DELTA={0,1}
     int  mla_attn          = 3;     // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds
     int  attn_max_batch    = 0;     // Max batch size to use when computing attention (only applicable if flash_attn = false)
     bool fused_moe_up_gate = true;  // fused up*unary(gate) op for MoE models
diff --git a/docs/development/qwen3next_bench_16k_pp16384_tg128.md b/docs/development/qwen3next_bench_16k_pp16384_tg128.md
deleted file mode 100644
index 82ede13c..00000000
--- a/docs/development/qwen3next_bench_16k_pp16384_tg128.md
+++ /dev/null
@@ -1,182 +0,0 @@
-# Qwen3Next Benchmark: PP 16384 / TG 128 (`ik_llama.cpp` vs `llama.cpp`)
-
-Date: 2026-02-08
-
-## Setup
-
-- Container: `iktest2`
-- Model: `/models/qwen3-next-coder.gguf`
-- Prompt processing: `-p 16384`
-- Token generation: `-n 128`
-- Batch settings: `-b 3072 -ub 768`
-- Threads: `-t 8`
-- Repetitions: `-r 1`
-- Mmap: `-mmp 0`
-
-CUDA runs:
-
-- `CUDA_VISIBLE_DEVICES=0`
-- `-fa 1 -ngl 999 --n-cpu-moe 47`
-
-CPU-only runs:
-
-- `-fa 0 -ngl 0 --n-cpu-moe 0`
-
-Hardware note:
-
-- GPU0 (bench target): `NVIDIA GeForce RTX 5060 Ti`, `16311 MiB` total (`CUDA_VISIBLE_DEVICES=0` for CUDA runs).
-- GPU1 (not used for these runs): `NVIDIA GeForce RTX 3060`, `12288 MiB` total.
-- Observed during active `ik` CUDA run (`p=8192,b=2048,ub=512,n-cpu-moe=45`): GPU0 memory used `~12074 MiB` (`~3775 MiB` free), from `nvidia-smi`.
-
-## Results
-
-| Build | Backend | PP 16384 (tok/s) | TG 128 (tok/s) |
-|---|---|---:|---:|
-| `ik_llama.cpp` | CUDA | 207.891304 | 27.263562 |
-| `llama.cpp` | CUDA | 185.764649 | 24.145662 |
-| `ik_llama.cpp` | CPU-only | 45.739881 | 12.172113 |
-| `llama.cpp` | CPU-only | 47.835420 | 6.991398 |
-
-## Relative (`ik` vs `llama.cpp`)
-
-- CUDA PP: `+11.91%`
-- CUDA TG: `+12.91%`
-- CPU PP: `-4.38%`
-- CPU TG: `+74.10%`
-
-## Raw outputs
-
-- `/tmp/ik_cuda_bench_16k.json`
-- `/tmp/mainline_cuda_bench_16k.json`
-- `/tmp/ik_cpu_bench_16k.json`
-- `/tmp/mainline_cpu_bench_16k.json`
-
-## Additional CUDA rerun (requested lower `n-cpu-moe` ballpark)
-
-Adjusted config:
-
-- `-p 8192 -n 128 -b 2048 -ub 512 -t 8 -fa 1 -ngl 999 -mmp 0`
-- single GPU: `CUDA_VISIBLE_DEVICES=0`
-
-Fit checks on `ik`:
-
-- `--n-cpu-moe 25` -> fail to load model
-- `--n-cpu-moe 40` -> fail to create context
-- `--n-cpu-moe 45` -> works
-
-Working comparison at `--n-cpu-moe 45`:
-
-| Build | Backend | PP 8192 (tok/s) | TG 128 (tok/s) |
-|---|---|---:|---:|
-| `ik_llama.cpp` | CUDA | 201.613283 | 24.884600 |
-| `llama.cpp` | CUDA | 145.100895 | 24.595058 |
-
-`ik` rerun with `-rtr 1` at the same config (`--n-cpu-moe 45`):
-
-| Build | Backend | PP 8192 (tok/s) | TG 128 (tok/s) |
-|---|---|---:|---:|
-| `ik_llama.cpp` (`-rtr 1`) | CUDA | 232.340508 | 27.895722 |
-
-## Historical Fused DeltaNet Check (obsolete)
-
-Date: 2026-02-08
-
-Setup:
-
-- Container: `iktest2`
-- Device: `CUDA_VISIBLE_DEVICES=0` (RTX 5060 Ti)
-- Common args: `-c 2048 -b 2048 -ub 512 --chunks 1 --no-warmup -ngl 999 --n-cpu-moe 47 -t 8 -fa on`
-- Switch under test: `LLAMA_QWEN3NEXT_FUSED_DELTA`
-
-Results (Wikitext2 sample file `/tmp/ppl_wikitext2_test.txt`):
-
-| Model | `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `LLAMA_QWEN3NEXT_FUSED_DELTA=1` |
-|---|---:|---:|
-| `/models/qwen3-next-coder.gguf` | `PPL 3.9378` | `PPL 15.3628` |
-| `/models/qwen-3-coder-next-mxfp4.gguf` | `PPL 3.9860` | `PPL 15.0740` |
-
-Conclusion:
-
-- This run is kept for history only and is superseded by the later `Fused DeltaNet Safety Update (Superseding)` section below.
-- Use the superseding section as source of truth for mode mapping and quality guidance.
-
-## Upstream PR #19375 Trial (Selective Port) Outcome
-
-Date: 2026-02-08
-
-What was tried:
-
-- Ported selected non-fused qwen3next graph changes from `ggml-org/llama.cpp#19375` (broadcast/repeat and autoregressive matmul rewrite), then benchmarked and re-tested perplexity.
-
-Outcome:
-
-- No stable speed win in our setup after repeated runs.
-- Direct autoregressive rewrite attempts from PR #19375 were not compatible with current ik graph-layout/contiguity assumptions and were reverted.
-- Final code keeps only safe chunk-shape fixes plus fused-mode safety controls.
-
-## Decode-Only Fused Mode Trial (`LLAMA_QWEN3NEXT_FUSED_DELTA=2`)
-
-Date: 2026-02-08
-
-Code change:
-
-- Added mode `2` for `LLAMA_QWEN3NEXT_FUSED_DELTA`:
-  - prompt / multi-token path: non-fused
-  - single-token decode path: fused
-
-Perplexity validation (`-c 2048`, GPU config as above):
-
-| Model | `=0` non-fused | `=2` decode-only fused |
-|---|---:|---:|
-| `/models/qwen3-next-coder.gguf` | `3.9378` | `3.9378` |
-| `/models/qwen-3-coder-next-mxfp4.gguf` | `3.9860` | `3.9860` |
-
-`llama-bench` at `-p 8192 -n 128 -b 2048 -ub 512 -r 3 -rtr 1`:
-
-| Mode | PP 8192 (tok/s) | TG 128 (tok/s) |
-|---|---:|---:|
-| `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `170.090` | `25.465` |
-| `LLAMA_QWEN3NEXT_FUSED_DELTA=2` | `166.212` | `29.599` |
-
-Notes:
-
-- Decode-only fused mode preserves prompt-quality metrics in this test.
-- TG improved significantly in this run; PP variance was higher, so PP delta should be treated as noisy.
-
-## Fused DeltaNet Safety Update (Superseding)
-
-Date: 2026-02-08
-
-This section supersedes the earlier `LLAMA_QWEN3NEXT_FUSED_DELTA` mode mapping.
-
-Updated env behavior in `src/llama-build-context.cpp`:
-
-- `0` / unset: non-fused for all token counts
-- `1`: fused only for `n_tok > 1` (prefill/chunking), non-fused for single-token decode
-- `2`: fused for all token counts (experimental)
-
-Reason:
-
-- Fused path has a known decode-path quality regression when forced on single-token steps.
-- The safer default acceleration is therefore prefill-only fused mode (`=1`).
-
-Validation (CUDA, `qwen3-next-coder.gguf`, `-c 2048 -b 1 -ub 1 -fa on -ngl 47 --n-cpu-moe 40 --chunks 1 --no-warmup`):
-
-| Mode | PPL |
-|---|---:|
-| `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `3.9148 +/- 0.31093` |
-| `LLAMA_QWEN3NEXT_FUSED_DELTA=1` | `3.9148 +/- 0.31093` |
-| `LLAMA_QWEN3NEXT_FUSED_DELTA=2` | `6.1277 +/- 0.54810` |
-
-Quick throughput check (`-p 8192 -n 128 -b 2048 -ub 512 -r 1 -rtr 1`, same CUDA settings):
-
-| Mode | PP 8192 (tok/s) | TG 128 (tok/s) |
-|---|---:|---:|
-| `0` | `179.30` | `24.69` |
-| `1` | `252.12` | `22.99` |
-| `2` | `245.71` | `27.94` |
-
-Interpretation:
-
-- Use `=1` for production-safe quality with strong PP gain.
-- Reserve `=2` for experiments only until decode-path correctness is fixed.
diff --git a/docs/development/qwen3next_perf_diff_report.md b/docs/development/qwen3next_perf_diff_report.md
deleted file mode 100644
index 919f61fd..00000000
--- a/docs/development/qwen3next_perf_diff_report.md
+++ /dev/null
@@ -1,165 +0,0 @@
-# Qwen3Next Review and Benchmark Summary (`ik_llama.cpp` vs `llama.cpp`)
-
-Date: 2026-02-08
-
-## Scope
-
-This document captures:
-
-- Current upstream PR alignment for Qwen3Next-related work.
-- What is already strong in `ik_llama.cpp` and what still needs adjustment.
-- Recommended runtime settings for this machine (single GPU target, long context).
-- Final apples-to-apples benchmark matrix for `ik_llama.cpp` vs `../llama.cpp`.
-
-## Upstream PR Check (as of 2026-02-08)
-
-Reviewed PRs:
-
-- https://github.com/ggml-org/llama.cpp/pull/18102 (`open`): Delta-Net CUDA op + integration.
-- https://github.com/ggml-org/llama.cpp/pull/18792 (`open`): unified DeltaNet handling (`src/models/delta.cpp`).
-- https://github.com/ggml-org/llama.cpp/pull/19375 (`open`, `draft`): Qwen3Next graph optimization in model builder.
-
-### Current alignment in `ik_llama.cpp`
-
-Already present and/or functionally covered:
-
-- CUDA DeltaNet op path exists in GGML (`ggml/src/ggml-cuda/delta-net.cu`).
-- Solve-tri and backend op support are present for the fused path.
-- Qwen3Next fused DeltaNet builder path exists (and is now runtime-toggleable via env).
-- Existing ik optimizations remain available (`-rtr`, grouped/fused paths, no-offload-only-active-experts switches).
-
-Not directly mirrored yet (by design divergence from mainline model layout):
-
-- Mainline `src/models/delta.cpp` structure from PR #18792.
-- Mainline `src/models/qwen3next.cpp` graph-form from PR #19375.
-
-## Required Adjustments (remaining)
-
-1. Keep non-fused as the strict safety baseline in defaults, and use `LLAMA_QWEN3NEXT_FUSED_DELTA=1` (prefill-only fused) as the explicit acceleration mode.
-2. Continue using `scripts/qwen3next-regression.sh` as the release gate for this model path, and wire it into CI or pre-merge checks.
-3. Treat the remaining PR #19375 autoregressive rewrite as deferred: direct porting into current ik graph builder is not layout-compatible without broader contiguity/reshape refactoring.
-4. Revisit PR #18792 (`src/models/delta.cpp`) only if we need unified GDA/KDA support for additional architectures; for Qwen3Next-only it is optional.
-
-## Strong Points of `ik_llama.cpp` to Preserve
-
-- More runtime controls than mainline for this workload (`-rtr`, backend toggles, MoE/OOAE controls).
-- Strong CUDA path for this model family once offload routing is tuned (`--n-cpu-moe` thresholding).
-- Better TG throughput than current mainline in matched CUDA and CPU tests on this host.
-
-## Best Runtime Configuration (this host)
-
-Model: `/models/qwen3-next-coder.gguf`
-
-Single-GPU long-context finding:
-
-- `-c 65536` on GPU0 (16 GB) requires at least `--n-cpu-moe 47` to fit reliably.
-
-8k sweep proxy (single GPU, tuned path):
-
-- `b=2048,ub=512` -> `pp8192=142.85`, `tg128=24.81`
-- `b=3072,ub=768` -> `pp8192=229.31`, `tg128=27.29` (best)
-- `b=4096,ub=1024` -> `pp8192=211.53`, `tg128=23.85`
-
-Recommended serving baseline:
-
-- `CUDA_VISIBLE_DEVICES=0`
-- `-c 65536 -b 3072 -ub 768 -t 8 -fa on -ngl 999 --n-cpu-moe 47 -rtr --qwen3next-fused-delta 1`
-
-## Final Benchmark Matrix (8k context proxy)
-
-All four builds were benchmarked with matched parameters and explicit `-mmp 0` for fairness.
-
-Common args:
-
-- `-m /models/qwen3-next-coder.gguf -p 8192 -n 128 -b 3072 -ub 768 -t 8 -r 1`
-- CUDA runs: `CUDA_VISIBLE_DEVICES=0 -fa 1 -ngl 999 --n-cpu-moe 47 -mmp 0`
-- CPU runs: `-fa 0 -ngl 0 --n-cpu-moe 0 -mmp 0`
-
-| Build | PP (tok/s) | TG (tok/s) |
-|---|---:|---:|
-| `ik` CUDA | 204.614 | 28.979 |
-| mainline CUDA | 184.521 | 22.012 |
-| `ik` CPU | 49.795 | 12.681 |
-| mainline CPU | 51.674 | 7.299 |
-
-Relative (`ik` vs mainline):
-
-- CUDA PP: `+10.9%`
-- CUDA TG: `+31.7%`
-- CPU PP: `-3.6%`
-- CPU TG: `+73.7%`
-
-## Notes
-
-- CPU-only Qwen3Next with `-fa 1` is now guarded in ik: FA is auto-disabled with a warning for `n_gpu_layers == 0` to avoid the prior `iqk_fa_templates.h` assert path.
-- `ik` benchmark JSON currently includes some non-JSON log lines in stdout around context creation; parsing should tolerate that.
-- Fused DeltaNet mode mapping has been updated in code:
-  - `0` / unset: non-fused
-  - `1`: fused only for `n_tok > 1` (safe mode)
-  - `2`: fused on all token counts (experimental; decode-quality regression observed)
-- Added manual regression runner for fused-mode safety checks:
-  - `scripts/qwen3next-fused-regression.sh`
-  - Example:
-    - `BIN=./build-qwen3next-fix/bin/llama-perplexity scripts/qwen3next-fused-regression.sh --model /models/qwen3-next-coder.gguf --ctx 2048 --decode-b 1 --decode-ub 1 --prefill-b 2048 --prefill-ub 512 --ngl 47 --n-cpu-moe 40`
-- Also integrated into the broader eval harness:
-  - `scripts/qwen3next-eval.sh --with-gpu --with-fused-regression ...`
-  - Results are surfaced in `SUMMARY.md` under `IK Fused Delta Regression`.
-- Fused regression now enforces absolute non-fused sanity too:
-  - mode0 decode/prefill PPL must stay below configurable thresholds (defaults: `10.0` / `10.0`).
-- Added unified Qwen3Next regression entrypoint for ongoing checks:
-  - `scripts/qwen3next-regression.sh --model /path/to/qwen3-next-coder.gguf`
-  - Outputs `SUMMARY.md` + per-step logs under `/tmp/qwen3next-regression/<timestamp>/`.
-- Added CLI plumbing for fused mode control (no raw env required):
-  - `--qwen3next-fused-delta {0|1|2}`
-  - This sets `LLAMA_QWEN3NEXT_FUSED_DELTA` for the current process.
-- Added experimental CUDA DeltaNet dispatch control:
-  - `GGML_CUDA_DELTA_NET_OPT={0|1|2|3|4}`
-  - `0`: baseline dispatch (default)
-  - `1`: force fp16 recurrent kernel (`head_dim=128`)
-  - `2`: force multiblock kernel
-  - `3`: force Blackwell optimized kernel
-  - `4`: conservative auto mode (pre-Blackwell only)
-- RTX 5060 Ti spot checks (`p=2048,n=64,b=1024,ub=256,--n-cpu-moe 47,-rtr 1`) did not show a reliable win from forced kernels:
-  - mode `2` and mode `3` reduced TG in single-run checks versus baseline.
-  - mode `4` tracks baseline on Blackwell (by design, no forced optimized-kernel switch there).
-
-## Decode Quality Diagnosis (Wikitext-2, `--chunks 1`, CUDA)
-
-Real-data perplexity checks on `/tmp/ppl_wikitext2_test.txt` confirm the decode regression source:
-
-- `qwen3-next-coder.gguf`
-  - mode `0`, opt `0`: `PPL=3.9148`
-  - mode `1`, opt `0`: `PPL=3.9148` (parity with mode 0)
-  - mode `2`, opt `0/1/2/4`: `PPL=6.1277` (consistently regressed)
-  - mode `2`, opt `3`: `PPL=302221.3639` (catastrophic instability)
-- `qwen-3-coder-next-mxfp4.gguf`
-  - mode `0`, opt `0`: `PPL=3.9832`
-  - mode `1`, opt `0`: `PPL=3.9832` (parity with mode 0)
-  - mode `2`, opt `0`: `PPL=6.2362` (same regression pattern)
-  - mode `2`, opt `3`: `PPL=795964.1118` (catastrophic instability)
-
-Conclusion:
-
-- Decode-quality regression is tied to fused-all mode (`LLAMA_QWEN3NEXT_FUSED_DELTA=2`), not fixed by kernel dispatch overrides.
-- `GGML_CUDA_DELTA_NET_OPT=3` should not be used on this path.
-
-## Safe Speed Gain (mode 1)
-
-With decode-safe mode (`LLAMA_QWEN3NEXT_FUSED_DELTA=1`), throughput on the serving proxy profile improved while preserving perplexity:
-
-- Profile:
-  - `llama-bench -m /models/qwen3-next-coder.gguf -p 8192 -n 128 -b 3072 -ub 768 -t 8 -fa 1 -ngl 999 --n-cpu-moe 47 -r 3 -rtr 1 -mmp 0`
-- Mode `0` (`r=3`):
-  - `pp8192 = 175.639 +/- 0.221 tok/s`
-  - `tg128  = 26.393 +/- 1.469 tok/s`
-- Mode `1` (`r=3`):
-  - `pp8192 = 237.014 +/- 1.199 tok/s`
-  - `tg128  = 27.111 +/- 1.395 tok/s`
-- Relative (`mode1` vs `mode0`):
-  - PP: `+34.9%`
-  - TG: `+2.7%`
-
-Additional A/B for `GGML_CUDA_DELTA_NET_OPT=2` under mode `1` (`r=3`) did not improve performance:
-
-- opt `0`: `pp8192=238.352`, `tg128=24.709`
-- opt `2`: `pp8192=237.680`, `tg128=24.566`
diff --git a/scripts/qwen3next-eval.sh b/scripts/qwen3next-eval.sh
deleted file mode 100755
index 102699f6..00000000
--- a/scripts/qwen3next-eval.sh
+++ /dev/null
@@ -1,546 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-IMAGE="${IMAGE:-iktest-dev:latest}"
-MAIN_REPO="${MAIN_REPO:-/home/yurko/Code/llama.cpp}"
-IK_REPO="${IK_REPO:-/home/yurko/Code/ik_llama.cpp}"
-MAIN_BUILD_DIR="${MAIN_BUILD_DIR:-build}"
-IK_BUILD_DIR="${IK_BUILD_DIR:-build}"
-MODEL_HOST="${MODEL_HOST:-/home/yurko/.cache/llama.cpp/qwen3-next-coder.gguf}"
-OUT_ROOT="${OUT_ROOT:-/tmp/qwen3next-eval}"
-WITH_GPU=0
-WITH_FUSED_REGRESSION=0
-GPU_DEVICE="${GPU_DEVICE:-0}"
-SWEEP_CTX="${SWEEP_CTX:-2048}"
-SWEEP_N="${SWEEP_N:-32}"
-
-usage() {
-    cat <<'USAGE'
-Usage:
-  scripts/qwen3next-eval.sh [options]
-
-Options:
-  --with-gpu                 Enable GPU checks in addition to CPU checks.
-  --with-fused-regression    Run ik fused-delta regression check and include in summary.
-  --gpu-device ID            CUDA device id to use for GPU sanity checks (default: 0).
-  --image IMAGE              Docker image to run checks in (default: iktest-dev:latest).
-  --main-repo PATH           Mainline repo path (default: /home/yurko/Code/llama.cpp).
-  --ik-repo PATH             ik repo path (default: /home/yurko/Code/ik_llama.cpp).
-  --main-build-dir NAME      Mainline build dir under main repo (default: build).
-  --ik-build-dir NAME        ik build dir under ik repo (default: build).
-  --model PATH               Host path to model GGUF file.
-  --out-root PATH            Output root directory (default: /tmp/qwen3next-eval).
-  --sweep-ctx N              Sweep context size for PP/TG check (default: 2048).
-  --sweep-n N                Sweep generation tokens (default: 32).
-  -h, --help                 Show this help.
-
-What this script runs (in this order):
-  1) CPU perplexity parity (chunks=1)      mainline -> ik
-  2) CPU perplexity parity (chunks=2)      mainline -> ik
-  3) CPU short generation smoke quality    mainline -> ik
-  4) Optional GPU sanity checks            mainline -> ik
-  5) Optional ik fused-delta regression    mode0/mode1/mode2 safety check
-
-Output:
-  A timestamped folder is created under OUT_ROOT with:
-  - SUMMARY.md
-  - run.log
-  - *.out / *.err logs for each command
-USAGE
-}
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --with-gpu)
-            WITH_GPU=1
-            shift
-            ;;
-        --with-fused-regression)
-            WITH_FUSED_REGRESSION=1
-            shift
-            ;;
-        --gpu-device)
-            GPU_DEVICE="$2"
-            shift 2
-            ;;
-        --image)
-            IMAGE="$2"
-            shift 2
-            ;;
-        --main-repo)
-            MAIN_REPO="$2"
-            shift 2
-            ;;
-        --ik-repo)
-            IK_REPO="$2"
-            shift 2
-            ;;
-        --main-build-dir)
-            MAIN_BUILD_DIR="$2"
-            shift 2
-            ;;
-        --ik-build-dir)
-            IK_BUILD_DIR="$2"
-            shift 2
-            ;;
-        --model)
-            MODEL_HOST="$2"
-            shift 2
-            ;;
-        --out-root)
-            OUT_ROOT="$2"
-            shift 2
-            ;;
-        --sweep-ctx)
-            SWEEP_CTX="$2"
-            shift 2
-            ;;
-        --sweep-n)
-            SWEEP_N="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown option: $1" >&2
-            usage
-            exit 2
-            ;;
-    esac
-done
-
-if [[ ! -d "$MAIN_REPO" ]]; then
-    echo "Mainline repo does not exist: $MAIN_REPO" >&2
-    exit 1
-fi
-if [[ ! -d "$IK_REPO" ]]; then
-    echo "ik repo does not exist: $IK_REPO" >&2
-    exit 1
-fi
-if [[ ! -f "$MODEL_HOST" ]]; then
-    echo "Model file does not exist: $MODEL_HOST" >&2
-    exit 1
-fi
-
-run_id="$(date +%Y%m%d_%H%M%S)"
-out_dir="${OUT_ROOT%/}/${run_id}"
-mkdir -p "$out_dir"
-
-cat > "${out_dir}/ppl_input.txt" <<'TXT'
-Deterministic evaluation text for quick perplexity parity checks.
-The next lines intentionally repeat a simple pattern to reduce variance.
-TXT
-for _ in $(seq 1 400); do
-    echo "the system writes logs and the system reads logs" >> "${out_dir}/ppl_input.txt"
-done
-
-cat > "${out_dir}/gen_prompt.txt" <<'TXT'
-Write a concise Python function that returns the first n Fibonacci numbers iteratively, and then give one sentence explaining time complexity.
-TXT
-
-cat > "${out_dir}/run_inside.sh" <<'BASH'
-#!/usr/bin/env bash
-set -euo pipefail
-
-WITH_GPU="${WITH_GPU:-0}"
-GPU_DEVICE="${GPU_DEVICE:-0}"
-SWEEP_CTX="${SWEEP_CTX:-2048}"
-SWEEP_N="${SWEEP_N:-32}"
-MAIN_BUILD_DIR="${MAIN_BUILD_DIR:-build}"
-IK_BUILD_DIR="${IK_BUILD_DIR:-build}"
-WITH_FUSED_REGRESSION="${WITH_FUSED_REGRESSION:-0}"
-
-MAIN_BIN="/mainline/${MAIN_BUILD_DIR}/bin"
-IK_BIN="/ik/${IK_BUILD_DIR}/bin"
-MAIN_LD="/mainline/${MAIN_BUILD_DIR}/bin:/mainline/${MAIN_BUILD_DIR}/src:/mainline/${MAIN_BUILD_DIR}/ggml/src:/mainline/${MAIN_BUILD_DIR}/examples/mtmd"
-IK_LD="/ik/${IK_BUILD_DIR}/bin:/ik/${IK_BUILD_DIR}/src:/ik/${IK_BUILD_DIR}/ggml/src:/ik/${IK_BUILD_DIR}/examples/mtmd"
-MODEL="/model.gguf"
-
-RUN_LOG="/out/run.log"
-STATUS_FILE="/out/status.tsv"
-
-touch "$RUN_LOG"
-printf "name\tstatus\texit_code\thost_mem_used_before_mib\thost_mem_used_after_mib\tgpu_mem_used_before_mib\tgpu_mem_used_after_mib\tmax_rss_kib\telapsed\n" > "$STATUS_FILE"
-
-log() {
-    local msg="$1"
-    printf "[%s] %s\n" "$(date +%H:%M:%S)" "$msg" | tee -a "$RUN_LOG"
-}
-
-require_bin() {
-    local path="$1"
-    if [[ ! -x "$path" ]]; then
-        log "MISSING: $path"
-        return 1
-    fi
-}
-
-host_mem_used_mib() {
-    awk '
-        /MemTotal:/     { mt = $2 }
-        /MemAvailable:/ { ma = $2 }
-        END {
-            if (mt > 0 && ma >= 0) {
-                printf "%.1f", (mt - ma) / 1024.0
-            } else {
-                print "NA"
-            }
-        }
-    ' /proc/meminfo
-}
-
-gpu_mem_used_mib() {
-    if [[ "$WITH_GPU" != "1" ]]; then
-        echo "NA"
-        return
-    fi
-    if ! command -v nvidia-smi >/dev/null 2>&1; then
-        echo "NA"
-        return
-    fi
-    local used
-    used="$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | tr '\n' ',' | sed 's/,$//' || true)"
-    if [[ -z "$used" ]]; then
-        echo "NA"
-    else
-        echo "$used"
-    fi
-}
-
-extract_max_rss_kib() {
-    local time_file="$1"
-    if [[ ! -f "$time_file" ]]; then
-        echo "NA"
-        return
-    fi
-    local rss
-    rss="$(grep -E '^Maximum resident set size' "$time_file" | awk '{print $6}' | tail -n1 || true)"
-    if [[ -z "$rss" ]]; then
-        echo "NA"
-    else
-        echo "$rss"
-    fi
-}
-
-extract_elapsed() {
-    local time_file="$1"
-    if [[ ! -f "$time_file" ]]; then
-        echo "NA"
-        return
-    fi
-    local elapsed
-    elapsed="$(grep -E '^Elapsed \(wall clock\) time' "$time_file" | sed -E 's/^[^:]+:[[:space:]]*//' | tail -n1 || true)"
-    if [[ -z "$elapsed" ]]; then
-        echo "NA"
-    else
-        echo "$elapsed"
-    fi
-}
-
-run_cmd() {
-    local name="$1"
-    shift
-    local out_file="/out/${name}.out"
-    local err_file="/out/${name}.err"
-    local time_file="/out/${name}.time"
-    local ec
-    local host_before host_after gpu_before gpu_after max_rss elapsed
-
-    host_before="$(host_mem_used_mib)"
-    gpu_before="$(gpu_mem_used_mib)"
-    log "RUN: $name"
-
-    set +e
-    if [[ -x /usr/bin/time ]]; then
-        /usr/bin/time -v -o "$time_file" "$@" >"$out_file" 2>"$err_file"
-        ec=$?
-    else
-        "$@" >"$out_file" 2>"$err_file"
-        ec=$?
-    fi
-    set -e
-
-    host_after="$(host_mem_used_mib)"
-    gpu_after="$(gpu_mem_used_mib)"
-    max_rss="$(extract_max_rss_kib "$time_file")"
-    elapsed="$(extract_elapsed "$time_file")"
-
-    if [[ $ec -eq 0 ]]; then
-        printf "%s\tOK\t0\t%s\t%s\t%s\t%s\t%s\t%s\n" \
-            "$name" "$host_before" "$host_after" "$gpu_before" "$gpu_after" "$max_rss" "$elapsed" >> "$STATUS_FILE"
-        log "OK: $name"
-    else
-        printf "%s\tFAIL\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n" \
-            "$name" "$ec" "$host_before" "$host_after" "$gpu_before" "$gpu_after" "$max_rss" "$elapsed" >> "$STATUS_FILE"
-        log "FAIL($ec): $name"
-    fi
-    return $ec
-}
-
-extract_ppl() {
-    local out_file="$1"
-    local err_file="$2"
-    local line num
-
-    line="$(cat "$out_file" "$err_file" 2>/dev/null | grep -E "Final estimate:" | tail -n1 || true)"
-    if [[ -z "$line" ]]; then
-        echo "NA"
-        return
-    fi
-
-    num="$(echo "$line" | sed -nE 's/.*= ([0-9]+\.[0-9]+).*/\1/p')"
-    if [[ -z "$num" ]]; then
-        num="$(echo "$line" | grep -Eo '[0-9]+\.[0-9]+' | head -n1 || true)"
-    fi
-    if [[ -z "$num" ]]; then
-        echo "NA"
-    else
-        echo "$num"
-    fi
-}
-
-abs_delta() {
-    local a="$1"
-    local b="$2"
-    awk -v a="$a" -v b="$b" 'BEGIN { d = a - b; if (d < 0) d = -d; printf "%.6f", d }'
-}
-
-has_token() {
-    local file="$1"
-    local pattern="$2"
-    if grep -Eiq "$pattern" "$file"; then
-        echo "yes"
-    else
-        echo "no"
-    fi
-}
-
-require_bin "$MAIN_BIN/llama-perplexity"
-require_bin "$MAIN_BIN/llama-cli"
-require_bin "$MAIN_BIN/llama-completion"
-require_bin "$IK_BIN/llama-perplexity"
-require_bin "$IK_BIN/llama-cli"
-
-if [[ "$WITH_GPU" != "1" ]]; then
-    export CUDA_VISIBLE_DEVICES=""
-    log "GPU checks disabled (CPU-only mode)"
-else
-    export CUDA_VISIBLE_DEVICES="$GPU_DEVICE"
-    log "GPU checks enabled on CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-fi
-
-if [[ "$WITH_FUSED_REGRESSION" == "1" && "$WITH_GPU" != "1" ]]; then
-    log "Fused regression requested but GPU mode is disabled; this step will be skipped"
-fi
-
-PPL_INPUT="/out/ppl_input.txt"
-GEN_PROMPT="$(cat /out/gen_prompt.txt)"
-
-# CPU perplexity: chunks=1 (mainline -> ik)
-run_cmd "cpu_ppl_chunks1_mainline" \
-    env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-perplexity" \
-        -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 0 || true
-run_cmd "cpu_ppl_chunks1_ik" \
-    env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-perplexity" \
-        -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 0 || true
-
-# CPU perplexity: chunks=2 (mainline -> ik)
-run_cmd "cpu_ppl_chunks2_mainline" \
-    env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-perplexity" \
-        -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 2 --no-warmup -ngl 0 || true
-run_cmd "cpu_ppl_chunks2_ik" \
-    env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-perplexity" \
-        -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 2 --no-warmup -ngl 0 || true
-
-# CPU short generation smoke quality (mainline -> ik)
-run_cmd "cpu_gen_mainline" \
-    env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-completion" \
-        -m "$MODEL" --cpu-moe -ngl 0 -c 512 -n 64 --seed 123 --temp 0 --top-k 1 --simple-io --no-display-prompt -p "$GEN_PROMPT" || true
-run_cmd "cpu_gen_ik" \
-    env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-cli" \
-        -m "$MODEL" --cpu-moe -ngl 0 -c 512 -n 64 --seed 123 --temp 0 --top-k 1 --simple-io --no-display-prompt -p "$GEN_PROMPT" || true
-
-if [[ "$WITH_GPU" == "1" ]]; then
-    # CUDA sanity perplexity: chunks=1 (mainline -> ik)
-    run_cmd "gpu_ppl_chunks1_mainline" \
-        env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-perplexity" \
-            -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 1 || true
-    run_cmd "gpu_ppl_chunks1_ik" \
-        env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-perplexity" \
-            -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 1 || true
-
-    # Quick sweep sanity (mainline -> ik)
-    if [[ -x "$MAIN_BIN/llama-sweep-bench" ]]; then
-        if env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-sweep-bench" --help >/dev/null 2>&1; then
-            run_cmd "gpu_sweep_mainline" \
-                env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-sweep-bench" \
-                    -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true
-        else
-            printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_mainline" >> "$STATUS_FILE"
-            log "SKIP: gpu_sweep_mainline (binary cannot start with current runtime deps)"
-        fi
-    else
-        printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_mainline" >> "$STATUS_FILE"
-        log "SKIP: gpu_sweep_mainline (missing $MAIN_BIN/llama-sweep-bench)"
-    fi
-    if [[ -x "$IK_BIN/llama-sweep-bench" ]]; then
-        if env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" --help >/dev/null 2>&1; then
-            run_cmd "gpu_sweep_ik" \
-                env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" \
-                    -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true
-        else
-            printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_ik" >> "$STATUS_FILE"
-            log "SKIP: gpu_sweep_ik (binary cannot start with current runtime deps)"
-        fi
-    else
-        printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_ik" >> "$STATUS_FILE"
-        log "SKIP: gpu_sweep_ik (missing $IK_BIN/llama-sweep-bench)"
-    fi
-fi
-
-if [[ "$WITH_FUSED_REGRESSION" == "1" ]]; then
-    if [[ "$WITH_GPU" != "1" ]]; then
-        printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "ik_fused_regression" >> "$STATUS_FILE"
-    elif [[ ! -x "/ik/scripts/qwen3next-fused-regression.sh" ]]; then
-        printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "ik_fused_regression" >> "$STATUS_FILE"
-        log "SKIP: ik_fused_regression (missing /ik/scripts/qwen3next-fused-regression.sh)"
-    else
-        run_cmd "ik_fused_regression" \
-            env LD_LIBRARY_PATH="$IK_LD" /ik/scripts/qwen3next-fused-regression.sh \
-                --model "$MODEL" \
-                --bin "$IK_BIN/llama-perplexity" \
-                --out /out/ik_fused_regression.md \
-                --cuda-device "$GPU_DEVICE" \
-                --threads 8 \
-                --ctx 2048 \
-                --fa on \
-                --ngl 47 \
-                --n-cpu-moe 40 \
-                --chunks 1 \
-                --decode-b 1 \
-                --decode-ub 1 \
-                --prefill-b 2048 \
-                --prefill-ub 512 || true
-    fi
-fi
-
-# Aggregate summary
-cpu_c1_main="$(extract_ppl /out/cpu_ppl_chunks1_mainline.out /out/cpu_ppl_chunks1_mainline.err)"
-cpu_c1_ik="$(extract_ppl /out/cpu_ppl_chunks1_ik.out /out/cpu_ppl_chunks1_ik.err)"
-cpu_c2_main="$(extract_ppl /out/cpu_ppl_chunks2_mainline.out /out/cpu_ppl_chunks2_mainline.err)"
-cpu_c2_ik="$(extract_ppl /out/cpu_ppl_chunks2_ik.out /out/cpu_ppl_chunks2_ik.err)"
-
-cpu_c1_delta="NA"
-cpu_c2_delta="NA"
-if [[ "$cpu_c1_main" != "NA" && "$cpu_c1_ik" != "NA" ]]; then
-    cpu_c1_delta="$(abs_delta "$cpu_c1_main" "$cpu_c1_ik")"
-fi
-if [[ "$cpu_c2_main" != "NA" && "$cpu_c2_ik" != "NA" ]]; then
-    cpu_c2_delta="$(abs_delta "$cpu_c2_main" "$cpu_c2_ik")"
-fi
-
-main_has_fib="$(has_token /out/cpu_gen_mainline.out 'fibonacci|fibs|fib')"
-ik_has_fib="$(has_token /out/cpu_gen_ik.out 'fibonacci|fibs|fib')"
-main_has_complexity="$(has_token /out/cpu_gen_mainline.out 'complexity|O\(')"
-ik_has_complexity="$(has_token /out/cpu_gen_ik.out 'complexity|O\(')"
-fused_decode_safe="NA"
-fused_prefill_safe="NA"
-fused_mode0_decode_sane="NA"
-fused_mode0_prefill_sane="NA"
-if [[ -f /out/ik_fused_regression.md ]]; then
-    fused_decode_safe="$(sed -nE 's/^- decode safety .*: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)"
-    fused_prefill_safe="$(sed -nE 's/^- prefill safety .*: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)"
-    fused_mode0_decode_sane="$(sed -nE 's/^- mode0 decode sanity: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)"
-    fused_mode0_prefill_sane="$(sed -nE 's/^- mode0 prefill sanity: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)"
-    if [[ -z "$fused_decode_safe" ]]; then fused_decode_safe="NA"; fi
-    if [[ -z "$fused_prefill_safe" ]]; then fused_prefill_safe="NA"; fi
-    if [[ -z "$fused_mode0_decode_sane" ]]; then fused_mode0_decode_sane="NA"; fi
-    if [[ -z "$fused_mode0_prefill_sane" ]]; then fused_mode0_prefill_sane="NA"; fi
-fi
-
-{
-    echo "# Qwen3Next Eval Summary"
-    echo
-    echo "Mode: $( [[ "$WITH_GPU" == "1" ]] && echo "CPU+GPU" || echo "CPU-only" )"
-    echo "- Sweep config: c=\`$SWEEP_CTX\`, n=\`$SWEEP_N\`"
-    echo
-    echo "## CPU Perplexity"
-    echo "- chunks=1 mainline: \`$cpu_c1_main\`"
-    echo "- chunks=1 ik: \`$cpu_c1_ik\`"
-    echo "- chunks=1 |delta|: \`$cpu_c1_delta\`"
-    echo "- chunks=2 mainline: \`$cpu_c2_main\`"
-    echo "- chunks=2 ik: \`$cpu_c2_ik\`"
-    echo "- chunks=2 |delta|: \`$cpu_c2_delta\`"
-    echo
-    echo "## CPU Short Generation Smoke"
-    echo "- mainline has Fibonacci token(s): \`$main_has_fib\`"
-    echo "- ik has Fibonacci token(s): \`$ik_has_fib\`"
-    echo "- mainline has complexity token(s): \`$main_has_complexity\`"
-    echo "- ik has complexity token(s): \`$ik_has_complexity\`"
-    echo
-    echo "## IK Fused Delta Regression"
-    if [[ "$WITH_FUSED_REGRESSION" == "1" ]]; then
-        if [[ -f /out/ik_fused_regression.md ]]; then
-            echo "- decode safety (mode1 ~= mode0): \`$fused_decode_safe\`"
-            echo "- prefill safety (mode1 ~= mode0): \`$fused_prefill_safe\`"
-            echo "- mode0 decode sanity: \`$fused_mode0_decode_sane\`"
-            echo "- mode0 prefill sanity: \`$fused_mode0_prefill_sane\`"
-            echo "- report: \`/out/ik_fused_regression.md\`"
-        else
-            echo "- status: \`requested but no report generated\`"
-        fi
-    else
-        echo "- status: \`not requested\`"
-    fi
-    echo
-    echo "## Command Status + Memory"
-    echo '```'
-    cat "$STATUS_FILE"
-    echo '```'
-    echo
-    echo "## First Non-empty Lines (Generation)"
-    echo "### mainline"
-    awk 'NF { print; c++; if (c == 20) exit }' /out/cpu_gen_mainline.out
-    echo
-    echo "### ik"
-    awk 'NF { print; c++; if (c == 20) exit }' /out/cpu_gen_ik.out
-} > /out/SUMMARY.md
-
-log "Summary written to /out/SUMMARY.md"
-BASH
-
-chmod +x "${out_dir}/run_inside.sh"
-
-docker_cmd=(
-    docker run --rm
-    -e WITH_GPU="${WITH_GPU}"
-    -e GPU_DEVICE="${GPU_DEVICE}"
-    -e SWEEP_CTX="${SWEEP_CTX}"
-    -e SWEEP_N="${SWEEP_N}"
-    -e WITH_FUSED_REGRESSION="${WITH_FUSED_REGRESSION}"
-    -e MAIN_BUILD_DIR="${MAIN_BUILD_DIR}"
-    -e IK_BUILD_DIR="${IK_BUILD_DIR}"
-    -v "${MAIN_REPO}:/mainline"
-    -v "${IK_REPO}:/ik"
-    -v "${MODEL_HOST}:/model.gguf:ro"
-    -v "${out_dir}:/out"
-)
-
-if [[ "$WITH_GPU" -eq 1 ]]; then
-    docker_cmd+=(--gpus all)
-fi
-
-docker_cmd+=("${IMAGE}" /bin/bash /out/run_inside.sh)
-
-echo "Running eval in container: ${IMAGE}"
-echo "Output directory: ${out_dir}"
-"${docker_cmd[@]}"
-
-echo
-echo "Done. Summary:"
-echo "  ${out_dir}/SUMMARY.md"
-echo "Raw logs:"
-echo "  ${out_dir}/*.out"
-echo "  ${out_dir}/*.err"
diff --git a/scripts/qwen3next-fused-regression.sh b/scripts/qwen3next-fused-regression.sh
deleted file mode 100755
index b3a0042b..00000000
--- a/scripts/qwen3next-fused-regression.sh
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-BIN="${BIN:-./build/bin/llama-perplexity}"
-MODEL="${MODEL:-}"
-INPUT_FILE="${INPUT_FILE:-/tmp/qwen3next_fused_regression_input.txt}"
-OUT_FILE="${OUT_FILE:-/tmp/qwen3next_fused_regression_$(date +%Y%m%d_%H%M%S).md}"
-
-CUDA_DEVICE="${CUDA_DEVICE:-0}"
-THREADS="${THREADS:-8}"
-CTX="${CTX:-2048}"
-FA="${FA:-on}"
-NGL="${NGL:-47}"
-N_CPU_MOE="${N_CPU_MOE:-40}"
-CHUNKS="${CHUNKS:-1}"
-
-DECODE_B="${DECODE_B:-1}"
-DECODE_UB="${DECODE_UB:-1}"
-PREFILL_B="${PREFILL_B:-2048}"
-PREFILL_UB="${PREFILL_UB:-512}"
-
-# Mandatory safety checks:
-# 1) mode=1 decode should stay aligned with mode=0 decode.
-# 2) mode=1 prefill should stay aligned with mode=0 prefill.
-MAX_DECODE_DELTA_01="${MAX_DECODE_DELTA_01:-0.10}"
-MAX_PREFILL_DELTA_01="${MAX_PREFILL_DELTA_01:-0.10}"
-# 3) mode=0 absolute perplexity should stay in a sane range.
-MAX_MODE0_DECODE_PPL="${MAX_MODE0_DECODE_PPL:-10.0}"
-MAX_MODE0_PREFILL_PPL="${MAX_MODE0_PREFILL_PPL:-10.0}"
-
-usage() {
-    cat <<'USAGE'
-Usage:
-  scripts/qwen3next-fused-regression.sh --model /path/to/model.gguf [options]
-
-Options:
-  --model PATH             GGUF model path (required)
-  --bin PATH               llama-perplexity binary (default: ./build/bin/llama-perplexity)
-  --input PATH             input text file; auto-generated if missing
-  --out PATH               markdown output file
-  --cuda-device ID         CUDA_VISIBLE_DEVICES value (default: 0)
-  --threads N              -t value (default: 8)
-  --ctx N                  -c value (default: 2048)
-  --fa on|off              -fa value (default: on)
-  --ngl N                  -ngl value (default: 47)
-  --n-cpu-moe N            --n-cpu-moe value (default: 40)
-  --chunks N               --chunks value (default: 1)
-  --decode-b N             decode batch size (default: 1)
-  --decode-ub N            decode ubatch size (default: 1)
-  --prefill-b N            prefill batch size (default: 2048)
-  --prefill-ub N           prefill ubatch size (default: 512)
-  --max-decode-delta-01 X  fail threshold for |PPL(mode1)-PPL(mode0)| in decode (default: 0.10)
-  --max-prefill-delta-01 X fail threshold for |PPL(mode1)-PPL(mode0)| in prefill (default: 0.10)
-  --max-mode0-decode-ppl X fail threshold for PPL(mode0) in decode (default: 10.0)
-  --max-mode0-prefill-ppl X fail threshold for PPL(mode0) in prefill (default: 10.0)
-  -h, --help               show this help
-USAGE
-}
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --model) MODEL="$2"; shift 2 ;;
-        --bin) BIN="$2"; shift 2 ;;
-        --input) INPUT_FILE="$2"; shift 2 ;;
-        --out) OUT_FILE="$2"; shift 2 ;;
-        --cuda-device) CUDA_DEVICE="$2"; shift 2 ;;
-        --threads) THREADS="$2"; shift 2 ;;
-        --ctx) CTX="$2"; shift 2 ;;
-        --fa) FA="$2"; shift 2 ;;
-        --ngl) NGL="$2"; shift 2 ;;
-        --n-cpu-moe) N_CPU_MOE="$2"; shift 2 ;;
-        --chunks) CHUNKS="$2"; shift 2 ;;
-        --decode-b) DECODE_B="$2"; shift 2 ;;
-        --decode-ub) DECODE_UB="$2"; shift 2 ;;
-        --prefill-b) PREFILL_B="$2"; shift 2 ;;
-        --prefill-ub) PREFILL_UB="$2"; shift 2 ;;
-        --max-decode-delta-01) MAX_DECODE_DELTA_01="$2"; shift 2 ;;
-        --max-prefill-delta-01) MAX_PREFILL_DELTA_01="$2"; shift 2 ;;
-        --max-mode0-decode-ppl) MAX_MODE0_DECODE_PPL="$2"; shift 2 ;;
-        --max-mode0-prefill-ppl) MAX_MODE0_PREFILL_PPL="$2"; shift 2 ;;
-        -h|--help) usage; exit 0 ;;
-        *)
-            echo "unknown option: $1" >&2
-            usage
-            exit 2
-            ;;
-    esac
-done
-
-if [[ -z "$MODEL" ]]; then
-    echo "--model is required" >&2
-    exit 2
-fi
-if [[ ! -x "$BIN" ]]; then
-    echo "binary not executable: $BIN" >&2
-    exit 1
-fi
-if [[ ! -f "$MODEL" ]]; then
-    echo "model not found: $MODEL" >&2
-    exit 1
-fi
-
-if [[ ! -f "$INPUT_FILE" ]]; then
-    cat > "$INPUT_FILE" <<'TXT'
-Regression text for Qwen3Next fused DeltaNet checks.
-This text is deterministic and intentionally repetitive.
-TXT
-    # Keep this comfortably above 2*ctx tokenization requirements used by llama-perplexity.
-    for _ in $(seq 1 900); do
-        echo "the model should keep stable perplexity under consistent settings" >> "$INPUT_FILE"
-    done
-fi
-
-log_dir="${OUT_FILE}.logs"
-mkdir -p "$log_dir"
-
-extract_ppl() {
-    local file="$1"
-    local line val
-    line="$(grep -E 'Final estimate:' "$file" | tail -n1 || true)"
-    if [[ -z "$line" ]]; then
-        echo "NA"
-        return
-    fi
-    val="$(echo "$line" | sed -nE 's/.*= ([0-9]+\.[0-9]+).*/\1/p')"
-    if [[ -z "$val" ]]; then
-        val="$(echo "$line" | grep -Eo '[0-9]+\.[0-9]+' | head -n1 || true)"
-    fi
-    if [[ -z "$val" ]]; then
-        echo "NA"
-    else
-        echo "$val"
-    fi
-}
-
-abs_delta() {
-    awk -v a="$1" -v b="$2" 'BEGIN { d = a - b; if (d < 0) d = -d; printf "%.6f", d }'
-}
-
-run_ppl() {
-    local mode="$1"
-    local b="$2"
-    local ub="$3"
-    local label="$4"
-    local log="${log_dir}/${label}_m${mode}.log"
-
-    echo "running ${label} mode=${mode} (b=${b} ub=${ub})" >&2
-    CUDA_VISIBLE_DEVICES="$CUDA_DEVICE" \
-    LLAMA_QWEN3NEXT_FUSED_DELTA="$mode" \
-    "$BIN" -m "$MODEL" -f "$INPUT_FILE" \
-        -c "$CTX" -b "$b" -ub "$ub" -t "$THREADS" \
-        -fa "$FA" -ngl "$NGL" --n-cpu-moe "$N_CPU_MOE" \
-        --chunks "$CHUNKS" --no-warmup >"$log" 2>&1
-
-    extract_ppl "$log"
-}
-
-decode_0="$(run_ppl 0 "$DECODE_B" "$DECODE_UB" decode)"
-decode_1="$(run_ppl 1 "$DECODE_B" "$DECODE_UB" decode)"
-decode_2="$(run_ppl 2 "$DECODE_B" "$DECODE_UB" decode)"
-
-prefill_0="$(run_ppl 0 "$PREFILL_B" "$PREFILL_UB" prefill)"
-prefill_1="$(run_ppl 1 "$PREFILL_B" "$PREFILL_UB" prefill)"
-prefill_2="$(run_ppl 2 "$PREFILL_B" "$PREFILL_UB" prefill)"
-
-if [[ "$decode_0" == "NA" || "$decode_1" == "NA" || "$decode_2" == "NA" || \
-      "$prefill_0" == "NA" || "$prefill_1" == "NA" || "$prefill_2" == "NA" ]]; then
-    echo "failed to extract one or more perplexity values; see logs in ${log_dir}" >&2
-    exit 1
-fi
-
-decode_delta_01="$(abs_delta "$decode_0" "$decode_1")"
-decode_delta_02="$(abs_delta "$decode_0" "$decode_2")"
-prefill_delta_01="$(abs_delta "$prefill_0" "$prefill_1")"
-prefill_delta_02="$(abs_delta "$prefill_0" "$prefill_2")"
-
-decode_ok="$(awk -v d="$decode_delta_01" -v t="$MAX_DECODE_DELTA_01" 'BEGIN { print(d <= t ? "yes" : "no") }')"
-prefill_ok="$(awk -v d="$prefill_delta_01" -v t="$MAX_PREFILL_DELTA_01" 'BEGIN { print(d <= t ? "yes" : "no") }')"
-mode0_decode_ok="$(awk -v p="$decode_0" -v t="$MAX_MODE0_DECODE_PPL" 'BEGIN { print(p <= t ? "yes" : "no") }')"
-mode0_prefill_ok="$(awk -v p="$prefill_0" -v t="$MAX_MODE0_PREFILL_PPL" 'BEGIN { print(p <= t ? "yes" : "no") }')"
-
-{
-    echo "# Qwen3Next Fused DeltaNet Regression Report"
-    echo
-    echo "- date: \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\`"
-    echo "- bin: \`$BIN\`"
-    echo "- model: \`$MODEL\`"
-    echo "- input: \`$INPUT_FILE\`"
-    echo "- cuda_device: \`$CUDA_DEVICE\`"
-    echo "- ctx: \`$CTX\`"
-    echo "- fa: \`$FA\`"
-    echo "- ngl: \`$NGL\`"
-    echo "- n_cpu_moe: \`$N_CPU_MOE\`"
-    echo "- chunks: \`$CHUNKS\`"
-    echo
-    echo "## Perplexity"
-    echo
-    echo "| Path | mode=0 | mode=1 | mode=2 | |delta|(1-0) | |delta|(2-0) |"
-    echo "|---|---:|---:|---:|---:|---:|"
-    echo "| decode (b=${DECODE_B},ub=${DECODE_UB}) | ${decode_0} | ${decode_1} | ${decode_2} | ${decode_delta_01} | ${decode_delta_02} |"
-    echo "| prefill (b=${PREFILL_B},ub=${PREFILL_UB}) | ${prefill_0} | ${prefill_1} | ${prefill_2} | ${prefill_delta_01} | ${prefill_delta_02} |"
-    echo
-    echo "## Safety Checks"
-    echo
-    echo "- decode safety (mode1 ~= mode0): \`${decode_ok}\` (threshold \`${MAX_DECODE_DELTA_01}\`)"
-    echo "- prefill safety (mode1 ~= mode0): \`${prefill_ok}\` (threshold \`${MAX_PREFILL_DELTA_01}\`)"
-    echo "- mode0 decode sanity: \`${mode0_decode_ok}\` (PPL \`${decode_0}\`, max \`${MAX_MODE0_DECODE_PPL}\`)"
-    echo "- mode0 prefill sanity: \`${mode0_prefill_ok}\` (PPL \`${prefill_0}\`, max \`${MAX_MODE0_PREFILL_PPL}\`)"
-    echo
-    echo "## Logs"
-    echo
-    echo "- raw logs dir: \`${log_dir}\`"
-    echo "- decode mode0: \`${log_dir}/decode_m0.log\`"
-    echo "- decode mode1: \`${log_dir}/decode_m1.log\`"
-    echo "- decode mode2: \`${log_dir}/decode_m2.log\`"
-    echo "- prefill mode0: \`${log_dir}/prefill_m0.log\`"
-    echo "- prefill mode1: \`${log_dir}/prefill_m1.log\`"
-    echo "- prefill mode2: \`${log_dir}/prefill_m2.log\`"
-} > "$OUT_FILE"
-
-echo "wrote report: $OUT_FILE"
-
-if [[ "$decode_ok" != "yes" || "$prefill_ok" != "yes" || "$mode0_decode_ok" != "yes" || "$mode0_prefill_ok" != "yes" ]]; then
-    echo "regression check failed; see report: $OUT_FILE" >&2
-    exit 1
-fi
-
-echo "regression check passed"
diff --git a/scripts/qwen3next-regression.sh b/scripts/qwen3next-regression.sh
deleted file mode 100755
index d6649248..00000000
--- a/scripts/qwen3next-regression.sh
+++ /dev/null
@@ -1,380 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-IMAGE="${IMAGE:-nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04}"
-IK_REPO="${IK_REPO:-/home/yurko/Code/ik_llama.cpp}"
-IK_BUILD_DIR="${IK_BUILD_DIR:-build}"
-MODEL_HOST="${MODEL_HOST:-/home/yurko/.cache/llama.cpp/qwen3-next-coder.gguf}"
-OUT_ROOT="${OUT_ROOT:-/tmp/qwen3next-regression}"
-GPU_DEVICE="${GPU_DEVICE:-0}"
-
-THREADS="${THREADS:-8}"
-FA="${FA:-on}"
-NGL="${NGL:-999}"
-
-PROXY_CTX="${PROXY_CTX:-8192}"
-PROXY_B="${PROXY_B:-3072}"
-PROXY_UB="${PROXY_UB:-768}"
-PROXY_N="${PROXY_N:-128}"
-PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE:-40}"
-
-REG_CTX="${REG_CTX:-2048}"
-REG_NGL="${REG_NGL:-47}"
-REG_DECODE_B="${REG_DECODE_B:-1}"
-REG_DECODE_UB="${REG_DECODE_UB:-1}"
-REG_PREFILL_B="${REG_PREFILL_B:-2048}"
-REG_PREFILL_UB="${REG_PREFILL_UB:-512}"
-
-WITH_FIT=1
-FIT_CTX="${FIT_CTX:-65536}"
-FIT_N_CPU_MOE="${FIT_N_CPU_MOE:-47}"
-FIT_N="${FIT_N:-1}"
-
-usage() {
-    cat <<'USAGE'
-Usage:
-  scripts/qwen3next-regression.sh [options]
-
-Options:
-  --image IMAGE              Docker image to run checks in (default: nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04)
-  --ik-repo PATH             ik repo path (default: /home/yurko/Code/ik_llama.cpp)
-  --ik-build-dir NAME        Build dir under ik repo (default: build)
-  --model PATH               Host path to model GGUF file
-  --out-root PATH            Output root directory (default: /tmp/qwen3next-regression)
-  --gpu-device ID            CUDA device id (default: 0)
-  --threads N                Threads (default: 8)
-  --fa on|off                Flash attention mode (default: on)
-  --ngl N                    -ngl value (default: 999)
-
-  --proxy-ctx N              Proxy sweep context (default: 8192)
-  --proxy-b N                Proxy sweep batch size (default: 3072)
-  --proxy-ub N               Proxy sweep ubatch size (default: 768)
-  --proxy-n N                Proxy sweep generation tokens (default: 128)
-  --proxy-n-cpu-moe N        Proxy sweep --n-cpu-moe (default: 40)
-
-  --reg-ctx N                Fused regression context (default: 2048)
-  --reg-ngl N                Fused regression -ngl (default: 47)
-  --reg-decode-b N           Fused regression decode b (default: 1)
-  --reg-decode-ub N          Fused regression decode ub (default: 1)
-  --reg-prefill-b N          Fused regression prefill b (default: 2048)
-  --reg-prefill-ub N         Fused regression prefill ub (default: 512)
-
-  --fit-ctx N                Long-context fit sanity context (default: 65536)
-  --fit-n-cpu-moe N          Long-context fit sanity --n-cpu-moe (default: 47)
-  --fit-n N                  Long-context fit sanity generation tokens (default: 1)
-  --no-fit                   Skip long-context fit sanity
-  -h, --help                 Show this help
-
-Runs:
-  1) Fused-delta regression guard (mode0/mode1/mode2 + sanity thresholds)
-  2) Single-GPU proxy sweep benchmark
-  3) Optional long-context fit sanity
-USAGE
-}
-
-while [[ $# -gt 0 ]]; do
-    case "$1" in
-        --image) IMAGE="$2"; shift 2 ;;
-        --ik-repo) IK_REPO="$2"; shift 2 ;;
-        --ik-build-dir) IK_BUILD_DIR="$2"; shift 2 ;;
-        --model) MODEL_HOST="$2"; shift 2 ;;
-        --out-root) OUT_ROOT="$2"; shift 2 ;;
-        --gpu-device) GPU_DEVICE="$2"; shift 2 ;;
-        --threads) THREADS="$2"; shift 2 ;;
-        --fa) FA="$2"; shift 2 ;;
-        --ngl) NGL="$2"; shift 2 ;;
-        --proxy-ctx) PROXY_CTX="$2"; shift 2 ;;
-        --proxy-b) PROXY_B="$2"; shift 2 ;;
-        --proxy-ub) PROXY_UB="$2"; shift 2 ;;
-        --proxy-n) PROXY_N="$2"; shift 2 ;;
-        --proxy-n-cpu-moe) PROXY_N_CPU_MOE="$2"; shift 2 ;;
-        --reg-ctx) REG_CTX="$2"; shift 2 ;;
-        --reg-ngl) REG_NGL="$2"; shift 2 ;;
-        --reg-decode-b) REG_DECODE_B="$2"; shift 2 ;;
-        --reg-decode-ub) REG_DECODE_UB="$2"; shift 2 ;;
-        --reg-prefill-b) REG_PREFILL_B="$2"; shift 2 ;;
-        --reg-prefill-ub) REG_PREFILL_UB="$2"; shift 2 ;;
-        --fit-ctx) FIT_CTX="$2"; shift 2 ;;
-        --fit-n-cpu-moe) FIT_N_CPU_MOE="$2"; shift 2 ;;
-        --fit-n) FIT_N="$2"; shift 2 ;;
-        --no-fit) WITH_FIT=0; shift ;;
-        -h|--help) usage; exit 0 ;;
-        *)
-            echo "Unknown option: $1" >&2
-            usage
-            exit 2
-            ;;
-    esac
-done
-
-if [[ ! -d "$IK_REPO" ]]; then
-    echo "ik repo does not exist: $IK_REPO" >&2
-    exit 1
-fi
-if [[ ! -f "$MODEL_HOST" ]]; then
-    echo "Model file does not exist: $MODEL_HOST" >&2
-    exit 1
-fi
-
-run_id="$(date +%Y%m%d_%H%M%S)"
-out_dir="${OUT_ROOT%/}/${run_id}"
-mkdir -p "$out_dir"
-
-cat > "${out_dir}/run_inside.sh" <<'BASH'
-#!/usr/bin/env bash
-set -euo pipefail
-
-IK_BUILD_DIR="${IK_BUILD_DIR:-build}"
-GPU_DEVICE="${GPU_DEVICE:-0}"
-THREADS="${THREADS:-8}"
-FA="${FA:-on}"
-NGL="${NGL:-999}"
-
-PROXY_CTX="${PROXY_CTX:-8192}"
-PROXY_B="${PROXY_B:-3072}"
-PROXY_UB="${PROXY_UB:-768}"
-PROXY_N="${PROXY_N:-128}"
-PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE:-40}"
-
-REG_CTX="${REG_CTX:-2048}"
-REG_NGL="${REG_NGL:-47}"
-REG_DECODE_B="${REG_DECODE_B:-1}"
-REG_DECODE_UB="${REG_DECODE_UB:-1}"
-REG_PREFILL_B="${REG_PREFILL_B:-2048}"
-REG_PREFILL_UB="${REG_PREFILL_UB:-512}"
-
-WITH_FIT="${WITH_FIT:-1}"
-FIT_CTX="${FIT_CTX:-65536}"
-FIT_N_CPU_MOE="${FIT_N_CPU_MOE:-47}"
-FIT_N="${FIT_N:-1}"
-
-IK_BIN="/ik/${IK_BUILD_DIR}/bin"
-IK_LD="/ik/${IK_BUILD_DIR}/bin:/ik/${IK_BUILD_DIR}/src:/ik/${IK_BUILD_DIR}/ggml/src:/ik/${IK_BUILD_DIR}/examples/mtmd"
-MODEL="/model.gguf"
-
-RUN_LOG="/out/run.log"
-STATUS_FILE="/out/status.tsv"
-
-touch "$RUN_LOG"
-printf "name\tstatus\texit_code\n" > "$STATUS_FILE"
-
-log() {
-    local msg="$1"
-    printf "[%s] %s\n" "$(date +%H:%M:%S)" "$msg" | tee -a "$RUN_LOG"
-}
-
-run_cmd() {
-    local name="$1"
-    shift
-    local out_file="/out/${name}.out"
-    local err_file="/out/${name}.err"
-    local ec
-
-    log "RUN: $name"
-    set +e
-    "$@" >"$out_file" 2>"$err_file"
-    ec=$?
-    set -e
-
-    if [[ $ec -eq 0 ]]; then
-        printf "%s\tOK\t0\n" "$name" >> "$STATUS_FILE"
-        log "OK: $name"
-    else
-        printf "%s\tFAIL\t%d\n" "$name" "$ec" >> "$STATUS_FILE"
-        log "FAIL($ec): $name"
-    fi
-    return $ec
-}
-
-require_bin() {
-    local path="$1"
-    if [[ ! -x "$path" ]]; then
-        log "MISSING: $path"
-        exit 1
-    fi
-}
-
-extract_best_metric() {
-    local out_file="$1"
-    local err_file="$2"
-    local col="$3"
-    awk -F'|' -v c="$col" '
-        /^\|[[:space:]]*[0-9]+[[:space:]]*\|/ {
-            v = $c
-            gsub(/[[:space:]]/, "", v)
-            if ((v + 0) > best) {
-                best = v + 0
-                row = $0
-            }
-        }
-        END {
-            if (best > 0) {
-                printf "%.2f\t%s\n", best, row
-            } else {
-                print "NA\tNA"
-            }
-        }
-    ' < <(cat "$out_file" "$err_file")
-}
-
-require_bin "$IK_BIN/llama-perplexity"
-require_bin "$IK_BIN/llama-sweep-bench"
-require_bin "$IK_BIN/llama-cli"
-require_bin "/ik/scripts/qwen3next-fused-regression.sh"
-
-export CUDA_VISIBLE_DEVICES="$GPU_DEVICE"
-log "GPU checks on CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-
-run_cmd "fused_regression" \
-    env LD_LIBRARY_PATH="$IK_LD" /ik/scripts/qwen3next-fused-regression.sh \
-        --model "$MODEL" \
-        --bin "$IK_BIN/llama-perplexity" \
-        --out /out/fused_regression.md \
-        --cuda-device "$GPU_DEVICE" \
-        --threads "$THREADS" \
-        --ctx "$REG_CTX" \
-        --fa "$FA" \
-        --ngl "$REG_NGL" \
-        --n-cpu-moe "$PROXY_N_CPU_MOE" \
-        --chunks 1 \
-        --decode-b "$REG_DECODE_B" \
-        --decode-ub "$REG_DECODE_UB" \
-        --prefill-b "$REG_PREFILL_B" \
-        --prefill-ub "$REG_PREFILL_UB" || true
-
-run_cmd "proxy_sweep" \
-    env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" \
-        -m "$MODEL" \
-        -c "$PROXY_CTX" \
-        -b "$PROXY_B" \
-        -ub "$PROXY_UB" \
-        -n "$PROXY_N" \
-        -t "$THREADS" \
-        -fa "$FA" \
-        --jinja \
-        -ngl "$NGL" \
-        --n-cpu-moe "$PROXY_N_CPU_MOE" \
-        -rtr \
-        --temp 1 \
-        --top-p 0.95 \
-        --top-k 40 \
-        --min-p 0.01 || true
-
-if [[ "$WITH_FIT" == "1" ]]; then
-    run_cmd "fit_sanity" \
-        env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-cli" \
-            -m "$MODEL" \
-            -c "$FIT_CTX" \
-            -n "$FIT_N" \
-            -t "$THREADS" \
-            -fa "$FA" \
-            -ngl "$NGL" \
-            --n-cpu-moe "$FIT_N_CPU_MOE" \
-            -rtr \
-            --temp 0 \
-            --top-k 1 \
-            --simple-io \
-            --no-display-prompt \
-            -p "ping" || true
-else
-    printf "%s\tSKIP\t0\n" "fit_sanity" >> "$STATUS_FILE"
-    log "SKIP: fit_sanity"
-fi
-
-fused_decode_safe="NA"
-fused_prefill_safe="NA"
-fused_mode0_decode_sane="NA"
-fused_mode0_prefill_sane="NA"
-if [[ -f /out/fused_regression.md ]]; then
-    fused_decode_safe="$(sed -nE 's/^- decode safety .*: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
-    fused_prefill_safe="$(sed -nE 's/^- prefill safety .*: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
-    fused_mode0_decode_sane="$(sed -nE 's/^- mode0 decode sanity: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
-    fused_mode0_prefill_sane="$(sed -nE 's/^- mode0 prefill sanity: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
-    if [[ -z "$fused_decode_safe" ]]; then fused_decode_safe="NA"; fi
-    if [[ -z "$fused_prefill_safe" ]]; then fused_prefill_safe="NA"; fi
-    if [[ -z "$fused_mode0_decode_sane" ]]; then fused_mode0_decode_sane="NA"; fi
-    if [[ -z "$fused_mode0_prefill_sane" ]]; then fused_mode0_prefill_sane="NA"; fi
-fi
-
-best_pp_tsv="$(extract_best_metric /out/proxy_sweep.out /out/proxy_sweep.err 6)"
-best_tg_tsv="$(extract_best_metric /out/proxy_sweep.out /out/proxy_sweep.err 8)"
-best_pp="${best_pp_tsv%%$'\t'*}"
-best_pp_row="${best_pp_tsv#*$'\t'}"
-best_tg="${best_tg_tsv%%$'\t'*}"
-best_tg_row="${best_tg_tsv#*$'\t'}"
-
-{
-    echo "# Qwen3Next Regression Summary"
-    echo
-    echo "## Fused Regression"
-    echo "- config: \`ctx=${REG_CTX}, decode(b=${REG_DECODE_B},ub=${REG_DECODE_UB}), prefill(b=${REG_PREFILL_B},ub=${REG_PREFILL_UB}), n-cpu-moe=${PROXY_N_CPU_MOE}\`"
-    echo "- decode safety: \`$fused_decode_safe\`"
-    echo "- prefill safety: \`$fused_prefill_safe\`"
-    echo "- mode0 decode sanity: \`$fused_mode0_decode_sane\`"
-    echo "- mode0 prefill sanity: \`$fused_mode0_prefill_sane\`"
-    echo "- report: \`/out/fused_regression.md\`"
-    echo
-    echo "## Proxy Sweep"
-    echo "- config: \`c=${PROXY_CTX}, b=${PROXY_B}, ub=${PROXY_UB}, n=${PROXY_N}, n-cpu-moe=${PROXY_N_CPU_MOE}\`"
-    echo "- best PP t/s: \`$best_pp\`"
-    echo "- best TG t/s: \`$best_tg\`"
-    echo "- best PP row: \`$best_pp_row\`"
-    echo "- best TG row: \`$best_tg_row\`"
-    echo
-    echo "## Long-Context Fit"
-    if [[ "$WITH_FIT" == "1" ]]; then
-        echo "- config: \`c=${FIT_CTX}, n-cpu-moe=${FIT_N_CPU_MOE}, n=${FIT_N}\`"
-        echo "- output: \`/out/fit_sanity.out\`"
-    else
-        echo "- skipped"
-    fi
-    echo
-    echo "## Command Status"
-    echo '```'
-    cat "$STATUS_FILE"
-    echo '```'
-} > /out/SUMMARY.md
-
-log "Summary written to /out/SUMMARY.md"
-BASH
-
-chmod +x "${out_dir}/run_inside.sh"
-
-docker_cmd=(
-    docker run --rm --gpus all
-    -e IK_BUILD_DIR="${IK_BUILD_DIR}"
-    -e GPU_DEVICE="${GPU_DEVICE}"
-    -e THREADS="${THREADS}"
-    -e FA="${FA}"
-    -e NGL="${NGL}"
-    -e PROXY_CTX="${PROXY_CTX}"
-    -e PROXY_B="${PROXY_B}"
-    -e PROXY_UB="${PROXY_UB}"
-    -e PROXY_N="${PROXY_N}"
-    -e PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE}"
-    -e REG_CTX="${REG_CTX}"
-    -e REG_NGL="${REG_NGL}"
-    -e REG_DECODE_B="${REG_DECODE_B}"
-    -e REG_DECODE_UB="${REG_DECODE_UB}"
-    -e REG_PREFILL_B="${REG_PREFILL_B}"
-    -e REG_PREFILL_UB="${REG_PREFILL_UB}"
-    -e WITH_FIT="${WITH_FIT}"
-    -e FIT_CTX="${FIT_CTX}"
-    -e FIT_N_CPU_MOE="${FIT_N_CPU_MOE}"
-    -e FIT_N="${FIT_N}"
-    -v "${IK_REPO}:/ik"
-    -v "${MODEL_HOST}:/model.gguf:ro"
-    -v "${out_dir}:/out"
-    "${IMAGE}" /bin/bash /out/run_inside.sh
-)
-
-echo "Running regression in container: ${IMAGE}"
-echo "Output directory: ${out_dir}"
-"${docker_cmd[@]}"
-
-echo
-echo "Done. Summary:"
-echo "  ${out_dir}/SUMMARY.md"
-echo "Raw logs:"
-echo "  ${out_dir}/*.out"
-echo "  ${out_dir}/*.err"
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index b1c2ea5b..2b953a87 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -4181,14 +4181,12 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
     enum class qwen3next_fused_delta_mode {
         off,
         tok_gt1,
-        all_tokens,
     };
 
     // Keep legacy DeltaNet path as default for correctness.
     // LLAMA_QWEN3NEXT_FUSED_DELTA values:
     //   unset / 0 : off
     //   1         : fused only for n_tok > 1 (safer; avoids known decode regression)
-    //   2         : fused for all token counts (experimental)
     const qwen3next_fused_delta_mode fused_delta_mode = []() {
         const char * env = std::getenv("LLAMA_QWEN3NEXT_FUSED_DELTA");
         if (env == nullptr || env[0] == '\0') {
@@ -4202,19 +4200,10 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
             case 't':
             case 'T':
                 return qwen3next_fused_delta_mode::tok_gt1;
-            case '2':
-                return qwen3next_fused_delta_mode::all_tokens;
             default:
                 return qwen3next_fused_delta_mode::off;
         }
     }();
-    if (fused_delta_mode == qwen3next_fused_delta_mode::all_tokens) {
-        static bool warned_all_tokens = false;
-        if (!warned_all_tokens) {
-            LLAMA_LOG_WARN("%s: LLAMA_QWEN3NEXT_FUSED_DELTA=2 enables fused single-token decode; quality regression is known in this mode\n", __func__);
-            warned_all_tokens = true;
-        }
-    }
 
     auto get_slice_2d = [&](ggml_tensor * t, int64_t c) -> ggml_tensor * {
         return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
@@ -4850,8 +4839,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
 
         std::pair<ggml_tensor *, ggml_tensor *> attn_out;
         const bool use_fused_delta_net =
-            (fused_delta_mode == qwen3next_fused_delta_mode::tok_gt1 && n_tok > 1) ||
-            (fused_delta_mode == qwen3next_fused_delta_mode::all_tokens);
+            (fused_delta_mode == qwen3next_fused_delta_mode::tok_gt1 && n_tok > 1);
 
         if (use_fused_delta_net) {
             attn_out = build_delta_net_fused(q_conv, k_conv, v_conv, gate, beta, state, il);
@@ -4935,16 +4923,14 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
     ggml_tensor * causal_mask = nullptr;
     ggml_tensor * identity    = nullptr;
     ggml_tensor * diag_mask   = nullptr;
-    if (fused_delta_mode != qwen3next_fused_delta_mode::all_tokens) {
-        causal_mask = ggml_tri(ctx0,
-                ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
-                GGML_TRI_TYPE_LOWER);
-        identity  = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
-        diag_mask = ggml_add(ctx0, causal_mask, identity);
-        ggml_build_forward_expand(gf, causal_mask);
-        ggml_build_forward_expand(gf, identity);
-        ggml_build_forward_expand(gf, diag_mask);
-    }
+    causal_mask = ggml_tri(ctx0,
+            ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
+            GGML_TRI_TYPE_LOWER);
+    identity  = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
+    diag_mask = ggml_add(ctx0, causal_mask, identity);
+    ggml_build_forward_expand(gf, causal_mask);
+    ggml_build_forward_expand(gf, identity);
+    ggml_build_forward_expand(gf, diag_mask);
 
     ggml_tensor * cur = nullptr;