diff --git a/common/common.cpp b/common/common.cpp index 3a4d9a72..39e95531 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1250,8 +1250,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa if (arg == "--qwen3next-fused-delta") { CHECK_ARG params.qwen3next_fused_delta = std::stoi(argv[i]); - if (params.qwen3next_fused_delta < 0 || params.qwen3next_fused_delta > 2) { - fprintf(stderr, "error: Invalid value for --qwen3next-fused-delta: %d (must be 0, 1, or 2)\n", + if (params.qwen3next_fused_delta < 0 || params.qwen3next_fused_delta > 1) { + fprintf(stderr, "error: Invalid value for --qwen3next-fused-delta: %d (must be 0 or 1)\n", params.qwen3next_fused_delta); invalid_param = true; } @@ -2181,7 +2181,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks }); options.push_back({ "*", "-no-fa, --no-flash-attn", "disable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); options.push_back({ "*", "-fa, --flash-attn (auto|on|off|0|1)", "set Flash Attention (default: %s)", params.flash_attn ? "on" : "off" }); - options.push_back({ "*", " --qwen3next-fused-delta {0,1,2}", + options.push_back({ "*", " --qwen3next-fused-delta {0,1}", "force LLAMA_QWEN3NEXT_FUSED_DELTA mode for Qwen3Next (default: env/model default)" }); options.push_back({ "*", "-mla, --mla-use", "enable MLA (default: %d)", params.mla_attn }); options.push_back({ "*", "-amb, --attention-max-batch", "max batch size for attention computations (default: %d)", params.attn_max_batch}); diff --git a/common/common.h b/common/common.h index 827fc1bb..a18d45d4 100644 --- a/common/common.h +++ b/common/common.h @@ -259,7 +259,7 @@ struct gpt_params { bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly bool flash_attn = true; // flash attention - int qwen3next_fused_delta = -1; // -1 keep env/default, otherwise force LLAMA_QWEN3NEXT_FUSED_DELTA={0,1,2} + int qwen3next_fused_delta = -1; // -1 keep env/default, otherwise force LLAMA_QWEN3NEXT_FUSED_DELTA={0,1} int mla_attn = 3; // MLA 0: standard, 1: MLA with K and V^T cache, 2: MLA with just K cache, 3: the best of both worlds int attn_max_batch = 0; // Max batch size to use when computing attention (only applicable if flash_attn = false) bool fused_moe_up_gate = true; // fused up*unary(gate) op for MoE models diff --git a/docs/development/qwen3next_bench_16k_pp16384_tg128.md b/docs/development/qwen3next_bench_16k_pp16384_tg128.md deleted file mode 100644 index 82ede13c..00000000 --- a/docs/development/qwen3next_bench_16k_pp16384_tg128.md +++ /dev/null @@ -1,182 +0,0 @@ -# Qwen3Next Benchmark: PP 16384 / TG 128 (`ik_llama.cpp` vs `llama.cpp`) - -Date: 2026-02-08 - -## Setup - -- Container: `iktest2` -- Model: `/models/qwen3-next-coder.gguf` -- Prompt processing: `-p 16384` -- Token generation: `-n 128` -- Batch settings: `-b 3072 -ub 768` -- Threads: `-t 8` -- Repetitions: `-r 1` -- Mmap: `-mmp 0` - -CUDA runs: - -- `CUDA_VISIBLE_DEVICES=0` -- `-fa 1 -ngl 999 --n-cpu-moe 47` - -CPU-only runs: - -- `-fa 0 -ngl 0 --n-cpu-moe 0` - -Hardware note: - -- GPU0 (bench target): `NVIDIA GeForce RTX 5060 Ti`, `16311 MiB` total (`CUDA_VISIBLE_DEVICES=0` for CUDA runs). -- GPU1 (not used for these runs): `NVIDIA GeForce RTX 3060`, `12288 MiB` total. -- Observed during active `ik` CUDA run (`p=8192,b=2048,ub=512,n-cpu-moe=45`): GPU0 memory used `~12074 MiB` (`~3775 MiB` free), from `nvidia-smi`. - -## Results - -| Build | Backend | PP 16384 (tok/s) | TG 128 (tok/s) | -|---|---|---:|---:| -| `ik_llama.cpp` | CUDA | 207.891304 | 27.263562 | -| `llama.cpp` | CUDA | 185.764649 | 24.145662 | -| `ik_llama.cpp` | CPU-only | 45.739881 | 12.172113 | -| `llama.cpp` | CPU-only | 47.835420 | 6.991398 | - -## Relative (`ik` vs `llama.cpp`) - -- CUDA PP: `+11.91%` -- CUDA TG: `+12.91%` -- CPU PP: `-4.38%` -- CPU TG: `+74.10%` - -## Raw outputs - -- `/tmp/ik_cuda_bench_16k.json` -- `/tmp/mainline_cuda_bench_16k.json` -- `/tmp/ik_cpu_bench_16k.json` -- `/tmp/mainline_cpu_bench_16k.json` - -## Additional CUDA rerun (requested lower `n-cpu-moe` ballpark) - -Adjusted config: - -- `-p 8192 -n 128 -b 2048 -ub 512 -t 8 -fa 1 -ngl 999 -mmp 0` -- single GPU: `CUDA_VISIBLE_DEVICES=0` - -Fit checks on `ik`: - -- `--n-cpu-moe 25` -> fail to load model -- `--n-cpu-moe 40` -> fail to create context -- `--n-cpu-moe 45` -> works - -Working comparison at `--n-cpu-moe 45`: - -| Build | Backend | PP 8192 (tok/s) | TG 128 (tok/s) | -|---|---|---:|---:| -| `ik_llama.cpp` | CUDA | 201.613283 | 24.884600 | -| `llama.cpp` | CUDA | 145.100895 | 24.595058 | - -`ik` rerun with `-rtr 1` at the same config (`--n-cpu-moe 45`): - -| Build | Backend | PP 8192 (tok/s) | TG 128 (tok/s) | -|---|---|---:|---:| -| `ik_llama.cpp` (`-rtr 1`) | CUDA | 232.340508 | 27.895722 | - -## Historical Fused DeltaNet Check (obsolete) - -Date: 2026-02-08 - -Setup: - -- Container: `iktest2` -- Device: `CUDA_VISIBLE_DEVICES=0` (RTX 5060 Ti) -- Common args: `-c 2048 -b 2048 -ub 512 --chunks 1 --no-warmup -ngl 999 --n-cpu-moe 47 -t 8 -fa on` -- Switch under test: `LLAMA_QWEN3NEXT_FUSED_DELTA` - -Results (Wikitext2 sample file `/tmp/ppl_wikitext2_test.txt`): - -| Model | `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `LLAMA_QWEN3NEXT_FUSED_DELTA=1` | -|---|---:|---:| -| `/models/qwen3-next-coder.gguf` | `PPL 3.9378` | `PPL 15.3628` | -| `/models/qwen-3-coder-next-mxfp4.gguf` | `PPL 3.9860` | `PPL 15.0740` | - -Conclusion: - -- This run is kept for history only and is superseded by the later `Fused DeltaNet Safety Update (Superseding)` section below. -- Use the superseding section as source of truth for mode mapping and quality guidance. - -## Upstream PR #19375 Trial (Selective Port) Outcome - -Date: 2026-02-08 - -What was tried: - -- Ported selected non-fused qwen3next graph changes from `ggml-org/llama.cpp#19375` (broadcast/repeat and autoregressive matmul rewrite), then benchmarked and re-tested perplexity. - -Outcome: - -- No stable speed win in our setup after repeated runs. -- Direct autoregressive rewrite attempts from PR #19375 were not compatible with current ik graph-layout/contiguity assumptions and were reverted. -- Final code keeps only safe chunk-shape fixes plus fused-mode safety controls. - -## Decode-Only Fused Mode Trial (`LLAMA_QWEN3NEXT_FUSED_DELTA=2`) - -Date: 2026-02-08 - -Code change: - -- Added mode `2` for `LLAMA_QWEN3NEXT_FUSED_DELTA`: - - prompt / multi-token path: non-fused - - single-token decode path: fused - -Perplexity validation (`-c 2048`, GPU config as above): - -| Model | `=0` non-fused | `=2` decode-only fused | -|---|---:|---:| -| `/models/qwen3-next-coder.gguf` | `3.9378` | `3.9378` | -| `/models/qwen-3-coder-next-mxfp4.gguf` | `3.9860` | `3.9860` | - -`llama-bench` at `-p 8192 -n 128 -b 2048 -ub 512 -r 3 -rtr 1`: - -| Mode | PP 8192 (tok/s) | TG 128 (tok/s) | -|---|---:|---:| -| `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `170.090` | `25.465` | -| `LLAMA_QWEN3NEXT_FUSED_DELTA=2` | `166.212` | `29.599` | - -Notes: - -- Decode-only fused mode preserves prompt-quality metrics in this test. -- TG improved significantly in this run; PP variance was higher, so PP delta should be treated as noisy. - -## Fused DeltaNet Safety Update (Superseding) - -Date: 2026-02-08 - -This section supersedes the earlier `LLAMA_QWEN3NEXT_FUSED_DELTA` mode mapping. - -Updated env behavior in `src/llama-build-context.cpp`: - -- `0` / unset: non-fused for all token counts -- `1`: fused only for `n_tok > 1` (prefill/chunking), non-fused for single-token decode -- `2`: fused for all token counts (experimental) - -Reason: - -- Fused path has a known decode-path quality regression when forced on single-token steps. -- The safer default acceleration is therefore prefill-only fused mode (`=1`). - -Validation (CUDA, `qwen3-next-coder.gguf`, `-c 2048 -b 1 -ub 1 -fa on -ngl 47 --n-cpu-moe 40 --chunks 1 --no-warmup`): - -| Mode | PPL | -|---|---:| -| `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `3.9148 +/- 0.31093` | -| `LLAMA_QWEN3NEXT_FUSED_DELTA=1` | `3.9148 +/- 0.31093` | -| `LLAMA_QWEN3NEXT_FUSED_DELTA=2` | `6.1277 +/- 0.54810` | - -Quick throughput check (`-p 8192 -n 128 -b 2048 -ub 512 -r 1 -rtr 1`, same CUDA settings): - -| Mode | PP 8192 (tok/s) | TG 128 (tok/s) | -|---|---:|---:| -| `0` | `179.30` | `24.69` | -| `1` | `252.12` | `22.99` | -| `2` | `245.71` | `27.94` | - -Interpretation: - -- Use `=1` for production-safe quality with strong PP gain. -- Reserve `=2` for experiments only until decode-path correctness is fixed. diff --git a/docs/development/qwen3next_perf_diff_report.md b/docs/development/qwen3next_perf_diff_report.md deleted file mode 100644 index 919f61fd..00000000 --- a/docs/development/qwen3next_perf_diff_report.md +++ /dev/null @@ -1,165 +0,0 @@ -# Qwen3Next Review and Benchmark Summary (`ik_llama.cpp` vs `llama.cpp`) - -Date: 2026-02-08 - -## Scope - -This document captures: - -- Current upstream PR alignment for Qwen3Next-related work. -- What is already strong in `ik_llama.cpp` and what still needs adjustment. -- Recommended runtime settings for this machine (single GPU target, long context). -- Final apples-to-apples benchmark matrix for `ik_llama.cpp` vs `../llama.cpp`. - -## Upstream PR Check (as of 2026-02-08) - -Reviewed PRs: - -- https://github.com/ggml-org/llama.cpp/pull/18102 (`open`): Delta-Net CUDA op + integration. -- https://github.com/ggml-org/llama.cpp/pull/18792 (`open`): unified DeltaNet handling (`src/models/delta.cpp`). -- https://github.com/ggml-org/llama.cpp/pull/19375 (`open`, `draft`): Qwen3Next graph optimization in model builder. - -### Current alignment in `ik_llama.cpp` - -Already present and/or functionally covered: - -- CUDA DeltaNet op path exists in GGML (`ggml/src/ggml-cuda/delta-net.cu`). -- Solve-tri and backend op support are present for the fused path. -- Qwen3Next fused DeltaNet builder path exists (and is now runtime-toggleable via env). -- Existing ik optimizations remain available (`-rtr`, grouped/fused paths, no-offload-only-active-experts switches). - -Not directly mirrored yet (by design divergence from mainline model layout): - -- Mainline `src/models/delta.cpp` structure from PR #18792. -- Mainline `src/models/qwen3next.cpp` graph-form from PR #19375. - -## Required Adjustments (remaining) - -1. Keep non-fused as the strict safety baseline in defaults, and use `LLAMA_QWEN3NEXT_FUSED_DELTA=1` (prefill-only fused) as the explicit acceleration mode. -2. Continue using `scripts/qwen3next-regression.sh` as the release gate for this model path, and wire it into CI or pre-merge checks. -3. Treat the remaining PR #19375 autoregressive rewrite as deferred: direct porting into current ik graph builder is not layout-compatible without broader contiguity/reshape refactoring. -4. Revisit PR #18792 (`src/models/delta.cpp`) only if we need unified GDA/KDA support for additional architectures; for Qwen3Next-only it is optional. - -## Strong Points of `ik_llama.cpp` to Preserve - -- More runtime controls than mainline for this workload (`-rtr`, backend toggles, MoE/OOAE controls). -- Strong CUDA path for this model family once offload routing is tuned (`--n-cpu-moe` thresholding). -- Better TG throughput than current mainline in matched CUDA and CPU tests on this host. - -## Best Runtime Configuration (this host) - -Model: `/models/qwen3-next-coder.gguf` - -Single-GPU long-context finding: - -- `-c 65536` on GPU0 (16 GB) requires at least `--n-cpu-moe 47` to fit reliably. - -8k sweep proxy (single GPU, tuned path): - -- `b=2048,ub=512` -> `pp8192=142.85`, `tg128=24.81` -- `b=3072,ub=768` -> `pp8192=229.31`, `tg128=27.29` (best) -- `b=4096,ub=1024` -> `pp8192=211.53`, `tg128=23.85` - -Recommended serving baseline: - -- `CUDA_VISIBLE_DEVICES=0` -- `-c 65536 -b 3072 -ub 768 -t 8 -fa on -ngl 999 --n-cpu-moe 47 -rtr --qwen3next-fused-delta 1` - -## Final Benchmark Matrix (8k context proxy) - -All four builds were benchmarked with matched parameters and explicit `-mmp 0` for fairness. - -Common args: - -- `-m /models/qwen3-next-coder.gguf -p 8192 -n 128 -b 3072 -ub 768 -t 8 -r 1` -- CUDA runs: `CUDA_VISIBLE_DEVICES=0 -fa 1 -ngl 999 --n-cpu-moe 47 -mmp 0` -- CPU runs: `-fa 0 -ngl 0 --n-cpu-moe 0 -mmp 0` - -| Build | PP (tok/s) | TG (tok/s) | -|---|---:|---:| -| `ik` CUDA | 204.614 | 28.979 | -| mainline CUDA | 184.521 | 22.012 | -| `ik` CPU | 49.795 | 12.681 | -| mainline CPU | 51.674 | 7.299 | - -Relative (`ik` vs mainline): - -- CUDA PP: `+10.9%` -- CUDA TG: `+31.7%` -- CPU PP: `-3.6%` -- CPU TG: `+73.7%` - -## Notes - -- CPU-only Qwen3Next with `-fa 1` is now guarded in ik: FA is auto-disabled with a warning for `n_gpu_layers == 0` to avoid the prior `iqk_fa_templates.h` assert path. -- `ik` benchmark JSON currently includes some non-JSON log lines in stdout around context creation; parsing should tolerate that. -- Fused DeltaNet mode mapping has been updated in code: - - `0` / unset: non-fused - - `1`: fused only for `n_tok > 1` (safe mode) - - `2`: fused on all token counts (experimental; decode-quality regression observed) -- Added manual regression runner for fused-mode safety checks: - - `scripts/qwen3next-fused-regression.sh` - - Example: - - `BIN=./build-qwen3next-fix/bin/llama-perplexity scripts/qwen3next-fused-regression.sh --model /models/qwen3-next-coder.gguf --ctx 2048 --decode-b 1 --decode-ub 1 --prefill-b 2048 --prefill-ub 512 --ngl 47 --n-cpu-moe 40` -- Also integrated into the broader eval harness: - - `scripts/qwen3next-eval.sh --with-gpu --with-fused-regression ...` - - Results are surfaced in `SUMMARY.md` under `IK Fused Delta Regression`. -- Fused regression now enforces absolute non-fused sanity too: - - mode0 decode/prefill PPL must stay below configurable thresholds (defaults: `10.0` / `10.0`). -- Added unified Qwen3Next regression entrypoint for ongoing checks: - - `scripts/qwen3next-regression.sh --model /path/to/qwen3-next-coder.gguf` - - Outputs `SUMMARY.md` + per-step logs under `/tmp/qwen3next-regression//`. -- Added CLI plumbing for fused mode control (no raw env required): - - `--qwen3next-fused-delta {0|1|2}` - - This sets `LLAMA_QWEN3NEXT_FUSED_DELTA` for the current process. -- Added experimental CUDA DeltaNet dispatch control: - - `GGML_CUDA_DELTA_NET_OPT={0|1|2|3|4}` - - `0`: baseline dispatch (default) - - `1`: force fp16 recurrent kernel (`head_dim=128`) - - `2`: force multiblock kernel - - `3`: force Blackwell optimized kernel - - `4`: conservative auto mode (pre-Blackwell only) -- RTX 5060 Ti spot checks (`p=2048,n=64,b=1024,ub=256,--n-cpu-moe 47,-rtr 1`) did not show a reliable win from forced kernels: - - mode `2` and mode `3` reduced TG in single-run checks versus baseline. - - mode `4` tracks baseline on Blackwell (by design, no forced optimized-kernel switch there). - -## Decode Quality Diagnosis (Wikitext-2, `--chunks 1`, CUDA) - -Real-data perplexity checks on `/tmp/ppl_wikitext2_test.txt` confirm the decode regression source: - -- `qwen3-next-coder.gguf` - - mode `0`, opt `0`: `PPL=3.9148` - - mode `1`, opt `0`: `PPL=3.9148` (parity with mode 0) - - mode `2`, opt `0/1/2/4`: `PPL=6.1277` (consistently regressed) - - mode `2`, opt `3`: `PPL=302221.3639` (catastrophic instability) -- `qwen-3-coder-next-mxfp4.gguf` - - mode `0`, opt `0`: `PPL=3.9832` - - mode `1`, opt `0`: `PPL=3.9832` (parity with mode 0) - - mode `2`, opt `0`: `PPL=6.2362` (same regression pattern) - - mode `2`, opt `3`: `PPL=795964.1118` (catastrophic instability) - -Conclusion: - -- Decode-quality regression is tied to fused-all mode (`LLAMA_QWEN3NEXT_FUSED_DELTA=2`), not fixed by kernel dispatch overrides. -- `GGML_CUDA_DELTA_NET_OPT=3` should not be used on this path. - -## Safe Speed Gain (mode 1) - -With decode-safe mode (`LLAMA_QWEN3NEXT_FUSED_DELTA=1`), throughput on the serving proxy profile improved while preserving perplexity: - -- Profile: - - `llama-bench -m /models/qwen3-next-coder.gguf -p 8192 -n 128 -b 3072 -ub 768 -t 8 -fa 1 -ngl 999 --n-cpu-moe 47 -r 3 -rtr 1 -mmp 0` -- Mode `0` (`r=3`): - - `pp8192 = 175.639 +/- 0.221 tok/s` - - `tg128 = 26.393 +/- 1.469 tok/s` -- Mode `1` (`r=3`): - - `pp8192 = 237.014 +/- 1.199 tok/s` - - `tg128 = 27.111 +/- 1.395 tok/s` -- Relative (`mode1` vs `mode0`): - - PP: `+34.9%` - - TG: `+2.7%` - -Additional A/B for `GGML_CUDA_DELTA_NET_OPT=2` under mode `1` (`r=3`) did not improve performance: - -- opt `0`: `pp8192=238.352`, `tg128=24.709` -- opt `2`: `pp8192=237.680`, `tg128=24.566` diff --git a/scripts/qwen3next-eval.sh b/scripts/qwen3next-eval.sh deleted file mode 100755 index 102699f6..00000000 --- a/scripts/qwen3next-eval.sh +++ /dev/null @@ -1,546 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -IMAGE="${IMAGE:-iktest-dev:latest}" -MAIN_REPO="${MAIN_REPO:-/home/yurko/Code/llama.cpp}" -IK_REPO="${IK_REPO:-/home/yurko/Code/ik_llama.cpp}" -MAIN_BUILD_DIR="${MAIN_BUILD_DIR:-build}" -IK_BUILD_DIR="${IK_BUILD_DIR:-build}" -MODEL_HOST="${MODEL_HOST:-/home/yurko/.cache/llama.cpp/qwen3-next-coder.gguf}" -OUT_ROOT="${OUT_ROOT:-/tmp/qwen3next-eval}" -WITH_GPU=0 -WITH_FUSED_REGRESSION=0 -GPU_DEVICE="${GPU_DEVICE:-0}" -SWEEP_CTX="${SWEEP_CTX:-2048}" -SWEEP_N="${SWEEP_N:-32}" - -usage() { - cat <<'USAGE' -Usage: - scripts/qwen3next-eval.sh [options] - -Options: - --with-gpu Enable GPU checks in addition to CPU checks. - --with-fused-regression Run ik fused-delta regression check and include in summary. - --gpu-device ID CUDA device id to use for GPU sanity checks (default: 0). - --image IMAGE Docker image to run checks in (default: iktest-dev:latest). - --main-repo PATH Mainline repo path (default: /home/yurko/Code/llama.cpp). - --ik-repo PATH ik repo path (default: /home/yurko/Code/ik_llama.cpp). - --main-build-dir NAME Mainline build dir under main repo (default: build). - --ik-build-dir NAME ik build dir under ik repo (default: build). - --model PATH Host path to model GGUF file. - --out-root PATH Output root directory (default: /tmp/qwen3next-eval). - --sweep-ctx N Sweep context size for PP/TG check (default: 2048). - --sweep-n N Sweep generation tokens (default: 32). - -h, --help Show this help. - -What this script runs (in this order): - 1) CPU perplexity parity (chunks=1) mainline -> ik - 2) CPU perplexity parity (chunks=2) mainline -> ik - 3) CPU short generation smoke quality mainline -> ik - 4) Optional GPU sanity checks mainline -> ik - 5) Optional ik fused-delta regression mode0/mode1/mode2 safety check - -Output: - A timestamped folder is created under OUT_ROOT with: - - SUMMARY.md - - run.log - - *.out / *.err logs for each command -USAGE -} - -while [[ $# -gt 0 ]]; do - case "$1" in - --with-gpu) - WITH_GPU=1 - shift - ;; - --with-fused-regression) - WITH_FUSED_REGRESSION=1 - shift - ;; - --gpu-device) - GPU_DEVICE="$2" - shift 2 - ;; - --image) - IMAGE="$2" - shift 2 - ;; - --main-repo) - MAIN_REPO="$2" - shift 2 - ;; - --ik-repo) - IK_REPO="$2" - shift 2 - ;; - --main-build-dir) - MAIN_BUILD_DIR="$2" - shift 2 - ;; - --ik-build-dir) - IK_BUILD_DIR="$2" - shift 2 - ;; - --model) - MODEL_HOST="$2" - shift 2 - ;; - --out-root) - OUT_ROOT="$2" - shift 2 - ;; - --sweep-ctx) - SWEEP_CTX="$2" - shift 2 - ;; - --sweep-n) - SWEEP_N="$2" - shift 2 - ;; - -h|--help) - usage - exit 0 - ;; - *) - echo "Unknown option: $1" >&2 - usage - exit 2 - ;; - esac -done - -if [[ ! -d "$MAIN_REPO" ]]; then - echo "Mainline repo does not exist: $MAIN_REPO" >&2 - exit 1 -fi -if [[ ! -d "$IK_REPO" ]]; then - echo "ik repo does not exist: $IK_REPO" >&2 - exit 1 -fi -if [[ ! -f "$MODEL_HOST" ]]; then - echo "Model file does not exist: $MODEL_HOST" >&2 - exit 1 -fi - -run_id="$(date +%Y%m%d_%H%M%S)" -out_dir="${OUT_ROOT%/}/${run_id}" -mkdir -p "$out_dir" - -cat > "${out_dir}/ppl_input.txt" <<'TXT' -Deterministic evaluation text for quick perplexity parity checks. -The next lines intentionally repeat a simple pattern to reduce variance. -TXT -for _ in $(seq 1 400); do - echo "the system writes logs and the system reads logs" >> "${out_dir}/ppl_input.txt" -done - -cat > "${out_dir}/gen_prompt.txt" <<'TXT' -Write a concise Python function that returns the first n Fibonacci numbers iteratively, and then give one sentence explaining time complexity. -TXT - -cat > "${out_dir}/run_inside.sh" <<'BASH' -#!/usr/bin/env bash -set -euo pipefail - -WITH_GPU="${WITH_GPU:-0}" -GPU_DEVICE="${GPU_DEVICE:-0}" -SWEEP_CTX="${SWEEP_CTX:-2048}" -SWEEP_N="${SWEEP_N:-32}" -MAIN_BUILD_DIR="${MAIN_BUILD_DIR:-build}" -IK_BUILD_DIR="${IK_BUILD_DIR:-build}" -WITH_FUSED_REGRESSION="${WITH_FUSED_REGRESSION:-0}" - -MAIN_BIN="/mainline/${MAIN_BUILD_DIR}/bin" -IK_BIN="/ik/${IK_BUILD_DIR}/bin" -MAIN_LD="/mainline/${MAIN_BUILD_DIR}/bin:/mainline/${MAIN_BUILD_DIR}/src:/mainline/${MAIN_BUILD_DIR}/ggml/src:/mainline/${MAIN_BUILD_DIR}/examples/mtmd" -IK_LD="/ik/${IK_BUILD_DIR}/bin:/ik/${IK_BUILD_DIR}/src:/ik/${IK_BUILD_DIR}/ggml/src:/ik/${IK_BUILD_DIR}/examples/mtmd" -MODEL="/model.gguf" - -RUN_LOG="/out/run.log" -STATUS_FILE="/out/status.tsv" - -touch "$RUN_LOG" -printf "name\tstatus\texit_code\thost_mem_used_before_mib\thost_mem_used_after_mib\tgpu_mem_used_before_mib\tgpu_mem_used_after_mib\tmax_rss_kib\telapsed\n" > "$STATUS_FILE" - -log() { - local msg="$1" - printf "[%s] %s\n" "$(date +%H:%M:%S)" "$msg" | tee -a "$RUN_LOG" -} - -require_bin() { - local path="$1" - if [[ ! -x "$path" ]]; then - log "MISSING: $path" - return 1 - fi -} - -host_mem_used_mib() { - awk ' - /MemTotal:/ { mt = $2 } - /MemAvailable:/ { ma = $2 } - END { - if (mt > 0 && ma >= 0) { - printf "%.1f", (mt - ma) / 1024.0 - } else { - print "NA" - } - } - ' /proc/meminfo -} - -gpu_mem_used_mib() { - if [[ "$WITH_GPU" != "1" ]]; then - echo "NA" - return - fi - if ! command -v nvidia-smi >/dev/null 2>&1; then - echo "NA" - return - fi - local used - used="$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | tr '\n' ',' | sed 's/,$//' || true)" - if [[ -z "$used" ]]; then - echo "NA" - else - echo "$used" - fi -} - -extract_max_rss_kib() { - local time_file="$1" - if [[ ! -f "$time_file" ]]; then - echo "NA" - return - fi - local rss - rss="$(grep -E '^Maximum resident set size' "$time_file" | awk '{print $6}' | tail -n1 || true)" - if [[ -z "$rss" ]]; then - echo "NA" - else - echo "$rss" - fi -} - -extract_elapsed() { - local time_file="$1" - if [[ ! -f "$time_file" ]]; then - echo "NA" - return - fi - local elapsed - elapsed="$(grep -E '^Elapsed \(wall clock\) time' "$time_file" | sed -E 's/^[^:]+:[[:space:]]*//' | tail -n1 || true)" - if [[ -z "$elapsed" ]]; then - echo "NA" - else - echo "$elapsed" - fi -} - -run_cmd() { - local name="$1" - shift - local out_file="/out/${name}.out" - local err_file="/out/${name}.err" - local time_file="/out/${name}.time" - local ec - local host_before host_after gpu_before gpu_after max_rss elapsed - - host_before="$(host_mem_used_mib)" - gpu_before="$(gpu_mem_used_mib)" - log "RUN: $name" - - set +e - if [[ -x /usr/bin/time ]]; then - /usr/bin/time -v -o "$time_file" "$@" >"$out_file" 2>"$err_file" - ec=$? - else - "$@" >"$out_file" 2>"$err_file" - ec=$? - fi - set -e - - host_after="$(host_mem_used_mib)" - gpu_after="$(gpu_mem_used_mib)" - max_rss="$(extract_max_rss_kib "$time_file")" - elapsed="$(extract_elapsed "$time_file")" - - if [[ $ec -eq 0 ]]; then - printf "%s\tOK\t0\t%s\t%s\t%s\t%s\t%s\t%s\n" \ - "$name" "$host_before" "$host_after" "$gpu_before" "$gpu_after" "$max_rss" "$elapsed" >> "$STATUS_FILE" - log "OK: $name" - else - printf "%s\tFAIL\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n" \ - "$name" "$ec" "$host_before" "$host_after" "$gpu_before" "$gpu_after" "$max_rss" "$elapsed" >> "$STATUS_FILE" - log "FAIL($ec): $name" - fi - return $ec -} - -extract_ppl() { - local out_file="$1" - local err_file="$2" - local line num - - line="$(cat "$out_file" "$err_file" 2>/dev/null | grep -E "Final estimate:" | tail -n1 || true)" - if [[ -z "$line" ]]; then - echo "NA" - return - fi - - num="$(echo "$line" | sed -nE 's/.*= ([0-9]+\.[0-9]+).*/\1/p')" - if [[ -z "$num" ]]; then - num="$(echo "$line" | grep -Eo '[0-9]+\.[0-9]+' | head -n1 || true)" - fi - if [[ -z "$num" ]]; then - echo "NA" - else - echo "$num" - fi -} - -abs_delta() { - local a="$1" - local b="$2" - awk -v a="$a" -v b="$b" 'BEGIN { d = a - b; if (d < 0) d = -d; printf "%.6f", d }' -} - -has_token() { - local file="$1" - local pattern="$2" - if grep -Eiq "$pattern" "$file"; then - echo "yes" - else - echo "no" - fi -} - -require_bin "$MAIN_BIN/llama-perplexity" -require_bin "$MAIN_BIN/llama-cli" -require_bin "$MAIN_BIN/llama-completion" -require_bin "$IK_BIN/llama-perplexity" -require_bin "$IK_BIN/llama-cli" - -if [[ "$WITH_GPU" != "1" ]]; then - export CUDA_VISIBLE_DEVICES="" - log "GPU checks disabled (CPU-only mode)" -else - export CUDA_VISIBLE_DEVICES="$GPU_DEVICE" - log "GPU checks enabled on CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" -fi - -if [[ "$WITH_FUSED_REGRESSION" == "1" && "$WITH_GPU" != "1" ]]; then - log "Fused regression requested but GPU mode is disabled; this step will be skipped" -fi - -PPL_INPUT="/out/ppl_input.txt" -GEN_PROMPT="$(cat /out/gen_prompt.txt)" - -# CPU perplexity: chunks=1 (mainline -> ik) -run_cmd "cpu_ppl_chunks1_mainline" \ - env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-perplexity" \ - -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 0 || true -run_cmd "cpu_ppl_chunks1_ik" \ - env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-perplexity" \ - -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 0 || true - -# CPU perplexity: chunks=2 (mainline -> ik) -run_cmd "cpu_ppl_chunks2_mainline" \ - env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-perplexity" \ - -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 2 --no-warmup -ngl 0 || true -run_cmd "cpu_ppl_chunks2_ik" \ - env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-perplexity" \ - -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 2 --no-warmup -ngl 0 || true - -# CPU short generation smoke quality (mainline -> ik) -run_cmd "cpu_gen_mainline" \ - env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-completion" \ - -m "$MODEL" --cpu-moe -ngl 0 -c 512 -n 64 --seed 123 --temp 0 --top-k 1 --simple-io --no-display-prompt -p "$GEN_PROMPT" || true -run_cmd "cpu_gen_ik" \ - env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-cli" \ - -m "$MODEL" --cpu-moe -ngl 0 -c 512 -n 64 --seed 123 --temp 0 --top-k 1 --simple-io --no-display-prompt -p "$GEN_PROMPT" || true - -if [[ "$WITH_GPU" == "1" ]]; then - # CUDA sanity perplexity: chunks=1 (mainline -> ik) - run_cmd "gpu_ppl_chunks1_mainline" \ - env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-perplexity" \ - -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 1 || true - run_cmd "gpu_ppl_chunks1_ik" \ - env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-perplexity" \ - -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 1 || true - - # Quick sweep sanity (mainline -> ik) - if [[ -x "$MAIN_BIN/llama-sweep-bench" ]]; then - if env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-sweep-bench" --help >/dev/null 2>&1; then - run_cmd "gpu_sweep_mainline" \ - env LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-sweep-bench" \ - -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true - else - printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_mainline" >> "$STATUS_FILE" - log "SKIP: gpu_sweep_mainline (binary cannot start with current runtime deps)" - fi - else - printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_mainline" >> "$STATUS_FILE" - log "SKIP: gpu_sweep_mainline (missing $MAIN_BIN/llama-sweep-bench)" - fi - if [[ -x "$IK_BIN/llama-sweep-bench" ]]; then - if env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" --help >/dev/null 2>&1; then - run_cmd "gpu_sweep_ik" \ - env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" \ - -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true - else - printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_ik" >> "$STATUS_FILE" - log "SKIP: gpu_sweep_ik (binary cannot start with current runtime deps)" - fi - else - printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_ik" >> "$STATUS_FILE" - log "SKIP: gpu_sweep_ik (missing $IK_BIN/llama-sweep-bench)" - fi -fi - -if [[ "$WITH_FUSED_REGRESSION" == "1" ]]; then - if [[ "$WITH_GPU" != "1" ]]; then - printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "ik_fused_regression" >> "$STATUS_FILE" - elif [[ ! -x "/ik/scripts/qwen3next-fused-regression.sh" ]]; then - printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "ik_fused_regression" >> "$STATUS_FILE" - log "SKIP: ik_fused_regression (missing /ik/scripts/qwen3next-fused-regression.sh)" - else - run_cmd "ik_fused_regression" \ - env LD_LIBRARY_PATH="$IK_LD" /ik/scripts/qwen3next-fused-regression.sh \ - --model "$MODEL" \ - --bin "$IK_BIN/llama-perplexity" \ - --out /out/ik_fused_regression.md \ - --cuda-device "$GPU_DEVICE" \ - --threads 8 \ - --ctx 2048 \ - --fa on \ - --ngl 47 \ - --n-cpu-moe 40 \ - --chunks 1 \ - --decode-b 1 \ - --decode-ub 1 \ - --prefill-b 2048 \ - --prefill-ub 512 || true - fi -fi - -# Aggregate summary -cpu_c1_main="$(extract_ppl /out/cpu_ppl_chunks1_mainline.out /out/cpu_ppl_chunks1_mainline.err)" -cpu_c1_ik="$(extract_ppl /out/cpu_ppl_chunks1_ik.out /out/cpu_ppl_chunks1_ik.err)" -cpu_c2_main="$(extract_ppl /out/cpu_ppl_chunks2_mainline.out /out/cpu_ppl_chunks2_mainline.err)" -cpu_c2_ik="$(extract_ppl /out/cpu_ppl_chunks2_ik.out /out/cpu_ppl_chunks2_ik.err)" - -cpu_c1_delta="NA" -cpu_c2_delta="NA" -if [[ "$cpu_c1_main" != "NA" && "$cpu_c1_ik" != "NA" ]]; then - cpu_c1_delta="$(abs_delta "$cpu_c1_main" "$cpu_c1_ik")" -fi -if [[ "$cpu_c2_main" != "NA" && "$cpu_c2_ik" != "NA" ]]; then - cpu_c2_delta="$(abs_delta "$cpu_c2_main" "$cpu_c2_ik")" -fi - -main_has_fib="$(has_token /out/cpu_gen_mainline.out 'fibonacci|fibs|fib')" -ik_has_fib="$(has_token /out/cpu_gen_ik.out 'fibonacci|fibs|fib')" -main_has_complexity="$(has_token /out/cpu_gen_mainline.out 'complexity|O\(')" -ik_has_complexity="$(has_token /out/cpu_gen_ik.out 'complexity|O\(')" -fused_decode_safe="NA" -fused_prefill_safe="NA" -fused_mode0_decode_sane="NA" -fused_mode0_prefill_sane="NA" -if [[ -f /out/ik_fused_regression.md ]]; then - fused_decode_safe="$(sed -nE 's/^- decode safety .*: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)" - fused_prefill_safe="$(sed -nE 's/^- prefill safety .*: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)" - fused_mode0_decode_sane="$(sed -nE 's/^- mode0 decode sanity: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)" - fused_mode0_prefill_sane="$(sed -nE 's/^- mode0 prefill sanity: `([^`]+)`.*/\1/p' /out/ik_fused_regression.md | tail -n1 || true)" - if [[ -z "$fused_decode_safe" ]]; then fused_decode_safe="NA"; fi - if [[ -z "$fused_prefill_safe" ]]; then fused_prefill_safe="NA"; fi - if [[ -z "$fused_mode0_decode_sane" ]]; then fused_mode0_decode_sane="NA"; fi - if [[ -z "$fused_mode0_prefill_sane" ]]; then fused_mode0_prefill_sane="NA"; fi -fi - -{ - echo "# Qwen3Next Eval Summary" - echo - echo "Mode: $( [[ "$WITH_GPU" == "1" ]] && echo "CPU+GPU" || echo "CPU-only" )" - echo "- Sweep config: c=\`$SWEEP_CTX\`, n=\`$SWEEP_N\`" - echo - echo "## CPU Perplexity" - echo "- chunks=1 mainline: \`$cpu_c1_main\`" - echo "- chunks=1 ik: \`$cpu_c1_ik\`" - echo "- chunks=1 |delta|: \`$cpu_c1_delta\`" - echo "- chunks=2 mainline: \`$cpu_c2_main\`" - echo "- chunks=2 ik: \`$cpu_c2_ik\`" - echo "- chunks=2 |delta|: \`$cpu_c2_delta\`" - echo - echo "## CPU Short Generation Smoke" - echo "- mainline has Fibonacci token(s): \`$main_has_fib\`" - echo "- ik has Fibonacci token(s): \`$ik_has_fib\`" - echo "- mainline has complexity token(s): \`$main_has_complexity\`" - echo "- ik has complexity token(s): \`$ik_has_complexity\`" - echo - echo "## IK Fused Delta Regression" - if [[ "$WITH_FUSED_REGRESSION" == "1" ]]; then - if [[ -f /out/ik_fused_regression.md ]]; then - echo "- decode safety (mode1 ~= mode0): \`$fused_decode_safe\`" - echo "- prefill safety (mode1 ~= mode0): \`$fused_prefill_safe\`" - echo "- mode0 decode sanity: \`$fused_mode0_decode_sane\`" - echo "- mode0 prefill sanity: \`$fused_mode0_prefill_sane\`" - echo "- report: \`/out/ik_fused_regression.md\`" - else - echo "- status: \`requested but no report generated\`" - fi - else - echo "- status: \`not requested\`" - fi - echo - echo "## Command Status + Memory" - echo '```' - cat "$STATUS_FILE" - echo '```' - echo - echo "## First Non-empty Lines (Generation)" - echo "### mainline" - awk 'NF { print; c++; if (c == 20) exit }' /out/cpu_gen_mainline.out - echo - echo "### ik" - awk 'NF { print; c++; if (c == 20) exit }' /out/cpu_gen_ik.out -} > /out/SUMMARY.md - -log "Summary written to /out/SUMMARY.md" -BASH - -chmod +x "${out_dir}/run_inside.sh" - -docker_cmd=( - docker run --rm - -e WITH_GPU="${WITH_GPU}" - -e GPU_DEVICE="${GPU_DEVICE}" - -e SWEEP_CTX="${SWEEP_CTX}" - -e SWEEP_N="${SWEEP_N}" - -e WITH_FUSED_REGRESSION="${WITH_FUSED_REGRESSION}" - -e MAIN_BUILD_DIR="${MAIN_BUILD_DIR}" - -e IK_BUILD_DIR="${IK_BUILD_DIR}" - -v "${MAIN_REPO}:/mainline" - -v "${IK_REPO}:/ik" - -v "${MODEL_HOST}:/model.gguf:ro" - -v "${out_dir}:/out" -) - -if [[ "$WITH_GPU" -eq 1 ]]; then - docker_cmd+=(--gpus all) -fi - -docker_cmd+=("${IMAGE}" /bin/bash /out/run_inside.sh) - -echo "Running eval in container: ${IMAGE}" -echo "Output directory: ${out_dir}" -"${docker_cmd[@]}" - -echo -echo "Done. Summary:" -echo " ${out_dir}/SUMMARY.md" -echo "Raw logs:" -echo " ${out_dir}/*.out" -echo " ${out_dir}/*.err" diff --git a/scripts/qwen3next-fused-regression.sh b/scripts/qwen3next-fused-regression.sh deleted file mode 100755 index b3a0042b..00000000 --- a/scripts/qwen3next-fused-regression.sh +++ /dev/null @@ -1,228 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -BIN="${BIN:-./build/bin/llama-perplexity}" -MODEL="${MODEL:-}" -INPUT_FILE="${INPUT_FILE:-/tmp/qwen3next_fused_regression_input.txt}" -OUT_FILE="${OUT_FILE:-/tmp/qwen3next_fused_regression_$(date +%Y%m%d_%H%M%S).md}" - -CUDA_DEVICE="${CUDA_DEVICE:-0}" -THREADS="${THREADS:-8}" -CTX="${CTX:-2048}" -FA="${FA:-on}" -NGL="${NGL:-47}" -N_CPU_MOE="${N_CPU_MOE:-40}" -CHUNKS="${CHUNKS:-1}" - -DECODE_B="${DECODE_B:-1}" -DECODE_UB="${DECODE_UB:-1}" -PREFILL_B="${PREFILL_B:-2048}" -PREFILL_UB="${PREFILL_UB:-512}" - -# Mandatory safety checks: -# 1) mode=1 decode should stay aligned with mode=0 decode. -# 2) mode=1 prefill should stay aligned with mode=0 prefill. -MAX_DECODE_DELTA_01="${MAX_DECODE_DELTA_01:-0.10}" -MAX_PREFILL_DELTA_01="${MAX_PREFILL_DELTA_01:-0.10}" -# 3) mode=0 absolute perplexity should stay in a sane range. -MAX_MODE0_DECODE_PPL="${MAX_MODE0_DECODE_PPL:-10.0}" -MAX_MODE0_PREFILL_PPL="${MAX_MODE0_PREFILL_PPL:-10.0}" - -usage() { - cat <<'USAGE' -Usage: - scripts/qwen3next-fused-regression.sh --model /path/to/model.gguf [options] - -Options: - --model PATH GGUF model path (required) - --bin PATH llama-perplexity binary (default: ./build/bin/llama-perplexity) - --input PATH input text file; auto-generated if missing - --out PATH markdown output file - --cuda-device ID CUDA_VISIBLE_DEVICES value (default: 0) - --threads N -t value (default: 8) - --ctx N -c value (default: 2048) - --fa on|off -fa value (default: on) - --ngl N -ngl value (default: 47) - --n-cpu-moe N --n-cpu-moe value (default: 40) - --chunks N --chunks value (default: 1) - --decode-b N decode batch size (default: 1) - --decode-ub N decode ubatch size (default: 1) - --prefill-b N prefill batch size (default: 2048) - --prefill-ub N prefill ubatch size (default: 512) - --max-decode-delta-01 X fail threshold for |PPL(mode1)-PPL(mode0)| in decode (default: 0.10) - --max-prefill-delta-01 X fail threshold for |PPL(mode1)-PPL(mode0)| in prefill (default: 0.10) - --max-mode0-decode-ppl X fail threshold for PPL(mode0) in decode (default: 10.0) - --max-mode0-prefill-ppl X fail threshold for PPL(mode0) in prefill (default: 10.0) - -h, --help show this help -USAGE -} - -while [[ $# -gt 0 ]]; do - case "$1" in - --model) MODEL="$2"; shift 2 ;; - --bin) BIN="$2"; shift 2 ;; - --input) INPUT_FILE="$2"; shift 2 ;; - --out) OUT_FILE="$2"; shift 2 ;; - --cuda-device) CUDA_DEVICE="$2"; shift 2 ;; - --threads) THREADS="$2"; shift 2 ;; - --ctx) CTX="$2"; shift 2 ;; - --fa) FA="$2"; shift 2 ;; - --ngl) NGL="$2"; shift 2 ;; - --n-cpu-moe) N_CPU_MOE="$2"; shift 2 ;; - --chunks) CHUNKS="$2"; shift 2 ;; - --decode-b) DECODE_B="$2"; shift 2 ;; - --decode-ub) DECODE_UB="$2"; shift 2 ;; - --prefill-b) PREFILL_B="$2"; shift 2 ;; - --prefill-ub) PREFILL_UB="$2"; shift 2 ;; - --max-decode-delta-01) MAX_DECODE_DELTA_01="$2"; shift 2 ;; - --max-prefill-delta-01) MAX_PREFILL_DELTA_01="$2"; shift 2 ;; - --max-mode0-decode-ppl) MAX_MODE0_DECODE_PPL="$2"; shift 2 ;; - --max-mode0-prefill-ppl) MAX_MODE0_PREFILL_PPL="$2"; shift 2 ;; - -h|--help) usage; exit 0 ;; - *) - echo "unknown option: $1" >&2 - usage - exit 2 - ;; - esac -done - -if [[ -z "$MODEL" ]]; then - echo "--model is required" >&2 - exit 2 -fi -if [[ ! -x "$BIN" ]]; then - echo "binary not executable: $BIN" >&2 - exit 1 -fi -if [[ ! -f "$MODEL" ]]; then - echo "model not found: $MODEL" >&2 - exit 1 -fi - -if [[ ! -f "$INPUT_FILE" ]]; then - cat > "$INPUT_FILE" <<'TXT' -Regression text for Qwen3Next fused DeltaNet checks. -This text is deterministic and intentionally repetitive. -TXT - # Keep this comfortably above 2*ctx tokenization requirements used by llama-perplexity. - for _ in $(seq 1 900); do - echo "the model should keep stable perplexity under consistent settings" >> "$INPUT_FILE" - done -fi - -log_dir="${OUT_FILE}.logs" -mkdir -p "$log_dir" - -extract_ppl() { - local file="$1" - local line val - line="$(grep -E 'Final estimate:' "$file" | tail -n1 || true)" - if [[ -z "$line" ]]; then - echo "NA" - return - fi - val="$(echo "$line" | sed -nE 's/.*= ([0-9]+\.[0-9]+).*/\1/p')" - if [[ -z "$val" ]]; then - val="$(echo "$line" | grep -Eo '[0-9]+\.[0-9]+' | head -n1 || true)" - fi - if [[ -z "$val" ]]; then - echo "NA" - else - echo "$val" - fi -} - -abs_delta() { - awk -v a="$1" -v b="$2" 'BEGIN { d = a - b; if (d < 0) d = -d; printf "%.6f", d }' -} - -run_ppl() { - local mode="$1" - local b="$2" - local ub="$3" - local label="$4" - local log="${log_dir}/${label}_m${mode}.log" - - echo "running ${label} mode=${mode} (b=${b} ub=${ub})" >&2 - CUDA_VISIBLE_DEVICES="$CUDA_DEVICE" \ - LLAMA_QWEN3NEXT_FUSED_DELTA="$mode" \ - "$BIN" -m "$MODEL" -f "$INPUT_FILE" \ - -c "$CTX" -b "$b" -ub "$ub" -t "$THREADS" \ - -fa "$FA" -ngl "$NGL" --n-cpu-moe "$N_CPU_MOE" \ - --chunks "$CHUNKS" --no-warmup >"$log" 2>&1 - - extract_ppl "$log" -} - -decode_0="$(run_ppl 0 "$DECODE_B" "$DECODE_UB" decode)" -decode_1="$(run_ppl 1 "$DECODE_B" "$DECODE_UB" decode)" -decode_2="$(run_ppl 2 "$DECODE_B" "$DECODE_UB" decode)" - -prefill_0="$(run_ppl 0 "$PREFILL_B" "$PREFILL_UB" prefill)" -prefill_1="$(run_ppl 1 "$PREFILL_B" "$PREFILL_UB" prefill)" -prefill_2="$(run_ppl 2 "$PREFILL_B" "$PREFILL_UB" prefill)" - -if [[ "$decode_0" == "NA" || "$decode_1" == "NA" || "$decode_2" == "NA" || \ - "$prefill_0" == "NA" || "$prefill_1" == "NA" || "$prefill_2" == "NA" ]]; then - echo "failed to extract one or more perplexity values; see logs in ${log_dir}" >&2 - exit 1 -fi - -decode_delta_01="$(abs_delta "$decode_0" "$decode_1")" -decode_delta_02="$(abs_delta "$decode_0" "$decode_2")" -prefill_delta_01="$(abs_delta "$prefill_0" "$prefill_1")" -prefill_delta_02="$(abs_delta "$prefill_0" "$prefill_2")" - -decode_ok="$(awk -v d="$decode_delta_01" -v t="$MAX_DECODE_DELTA_01" 'BEGIN { print(d <= t ? "yes" : "no") }')" -prefill_ok="$(awk -v d="$prefill_delta_01" -v t="$MAX_PREFILL_DELTA_01" 'BEGIN { print(d <= t ? "yes" : "no") }')" -mode0_decode_ok="$(awk -v p="$decode_0" -v t="$MAX_MODE0_DECODE_PPL" 'BEGIN { print(p <= t ? "yes" : "no") }')" -mode0_prefill_ok="$(awk -v p="$prefill_0" -v t="$MAX_MODE0_PREFILL_PPL" 'BEGIN { print(p <= t ? "yes" : "no") }')" - -{ - echo "# Qwen3Next Fused DeltaNet Regression Report" - echo - echo "- date: \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\`" - echo "- bin: \`$BIN\`" - echo "- model: \`$MODEL\`" - echo "- input: \`$INPUT_FILE\`" - echo "- cuda_device: \`$CUDA_DEVICE\`" - echo "- ctx: \`$CTX\`" - echo "- fa: \`$FA\`" - echo "- ngl: \`$NGL\`" - echo "- n_cpu_moe: \`$N_CPU_MOE\`" - echo "- chunks: \`$CHUNKS\`" - echo - echo "## Perplexity" - echo - echo "| Path | mode=0 | mode=1 | mode=2 | |delta|(1-0) | |delta|(2-0) |" - echo "|---|---:|---:|---:|---:|---:|" - echo "| decode (b=${DECODE_B},ub=${DECODE_UB}) | ${decode_0} | ${decode_1} | ${decode_2} | ${decode_delta_01} | ${decode_delta_02} |" - echo "| prefill (b=${PREFILL_B},ub=${PREFILL_UB}) | ${prefill_0} | ${prefill_1} | ${prefill_2} | ${prefill_delta_01} | ${prefill_delta_02} |" - echo - echo "## Safety Checks" - echo - echo "- decode safety (mode1 ~= mode0): \`${decode_ok}\` (threshold \`${MAX_DECODE_DELTA_01}\`)" - echo "- prefill safety (mode1 ~= mode0): \`${prefill_ok}\` (threshold \`${MAX_PREFILL_DELTA_01}\`)" - echo "- mode0 decode sanity: \`${mode0_decode_ok}\` (PPL \`${decode_0}\`, max \`${MAX_MODE0_DECODE_PPL}\`)" - echo "- mode0 prefill sanity: \`${mode0_prefill_ok}\` (PPL \`${prefill_0}\`, max \`${MAX_MODE0_PREFILL_PPL}\`)" - echo - echo "## Logs" - echo - echo "- raw logs dir: \`${log_dir}\`" - echo "- decode mode0: \`${log_dir}/decode_m0.log\`" - echo "- decode mode1: \`${log_dir}/decode_m1.log\`" - echo "- decode mode2: \`${log_dir}/decode_m2.log\`" - echo "- prefill mode0: \`${log_dir}/prefill_m0.log\`" - echo "- prefill mode1: \`${log_dir}/prefill_m1.log\`" - echo "- prefill mode2: \`${log_dir}/prefill_m2.log\`" -} > "$OUT_FILE" - -echo "wrote report: $OUT_FILE" - -if [[ "$decode_ok" != "yes" || "$prefill_ok" != "yes" || "$mode0_decode_ok" != "yes" || "$mode0_prefill_ok" != "yes" ]]; then - echo "regression check failed; see report: $OUT_FILE" >&2 - exit 1 -fi - -echo "regression check passed" diff --git a/scripts/qwen3next-regression.sh b/scripts/qwen3next-regression.sh deleted file mode 100755 index d6649248..00000000 --- a/scripts/qwen3next-regression.sh +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -IMAGE="${IMAGE:-nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04}" -IK_REPO="${IK_REPO:-/home/yurko/Code/ik_llama.cpp}" -IK_BUILD_DIR="${IK_BUILD_DIR:-build}" -MODEL_HOST="${MODEL_HOST:-/home/yurko/.cache/llama.cpp/qwen3-next-coder.gguf}" -OUT_ROOT="${OUT_ROOT:-/tmp/qwen3next-regression}" -GPU_DEVICE="${GPU_DEVICE:-0}" - -THREADS="${THREADS:-8}" -FA="${FA:-on}" -NGL="${NGL:-999}" - -PROXY_CTX="${PROXY_CTX:-8192}" -PROXY_B="${PROXY_B:-3072}" -PROXY_UB="${PROXY_UB:-768}" -PROXY_N="${PROXY_N:-128}" -PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE:-40}" - -REG_CTX="${REG_CTX:-2048}" -REG_NGL="${REG_NGL:-47}" -REG_DECODE_B="${REG_DECODE_B:-1}" -REG_DECODE_UB="${REG_DECODE_UB:-1}" -REG_PREFILL_B="${REG_PREFILL_B:-2048}" -REG_PREFILL_UB="${REG_PREFILL_UB:-512}" - -WITH_FIT=1 -FIT_CTX="${FIT_CTX:-65536}" -FIT_N_CPU_MOE="${FIT_N_CPU_MOE:-47}" -FIT_N="${FIT_N:-1}" - -usage() { - cat <<'USAGE' -Usage: - scripts/qwen3next-regression.sh [options] - -Options: - --image IMAGE Docker image to run checks in (default: nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04) - --ik-repo PATH ik repo path (default: /home/yurko/Code/ik_llama.cpp) - --ik-build-dir NAME Build dir under ik repo (default: build) - --model PATH Host path to model GGUF file - --out-root PATH Output root directory (default: /tmp/qwen3next-regression) - --gpu-device ID CUDA device id (default: 0) - --threads N Threads (default: 8) - --fa on|off Flash attention mode (default: on) - --ngl N -ngl value (default: 999) - - --proxy-ctx N Proxy sweep context (default: 8192) - --proxy-b N Proxy sweep batch size (default: 3072) - --proxy-ub N Proxy sweep ubatch size (default: 768) - --proxy-n N Proxy sweep generation tokens (default: 128) - --proxy-n-cpu-moe N Proxy sweep --n-cpu-moe (default: 40) - - --reg-ctx N Fused regression context (default: 2048) - --reg-ngl N Fused regression -ngl (default: 47) - --reg-decode-b N Fused regression decode b (default: 1) - --reg-decode-ub N Fused regression decode ub (default: 1) - --reg-prefill-b N Fused regression prefill b (default: 2048) - --reg-prefill-ub N Fused regression prefill ub (default: 512) - - --fit-ctx N Long-context fit sanity context (default: 65536) - --fit-n-cpu-moe N Long-context fit sanity --n-cpu-moe (default: 47) - --fit-n N Long-context fit sanity generation tokens (default: 1) - --no-fit Skip long-context fit sanity - -h, --help Show this help - -Runs: - 1) Fused-delta regression guard (mode0/mode1/mode2 + sanity thresholds) - 2) Single-GPU proxy sweep benchmark - 3) Optional long-context fit sanity -USAGE -} - -while [[ $# -gt 0 ]]; do - case "$1" in - --image) IMAGE="$2"; shift 2 ;; - --ik-repo) IK_REPO="$2"; shift 2 ;; - --ik-build-dir) IK_BUILD_DIR="$2"; shift 2 ;; - --model) MODEL_HOST="$2"; shift 2 ;; - --out-root) OUT_ROOT="$2"; shift 2 ;; - --gpu-device) GPU_DEVICE="$2"; shift 2 ;; - --threads) THREADS="$2"; shift 2 ;; - --fa) FA="$2"; shift 2 ;; - --ngl) NGL="$2"; shift 2 ;; - --proxy-ctx) PROXY_CTX="$2"; shift 2 ;; - --proxy-b) PROXY_B="$2"; shift 2 ;; - --proxy-ub) PROXY_UB="$2"; shift 2 ;; - --proxy-n) PROXY_N="$2"; shift 2 ;; - --proxy-n-cpu-moe) PROXY_N_CPU_MOE="$2"; shift 2 ;; - --reg-ctx) REG_CTX="$2"; shift 2 ;; - --reg-ngl) REG_NGL="$2"; shift 2 ;; - --reg-decode-b) REG_DECODE_B="$2"; shift 2 ;; - --reg-decode-ub) REG_DECODE_UB="$2"; shift 2 ;; - --reg-prefill-b) REG_PREFILL_B="$2"; shift 2 ;; - --reg-prefill-ub) REG_PREFILL_UB="$2"; shift 2 ;; - --fit-ctx) FIT_CTX="$2"; shift 2 ;; - --fit-n-cpu-moe) FIT_N_CPU_MOE="$2"; shift 2 ;; - --fit-n) FIT_N="$2"; shift 2 ;; - --no-fit) WITH_FIT=0; shift ;; - -h|--help) usage; exit 0 ;; - *) - echo "Unknown option: $1" >&2 - usage - exit 2 - ;; - esac -done - -if [[ ! -d "$IK_REPO" ]]; then - echo "ik repo does not exist: $IK_REPO" >&2 - exit 1 -fi -if [[ ! -f "$MODEL_HOST" ]]; then - echo "Model file does not exist: $MODEL_HOST" >&2 - exit 1 -fi - -run_id="$(date +%Y%m%d_%H%M%S)" -out_dir="${OUT_ROOT%/}/${run_id}" -mkdir -p "$out_dir" - -cat > "${out_dir}/run_inside.sh" <<'BASH' -#!/usr/bin/env bash -set -euo pipefail - -IK_BUILD_DIR="${IK_BUILD_DIR:-build}" -GPU_DEVICE="${GPU_DEVICE:-0}" -THREADS="${THREADS:-8}" -FA="${FA:-on}" -NGL="${NGL:-999}" - -PROXY_CTX="${PROXY_CTX:-8192}" -PROXY_B="${PROXY_B:-3072}" -PROXY_UB="${PROXY_UB:-768}" -PROXY_N="${PROXY_N:-128}" -PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE:-40}" - -REG_CTX="${REG_CTX:-2048}" -REG_NGL="${REG_NGL:-47}" -REG_DECODE_B="${REG_DECODE_B:-1}" -REG_DECODE_UB="${REG_DECODE_UB:-1}" -REG_PREFILL_B="${REG_PREFILL_B:-2048}" -REG_PREFILL_UB="${REG_PREFILL_UB:-512}" - -WITH_FIT="${WITH_FIT:-1}" -FIT_CTX="${FIT_CTX:-65536}" -FIT_N_CPU_MOE="${FIT_N_CPU_MOE:-47}" -FIT_N="${FIT_N:-1}" - -IK_BIN="/ik/${IK_BUILD_DIR}/bin" -IK_LD="/ik/${IK_BUILD_DIR}/bin:/ik/${IK_BUILD_DIR}/src:/ik/${IK_BUILD_DIR}/ggml/src:/ik/${IK_BUILD_DIR}/examples/mtmd" -MODEL="/model.gguf" - -RUN_LOG="/out/run.log" -STATUS_FILE="/out/status.tsv" - -touch "$RUN_LOG" -printf "name\tstatus\texit_code\n" > "$STATUS_FILE" - -log() { - local msg="$1" - printf "[%s] %s\n" "$(date +%H:%M:%S)" "$msg" | tee -a "$RUN_LOG" -} - -run_cmd() { - local name="$1" - shift - local out_file="/out/${name}.out" - local err_file="/out/${name}.err" - local ec - - log "RUN: $name" - set +e - "$@" >"$out_file" 2>"$err_file" - ec=$? - set -e - - if [[ $ec -eq 0 ]]; then - printf "%s\tOK\t0\n" "$name" >> "$STATUS_FILE" - log "OK: $name" - else - printf "%s\tFAIL\t%d\n" "$name" "$ec" >> "$STATUS_FILE" - log "FAIL($ec): $name" - fi - return $ec -} - -require_bin() { - local path="$1" - if [[ ! -x "$path" ]]; then - log "MISSING: $path" - exit 1 - fi -} - -extract_best_metric() { - local out_file="$1" - local err_file="$2" - local col="$3" - awk -F'|' -v c="$col" ' - /^\|[[:space:]]*[0-9]+[[:space:]]*\|/ { - v = $c - gsub(/[[:space:]]/, "", v) - if ((v + 0) > best) { - best = v + 0 - row = $0 - } - } - END { - if (best > 0) { - printf "%.2f\t%s\n", best, row - } else { - print "NA\tNA" - } - } - ' < <(cat "$out_file" "$err_file") -} - -require_bin "$IK_BIN/llama-perplexity" -require_bin "$IK_BIN/llama-sweep-bench" -require_bin "$IK_BIN/llama-cli" -require_bin "/ik/scripts/qwen3next-fused-regression.sh" - -export CUDA_VISIBLE_DEVICES="$GPU_DEVICE" -log "GPU checks on CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" - -run_cmd "fused_regression" \ - env LD_LIBRARY_PATH="$IK_LD" /ik/scripts/qwen3next-fused-regression.sh \ - --model "$MODEL" \ - --bin "$IK_BIN/llama-perplexity" \ - --out /out/fused_regression.md \ - --cuda-device "$GPU_DEVICE" \ - --threads "$THREADS" \ - --ctx "$REG_CTX" \ - --fa "$FA" \ - --ngl "$REG_NGL" \ - --n-cpu-moe "$PROXY_N_CPU_MOE" \ - --chunks 1 \ - --decode-b "$REG_DECODE_B" \ - --decode-ub "$REG_DECODE_UB" \ - --prefill-b "$REG_PREFILL_B" \ - --prefill-ub "$REG_PREFILL_UB" || true - -run_cmd "proxy_sweep" \ - env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" \ - -m "$MODEL" \ - -c "$PROXY_CTX" \ - -b "$PROXY_B" \ - -ub "$PROXY_UB" \ - -n "$PROXY_N" \ - -t "$THREADS" \ - -fa "$FA" \ - --jinja \ - -ngl "$NGL" \ - --n-cpu-moe "$PROXY_N_CPU_MOE" \ - -rtr \ - --temp 1 \ - --top-p 0.95 \ - --top-k 40 \ - --min-p 0.01 || true - -if [[ "$WITH_FIT" == "1" ]]; then - run_cmd "fit_sanity" \ - env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-cli" \ - -m "$MODEL" \ - -c "$FIT_CTX" \ - -n "$FIT_N" \ - -t "$THREADS" \ - -fa "$FA" \ - -ngl "$NGL" \ - --n-cpu-moe "$FIT_N_CPU_MOE" \ - -rtr \ - --temp 0 \ - --top-k 1 \ - --simple-io \ - --no-display-prompt \ - -p "ping" || true -else - printf "%s\tSKIP\t0\n" "fit_sanity" >> "$STATUS_FILE" - log "SKIP: fit_sanity" -fi - -fused_decode_safe="NA" -fused_prefill_safe="NA" -fused_mode0_decode_sane="NA" -fused_mode0_prefill_sane="NA" -if [[ -f /out/fused_regression.md ]]; then - fused_decode_safe="$(sed -nE 's/^- decode safety .*: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)" - fused_prefill_safe="$(sed -nE 's/^- prefill safety .*: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)" - fused_mode0_decode_sane="$(sed -nE 's/^- mode0 decode sanity: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)" - fused_mode0_prefill_sane="$(sed -nE 's/^- mode0 prefill sanity: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)" - if [[ -z "$fused_decode_safe" ]]; then fused_decode_safe="NA"; fi - if [[ -z "$fused_prefill_safe" ]]; then fused_prefill_safe="NA"; fi - if [[ -z "$fused_mode0_decode_sane" ]]; then fused_mode0_decode_sane="NA"; fi - if [[ -z "$fused_mode0_prefill_sane" ]]; then fused_mode0_prefill_sane="NA"; fi -fi - -best_pp_tsv="$(extract_best_metric /out/proxy_sweep.out /out/proxy_sweep.err 6)" -best_tg_tsv="$(extract_best_metric /out/proxy_sweep.out /out/proxy_sweep.err 8)" -best_pp="${best_pp_tsv%%$'\t'*}" -best_pp_row="${best_pp_tsv#*$'\t'}" -best_tg="${best_tg_tsv%%$'\t'*}" -best_tg_row="${best_tg_tsv#*$'\t'}" - -{ - echo "# Qwen3Next Regression Summary" - echo - echo "## Fused Regression" - echo "- config: \`ctx=${REG_CTX}, decode(b=${REG_DECODE_B},ub=${REG_DECODE_UB}), prefill(b=${REG_PREFILL_B},ub=${REG_PREFILL_UB}), n-cpu-moe=${PROXY_N_CPU_MOE}\`" - echo "- decode safety: \`$fused_decode_safe\`" - echo "- prefill safety: \`$fused_prefill_safe\`" - echo "- mode0 decode sanity: \`$fused_mode0_decode_sane\`" - echo "- mode0 prefill sanity: \`$fused_mode0_prefill_sane\`" - echo "- report: \`/out/fused_regression.md\`" - echo - echo "## Proxy Sweep" - echo "- config: \`c=${PROXY_CTX}, b=${PROXY_B}, ub=${PROXY_UB}, n=${PROXY_N}, n-cpu-moe=${PROXY_N_CPU_MOE}\`" - echo "- best PP t/s: \`$best_pp\`" - echo "- best TG t/s: \`$best_tg\`" - echo "- best PP row: \`$best_pp_row\`" - echo "- best TG row: \`$best_tg_row\`" - echo - echo "## Long-Context Fit" - if [[ "$WITH_FIT" == "1" ]]; then - echo "- config: \`c=${FIT_CTX}, n-cpu-moe=${FIT_N_CPU_MOE}, n=${FIT_N}\`" - echo "- output: \`/out/fit_sanity.out\`" - else - echo "- skipped" - fi - echo - echo "## Command Status" - echo '```' - cat "$STATUS_FILE" - echo '```' -} > /out/SUMMARY.md - -log "Summary written to /out/SUMMARY.md" -BASH - -chmod +x "${out_dir}/run_inside.sh" - -docker_cmd=( - docker run --rm --gpus all - -e IK_BUILD_DIR="${IK_BUILD_DIR}" - -e GPU_DEVICE="${GPU_DEVICE}" - -e THREADS="${THREADS}" - -e FA="${FA}" - -e NGL="${NGL}" - -e PROXY_CTX="${PROXY_CTX}" - -e PROXY_B="${PROXY_B}" - -e PROXY_UB="${PROXY_UB}" - -e PROXY_N="${PROXY_N}" - -e PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE}" - -e REG_CTX="${REG_CTX}" - -e REG_NGL="${REG_NGL}" - -e REG_DECODE_B="${REG_DECODE_B}" - -e REG_DECODE_UB="${REG_DECODE_UB}" - -e REG_PREFILL_B="${REG_PREFILL_B}" - -e REG_PREFILL_UB="${REG_PREFILL_UB}" - -e WITH_FIT="${WITH_FIT}" - -e FIT_CTX="${FIT_CTX}" - -e FIT_N_CPU_MOE="${FIT_N_CPU_MOE}" - -e FIT_N="${FIT_N}" - -v "${IK_REPO}:/ik" - -v "${MODEL_HOST}:/model.gguf:ro" - -v "${out_dir}:/out" - "${IMAGE}" /bin/bash /out/run_inside.sh -) - -echo "Running regression in container: ${IMAGE}" -echo "Output directory: ${out_dir}" -"${docker_cmd[@]}" - -echo -echo "Done. Summary:" -echo " ${out_dir}/SUMMARY.md" -echo "Raw logs:" -echo " ${out_dir}/*.out" -echo " ${out_dir}/*.err" diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index b1c2ea5b..2b953a87 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -4181,14 +4181,12 @@ ggml_cgraph * llm_build_context::build_qwen3next() { enum class qwen3next_fused_delta_mode { off, tok_gt1, - all_tokens, }; // Keep legacy DeltaNet path as default for correctness. // LLAMA_QWEN3NEXT_FUSED_DELTA values: // unset / 0 : off // 1 : fused only for n_tok > 1 (safer; avoids known decode regression) - // 2 : fused for all token counts (experimental) const qwen3next_fused_delta_mode fused_delta_mode = []() { const char * env = std::getenv("LLAMA_QWEN3NEXT_FUSED_DELTA"); if (env == nullptr || env[0] == '\0') { @@ -4202,19 +4200,10 @@ ggml_cgraph * llm_build_context::build_qwen3next() { case 't': case 'T': return qwen3next_fused_delta_mode::tok_gt1; - case '2': - return qwen3next_fused_delta_mode::all_tokens; default: return qwen3next_fused_delta_mode::off; } }(); - if (fused_delta_mode == qwen3next_fused_delta_mode::all_tokens) { - static bool warned_all_tokens = false; - if (!warned_all_tokens) { - LLAMA_LOG_WARN("%s: LLAMA_QWEN3NEXT_FUSED_DELTA=2 enables fused single-token decode; quality regression is known in this mode\n", __func__); - warned_all_tokens = true; - } - } auto get_slice_2d = [&](ggml_tensor * t, int64_t c) -> ggml_tensor * { return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3], @@ -4850,8 +4839,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() { std::pair attn_out; const bool use_fused_delta_net = - (fused_delta_mode == qwen3next_fused_delta_mode::tok_gt1 && n_tok > 1) || - (fused_delta_mode == qwen3next_fused_delta_mode::all_tokens); + (fused_delta_mode == qwen3next_fused_delta_mode::tok_gt1 && n_tok > 1); if (use_fused_delta_net) { attn_out = build_delta_net_fused(q_conv, k_conv, v_conv, gate, beta, state, il); @@ -4935,16 +4923,14 @@ ggml_cgraph * llm_build_context::build_qwen3next() { ggml_tensor * causal_mask = nullptr; ggml_tensor * identity = nullptr; ggml_tensor * diag_mask = nullptr; - if (fused_delta_mode != qwen3next_fused_delta_mode::all_tokens) { - causal_mask = ggml_tri(ctx0, - ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f), - GGML_TRI_TYPE_LOWER); - identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f)); - diag_mask = ggml_add(ctx0, causal_mask, identity); - ggml_build_forward_expand(gf, causal_mask); - ggml_build_forward_expand(gf, identity); - ggml_build_forward_expand(gf, diag_mask); - } + causal_mask = ggml_tri(ctx0, + ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f), + GGML_TRI_TYPE_LOWER); + identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f)); + diag_mask = ggml_add(ctx0, causal_mask, identity); + ggml_build_forward_expand(gf, causal_mask); + ggml_build_forward_expand(gf, identity); + ggml_build_forward_expand(gf, diag_mask); ggml_tensor * cur = nullptr;