From 6db8dc86caaecea8f84f1ad22600ef71f3830fad Mon Sep 17 00:00:00 2001 From: yurko Date: Fri, 6 Feb 2026 19:28:17 -0800 Subject: [PATCH] qwen3next: split cpu/cuda eval builds and tune PP scheduling --- .../development/qwen3next_perf_diff_report.md | 98 +++++++++++++++++++ scripts/qwen3next-eval.sh | 54 ++++++---- src/llama.cpp | 14 ++- 3 files changed, 146 insertions(+), 20 deletions(-) diff --git a/docs/development/qwen3next_perf_diff_report.md b/docs/development/qwen3next_perf_diff_report.md index 26446050..07294509 100644 --- a/docs/development/qwen3next_perf_diff_report.md +++ b/docs/development/qwen3next_perf_diff_report.md @@ -382,3 +382,101 @@ Run artifact: `/tmp/qwen3next-repeat-20260206_064133` - run3: `126.33 / 23.42` Interpretation: in this setup, `-no-ooae` is currently more stable and generally faster for PP; default OOAE shows large variance and occasional severe PP drops. + +## 2026-02-06 Dual-Build Split + Context Sweep + PP Profiling + +### Code updates in this pass + +1. `src/llama.cpp` + - Added a Qwen3Next-specific guard that disables `only_active_experts` for large-batch hybrid MoE prompt paths: + - condition: `arch == QWEN3NEXT`, tensor overrides enabled, `n_batch >= 512` + - Rationale: avoid extra scheduling/sync/copy overhead in this PP-heavy path. +2. `scripts/qwen3next-eval.sh` + - Added build-dir selection so CPU and CUDA trees can be reused without rebuild toggling: + - `--main-build-dir` + - `--ik-build-dir` + - Fixed runtime loader paths to include both `bin` and `src` shared-library locations. + +### Separate build setup (requested) + +Built and validated two persistent trees in `ik_llama.cpp`: + +- `build-cpu`: `GGML_CUDA=OFF`, `GGML_BLAS=ON` +- `build-cuda`: `GGML_CUDA=ON`, `GGML_BLAS=OFF` + +Command used: + +```bash +docker run --rm --gpus all \ + -v /home/yurko/Code/ik_llama.cpp:/ik \ + -w /ik \ + iktest-dev:latest \ + bash -lc ' + cmake -S /ik -B /ik/build-cpu -DGGML_CUDA=OFF -DGGML_BLAS=ON -DCMAKE_BUILD_TYPE=Release + cmake --build /ik/build-cpu --config Release -j 56 --target llama-cli llama-sweep-bench llama-perplexity + cmake -S /ik -B /ik/build-cuda -DGGML_CUDA=ON -DGGML_BLAS=OFF -DCMAKE_BUILD_TYPE=Release + cmake --build /ik/build-cuda --config Release -j 56 --target llama-cli llama-sweep-bench llama-perplexity + ' +``` + +### Parity rerun after this pass + +Run artifact: `/tmp/qwen3next-eval/20260206_191050` + +- CPU PPL parity: + - chunks=1: mainline `1.0009`, ik `1.0009`, delta `0.000000` + - chunks=2: mainline `1.0005`, ik `1.0005`, delta `0.000000` +- CUDA sanity parity: + - `gpu_ppl_chunks1_mainline`: `OK` + - `gpu_ppl_chunks1_ik`: `OK` + +### Requested runs: CPU `c=512`, CUDA up to `c=8192` + +Run artifact: `/tmp/qwen3next-dual-build-20260206_191427` + +Config: + +- CPU: `build-cpu`, `-c 512 -b 1024 -ub 128 -n 16 -ngl 0` +- CUDA: `build-cuda`, `-c {512,1024,2048,4096,8192} -b 1024 -ub 128 -n 16 -ngl 999 --cpu-moe` + +| Case | maxPP (t/s) | maxTG (t/s) | graph splits | +| --- | ---: | ---: | ---: | +| `cpu_c512` | 98.31 | 6.58 | 1 | +| `cuda_c512` | 137.09 | 25.69 | 530 | +| `cuda_c1024` | 135.74 | 27.68 | 530 | +| `cuda_c2048` | 134.87 | 26.71 | 530 | +| `cuda_c4096` | 136.62 | 27.37 | 530 | +| `cuda_c8192` | 137.50 | 27.53 | 530 | + +Observation: PP remains roughly flat (`~135-137 t/s`) from `c=512` through `c=8192`, so this is not primarily a long-context KV-scaling bottleneck. + +### Prompt-processing bottleneck profiling + +Run artifact: `/tmp/qwen3next-profile-20260206_192018` + +| Case | maxPP (t/s) | maxTG (t/s) | splits | threads | offloaded layers | +| --- | ---: | ---: | ---: | --- | --- | +| `single_default` | 125.77 | 24.01 | 530 | `t=8,tb=8` | `49/49` | +| `single_t16_tb16` | 37.00 | 0.85 | 530 | `t=16,tb=16` | `49/49` | +| `dual_default` | 128.94 | 22.75 | 531 | `t=8,tb=8` | `49/49` | +| `dual_t16_tb16` | 37.68 | 0.82 | 531 | `t=16,tb=16` | `49/49` | + +Key findings: + +1. Increasing CPU threads to 16 for this CPU-MoE path is strongly harmful on this machine. +2. Dual-GPU (`0,1`) does not materially improve PP over single-GPU for this config. +3. Main logs still show all expert tensors overridden to CPU and a large CPU expert buffer (`~45.8 GiB`), so PP is dominated by CPU-MoE path behavior rather than GPU-context growth. +4. Graph splits remain high (`~530`) and stable across contexts, indicating persistent scheduler/backend overhead. + +### Additional variance check (`default` vs `-no-ooae`) + +Run artifact: `/tmp/qwen3next-ooae-repeat-20260206_192523` + +- `default` (with auto Qwen3Next guard): `112.64/23.88`, `135.73/26.40`, `135.30/27.19` (PP/TG) +- `-no-ooae`: `131.87/25.97`, `113.80/23.77`, `114.25/23.79` + +Interpretation: run-to-run variance is still significant in this environment; however, the new auto-guard removes the worst observed OOAE collapse mode in the default path while preserving parity. + +### Why this is still below ~400 PP + +Given this exact setup, the dominant limiter is CPU-MoE expert execution (large expert tensors on CPU + routing/scheduler overhead), not context length. With `--cpu-moe`, this hardware/config currently lands around `~125-137` PP in stable runs. Reaching `~400` PP on this model likely requires reducing or eliminating CPU-MoE dependence (more VRAM / different placement strategy) rather than only kernel micro-tuning. diff --git a/scripts/qwen3next-eval.sh b/scripts/qwen3next-eval.sh index ffbe2241..1fe6dfec 100755 --- a/scripts/qwen3next-eval.sh +++ b/scripts/qwen3next-eval.sh @@ -4,6 +4,8 @@ set -euo pipefail IMAGE="${IMAGE:-iktest-dev:latest}" MAIN_REPO="${MAIN_REPO:-/home/yurko/Code/llama.cpp}" IK_REPO="${IK_REPO:-/home/yurko/Code/ik_llama.cpp}" +MAIN_BUILD_DIR="${MAIN_BUILD_DIR:-build}" +IK_BUILD_DIR="${IK_BUILD_DIR:-build}" MODEL_HOST="${MODEL_HOST:-/home/yurko/.cache/llama.cpp/qwen3-next-coder.gguf}" OUT_ROOT="${OUT_ROOT:-/tmp/qwen3next-eval}" WITH_GPU=0 @@ -22,6 +24,8 @@ Options: --image IMAGE Docker image to run checks in (default: iktest-dev:latest). --main-repo PATH Mainline repo path (default: /home/yurko/Code/llama.cpp). --ik-repo PATH ik repo path (default: /home/yurko/Code/ik_llama.cpp). + --main-build-dir NAME Mainline build dir under main repo (default: build). + --ik-build-dir NAME ik build dir under ik repo (default: build). --model PATH Host path to model GGUF file. --out-root PATH Output root directory (default: /tmp/qwen3next-eval). --sweep-ctx N Sweep context size for PP/TG check (default: 2048). @@ -64,6 +68,14 @@ while [[ $# -gt 0 ]]; do IK_REPO="$2" shift 2 ;; + --main-build-dir) + MAIN_BUILD_DIR="$2" + shift 2 + ;; + --ik-build-dir) + IK_BUILD_DIR="$2" + shift 2 + ;; --model) MODEL_HOST="$2" shift 2 @@ -129,9 +141,13 @@ WITH_GPU="${WITH_GPU:-0}" GPU_DEVICE="${GPU_DEVICE:-0}" SWEEP_CTX="${SWEEP_CTX:-2048}" SWEEP_N="${SWEEP_N:-32}" +MAIN_BUILD_DIR="${MAIN_BUILD_DIR:-build}" +IK_BUILD_DIR="${IK_BUILD_DIR:-build}" -MAIN_LD="/mainline/build/bin" -IK_LD="/ik/build/src:/ik/build/ggml/src:/ik/build/examples/mtmd" +MAIN_BIN="/mainline/${MAIN_BUILD_DIR}/bin" +IK_BIN="/ik/${IK_BUILD_DIR}/bin" +MAIN_LD="/mainline/${MAIN_BUILD_DIR}/bin:/mainline/${MAIN_BUILD_DIR}/src:/mainline/${MAIN_BUILD_DIR}/ggml/src:/mainline/${MAIN_BUILD_DIR}/examples/mtmd" +IK_LD="/ik/${IK_BUILD_DIR}/bin:/ik/${IK_BUILD_DIR}/src:/ik/${IK_BUILD_DIR}/ggml/src:/ik/${IK_BUILD_DIR}/examples/mtmd" MODEL="/model.gguf" RUN_LOG="/out/run.log" @@ -294,38 +310,38 @@ has_token() { } main_ppl() { - LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-perplexity "$@" + LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-perplexity" "$@" } ik_ppl() { - LD_LIBRARY_PATH="$IK_LD" /ik/build/bin/llama-perplexity "$@" + LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-perplexity" "$@" } main_cli() { - LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-cli "$@" + LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-cli" "$@" } main_completion() { - LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-completion "$@" + LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-completion" "$@" } ik_cli() { - LD_LIBRARY_PATH="$IK_LD" /ik/build/bin/llama-cli "$@" + LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-cli" "$@" } main_sweep() { - LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-sweep-bench "$@" + LD_LIBRARY_PATH="$MAIN_LD" "$MAIN_BIN/llama-sweep-bench" "$@" } ik_sweep() { - LD_LIBRARY_PATH="$IK_LD" /ik/build/bin/llama-sweep-bench "$@" + LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" "$@" } -require_bin "/mainline/build/bin/llama-perplexity" -require_bin "/mainline/build/bin/llama-cli" -require_bin "/mainline/build/bin/llama-completion" -require_bin "/ik/build/bin/llama-perplexity" -require_bin "/ik/build/bin/llama-cli" +require_bin "$MAIN_BIN/llama-perplexity" +require_bin "$MAIN_BIN/llama-cli" +require_bin "$MAIN_BIN/llama-completion" +require_bin "$IK_BIN/llama-perplexity" +require_bin "$IK_BIN/llama-cli" if [[ "$WITH_GPU" != "1" ]]; then export CUDA_VISIBLE_DEVICES="" @@ -364,19 +380,19 @@ if [[ "$WITH_GPU" == "1" ]]; then ik_ppl -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 1 || true # Quick sweep sanity (mainline -> ik) - if [[ -x /mainline/build/bin/llama-sweep-bench ]]; then + if [[ -x "$MAIN_BIN/llama-sweep-bench" ]]; then run_cmd "gpu_sweep_mainline" \ main_sweep -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true else printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_mainline" >> "$STATUS_FILE" - log "SKIP: gpu_sweep_mainline (missing /mainline/build/bin/llama-sweep-bench)" + log "SKIP: gpu_sweep_mainline (missing $MAIN_BIN/llama-sweep-bench)" fi - if [[ -x /ik/build/bin/llama-sweep-bench ]]; then + if [[ -x "$IK_BIN/llama-sweep-bench" ]]; then run_cmd "gpu_sweep_ik" \ ik_sweep -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true else printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_ik" >> "$STATUS_FILE" - log "SKIP: gpu_sweep_ik (missing /ik/build/bin/llama-sweep-bench)" + log "SKIP: gpu_sweep_ik (missing $IK_BIN/llama-sweep-bench)" fi fi @@ -444,6 +460,8 @@ docker_cmd=( -e GPU_DEVICE="${GPU_DEVICE}" -e SWEEP_CTX="${SWEEP_CTX}" -e SWEEP_N="${SWEEP_N}" + -e MAIN_BUILD_DIR="${MAIN_BUILD_DIR}" + -e IK_BUILD_DIR="${IK_BUILD_DIR}" -v "${MAIN_REPO}:/mainline" -v "${IK_REPO}:/ik" -v "${MODEL_HOST}:/model.gguf:ro" diff --git a/src/llama.cpp b/src/llama.cpp index 6e960459..36d0ff51 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4984,8 +4984,18 @@ struct llama_context * llama_new_context_with_model( } } - if (params.only_active_experts) { - LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting only active experts offload\n"); + bool only_active_experts = params.only_active_experts; + if (only_active_experts && + model->arch == LLM_ARCH_QWEN3NEXT && + model->has_tensor_overrides() && + cparams.n_batch >= 512) { + // In large-batch hybrid CPU/GPU MoE prompt processing, moving only active experts can + // add synchronization and copy overhead. Disable this mode for this Qwen3Next path. + LLAMA_LOG_INFO("%s: disabling only_active_experts for Qwen3Next large-batch hybrid MoE prompt path\n", __func__); + only_active_experts = false; + } + if (only_active_experts) { + LLAMA_LOG_INFO("%s: enabling only_active_experts scheduling\n", __func__); ggml_backend_sched_set_only_active_experts(ctx->sched, true); } if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {