cuda: reduce qwen3next moe/ssm sync overhead and refresh eval

2026-02-24 23:24:13 +00:00 · 2026-02-06 14:46:59 +00:00
parent c767cfa1d3
commit e64b43392f
6 changed files with 624 additions and 43 deletions
--- a/scripts/qwen3next-eval.sh
+++ b/scripts/qwen3next-eval.sh
@@ -0,0 +1,468 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+IMAGE="${IMAGE:-iktest-dev:latest}"
+MAIN_REPO="${MAIN_REPO:-/home/yurko/Code/llama.cpp}"
+IK_REPO="${IK_REPO:-/home/yurko/Code/ik_llama.cpp}"
+MODEL_HOST="${MODEL_HOST:-/home/yurko/.cache/llama.cpp/qwen3-next-coder.gguf}"
+OUT_ROOT="${OUT_ROOT:-/tmp/qwen3next-eval}"
+WITH_GPU=0
+GPU_DEVICE="${GPU_DEVICE:-0}"
+SWEEP_CTX="${SWEEP_CTX:-2048}"
+SWEEP_N="${SWEEP_N:-32}"
+
+usage() {
+    cat <<'USAGE'
+Usage:
+  scripts/qwen3next-eval.sh [options]
+
+Options:
+  --with-gpu                 Enable GPU checks in addition to CPU checks.
+  --gpu-device ID            CUDA device id to use for GPU sanity checks (default: 0).
+  --image IMAGE              Docker image to run checks in (default: iktest-dev:latest).
+  --main-repo PATH           Mainline repo path (default: /home/yurko/Code/llama.cpp).
+  --ik-repo PATH             ik repo path (default: /home/yurko/Code/ik_llama.cpp).
+  --model PATH               Host path to model GGUF file.
+  --out-root PATH            Output root directory (default: /tmp/qwen3next-eval).
+  --sweep-ctx N              Sweep context size for PP/TG check (default: 2048).
+  --sweep-n N                Sweep generation tokens (default: 32).
+  -h, --help                 Show this help.
+
+What this script runs (in this order):
+  1) CPU perplexity parity (chunks=1)      mainline -> ik
+  2) CPU perplexity parity (chunks=2)      mainline -> ik
+  3) CPU short generation smoke quality    mainline -> ik
+  4) Optional GPU sanity checks            mainline -> ik
+
+Output:
+  A timestamped folder is created under OUT_ROOT with:
+  - SUMMARY.md
+  - run.log
+  - *.out / *.err logs for each command
+USAGE
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --with-gpu)
+            WITH_GPU=1
+            shift
+            ;;
+        --gpu-device)
+            GPU_DEVICE="$2"
+            shift 2
+            ;;
+        --image)
+            IMAGE="$2"
+            shift 2
+            ;;
+        --main-repo)
+            MAIN_REPO="$2"
+            shift 2
+            ;;
+        --ik-repo)
+            IK_REPO="$2"
+            shift 2
+            ;;
+        --model)
+            MODEL_HOST="$2"
+            shift 2
+            ;;
+        --out-root)
+            OUT_ROOT="$2"
+            shift 2
+            ;;
+        --sweep-ctx)
+            SWEEP_CTX="$2"
+            shift 2
+            ;;
+        --sweep-n)
+            SWEEP_N="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1" >&2
+            usage
+            exit 2
+            ;;
+    esac
+done
+
+if [[ ! -d "$MAIN_REPO" ]]; then
+    echo "Mainline repo does not exist: $MAIN_REPO" >&2
+    exit 1
+fi
+if [[ ! -d "$IK_REPO" ]]; then
+    echo "ik repo does not exist: $IK_REPO" >&2
+    exit 1
+fi
+if [[ ! -f "$MODEL_HOST" ]]; then
+    echo "Model file does not exist: $MODEL_HOST" >&2
+    exit 1
+fi
+
+run_id="$(date +%Y%m%d_%H%M%S)"
+out_dir="${OUT_ROOT%/}/${run_id}"
+mkdir -p "$out_dir"
+
+cat > "${out_dir}/ppl_input.txt" <<'TXT'
+Deterministic evaluation text for quick perplexity parity checks.
+The next lines intentionally repeat a simple pattern to reduce variance.
+TXT
+for _ in $(seq 1 400); do
+    echo "the system writes logs and the system reads logs" >> "${out_dir}/ppl_input.txt"
+done
+
+cat > "${out_dir}/gen_prompt.txt" <<'TXT'
+Write a concise Python function that returns the first n Fibonacci numbers iteratively, and then give one sentence explaining time complexity.
+TXT
+
+cat > "${out_dir}/run_inside.sh" <<'BASH'
+#!/usr/bin/env bash
+set -euo pipefail
+
+WITH_GPU="${WITH_GPU:-0}"
+GPU_DEVICE="${GPU_DEVICE:-0}"
+SWEEP_CTX="${SWEEP_CTX:-2048}"
+SWEEP_N="${SWEEP_N:-32}"
+
+MAIN_LD="/mainline/build/bin"
+IK_LD="/ik/build/src:/ik/build/ggml/src:/ik/build/examples/mtmd"
+MODEL="/model.gguf"
+
+RUN_LOG="/out/run.log"
+STATUS_FILE="/out/status.tsv"
+
+touch "$RUN_LOG"
+printf "name\tstatus\texit_code\thost_mem_used_before_mib\thost_mem_used_after_mib\tgpu_mem_used_before_mib\tgpu_mem_used_after_mib\tmax_rss_kib\telapsed\n" > "$STATUS_FILE"
+
+log() {
+    local msg="$1"
+    printf "[%s] %s\n" "$(date +%H:%M:%S)" "$msg" | tee -a "$RUN_LOG"
+}
+
+require_bin() {
+    local path="$1"
+    if [[ ! -x "$path" ]]; then
+        log "MISSING: $path"
+        return 1
+    fi
+}
+
+host_mem_used_mib() {
+    awk '
+        /MemTotal:/     { mt = $2 }
+        /MemAvailable:/ { ma = $2 }
+        END {
+            if (mt > 0 && ma >= 0) {
+                printf "%.1f", (mt - ma) / 1024.0
+            } else {
+                print "NA"
+            }
+        }
+    ' /proc/meminfo
+}
+
+gpu_mem_used_mib() {
+    if [[ "$WITH_GPU" != "1" ]]; then
+        echo "NA"
+        return
+    fi
+    if ! command -v nvidia-smi >/dev/null 2>&1; then
+        echo "NA"
+        return
+    fi
+    local used
+    used="$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits 2>/dev/null | tr '\n' ',' | sed 's/,$//' || true)"
+    if [[ -z "$used" ]]; then
+        echo "NA"
+    else
+        echo "$used"
+    fi
+}
+
+extract_max_rss_kib() {
+    local time_file="$1"
+    if [[ ! -f "$time_file" ]]; then
+        echo "NA"
+        return
+    fi
+    local rss
+    rss="$(grep -E '^Maximum resident set size' "$time_file" | awk '{print $6}' | tail -n1 || true)"
+    if [[ -z "$rss" ]]; then
+        echo "NA"
+    else
+        echo "$rss"
+    fi
+}
+
+extract_elapsed() {
+    local time_file="$1"
+    if [[ ! -f "$time_file" ]]; then
+        echo "NA"
+        return
+    fi
+    local elapsed
+    elapsed="$(grep -E '^Elapsed \(wall clock\) time' "$time_file" | sed -E 's/^[^:]+:[[:space:]]*//' | tail -n1 || true)"
+    if [[ -z "$elapsed" ]]; then
+        echo "NA"
+    else
+        echo "$elapsed"
+    fi
+}
+
+run_cmd() {
+    local name="$1"
+    shift
+    local out_file="/out/${name}.out"
+    local err_file="/out/${name}.err"
+    local time_file="/out/${name}.time"
+    local ec
+    local host_before host_after gpu_before gpu_after max_rss elapsed
+
+    host_before="$(host_mem_used_mib)"
+    gpu_before="$(gpu_mem_used_mib)"
+    log "RUN: $name"
+
+    set +e
+    if [[ -x /usr/bin/time ]]; then
+        /usr/bin/time -v -o "$time_file" "$@" >"$out_file" 2>"$err_file"
+        ec=$?
+    else
+        "$@" >"$out_file" 2>"$err_file"
+        ec=$?
+    fi
+    set -e
+
+    host_after="$(host_mem_used_mib)"
+    gpu_after="$(gpu_mem_used_mib)"
+    max_rss="$(extract_max_rss_kib "$time_file")"
+    elapsed="$(extract_elapsed "$time_file")"
+
+    if [[ $ec -eq 0 ]]; then
+        printf "%s\tOK\t0\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+            "$name" "$host_before" "$host_after" "$gpu_before" "$gpu_after" "$max_rss" "$elapsed" >> "$STATUS_FILE"
+        log "OK: $name"
+    else
+        printf "%s\tFAIL\t%d\t%s\t%s\t%s\t%s\t%s\t%s\n" \
+            "$name" "$ec" "$host_before" "$host_after" "$gpu_before" "$gpu_after" "$max_rss" "$elapsed" >> "$STATUS_FILE"
+        log "FAIL($ec): $name"
+    fi
+    return $ec
+}
+
+extract_ppl() {
+    local out_file="$1"
+    local err_file="$2"
+    local line num
+
+    line="$(cat "$out_file" "$err_file" 2>/dev/null | grep -E "Final estimate:" | tail -n1 || true)"
+    if [[ -z "$line" ]]; then
+        echo "NA"
+        return
+    fi
+
+    num="$(echo "$line" | sed -nE 's/.*= ([0-9]+\.[0-9]+).*/\1/p')"
+    if [[ -z "$num" ]]; then
+        num="$(echo "$line" | grep -Eo '[0-9]+\.[0-9]+' | head -n1 || true)"
+    fi
+    if [[ -z "$num" ]]; then
+        echo "NA"
+    else
+        echo "$num"
+    fi
+}
+
+abs_delta() {
+    local a="$1"
+    local b="$2"
+    awk -v a="$a" -v b="$b" 'BEGIN { d = a - b; if (d < 0) d = -d; printf "%.6f", d }'
+}
+
+has_token() {
+    local file="$1"
+    local pattern="$2"
+    if grep -Eiq "$pattern" "$file"; then
+        echo "yes"
+    else
+        echo "no"
+    fi
+}
+
+main_ppl() {
+    LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-perplexity "$@"
+}
+
+ik_ppl() {
+    LD_LIBRARY_PATH="$IK_LD" /ik/build/bin/llama-perplexity "$@"
+}
+
+main_cli() {
+    LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-cli "$@"
+}
+
+main_completion() {
+    LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-completion "$@"
+}
+
+ik_cli() {
+    LD_LIBRARY_PATH="$IK_LD" /ik/build/bin/llama-cli "$@"
+}
+
+main_sweep() {
+    LD_LIBRARY_PATH="$MAIN_LD" /mainline/build/bin/llama-sweep-bench "$@"
+}
+
+ik_sweep() {
+    LD_LIBRARY_PATH="$IK_LD" /ik/build/bin/llama-sweep-bench "$@"
+}
+
+require_bin "/mainline/build/bin/llama-perplexity"
+require_bin "/mainline/build/bin/llama-cli"
+require_bin "/mainline/build/bin/llama-completion"
+require_bin "/ik/build/bin/llama-perplexity"
+require_bin "/ik/build/bin/llama-cli"
+
+if [[ "$WITH_GPU" != "1" ]]; then
+    export CUDA_VISIBLE_DEVICES=""
+    log "GPU checks disabled (CPU-only mode)"
+else
+    export CUDA_VISIBLE_DEVICES="$GPU_DEVICE"
+    log "GPU checks enabled on CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+fi
+
+PPL_INPUT="/out/ppl_input.txt"
+GEN_PROMPT="$(cat /out/gen_prompt.txt)"
+
+# CPU perplexity: chunks=1 (mainline -> ik)
+run_cmd "cpu_ppl_chunks1_mainline" \
+    main_ppl -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 0 || true
+run_cmd "cpu_ppl_chunks1_ik" \
+    ik_ppl -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 0 || true
+
+# CPU perplexity: chunks=2 (mainline -> ik)
+run_cmd "cpu_ppl_chunks2_mainline" \
+    main_ppl -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 2 --no-warmup -ngl 0 || true
+run_cmd "cpu_ppl_chunks2_ik" \
+    ik_ppl -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 2 --no-warmup -ngl 0 || true
+
+# CPU short generation smoke quality (mainline -> ik)
+run_cmd "cpu_gen_mainline" \
+    main_completion -m "$MODEL" --cpu-moe -ngl 0 -c 512 -n 64 --seed 123 --temp 0 --top-k 1 --simple-io --no-display-prompt -p "$GEN_PROMPT" || true
+run_cmd "cpu_gen_ik" \
+    ik_cli -m "$MODEL" --cpu-moe -ngl 0 -c 512 -n 64 --seed 123 --temp 0 --top-k 1 --simple-io --no-display-prompt -p "$GEN_PROMPT" || true
+
+if [[ "$WITH_GPU" == "1" ]]; then
+    # CUDA sanity perplexity: chunks=1 (mainline -> ik)
+    run_cmd "gpu_ppl_chunks1_mainline" \
+        main_ppl -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 1 || true
+    run_cmd "gpu_ppl_chunks1_ik" \
+        ik_ppl -m "$MODEL" -f "$PPL_INPUT" -c 256 -b 64 -ub 64 --chunks 1 --no-warmup -ngl 1 || true
+
+    # Quick sweep sanity (mainline -> ik)
+    if [[ -x /mainline/build/bin/llama-sweep-bench ]]; then
+        run_cmd "gpu_sweep_mainline" \
+            main_sweep -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true
+    else
+        printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_mainline" >> "$STATUS_FILE"
+        log "SKIP: gpu_sweep_mainline (missing /mainline/build/bin/llama-sweep-bench)"
+    fi
+    if [[ -x /ik/build/bin/llama-sweep-bench ]]; then
+        run_cmd "gpu_sweep_ik" \
+            ik_sweep -m "$MODEL" --cpu-moe -ngl 999 -c "$SWEEP_CTX" -b 1024 -ub 128 -n "$SWEEP_N" -ctk f16 -ctv f16 || true
+    else
+        printf "%s\tSKIP\t0\tNA\tNA\tNA\tNA\tNA\tNA\n" "gpu_sweep_ik" >> "$STATUS_FILE"
+        log "SKIP: gpu_sweep_ik (missing /ik/build/bin/llama-sweep-bench)"
+    fi
+fi
+
+# Aggregate summary
+cpu_c1_main="$(extract_ppl /out/cpu_ppl_chunks1_mainline.out /out/cpu_ppl_chunks1_mainline.err)"
+cpu_c1_ik="$(extract_ppl /out/cpu_ppl_chunks1_ik.out /out/cpu_ppl_chunks1_ik.err)"
+cpu_c2_main="$(extract_ppl /out/cpu_ppl_chunks2_mainline.out /out/cpu_ppl_chunks2_mainline.err)"
+cpu_c2_ik="$(extract_ppl /out/cpu_ppl_chunks2_ik.out /out/cpu_ppl_chunks2_ik.err)"
+
+cpu_c1_delta="NA"
+cpu_c2_delta="NA"
+if [[ "$cpu_c1_main" != "NA" && "$cpu_c1_ik" != "NA" ]]; then
+    cpu_c1_delta="$(abs_delta "$cpu_c1_main" "$cpu_c1_ik")"
+fi
+if [[ "$cpu_c2_main" != "NA" && "$cpu_c2_ik" != "NA" ]]; then
+    cpu_c2_delta="$(abs_delta "$cpu_c2_main" "$cpu_c2_ik")"
+fi
+
+main_has_fib="$(has_token /out/cpu_gen_mainline.out 'fibonacci|fibs|fib')"
+ik_has_fib="$(has_token /out/cpu_gen_ik.out 'fibonacci|fibs|fib')"
+main_has_complexity="$(has_token /out/cpu_gen_mainline.out 'complexity|O\(')"
+ik_has_complexity="$(has_token /out/cpu_gen_ik.out 'complexity|O\(')"
+
+{
+    echo "# Qwen3Next Eval Summary"
+    echo
+    echo "Mode: $( [[ "$WITH_GPU" == "1" ]] && echo "CPU+GPU" || echo "CPU-only" )"
+    echo "- Sweep config: c=\`$SWEEP_CTX\`, n=\`$SWEEP_N\`"
+    echo
+    echo "## CPU Perplexity"
+    echo "- chunks=1 mainline: \`$cpu_c1_main\`"
+    echo "- chunks=1 ik: \`$cpu_c1_ik\`"
+    echo "- chunks=1 |delta|: \`$cpu_c1_delta\`"
+    echo "- chunks=2 mainline: \`$cpu_c2_main\`"
+    echo "- chunks=2 ik: \`$cpu_c2_ik\`"
+    echo "- chunks=2 |delta|: \`$cpu_c2_delta\`"
+    echo
+    echo "## CPU Short Generation Smoke"
+    echo "- mainline has Fibonacci token(s): \`$main_has_fib\`"
+    echo "- ik has Fibonacci token(s): \`$ik_has_fib\`"
+    echo "- mainline has complexity token(s): \`$main_has_complexity\`"
+    echo "- ik has complexity token(s): \`$ik_has_complexity\`"
+    echo
+    echo "## Command Status + Memory"
+    echo '```'
+    cat "$STATUS_FILE"
+    echo '```'
+    echo
+    echo "## First Non-empty Lines (Generation)"
+    echo "### mainline"
+    awk 'NF { print; c++; if (c == 20) exit }' /out/cpu_gen_mainline.out
+    echo
+    echo "### ik"
+    awk 'NF { print; c++; if (c == 20) exit }' /out/cpu_gen_ik.out
+} > /out/SUMMARY.md
+
+log "Summary written to /out/SUMMARY.md"
+BASH
+
+chmod +x "${out_dir}/run_inside.sh"
+
+docker_cmd=(
+    docker run --rm
+    -e WITH_GPU="${WITH_GPU}"
+    -e GPU_DEVICE="${GPU_DEVICE}"
+    -e SWEEP_CTX="${SWEEP_CTX}"
+    -e SWEEP_N="${SWEEP_N}"
+    -v "${MAIN_REPO}:/mainline"
+    -v "${IK_REPO}:/ik"
+    -v "${MODEL_HOST}:/model.gguf:ro"
+    -v "${out_dir}:/out"
+)
+
+if [[ "$WITH_GPU" -eq 1 ]]; then
+    docker_cmd+=(--gpus all)
+fi
+
+docker_cmd+=("${IMAGE}" /bin/bash /out/run_inside.sh)
+
+echo "Running eval in container: ${IMAGE}"
+echo "Output directory: ${out_dir}"
+"${docker_cmd[@]}"
+
+echo
+echo "Done. Summary:"
+echo "  ${out_dir}/SUMMARY.md"
+echo "Raw logs:"
+echo "  ${out_dir}/*.out"
+echo "  ${out_dir}/*.err"