mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-21 13:44:10 +00:00
381 lines
12 KiB
Bash
Executable File
381 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
IMAGE="${IMAGE:-nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04}"
|
|
IK_REPO="${IK_REPO:-/home/yurko/Code/ik_llama.cpp}"
|
|
IK_BUILD_DIR="${IK_BUILD_DIR:-build}"
|
|
MODEL_HOST="${MODEL_HOST:-/home/yurko/.cache/llama.cpp/qwen3-next-coder.gguf}"
|
|
OUT_ROOT="${OUT_ROOT:-/tmp/qwen3next-regression}"
|
|
GPU_DEVICE="${GPU_DEVICE:-0}"
|
|
|
|
THREADS="${THREADS:-8}"
|
|
FA="${FA:-on}"
|
|
NGL="${NGL:-999}"
|
|
|
|
PROXY_CTX="${PROXY_CTX:-8192}"
|
|
PROXY_B="${PROXY_B:-3072}"
|
|
PROXY_UB="${PROXY_UB:-768}"
|
|
PROXY_N="${PROXY_N:-128}"
|
|
PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE:-40}"
|
|
|
|
REG_CTX="${REG_CTX:-2048}"
|
|
REG_NGL="${REG_NGL:-47}"
|
|
REG_DECODE_B="${REG_DECODE_B:-1}"
|
|
REG_DECODE_UB="${REG_DECODE_UB:-1}"
|
|
REG_PREFILL_B="${REG_PREFILL_B:-2048}"
|
|
REG_PREFILL_UB="${REG_PREFILL_UB:-512}"
|
|
|
|
WITH_FIT=1
|
|
FIT_CTX="${FIT_CTX:-65536}"
|
|
FIT_N_CPU_MOE="${FIT_N_CPU_MOE:-47}"
|
|
FIT_N="${FIT_N:-1}"
|
|
|
|
usage() {
|
|
cat <<'USAGE'
|
|
Usage:
|
|
scripts/qwen3next-regression.sh [options]
|
|
|
|
Options:
|
|
--image IMAGE Docker image to run checks in (default: nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04)
|
|
--ik-repo PATH ik repo path (default: /home/yurko/Code/ik_llama.cpp)
|
|
--ik-build-dir NAME Build dir under ik repo (default: build)
|
|
--model PATH Host path to model GGUF file
|
|
--out-root PATH Output root directory (default: /tmp/qwen3next-regression)
|
|
--gpu-device ID CUDA device id (default: 0)
|
|
--threads N Threads (default: 8)
|
|
--fa on|off Flash attention mode (default: on)
|
|
--ngl N -ngl value (default: 999)
|
|
|
|
--proxy-ctx N Proxy sweep context (default: 8192)
|
|
--proxy-b N Proxy sweep batch size (default: 3072)
|
|
--proxy-ub N Proxy sweep ubatch size (default: 768)
|
|
--proxy-n N Proxy sweep generation tokens (default: 128)
|
|
--proxy-n-cpu-moe N Proxy sweep --n-cpu-moe (default: 40)
|
|
|
|
--reg-ctx N Fused regression context (default: 2048)
|
|
--reg-ngl N Fused regression -ngl (default: 47)
|
|
--reg-decode-b N Fused regression decode b (default: 1)
|
|
--reg-decode-ub N Fused regression decode ub (default: 1)
|
|
--reg-prefill-b N Fused regression prefill b (default: 2048)
|
|
--reg-prefill-ub N Fused regression prefill ub (default: 512)
|
|
|
|
--fit-ctx N Long-context fit sanity context (default: 65536)
|
|
--fit-n-cpu-moe N Long-context fit sanity --n-cpu-moe (default: 47)
|
|
--fit-n N Long-context fit sanity generation tokens (default: 1)
|
|
--no-fit Skip long-context fit sanity
|
|
-h, --help Show this help
|
|
|
|
Runs:
|
|
1) Fused-delta regression guard (mode0/mode1/mode2 + sanity thresholds)
|
|
2) Single-GPU proxy sweep benchmark
|
|
3) Optional long-context fit sanity
|
|
USAGE
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--image) IMAGE="$2"; shift 2 ;;
|
|
--ik-repo) IK_REPO="$2"; shift 2 ;;
|
|
--ik-build-dir) IK_BUILD_DIR="$2"; shift 2 ;;
|
|
--model) MODEL_HOST="$2"; shift 2 ;;
|
|
--out-root) OUT_ROOT="$2"; shift 2 ;;
|
|
--gpu-device) GPU_DEVICE="$2"; shift 2 ;;
|
|
--threads) THREADS="$2"; shift 2 ;;
|
|
--fa) FA="$2"; shift 2 ;;
|
|
--ngl) NGL="$2"; shift 2 ;;
|
|
--proxy-ctx) PROXY_CTX="$2"; shift 2 ;;
|
|
--proxy-b) PROXY_B="$2"; shift 2 ;;
|
|
--proxy-ub) PROXY_UB="$2"; shift 2 ;;
|
|
--proxy-n) PROXY_N="$2"; shift 2 ;;
|
|
--proxy-n-cpu-moe) PROXY_N_CPU_MOE="$2"; shift 2 ;;
|
|
--reg-ctx) REG_CTX="$2"; shift 2 ;;
|
|
--reg-ngl) REG_NGL="$2"; shift 2 ;;
|
|
--reg-decode-b) REG_DECODE_B="$2"; shift 2 ;;
|
|
--reg-decode-ub) REG_DECODE_UB="$2"; shift 2 ;;
|
|
--reg-prefill-b) REG_PREFILL_B="$2"; shift 2 ;;
|
|
--reg-prefill-ub) REG_PREFILL_UB="$2"; shift 2 ;;
|
|
--fit-ctx) FIT_CTX="$2"; shift 2 ;;
|
|
--fit-n-cpu-moe) FIT_N_CPU_MOE="$2"; shift 2 ;;
|
|
--fit-n) FIT_N="$2"; shift 2 ;;
|
|
--no-fit) WITH_FIT=0; shift ;;
|
|
-h|--help) usage; exit 0 ;;
|
|
*)
|
|
echo "Unknown option: $1" >&2
|
|
usage
|
|
exit 2
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ ! -d "$IK_REPO" ]]; then
|
|
echo "ik repo does not exist: $IK_REPO" >&2
|
|
exit 1
|
|
fi
|
|
if [[ ! -f "$MODEL_HOST" ]]; then
|
|
echo "Model file does not exist: $MODEL_HOST" >&2
|
|
exit 1
|
|
fi
|
|
|
|
run_id="$(date +%Y%m%d_%H%M%S)"
|
|
out_dir="${OUT_ROOT%/}/${run_id}"
|
|
mkdir -p "$out_dir"
|
|
|
|
cat > "${out_dir}/run_inside.sh" <<'BASH'
|
|
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
IK_BUILD_DIR="${IK_BUILD_DIR:-build}"
|
|
GPU_DEVICE="${GPU_DEVICE:-0}"
|
|
THREADS="${THREADS:-8}"
|
|
FA="${FA:-on}"
|
|
NGL="${NGL:-999}"
|
|
|
|
PROXY_CTX="${PROXY_CTX:-8192}"
|
|
PROXY_B="${PROXY_B:-3072}"
|
|
PROXY_UB="${PROXY_UB:-768}"
|
|
PROXY_N="${PROXY_N:-128}"
|
|
PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE:-40}"
|
|
|
|
REG_CTX="${REG_CTX:-2048}"
|
|
REG_NGL="${REG_NGL:-47}"
|
|
REG_DECODE_B="${REG_DECODE_B:-1}"
|
|
REG_DECODE_UB="${REG_DECODE_UB:-1}"
|
|
REG_PREFILL_B="${REG_PREFILL_B:-2048}"
|
|
REG_PREFILL_UB="${REG_PREFILL_UB:-512}"
|
|
|
|
WITH_FIT="${WITH_FIT:-1}"
|
|
FIT_CTX="${FIT_CTX:-65536}"
|
|
FIT_N_CPU_MOE="${FIT_N_CPU_MOE:-47}"
|
|
FIT_N="${FIT_N:-1}"
|
|
|
|
IK_BIN="/ik/${IK_BUILD_DIR}/bin"
|
|
IK_LD="/ik/${IK_BUILD_DIR}/bin:/ik/${IK_BUILD_DIR}/src:/ik/${IK_BUILD_DIR}/ggml/src:/ik/${IK_BUILD_DIR}/examples/mtmd"
|
|
MODEL="/model.gguf"
|
|
|
|
RUN_LOG="/out/run.log"
|
|
STATUS_FILE="/out/status.tsv"
|
|
|
|
touch "$RUN_LOG"
|
|
printf "name\tstatus\texit_code\n" > "$STATUS_FILE"
|
|
|
|
log() {
|
|
local msg="$1"
|
|
printf "[%s] %s\n" "$(date +%H:%M:%S)" "$msg" | tee -a "$RUN_LOG"
|
|
}
|
|
|
|
run_cmd() {
|
|
local name="$1"
|
|
shift
|
|
local out_file="/out/${name}.out"
|
|
local err_file="/out/${name}.err"
|
|
local ec
|
|
|
|
log "RUN: $name"
|
|
set +e
|
|
"$@" >"$out_file" 2>"$err_file"
|
|
ec=$?
|
|
set -e
|
|
|
|
if [[ $ec -eq 0 ]]; then
|
|
printf "%s\tOK\t0\n" "$name" >> "$STATUS_FILE"
|
|
log "OK: $name"
|
|
else
|
|
printf "%s\tFAIL\t%d\n" "$name" "$ec" >> "$STATUS_FILE"
|
|
log "FAIL($ec): $name"
|
|
fi
|
|
return $ec
|
|
}
|
|
|
|
require_bin() {
|
|
local path="$1"
|
|
if [[ ! -x "$path" ]]; then
|
|
log "MISSING: $path"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
extract_best_metric() {
|
|
local out_file="$1"
|
|
local err_file="$2"
|
|
local col="$3"
|
|
awk -F'|' -v c="$col" '
|
|
/^\|[[:space:]]*[0-9]+[[:space:]]*\|/ {
|
|
v = $c
|
|
gsub(/[[:space:]]/, "", v)
|
|
if ((v + 0) > best) {
|
|
best = v + 0
|
|
row = $0
|
|
}
|
|
}
|
|
END {
|
|
if (best > 0) {
|
|
printf "%.2f\t%s\n", best, row
|
|
} else {
|
|
print "NA\tNA"
|
|
}
|
|
}
|
|
' < <(cat "$out_file" "$err_file")
|
|
}
|
|
|
|
require_bin "$IK_BIN/llama-perplexity"
|
|
require_bin "$IK_BIN/llama-sweep-bench"
|
|
require_bin "$IK_BIN/llama-cli"
|
|
require_bin "/ik/scripts/qwen3next-fused-regression.sh"
|
|
|
|
export CUDA_VISIBLE_DEVICES="$GPU_DEVICE"
|
|
log "GPU checks on CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
|
|
|
|
run_cmd "fused_regression" \
|
|
env LD_LIBRARY_PATH="$IK_LD" /ik/scripts/qwen3next-fused-regression.sh \
|
|
--model "$MODEL" \
|
|
--bin "$IK_BIN/llama-perplexity" \
|
|
--out /out/fused_regression.md \
|
|
--cuda-device "$GPU_DEVICE" \
|
|
--threads "$THREADS" \
|
|
--ctx "$REG_CTX" \
|
|
--fa "$FA" \
|
|
--ngl "$REG_NGL" \
|
|
--n-cpu-moe "$PROXY_N_CPU_MOE" \
|
|
--chunks 1 \
|
|
--decode-b "$REG_DECODE_B" \
|
|
--decode-ub "$REG_DECODE_UB" \
|
|
--prefill-b "$REG_PREFILL_B" \
|
|
--prefill-ub "$REG_PREFILL_UB" || true
|
|
|
|
run_cmd "proxy_sweep" \
|
|
env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-sweep-bench" \
|
|
-m "$MODEL" \
|
|
-c "$PROXY_CTX" \
|
|
-b "$PROXY_B" \
|
|
-ub "$PROXY_UB" \
|
|
-n "$PROXY_N" \
|
|
-t "$THREADS" \
|
|
-fa "$FA" \
|
|
--jinja \
|
|
-ngl "$NGL" \
|
|
--n-cpu-moe "$PROXY_N_CPU_MOE" \
|
|
-rtr \
|
|
--temp 1 \
|
|
--top-p 0.95 \
|
|
--top-k 40 \
|
|
--min-p 0.01 || true
|
|
|
|
if [[ "$WITH_FIT" == "1" ]]; then
|
|
run_cmd "fit_sanity" \
|
|
env LD_LIBRARY_PATH="$IK_LD" "$IK_BIN/llama-cli" \
|
|
-m "$MODEL" \
|
|
-c "$FIT_CTX" \
|
|
-n "$FIT_N" \
|
|
-t "$THREADS" \
|
|
-fa "$FA" \
|
|
-ngl "$NGL" \
|
|
--n-cpu-moe "$FIT_N_CPU_MOE" \
|
|
-rtr \
|
|
--temp 0 \
|
|
--top-k 1 \
|
|
--simple-io \
|
|
--no-display-prompt \
|
|
-p "ping" || true
|
|
else
|
|
printf "%s\tSKIP\t0\n" "fit_sanity" >> "$STATUS_FILE"
|
|
log "SKIP: fit_sanity"
|
|
fi
|
|
|
|
fused_decode_safe="NA"
|
|
fused_prefill_safe="NA"
|
|
fused_mode0_decode_sane="NA"
|
|
fused_mode0_prefill_sane="NA"
|
|
if [[ -f /out/fused_regression.md ]]; then
|
|
fused_decode_safe="$(sed -nE 's/^- decode safety .*: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
|
|
fused_prefill_safe="$(sed -nE 's/^- prefill safety .*: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
|
|
fused_mode0_decode_sane="$(sed -nE 's/^- mode0 decode sanity: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
|
|
fused_mode0_prefill_sane="$(sed -nE 's/^- mode0 prefill sanity: `([^`]+)`.*/\1/p' /out/fused_regression.md | tail -n1 || true)"
|
|
if [[ -z "$fused_decode_safe" ]]; then fused_decode_safe="NA"; fi
|
|
if [[ -z "$fused_prefill_safe" ]]; then fused_prefill_safe="NA"; fi
|
|
if [[ -z "$fused_mode0_decode_sane" ]]; then fused_mode0_decode_sane="NA"; fi
|
|
if [[ -z "$fused_mode0_prefill_sane" ]]; then fused_mode0_prefill_sane="NA"; fi
|
|
fi
|
|
|
|
best_pp_tsv="$(extract_best_metric /out/proxy_sweep.out /out/proxy_sweep.err 6)"
|
|
best_tg_tsv="$(extract_best_metric /out/proxy_sweep.out /out/proxy_sweep.err 8)"
|
|
best_pp="${best_pp_tsv%%$'\t'*}"
|
|
best_pp_row="${best_pp_tsv#*$'\t'}"
|
|
best_tg="${best_tg_tsv%%$'\t'*}"
|
|
best_tg_row="${best_tg_tsv#*$'\t'}"
|
|
|
|
{
|
|
echo "# Qwen3Next Regression Summary"
|
|
echo
|
|
echo "## Fused Regression"
|
|
echo "- config: \`ctx=${REG_CTX}, decode(b=${REG_DECODE_B},ub=${REG_DECODE_UB}), prefill(b=${REG_PREFILL_B},ub=${REG_PREFILL_UB}), n-cpu-moe=${PROXY_N_CPU_MOE}\`"
|
|
echo "- decode safety: \`$fused_decode_safe\`"
|
|
echo "- prefill safety: \`$fused_prefill_safe\`"
|
|
echo "- mode0 decode sanity: \`$fused_mode0_decode_sane\`"
|
|
echo "- mode0 prefill sanity: \`$fused_mode0_prefill_sane\`"
|
|
echo "- report: \`/out/fused_regression.md\`"
|
|
echo
|
|
echo "## Proxy Sweep"
|
|
echo "- config: \`c=${PROXY_CTX}, b=${PROXY_B}, ub=${PROXY_UB}, n=${PROXY_N}, n-cpu-moe=${PROXY_N_CPU_MOE}\`"
|
|
echo "- best PP t/s: \`$best_pp\`"
|
|
echo "- best TG t/s: \`$best_tg\`"
|
|
echo "- best PP row: \`$best_pp_row\`"
|
|
echo "- best TG row: \`$best_tg_row\`"
|
|
echo
|
|
echo "## Long-Context Fit"
|
|
if [[ "$WITH_FIT" == "1" ]]; then
|
|
echo "- config: \`c=${FIT_CTX}, n-cpu-moe=${FIT_N_CPU_MOE}, n=${FIT_N}\`"
|
|
echo "- output: \`/out/fit_sanity.out\`"
|
|
else
|
|
echo "- skipped"
|
|
fi
|
|
echo
|
|
echo "## Command Status"
|
|
echo '```'
|
|
cat "$STATUS_FILE"
|
|
echo '```'
|
|
} > /out/SUMMARY.md
|
|
|
|
log "Summary written to /out/SUMMARY.md"
|
|
BASH
|
|
|
|
chmod +x "${out_dir}/run_inside.sh"
|
|
|
|
docker_cmd=(
|
|
docker run --rm --gpus all
|
|
-e IK_BUILD_DIR="${IK_BUILD_DIR}"
|
|
-e GPU_DEVICE="${GPU_DEVICE}"
|
|
-e THREADS="${THREADS}"
|
|
-e FA="${FA}"
|
|
-e NGL="${NGL}"
|
|
-e PROXY_CTX="${PROXY_CTX}"
|
|
-e PROXY_B="${PROXY_B}"
|
|
-e PROXY_UB="${PROXY_UB}"
|
|
-e PROXY_N="${PROXY_N}"
|
|
-e PROXY_N_CPU_MOE="${PROXY_N_CPU_MOE}"
|
|
-e REG_CTX="${REG_CTX}"
|
|
-e REG_NGL="${REG_NGL}"
|
|
-e REG_DECODE_B="${REG_DECODE_B}"
|
|
-e REG_DECODE_UB="${REG_DECODE_UB}"
|
|
-e REG_PREFILL_B="${REG_PREFILL_B}"
|
|
-e REG_PREFILL_UB="${REG_PREFILL_UB}"
|
|
-e WITH_FIT="${WITH_FIT}"
|
|
-e FIT_CTX="${FIT_CTX}"
|
|
-e FIT_N_CPU_MOE="${FIT_N_CPU_MOE}"
|
|
-e FIT_N="${FIT_N}"
|
|
-v "${IK_REPO}:/ik"
|
|
-v "${MODEL_HOST}:/model.gguf:ro"
|
|
-v "${out_dir}:/out"
|
|
"${IMAGE}" /bin/bash /out/run_inside.sh
|
|
)
|
|
|
|
echo "Running regression in container: ${IMAGE}"
|
|
echo "Output directory: ${out_dir}"
|
|
"${docker_cmd[@]}"
|
|
|
|
echo
|
|
echo "Done. Summary:"
|
|
echo " ${out_dir}/SUMMARY.md"
|
|
echo "Raw logs:"
|
|
echo " ${out_dir}/*.out"
|
|
echo " ${out_dir}/*.err"
|