Files
ik_llama.cpp/scripts/qwen3next-fused-regression.sh

229 lines
8.4 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
BIN="${BIN:-./build/bin/llama-perplexity}"
MODEL="${MODEL:-}"
INPUT_FILE="${INPUT_FILE:-/tmp/qwen3next_fused_regression_input.txt}"
OUT_FILE="${OUT_FILE:-/tmp/qwen3next_fused_regression_$(date +%Y%m%d_%H%M%S).md}"
CUDA_DEVICE="${CUDA_DEVICE:-0}"
THREADS="${THREADS:-8}"
CTX="${CTX:-2048}"
FA="${FA:-on}"
NGL="${NGL:-47}"
N_CPU_MOE="${N_CPU_MOE:-40}"
CHUNKS="${CHUNKS:-1}"
DECODE_B="${DECODE_B:-1}"
DECODE_UB="${DECODE_UB:-1}"
PREFILL_B="${PREFILL_B:-2048}"
PREFILL_UB="${PREFILL_UB:-512}"
# Mandatory safety checks:
# 1) mode=1 decode should stay aligned with mode=0 decode.
# 2) mode=1 prefill should stay aligned with mode=0 prefill.
MAX_DECODE_DELTA_01="${MAX_DECODE_DELTA_01:-0.10}"
MAX_PREFILL_DELTA_01="${MAX_PREFILL_DELTA_01:-0.10}"
# 3) mode=0 absolute perplexity should stay in a sane range.
MAX_MODE0_DECODE_PPL="${MAX_MODE0_DECODE_PPL:-10.0}"
MAX_MODE0_PREFILL_PPL="${MAX_MODE0_PREFILL_PPL:-10.0}"
usage() {
cat <<'USAGE'
Usage:
scripts/qwen3next-fused-regression.sh --model /path/to/model.gguf [options]
Options:
--model PATH GGUF model path (required)
--bin PATH llama-perplexity binary (default: ./build/bin/llama-perplexity)
--input PATH input text file; auto-generated if missing
--out PATH markdown output file
--cuda-device ID CUDA_VISIBLE_DEVICES value (default: 0)
--threads N -t value (default: 8)
--ctx N -c value (default: 2048)
--fa on|off -fa value (default: on)
--ngl N -ngl value (default: 47)
--n-cpu-moe N --n-cpu-moe value (default: 40)
--chunks N --chunks value (default: 1)
--decode-b N decode batch size (default: 1)
--decode-ub N decode ubatch size (default: 1)
--prefill-b N prefill batch size (default: 2048)
--prefill-ub N prefill ubatch size (default: 512)
--max-decode-delta-01 X fail threshold for |PPL(mode1)-PPL(mode0)| in decode (default: 0.10)
--max-prefill-delta-01 X fail threshold for |PPL(mode1)-PPL(mode0)| in prefill (default: 0.10)
--max-mode0-decode-ppl X fail threshold for PPL(mode0) in decode (default: 10.0)
--max-mode0-prefill-ppl X fail threshold for PPL(mode0) in prefill (default: 10.0)
-h, --help show this help
USAGE
}
while [[ $# -gt 0 ]]; do
case "$1" in
--model) MODEL="$2"; shift 2 ;;
--bin) BIN="$2"; shift 2 ;;
--input) INPUT_FILE="$2"; shift 2 ;;
--out) OUT_FILE="$2"; shift 2 ;;
--cuda-device) CUDA_DEVICE="$2"; shift 2 ;;
--threads) THREADS="$2"; shift 2 ;;
--ctx) CTX="$2"; shift 2 ;;
--fa) FA="$2"; shift 2 ;;
--ngl) NGL="$2"; shift 2 ;;
--n-cpu-moe) N_CPU_MOE="$2"; shift 2 ;;
--chunks) CHUNKS="$2"; shift 2 ;;
--decode-b) DECODE_B="$2"; shift 2 ;;
--decode-ub) DECODE_UB="$2"; shift 2 ;;
--prefill-b) PREFILL_B="$2"; shift 2 ;;
--prefill-ub) PREFILL_UB="$2"; shift 2 ;;
--max-decode-delta-01) MAX_DECODE_DELTA_01="$2"; shift 2 ;;
--max-prefill-delta-01) MAX_PREFILL_DELTA_01="$2"; shift 2 ;;
--max-mode0-decode-ppl) MAX_MODE0_DECODE_PPL="$2"; shift 2 ;;
--max-mode0-prefill-ppl) MAX_MODE0_PREFILL_PPL="$2"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*)
echo "unknown option: $1" >&2
usage
exit 2
;;
esac
done
if [[ -z "$MODEL" ]]; then
echo "--model is required" >&2
exit 2
fi
if [[ ! -x "$BIN" ]]; then
echo "binary not executable: $BIN" >&2
exit 1
fi
if [[ ! -f "$MODEL" ]]; then
echo "model not found: $MODEL" >&2
exit 1
fi
if [[ ! -f "$INPUT_FILE" ]]; then
cat > "$INPUT_FILE" <<'TXT'
Regression text for Qwen3Next fused DeltaNet checks.
This text is deterministic and intentionally repetitive.
TXT
# Keep this comfortably above 2*ctx tokenization requirements used by llama-perplexity.
for _ in $(seq 1 900); do
echo "the model should keep stable perplexity under consistent settings" >> "$INPUT_FILE"
done
fi
log_dir="${OUT_FILE}.logs"
mkdir -p "$log_dir"
extract_ppl() {
local file="$1"
local line val
line="$(grep -E 'Final estimate:' "$file" | tail -n1 || true)"
if [[ -z "$line" ]]; then
echo "NA"
return
fi
val="$(echo "$line" | sed -nE 's/.*= ([0-9]+\.[0-9]+).*/\1/p')"
if [[ -z "$val" ]]; then
val="$(echo "$line" | grep -Eo '[0-9]+\.[0-9]+' | head -n1 || true)"
fi
if [[ -z "$val" ]]; then
echo "NA"
else
echo "$val"
fi
}
abs_delta() {
awk -v a="$1" -v b="$2" 'BEGIN { d = a - b; if (d < 0) d = -d; printf "%.6f", d }'
}
run_ppl() {
local mode="$1"
local b="$2"
local ub="$3"
local label="$4"
local log="${log_dir}/${label}_m${mode}.log"
echo "running ${label} mode=${mode} (b=${b} ub=${ub})" >&2
CUDA_VISIBLE_DEVICES="$CUDA_DEVICE" \
LLAMA_QWEN3NEXT_FUSED_DELTA="$mode" \
"$BIN" -m "$MODEL" -f "$INPUT_FILE" \
-c "$CTX" -b "$b" -ub "$ub" -t "$THREADS" \
-fa "$FA" -ngl "$NGL" --n-cpu-moe "$N_CPU_MOE" \
--chunks "$CHUNKS" --no-warmup >"$log" 2>&1
extract_ppl "$log"
}
decode_0="$(run_ppl 0 "$DECODE_B" "$DECODE_UB" decode)"
decode_1="$(run_ppl 1 "$DECODE_B" "$DECODE_UB" decode)"
decode_2="$(run_ppl 2 "$DECODE_B" "$DECODE_UB" decode)"
prefill_0="$(run_ppl 0 "$PREFILL_B" "$PREFILL_UB" prefill)"
prefill_1="$(run_ppl 1 "$PREFILL_B" "$PREFILL_UB" prefill)"
prefill_2="$(run_ppl 2 "$PREFILL_B" "$PREFILL_UB" prefill)"
if [[ "$decode_0" == "NA" || "$decode_1" == "NA" || "$decode_2" == "NA" || \
"$prefill_0" == "NA" || "$prefill_1" == "NA" || "$prefill_2" == "NA" ]]; then
echo "failed to extract one or more perplexity values; see logs in ${log_dir}" >&2
exit 1
fi
decode_delta_01="$(abs_delta "$decode_0" "$decode_1")"
decode_delta_02="$(abs_delta "$decode_0" "$decode_2")"
prefill_delta_01="$(abs_delta "$prefill_0" "$prefill_1")"
prefill_delta_02="$(abs_delta "$prefill_0" "$prefill_2")"
decode_ok="$(awk -v d="$decode_delta_01" -v t="$MAX_DECODE_DELTA_01" 'BEGIN { print(d <= t ? "yes" : "no") }')"
prefill_ok="$(awk -v d="$prefill_delta_01" -v t="$MAX_PREFILL_DELTA_01" 'BEGIN { print(d <= t ? "yes" : "no") }')"
mode0_decode_ok="$(awk -v p="$decode_0" -v t="$MAX_MODE0_DECODE_PPL" 'BEGIN { print(p <= t ? "yes" : "no") }')"
mode0_prefill_ok="$(awk -v p="$prefill_0" -v t="$MAX_MODE0_PREFILL_PPL" 'BEGIN { print(p <= t ? "yes" : "no") }')"
{
echo "# Qwen3Next Fused DeltaNet Regression Report"
echo
echo "- date: \`$(date -u +%Y-%m-%dT%H:%M:%SZ)\`"
echo "- bin: \`$BIN\`"
echo "- model: \`$MODEL\`"
echo "- input: \`$INPUT_FILE\`"
echo "- cuda_device: \`$CUDA_DEVICE\`"
echo "- ctx: \`$CTX\`"
echo "- fa: \`$FA\`"
echo "- ngl: \`$NGL\`"
echo "- n_cpu_moe: \`$N_CPU_MOE\`"
echo "- chunks: \`$CHUNKS\`"
echo
echo "## Perplexity"
echo
echo "| Path | mode=0 | mode=1 | mode=2 | |delta|(1-0) | |delta|(2-0) |"
echo "|---|---:|---:|---:|---:|---:|"
echo "| decode (b=${DECODE_B},ub=${DECODE_UB}) | ${decode_0} | ${decode_1} | ${decode_2} | ${decode_delta_01} | ${decode_delta_02} |"
echo "| prefill (b=${PREFILL_B},ub=${PREFILL_UB}) | ${prefill_0} | ${prefill_1} | ${prefill_2} | ${prefill_delta_01} | ${prefill_delta_02} |"
echo
echo "## Safety Checks"
echo
echo "- decode safety (mode1 ~= mode0): \`${decode_ok}\` (threshold \`${MAX_DECODE_DELTA_01}\`)"
echo "- prefill safety (mode1 ~= mode0): \`${prefill_ok}\` (threshold \`${MAX_PREFILL_DELTA_01}\`)"
echo "- mode0 decode sanity: \`${mode0_decode_ok}\` (PPL \`${decode_0}\`, max \`${MAX_MODE0_DECODE_PPL}\`)"
echo "- mode0 prefill sanity: \`${mode0_prefill_ok}\` (PPL \`${prefill_0}\`, max \`${MAX_MODE0_PREFILL_PPL}\`)"
echo
echo "## Logs"
echo
echo "- raw logs dir: \`${log_dir}\`"
echo "- decode mode0: \`${log_dir}/decode_m0.log\`"
echo "- decode mode1: \`${log_dir}/decode_m1.log\`"
echo "- decode mode2: \`${log_dir}/decode_m2.log\`"
echo "- prefill mode0: \`${log_dir}/prefill_m0.log\`"
echo "- prefill mode1: \`${log_dir}/prefill_m1.log\`"
echo "- prefill mode2: \`${log_dir}/prefill_m2.log\`"
} > "$OUT_FILE"
echo "wrote report: $OUT_FILE"
if [[ "$decode_ok" != "yes" || "$prefill_ok" != "yes" || "$mode0_decode_ok" != "yes" || "$mode0_prefill_ok" != "yes" ]]; then
echo "regression check failed; see report: $OUT_FILE" >&2
exit 1
fi
echo "regression check passed"