mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-06-30 19:57:52 +00:00
90 lines
2.3 KiB
Bash
Executable File
90 lines
2.3 KiB
Bash
Executable File
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# Detect GPU family from hostname (e.g., linux-mi35x-gpu-1-xxxxx-runner-zzzzz)
|
|
HOSTNAME_VALUE=$(hostname)
|
|
GPU_FAMILY=""
|
|
|
|
# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
|
|
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
|
|
GPU_FAMILY="${BASH_REMATCH[1]}"
|
|
echo "Detected GPU family from hostname: ${GPU_FAMILY}"
|
|
else
|
|
echo "Warning: could not parse GPU family from '${HOSTNAME_VALUE}'"
|
|
fi
|
|
|
|
WORKDIR="/sglang-checkout/test/srt"
|
|
declare -A ENV_MAP=(
|
|
[SGLANG_IS_IN_CI_AMD]=1
|
|
[SGLANG_IS_IN_CI]=1
|
|
[SGLANG_USE_AITER]=1
|
|
)
|
|
|
|
# Conditionally add GPU_ARCHS only for mi35x
|
|
if [[ "${GPU_FAMILY}" == "mi35x" ]]; then
|
|
ENV_MAP[GPU_ARCHS]="gfx950"
|
|
fi
|
|
|
|
# Parse -w/--workdir and -e ENV=VAL
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
-w|--workdir)
|
|
WORKDIR="$2"
|
|
shift 2
|
|
;;
|
|
-e)
|
|
IFS="=" read -r key val <<< "$2"
|
|
ENV_MAP["$key"]="$val"
|
|
shift 2
|
|
;;
|
|
--)
|
|
shift
|
|
break
|
|
;;
|
|
*)
|
|
break
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Build final ENV_ARGS
|
|
ENV_ARGS=()
|
|
for key in "${!ENV_MAP[@]}"; do
|
|
ENV_ARGS+=("-e" "$key=${ENV_MAP[$key]}")
|
|
done
|
|
|
|
# Run docker exec with retry logic for HuggingFace network/download issues
|
|
# When HF model downloads fail due to network timeouts or rate limits,
|
|
# retrying with HF_HUB_OFFLINE=1 uses cached models from previous downloads.
|
|
#
|
|
# First attempt: normal mode (allows HF downloads)
|
|
if docker exec \
|
|
-w "$WORKDIR" \
|
|
"${ENV_ARGS[@]}" \
|
|
ci_sglang "$@"; then
|
|
exit 0
|
|
else
|
|
FIRST_EXIT_CODE=$?
|
|
fi
|
|
|
|
echo "First attempt failed with exit code $FIRST_EXIT_CODE"
|
|
|
|
# Skip retry for test failures that won't be fixed by offline mode:
|
|
# - Exit 1: Test assertion failures (accuracy below threshold)
|
|
# - Exit 137 (128+9): Process killed by OOM
|
|
# - Exit 255: Test suite completed with test errors
|
|
# Only retry for other exit codes (e.g., network timeouts, HF download failures)
|
|
if [[ "$FIRST_EXIT_CODE" -eq 1 || "$FIRST_EXIT_CODE" -eq 137 || "$FIRST_EXIT_CODE" -eq 255 ]]; then
|
|
echo "Exit code $FIRST_EXIT_CODE indicates test failure (not network issue), not retrying"
|
|
exit $FIRST_EXIT_CODE
|
|
fi
|
|
|
|
echo "Retrying with HF_HUB_OFFLINE=1 (offline mode to use cached models)..."
|
|
|
|
# Second attempt: force HF offline mode to avoid network timeouts
|
|
docker exec \
|
|
-w "$WORKDIR" \
|
|
"${ENV_ARGS[@]}" \
|
|
-e HF_HUB_OFFLINE=1 \
|
|
ci_sglang "$@"
|