Files
sglang/test/run_suite.py
2026-04-08 22:07:24 +08:00

363 lines
11 KiB
Python

import argparse
import glob
import os
import sys
from typing import List
import tabulate
from sglang.test.ci.ci_register import (
CIRegistry,
HWBackend,
auto_partition,
collect_tests,
)
from sglang.test.ci.ci_utils import run_unittest_files
HW_MAPPING = {
"cpu": HWBackend.CPU,
"cuda": HWBackend.CUDA,
"amd": HWBackend.AMD,
"npu": HWBackend.NPU,
}
# Per-commit test suites (run on every PR)
PER_COMMIT_SUITES = {
HWBackend.CPU: ["stage-a-test-cpu"],
HWBackend.AMD: [
"stage-a-test-1-gpu-small-amd",
"stage-b-test-1-gpu-small-amd",
"stage-b-test-1-gpu-small-amd-nondeterministic",
"stage-b-test-1-gpu-small-amd-mi35x",
"stage-b-test-large-8-gpu-35x-disaggregation-amd",
"stage-b-test-1-gpu-large-amd",
"stage-b-test-2-gpu-large-amd",
"stage-c-test-4-gpu-amd",
"stage-c-test-large-8-gpu-amd",
"stage-c-test-large-8-gpu-amd-mi35x",
],
HWBackend.CUDA: [
"stage-a-test-1-gpu-small",
"stage-b-test-1-gpu-small",
"stage-b-test-1-gpu-large",
"stage-b-test-2-gpu-large",
"stage-b-test-4-gpu-b200",
"stage-b-kernel-unit-1-gpu-large",
"stage-b-kernel-unit-1-gpu-b200",
"stage-b-kernel-unit-8-gpu-h200",
"stage-b-kernel-benchmark-1-gpu-large",
"stage-c-test-4-gpu-h100",
"stage-c-test-4-gpu-b200",
"stage-c-test-4-gpu-gb200",
"stage-c-test-8-gpu-h20",
"stage-c-test-8-gpu-h200",
"stage-c-test-8-gpu-b200",
"stage-c-test-deepep-4-gpu-h100",
"stage-c-test-deepep-8-gpu-h200",
],
HWBackend.NPU: [
"stage-a-test-1-gpu-small",
"stage-b-test-1-npu-a2",
"stage-b-test-2-npu-a2",
"stage-b-test-4-npu-a3",
"stage-b-test-16-npu-a3",
],
}
# Nightly test suites (run nightly, organized by GPU configuration)
NIGHTLY_SUITES = {
HWBackend.CUDA: [
"nightly-1-gpu",
"nightly-2-gpu",
"nightly-4-gpu",
"nightly-4-gpu-b200",
"nightly-8-gpu",
"nightly-8-gpu-h200",
"nightly-8-gpu-h20",
"nightly-8-gpu-b200",
"nightly-8-gpu-h200-basic", # Basic tests for large models on H200
"nightly-8-gpu-b200-basic", # Basic tests for large models on B200
"nightly-8-gpu-common", # Common tests that run on both H200 and B200
"nightly-kernel-1-gpu",
"nightly-kernel-8-gpu-h200",
# Eval and perf suites (2-gpu)
"nightly-eval-text-2-gpu",
"nightly-eval-vlm-2-gpu",
"nightly-perf-text-2-gpu",
"nightly-perf-vlm-2-gpu",
# GB300 (4x B200 NVL4) nightly suite
"nightly-4-gpu-gb300",
],
HWBackend.AMD: [
"nightly-amd",
"nightly-amd-1-gpu",
"nightly-amd-1-gpu-mi35x",
"nightly-amd-1-gpu-zimage-turbo",
"nightly-amd-4-gpu",
"nightly-amd-8-gpu",
"nightly-amd-vlm",
# MI35x 8-GPU suite (different model configs)
"nightly-amd-8-gpu-mi35x",
],
HWBackend.CPU: [],
HWBackend.NPU: [
"nightly-1-npu-a3",
"nightly-2-npu-a3",
"nightly-4-npu-a3",
"nightly-8-npu-a3",
"nightly-16-npu-a3",
"full-1-npu-a3",
"full-2-npu-a3",
"full-4-npu-a3",
"full-8-npu-a3",
"full-16-npu-a3",
],
}
OTHER_SUITES = {
HWBackend.CPU: [
"default",
],
HWBackend.CUDA: [
"stress",
"weekly-8-gpu-h200",
],
}
_SUITE_CHECKED_BACKENDS = {HWBackend.CUDA, HWBackend.CPU}
def _valid_suites_by_backend() -> dict:
"""Build a mapping from backend to its set of valid suite names."""
result = {}
for suite_dict in (PER_COMMIT_SUITES, NIGHTLY_SUITES, OTHER_SUITES):
for backend, suites in suite_dict.items():
if backend not in result:
result[backend] = set()
result[backend].update(suites)
return result
def validate_all_suites(all_tests: List[CIRegistry]):
"""Fail fast if any test is registered to a suite that doesn't belong to its backend."""
valid_by_backend = _valid_suites_by_backend()
errors = []
for t in all_tests:
if t.backend not in _SUITE_CHECKED_BACKENDS:
continue
valid = valid_by_backend.get(t.backend, set())
if t.suite not in valid:
errors.append(
f" {t.filename}: backend={t.backend.name}, suite='{t.suite}'"
)
if errors:
raise ValueError("Tests registered to invalid suites:\n" + "\n".join(errors))
def filter_tests(
ci_tests: List[CIRegistry], hw: HWBackend, suite: str, nightly: bool = False
) -> List[CIRegistry]:
ci_tests = [
t
for t in ci_tests
if t.backend == hw and t.suite == suite and t.nightly == nightly
]
valid_suites = (
NIGHTLY_SUITES.get(hw, []) if nightly else PER_COMMIT_SUITES.get(hw, [])
)
if suite not in valid_suites:
print(
f"Warning: Unknown suite {suite} for backend {hw.name}, nightly={nightly}"
)
enabled_tests = [t for t in ci_tests if t.disabled is None]
skipped_tests = [t for t in ci_tests if t.disabled is not None]
return enabled_tests, skipped_tests
def pretty_print_tests(
args, ci_tests: List[CIRegistry], skipped_tests: List[CIRegistry]
):
hw = HW_MAPPING[args.hw]
suite = args.suite
nightly = args.nightly
if args.auto_partition_size:
partition_info = (
f"{args.auto_partition_id + 1}/{args.auto_partition_size} "
f"(0-based id={args.auto_partition_id})"
)
else:
partition_info = "full"
headers = ["Hardware", "Suite", "Nightly", "Partition"]
rows = [[hw.name, suite, str(nightly), partition_info]]
msg = tabulate.tabulate(rows, headers=headers, tablefmt="psql") + "\n"
if skipped_tests:
msg += f"⚠️ Skipped {len(skipped_tests)} test(s):\n"
for t in skipped_tests:
reason = t.disabled or "disabled"
msg += f" - {t.filename} (reason: {reason})\n"
msg += "\n"
if len(ci_tests) == 0:
msg += f"No tests found for hw={hw.name}, suite={suite}, nightly={nightly}\n"
msg += "This is expected during incremental migration. Skipping.\n"
else:
total_est_time = sum(t.est_time for t in ci_tests)
msg += (
f"✅ Enabled {len(ci_tests)} test(s) (est total {total_est_time:.1f}s):\n"
)
for t in ci_tests:
msg += f" - {t.filename} (est_time={t.est_time})\n"
print(msg, flush=True)
def run_a_suite(args):
hw = HW_MAPPING[args.hw]
suite = args.suite
nightly = args.nightly
auto_partition_id = args.auto_partition_id
auto_partition_size = args.auto_partition_size
# Use absolute paths so the script works from any working directory
script_dir = os.path.dirname(os.path.abspath(__file__))
repo_root = os.path.dirname(script_dir)
# Registered tests under test/registered/
files = [
f
for f in glob.glob(
os.path.join(script_dir, "registered", "**", "*.py"), recursive=True
)
if not f.endswith("/conftest.py") and not f.endswith("/__init__.py")
]
# JIT kernel tests and benchmarks (live alongside kernel source)
jit_kernel_dir = os.path.join(repo_root, "python", "sglang", "jit_kernel")
files += glob.glob(
os.path.join(jit_kernel_dir, "tests", "**", "test_*.py"), recursive=True
)
files += glob.glob(
os.path.join(jit_kernel_dir, "benchmark", "**", "bench_*.py"), recursive=True
)
# Strict: all discovered files must have proper registration
sanity_check = True
all_tests = collect_tests(files, sanity_check=sanity_check)
validate_all_suites(all_tests)
ci_tests, skipped_tests = filter_tests(all_tests, hw, suite, nightly)
if auto_partition_size:
ci_tests = auto_partition(ci_tests, auto_partition_id, auto_partition_size)
pretty_print_tests(args, ci_tests, skipped_tests)
# Add extra timeout when retry is enabled
timeout = args.timeout_per_file
if args.enable_retry:
timeout += args.retry_timeout_increase
return run_unittest_files(
ci_tests,
timeout_per_file=timeout,
continue_on_error=args.continue_on_error,
enable_retry=args.enable_retry,
max_attempts=args.max_attempts,
retry_wait_seconds=args.retry_wait_seconds,
)
def main():
parser = argparse.ArgumentParser(
description="Run CI test suites from test/registered/"
)
parser.add_argument(
"--hw",
type=str,
choices=HW_MAPPING.keys(),
required=True,
help="Hardware backend to run tests on.",
)
parser.add_argument("--suite", type=str, required=True, help="Test suite to run.")
parser.add_argument(
"--nightly",
action="store_true",
help="Run nightly tests instead of per-commit tests.",
)
parser.add_argument(
"--timeout-per-file",
type=int,
default=1200,
help="The time limit for running one file in seconds (default: 1200).",
)
parser.add_argument(
"--continue-on-error",
action="store_true",
default=False,
help="Continue running remaining tests even if one fails (default: False, useful for nightly tests).",
)
parser.add_argument(
"--auto-partition-id",
type=int,
help="Use auto load balancing. The part id.",
)
parser.add_argument(
"--auto-partition-size",
type=int,
help="Use auto load balancing. The number of parts.",
)
parser.add_argument(
"--enable-retry",
action="store_true",
default=False,
help="Enable smart retry for accuracy/performance assertion failures (not code errors)",
)
parser.add_argument(
"--max-attempts",
type=int,
default=2,
help="Maximum number of attempts per file including initial run (default: 2)",
)
parser.add_argument(
"--retry-wait-seconds",
type=int,
default=60,
help="Seconds to wait between retries (default: 60)",
)
parser.add_argument(
"--retry-timeout-increase",
type=int,
default=600,
help="Additional timeout in seconds when retry is enabled (default: 600)",
)
args = parser.parse_args()
# Validate auto-partition arguments
if (args.auto_partition_id is not None) != (args.auto_partition_size is not None):
parser.error(
"--auto-partition-id and --auto-partition-size must be specified together."
)
if args.auto_partition_size is not None:
if args.auto_partition_size <= 0:
parser.error("--auto-partition-size must be positive.")
if not 0 <= args.auto_partition_id < args.auto_partition_size:
parser.error(
f"--auto-partition-id must be in range [0, {args.auto_partition_size}), "
f"but got {args.auto_partition_id}"
)
exit_code = run_a_suite(args)
sys.exit(exit_code)
if __name__ == "__main__":
main()