mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-07-01 04:08:10 +00:00
363 lines
11 KiB
Python
363 lines
11 KiB
Python
import argparse
|
|
import glob
|
|
import os
|
|
import sys
|
|
from typing import List
|
|
|
|
import tabulate
|
|
|
|
from sglang.test.ci.ci_register import (
|
|
CIRegistry,
|
|
HWBackend,
|
|
auto_partition,
|
|
collect_tests,
|
|
)
|
|
from sglang.test.ci.ci_utils import run_unittest_files
|
|
|
|
HW_MAPPING = {
|
|
"cpu": HWBackend.CPU,
|
|
"cuda": HWBackend.CUDA,
|
|
"amd": HWBackend.AMD,
|
|
"npu": HWBackend.NPU,
|
|
}
|
|
|
|
# Per-commit test suites (run on every PR)
|
|
PER_COMMIT_SUITES = {
|
|
HWBackend.CPU: ["stage-a-test-cpu"],
|
|
HWBackend.AMD: [
|
|
"stage-a-test-1-gpu-small-amd",
|
|
"stage-b-test-1-gpu-small-amd",
|
|
"stage-b-test-1-gpu-small-amd-nondeterministic",
|
|
"stage-b-test-1-gpu-small-amd-mi35x",
|
|
"stage-b-test-large-8-gpu-35x-disaggregation-amd",
|
|
"stage-b-test-1-gpu-large-amd",
|
|
"stage-b-test-2-gpu-large-amd",
|
|
"stage-c-test-4-gpu-amd",
|
|
"stage-c-test-large-8-gpu-amd",
|
|
"stage-c-test-large-8-gpu-amd-mi35x",
|
|
],
|
|
HWBackend.CUDA: [
|
|
"stage-a-test-1-gpu-small",
|
|
"stage-b-test-1-gpu-small",
|
|
"stage-b-test-1-gpu-large",
|
|
"stage-b-test-2-gpu-large",
|
|
"stage-b-test-4-gpu-b200",
|
|
"stage-b-kernel-unit-1-gpu-large",
|
|
"stage-b-kernel-unit-1-gpu-b200",
|
|
"stage-b-kernel-unit-8-gpu-h200",
|
|
"stage-b-kernel-benchmark-1-gpu-large",
|
|
"stage-c-test-4-gpu-h100",
|
|
"stage-c-test-4-gpu-b200",
|
|
"stage-c-test-4-gpu-gb200",
|
|
"stage-c-test-8-gpu-h20",
|
|
"stage-c-test-8-gpu-h200",
|
|
"stage-c-test-8-gpu-b200",
|
|
"stage-c-test-deepep-4-gpu-h100",
|
|
"stage-c-test-deepep-8-gpu-h200",
|
|
],
|
|
HWBackend.NPU: [
|
|
"stage-a-test-1-gpu-small",
|
|
"stage-b-test-1-npu-a2",
|
|
"stage-b-test-2-npu-a2",
|
|
"stage-b-test-4-npu-a3",
|
|
"stage-b-test-16-npu-a3",
|
|
],
|
|
}
|
|
|
|
# Nightly test suites (run nightly, organized by GPU configuration)
|
|
NIGHTLY_SUITES = {
|
|
HWBackend.CUDA: [
|
|
"nightly-1-gpu",
|
|
"nightly-2-gpu",
|
|
"nightly-4-gpu",
|
|
"nightly-4-gpu-b200",
|
|
"nightly-8-gpu",
|
|
"nightly-8-gpu-h200",
|
|
"nightly-8-gpu-h20",
|
|
"nightly-8-gpu-b200",
|
|
"nightly-8-gpu-h200-basic", # Basic tests for large models on H200
|
|
"nightly-8-gpu-b200-basic", # Basic tests for large models on B200
|
|
"nightly-8-gpu-common", # Common tests that run on both H200 and B200
|
|
"nightly-kernel-1-gpu",
|
|
"nightly-kernel-8-gpu-h200",
|
|
# Eval and perf suites (2-gpu)
|
|
"nightly-eval-text-2-gpu",
|
|
"nightly-eval-vlm-2-gpu",
|
|
"nightly-perf-text-2-gpu",
|
|
"nightly-perf-vlm-2-gpu",
|
|
# GB300 (4x B200 NVL4) nightly suite
|
|
"nightly-4-gpu-gb300",
|
|
],
|
|
HWBackend.AMD: [
|
|
"nightly-amd",
|
|
"nightly-amd-1-gpu",
|
|
"nightly-amd-1-gpu-mi35x",
|
|
"nightly-amd-1-gpu-zimage-turbo",
|
|
"nightly-amd-4-gpu",
|
|
"nightly-amd-8-gpu",
|
|
"nightly-amd-vlm",
|
|
# MI35x 8-GPU suite (different model configs)
|
|
"nightly-amd-8-gpu-mi35x",
|
|
],
|
|
HWBackend.CPU: [],
|
|
HWBackend.NPU: [
|
|
"nightly-1-npu-a3",
|
|
"nightly-2-npu-a3",
|
|
"nightly-4-npu-a3",
|
|
"nightly-8-npu-a3",
|
|
"nightly-16-npu-a3",
|
|
"full-1-npu-a3",
|
|
"full-2-npu-a3",
|
|
"full-4-npu-a3",
|
|
"full-8-npu-a3",
|
|
"full-16-npu-a3",
|
|
],
|
|
}
|
|
|
|
|
|
OTHER_SUITES = {
|
|
HWBackend.CPU: [
|
|
"default",
|
|
],
|
|
HWBackend.CUDA: [
|
|
"stress",
|
|
"weekly-8-gpu-h200",
|
|
],
|
|
}
|
|
|
|
|
|
_SUITE_CHECKED_BACKENDS = {HWBackend.CUDA, HWBackend.CPU}
|
|
|
|
|
|
def _valid_suites_by_backend() -> dict:
|
|
"""Build a mapping from backend to its set of valid suite names."""
|
|
result = {}
|
|
for suite_dict in (PER_COMMIT_SUITES, NIGHTLY_SUITES, OTHER_SUITES):
|
|
for backend, suites in suite_dict.items():
|
|
if backend not in result:
|
|
result[backend] = set()
|
|
result[backend].update(suites)
|
|
return result
|
|
|
|
|
|
def validate_all_suites(all_tests: List[CIRegistry]):
|
|
"""Fail fast if any test is registered to a suite that doesn't belong to its backend."""
|
|
valid_by_backend = _valid_suites_by_backend()
|
|
errors = []
|
|
for t in all_tests:
|
|
if t.backend not in _SUITE_CHECKED_BACKENDS:
|
|
continue
|
|
valid = valid_by_backend.get(t.backend, set())
|
|
if t.suite not in valid:
|
|
errors.append(
|
|
f" {t.filename}: backend={t.backend.name}, suite='{t.suite}'"
|
|
)
|
|
if errors:
|
|
raise ValueError("Tests registered to invalid suites:\n" + "\n".join(errors))
|
|
|
|
|
|
def filter_tests(
|
|
ci_tests: List[CIRegistry], hw: HWBackend, suite: str, nightly: bool = False
|
|
) -> List[CIRegistry]:
|
|
ci_tests = [
|
|
t
|
|
for t in ci_tests
|
|
if t.backend == hw and t.suite == suite and t.nightly == nightly
|
|
]
|
|
|
|
valid_suites = (
|
|
NIGHTLY_SUITES.get(hw, []) if nightly else PER_COMMIT_SUITES.get(hw, [])
|
|
)
|
|
|
|
if suite not in valid_suites:
|
|
print(
|
|
f"Warning: Unknown suite {suite} for backend {hw.name}, nightly={nightly}"
|
|
)
|
|
|
|
enabled_tests = [t for t in ci_tests if t.disabled is None]
|
|
skipped_tests = [t for t in ci_tests if t.disabled is not None]
|
|
|
|
return enabled_tests, skipped_tests
|
|
|
|
|
|
def pretty_print_tests(
|
|
args, ci_tests: List[CIRegistry], skipped_tests: List[CIRegistry]
|
|
):
|
|
hw = HW_MAPPING[args.hw]
|
|
suite = args.suite
|
|
nightly = args.nightly
|
|
if args.auto_partition_size:
|
|
partition_info = (
|
|
f"{args.auto_partition_id + 1}/{args.auto_partition_size} "
|
|
f"(0-based id={args.auto_partition_id})"
|
|
)
|
|
else:
|
|
partition_info = "full"
|
|
|
|
headers = ["Hardware", "Suite", "Nightly", "Partition"]
|
|
rows = [[hw.name, suite, str(nightly), partition_info]]
|
|
msg = tabulate.tabulate(rows, headers=headers, tablefmt="psql") + "\n"
|
|
|
|
if skipped_tests:
|
|
msg += f"⚠️ Skipped {len(skipped_tests)} test(s):\n"
|
|
for t in skipped_tests:
|
|
reason = t.disabled or "disabled"
|
|
msg += f" - {t.filename} (reason: {reason})\n"
|
|
msg += "\n"
|
|
|
|
if len(ci_tests) == 0:
|
|
msg += f"No tests found for hw={hw.name}, suite={suite}, nightly={nightly}\n"
|
|
msg += "This is expected during incremental migration. Skipping.\n"
|
|
else:
|
|
total_est_time = sum(t.est_time for t in ci_tests)
|
|
msg += (
|
|
f"✅ Enabled {len(ci_tests)} test(s) (est total {total_est_time:.1f}s):\n"
|
|
)
|
|
for t in ci_tests:
|
|
msg += f" - {t.filename} (est_time={t.est_time})\n"
|
|
|
|
print(msg, flush=True)
|
|
|
|
|
|
def run_a_suite(args):
|
|
hw = HW_MAPPING[args.hw]
|
|
suite = args.suite
|
|
nightly = args.nightly
|
|
auto_partition_id = args.auto_partition_id
|
|
auto_partition_size = args.auto_partition_size
|
|
|
|
# Use absolute paths so the script works from any working directory
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
repo_root = os.path.dirname(script_dir)
|
|
|
|
# Registered tests under test/registered/
|
|
files = [
|
|
f
|
|
for f in glob.glob(
|
|
os.path.join(script_dir, "registered", "**", "*.py"), recursive=True
|
|
)
|
|
if not f.endswith("/conftest.py") and not f.endswith("/__init__.py")
|
|
]
|
|
|
|
# JIT kernel tests and benchmarks (live alongside kernel source)
|
|
jit_kernel_dir = os.path.join(repo_root, "python", "sglang", "jit_kernel")
|
|
files += glob.glob(
|
|
os.path.join(jit_kernel_dir, "tests", "**", "test_*.py"), recursive=True
|
|
)
|
|
files += glob.glob(
|
|
os.path.join(jit_kernel_dir, "benchmark", "**", "bench_*.py"), recursive=True
|
|
)
|
|
|
|
# Strict: all discovered files must have proper registration
|
|
sanity_check = True
|
|
|
|
all_tests = collect_tests(files, sanity_check=sanity_check)
|
|
validate_all_suites(all_tests)
|
|
ci_tests, skipped_tests = filter_tests(all_tests, hw, suite, nightly)
|
|
|
|
if auto_partition_size:
|
|
ci_tests = auto_partition(ci_tests, auto_partition_id, auto_partition_size)
|
|
|
|
pretty_print_tests(args, ci_tests, skipped_tests)
|
|
|
|
# Add extra timeout when retry is enabled
|
|
timeout = args.timeout_per_file
|
|
if args.enable_retry:
|
|
timeout += args.retry_timeout_increase
|
|
|
|
return run_unittest_files(
|
|
ci_tests,
|
|
timeout_per_file=timeout,
|
|
continue_on_error=args.continue_on_error,
|
|
enable_retry=args.enable_retry,
|
|
max_attempts=args.max_attempts,
|
|
retry_wait_seconds=args.retry_wait_seconds,
|
|
)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Run CI test suites from test/registered/"
|
|
)
|
|
parser.add_argument(
|
|
"--hw",
|
|
type=str,
|
|
choices=HW_MAPPING.keys(),
|
|
required=True,
|
|
help="Hardware backend to run tests on.",
|
|
)
|
|
parser.add_argument("--suite", type=str, required=True, help="Test suite to run.")
|
|
parser.add_argument(
|
|
"--nightly",
|
|
action="store_true",
|
|
help="Run nightly tests instead of per-commit tests.",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout-per-file",
|
|
type=int,
|
|
default=1200,
|
|
help="The time limit for running one file in seconds (default: 1200).",
|
|
)
|
|
parser.add_argument(
|
|
"--continue-on-error",
|
|
action="store_true",
|
|
default=False,
|
|
help="Continue running remaining tests even if one fails (default: False, useful for nightly tests).",
|
|
)
|
|
parser.add_argument(
|
|
"--auto-partition-id",
|
|
type=int,
|
|
help="Use auto load balancing. The part id.",
|
|
)
|
|
parser.add_argument(
|
|
"--auto-partition-size",
|
|
type=int,
|
|
help="Use auto load balancing. The number of parts.",
|
|
)
|
|
parser.add_argument(
|
|
"--enable-retry",
|
|
action="store_true",
|
|
default=False,
|
|
help="Enable smart retry for accuracy/performance assertion failures (not code errors)",
|
|
)
|
|
parser.add_argument(
|
|
"--max-attempts",
|
|
type=int,
|
|
default=2,
|
|
help="Maximum number of attempts per file including initial run (default: 2)",
|
|
)
|
|
parser.add_argument(
|
|
"--retry-wait-seconds",
|
|
type=int,
|
|
default=60,
|
|
help="Seconds to wait between retries (default: 60)",
|
|
)
|
|
parser.add_argument(
|
|
"--retry-timeout-increase",
|
|
type=int,
|
|
default=600,
|
|
help="Additional timeout in seconds when retry is enabled (default: 600)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Validate auto-partition arguments
|
|
if (args.auto_partition_id is not None) != (args.auto_partition_size is not None):
|
|
parser.error(
|
|
"--auto-partition-id and --auto-partition-size must be specified together."
|
|
)
|
|
if args.auto_partition_size is not None:
|
|
if args.auto_partition_size <= 0:
|
|
parser.error("--auto-partition-size must be positive.")
|
|
if not 0 <= args.auto_partition_id < args.auto_partition_size:
|
|
parser.error(
|
|
f"--auto-partition-id must be in range [0, {args.auto_partition_size}), "
|
|
f"but got {args.auto_partition_id}"
|
|
)
|
|
|
|
exit_code = run_a_suite(args)
|
|
sys.exit(exit_code)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|