mirror of
https://github.com/kvcache-ai/sglang.git
synced 2026-06-30 19:57:52 +00:00
1312 lines
44 KiB
Python
Executable File
1312 lines
44 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
SGLang CI Auto Bisect
|
|
|
|
Fetches recent Nvidia scheduled PR Test runs, identifies consistently failing
|
|
tests, and calls Claude to classify each as regression/flaky/hardware/environment.
|
|
|
|
Self-contained: does its own lightweight GitHub API analysis instead of running
|
|
the full ci_failures_analysis.py, keeping API usage to ~30-40 calls.
|
|
|
|
Usage:
|
|
python ci_auto_bisect.py \
|
|
--github-token $GITHUB_TOKEN \
|
|
--anthropic-api-key $ANTHROPIC_API_KEY \
|
|
--output bisect_results.json \
|
|
--max-failures 10
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import anthropic
|
|
import requests
|
|
|
|
REPO = "sgl-project/sglang"
|
|
GITHUB_API = "https://api.github.com"
|
|
|
|
# Claude model to use
|
|
CLAUDE_MODEL = "claude-sonnet-4-6"
|
|
|
|
# Path to the bisect skill definition (relative to repo root)
|
|
BISECT_SKILL_PATH = ".claude/skills/sglang-bisect-ci-regression/SKILL.md"
|
|
|
|
# Jobs to exclude from analysis (administrative/setup, not actual tests)
|
|
EXCLUDED_JOBS = [
|
|
"check-changes",
|
|
"pr-test-finish",
|
|
"call-gate",
|
|
"pr-gate",
|
|
"check-all-jobs",
|
|
]
|
|
|
|
# Number of recent scheduled runs to analyze
|
|
SCHEDULED_RUN_LIMIT = 6
|
|
|
|
# Compiled regex for stripping ANSI escape codes from CI logs
|
|
_ANSI_ESCAPE_RE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class FailureTarget:
|
|
"""A single test failure that needs bisection analysis."""
|
|
|
|
job_name: str
|
|
test_file: str
|
|
hardware: str
|
|
current_streak: int
|
|
first_failure_sha: str
|
|
last_failure_sha: str
|
|
first_failure_date: str
|
|
last_failure_date: str
|
|
first_failure_job_url: str
|
|
last_failure_job_url: str
|
|
first_failure_job_id: Optional[int]
|
|
last_failure_job_id: Optional[int]
|
|
recent_run_statuses: List[str] = field(default_factory=list)
|
|
test_streak: int = 0
|
|
test_total_failures: int = 0
|
|
|
|
|
|
@dataclass
|
|
class BisectionContext:
|
|
"""All gathered context for a single bisection."""
|
|
|
|
target: FailureTarget
|
|
commits_between: List[str] = field(default_factory=list)
|
|
error_signature: str = ""
|
|
runner_correlation: Dict = field(default_factory=dict)
|
|
candidate_commits: List[str] = field(default_factory=list)
|
|
|
|
|
|
@dataclass
|
|
class BisectionResult:
|
|
"""Claude's analysis result for a single failure."""
|
|
|
|
target: FailureTarget
|
|
classification: str = "unknown"
|
|
confidence: str = "low"
|
|
suspected_commit: Optional[str] = None
|
|
suspected_pr: Optional[int] = None
|
|
evidence_summary: str = ""
|
|
recommended_fix: str = ""
|
|
raw_response: str = ""
|
|
tokens_used: int = 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Focused GitHub API analysis (Nvidia scheduled only)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _gh_headers(token: str) -> Dict[str, str]:
|
|
return {
|
|
"Authorization": f"token {token}",
|
|
"Accept": "application/vnd.github.v3+json",
|
|
}
|
|
|
|
|
|
def _gh_get(url: str, token: str, params: Optional[dict] = None) -> Optional[dict]:
|
|
"""Make a GitHub API GET request. Raises on auth/permission errors."""
|
|
try:
|
|
resp = requests.get(url, headers=_gh_headers(token), params=params, timeout=30)
|
|
if resp.status_code in (401, 403):
|
|
raise RuntimeError(
|
|
f"GitHub API auth/permission error ({resp.status_code}) "
|
|
f"for {url}: {resp.text[:200]}"
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
except requests.RequestException as e:
|
|
print(f" ERROR: GitHub API request failed for {url}: {e}")
|
|
raise
|
|
|
|
|
|
def _gh_get_all_pages(
|
|
url: str, token: str, params: Optional[dict] = None
|
|
) -> List[dict]:
|
|
"""Fetch all pages for a paginated GitHub API endpoint. Raises on auth errors."""
|
|
all_items = []
|
|
current_params = dict(params or {})
|
|
current_url: Optional[str] = url
|
|
|
|
while current_url:
|
|
resp = requests.get(
|
|
current_url,
|
|
headers=_gh_headers(token),
|
|
params=current_params,
|
|
timeout=30,
|
|
)
|
|
if resp.status_code in (401, 403):
|
|
raise RuntimeError(
|
|
f"GitHub API auth/permission error ({resp.status_code}): "
|
|
f"{resp.text[:200]}"
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
items = data.get("jobs", data.get("workflow_runs", []))
|
|
all_items.extend(items)
|
|
|
|
# Follow pagination
|
|
link = resp.headers.get("Link", "")
|
|
next_url = None
|
|
for part in link.split(", "):
|
|
if 'rel="next"' in part:
|
|
next_url = part.split(";")[0].strip("<>")
|
|
break
|
|
current_url = next_url
|
|
current_params = {} # params are in the URL for subsequent pages
|
|
|
|
return all_items
|
|
|
|
|
|
def fetch_nvidia_scheduled_runs(token: str) -> List[dict]:
|
|
"""Fetch recent scheduled PR Test runs on main. ~1 API call."""
|
|
print(f"Fetching {SCHEDULED_RUN_LIMIT} recent scheduled PR Test (Nvidia) runs...")
|
|
url = f"{GITHUB_API}/repos/{REPO}/actions/workflows/pr-test.yml/runs"
|
|
data = _gh_get(url, token, {"event": "schedule", "per_page": SCHEDULED_RUN_LIMIT})
|
|
if not data:
|
|
return []
|
|
runs = data.get("workflow_runs", [])
|
|
print(f" Found {len(runs)} runs")
|
|
return runs
|
|
|
|
|
|
def fetch_jobs_for_run(run_id: int, token: str) -> List[dict]:
|
|
"""Fetch all jobs for a workflow run, handling pagination. ~1-2 API calls."""
|
|
url = f"{GITHUB_API}/repos/{REPO}/actions/runs/{run_id}/jobs"
|
|
return _gh_get_all_pages(url, token, {"per_page": 100})
|
|
|
|
|
|
def fetch_job_logs(job_id: int, token: str, max_chars: int = 2000000) -> str:
|
|
"""Fetch logs for a specific job. 1 API call. Returns empty string on failure."""
|
|
if not job_id:
|
|
return ""
|
|
try:
|
|
url = f"{GITHUB_API}/repos/{REPO}/actions/jobs/{job_id}/logs"
|
|
resp = requests.get(
|
|
url, headers=_gh_headers(token), timeout=60, allow_redirects=True
|
|
)
|
|
if resp.status_code == 200:
|
|
text = resp.text
|
|
return text[-max_chars:] if len(text) > max_chars else text
|
|
print(f" Warning: Log fetch for job {job_id} returned HTTP {resp.status_code}")
|
|
except requests.RequestException as e:
|
|
print(f" Warning: Failed to fetch logs for job {job_id}: {e}")
|
|
return ""
|
|
|
|
|
|
def parse_test_summary(logs: str) -> Optional[Dict]:
|
|
"""Parse the test summary block from job logs.
|
|
|
|
Returns dict with passed/total counts and list of failed tests,
|
|
or None if no summary found.
|
|
"""
|
|
# Strip ANSI escape codes
|
|
logs = _ANSI_ESCAPE_RE.sub("", logs)
|
|
|
|
summary_match = re.search(r"Test Summary:\s*(\d+)/(\d+)\s*passed", logs)
|
|
if not summary_match:
|
|
# Try to find the last running test (timeout scenario)
|
|
last_test = _find_last_running_test(logs)
|
|
if last_test:
|
|
return {"passed": 0, "total": 0, "failed_tests": [last_test]}
|
|
return None
|
|
|
|
try:
|
|
passed = int(summary_match.group(1))
|
|
total = int(summary_match.group(2))
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
failed_tests = []
|
|
failed_section_match = re.search(
|
|
r".?\s*FAILED:\s*\n(.*?)(?:={10,}|$)", logs, re.DOTALL
|
|
)
|
|
if failed_section_match:
|
|
for match in re.finditer(r"(\S+\.py)", failed_section_match.group(1)):
|
|
full_path = match.group(1)
|
|
test_file = full_path.split("/")[-1] if "/" in full_path else full_path
|
|
failed_tests.append({"test_file": test_file, "full_path": full_path})
|
|
|
|
return {"passed": passed, "total": total, "failed_tests": failed_tests}
|
|
|
|
|
|
def _find_last_running_test(logs: str) -> Optional[Dict]:
|
|
"""Find the last test running before logs cut off (timeout scenarios)."""
|
|
lines = logs.split("\n")
|
|
test_patterns = [r"(\S+\.py)::", r"python3?\s+(\S+\.py)"]
|
|
|
|
# Find last "server_args:" and look above it for test file
|
|
server_args_idx = None
|
|
for i in range(len(lines) - 1, -1, -1):
|
|
if "server_args:" in lines[i].lower() or "server_args =" in lines[i]:
|
|
server_args_idx = i
|
|
break
|
|
|
|
if server_args_idx is not None:
|
|
for j in range(1, 11):
|
|
line_idx = server_args_idx - j
|
|
if line_idx >= 0:
|
|
for pattern in test_patterns:
|
|
match = re.search(pattern, lines[line_idx])
|
|
if match:
|
|
full_path = match.group(1)
|
|
test_file = (
|
|
full_path.split("/")[-1] if "/" in full_path else full_path
|
|
)
|
|
if test_file.endswith(".py"):
|
|
return {"test_file": test_file, "full_path": full_path}
|
|
return None
|
|
|
|
|
|
def analyze_scheduled_failures(
|
|
token: str, min_streak: int = 1, max_failures: int = 10
|
|
) -> Tuple[List[FailureTarget], Dict[int, str]]:
|
|
"""
|
|
Fetch Nvidia scheduled runs, analyze job/test failure streaks, return targets.
|
|
|
|
Returns (targets, logs_cache) where logs_cache maps job_id -> log text,
|
|
so callers can reuse fetched logs without re-fetching.
|
|
|
|
API calls: 1 (list runs) + ~12 (jobs per run) + ~5-10 (logs for broken jobs)
|
|
= ~20-25 total.
|
|
"""
|
|
logs_cache: Dict[int, str] = {}
|
|
|
|
runs = fetch_nvidia_scheduled_runs(token)
|
|
if not runs:
|
|
print("No scheduled runs found")
|
|
return [], logs_cache
|
|
|
|
# Sort oldest-first for streak tracking
|
|
sorted_runs = sorted(runs, key=lambda r: r.get("created_at", ""))
|
|
|
|
# Track per-job streaks
|
|
job_streak: Dict[str, int] = {}
|
|
job_first_fail: Dict[str, dict] = {}
|
|
job_last_fail: Dict[str, dict] = {}
|
|
job_recent: Dict[str, List[str]] = {}
|
|
|
|
print(f"\nAnalyzing {len(sorted_runs)} runs for job failure streaks...")
|
|
api_calls = 1 # The initial list-runs call
|
|
|
|
for run in sorted_runs:
|
|
try:
|
|
run_id: int = run["id"]
|
|
except (KeyError, TypeError):
|
|
print(f" Warning: Skipping malformed run entry: {run}")
|
|
continue
|
|
|
|
head_sha = run.get("head_sha", "")[:8]
|
|
created_at = run.get("created_at", "")
|
|
run_url = f"https://github.com/{REPO}/actions/runs/{run_id}"
|
|
|
|
jobs = fetch_jobs_for_run(run_id, token)
|
|
api_calls += 1
|
|
time.sleep(0.05)
|
|
|
|
for job in jobs:
|
|
name = job.get("name", "")
|
|
if any(name.startswith(ex) for ex in EXCLUDED_JOBS):
|
|
continue
|
|
|
|
conclusion = job.get("conclusion")
|
|
job_id = job.get("id")
|
|
job_url = job.get("html_url", run_url)
|
|
|
|
if name not in job_streak:
|
|
job_streak[name] = 0
|
|
job_recent[name] = []
|
|
|
|
if conclusion == "failure":
|
|
job_streak[name] += 1
|
|
if job_streak[name] == 1:
|
|
job_first_fail[name] = {
|
|
"head_sha": head_sha,
|
|
"created_at": created_at,
|
|
"job_url": job_url,
|
|
"job_id": job_id,
|
|
}
|
|
job_last_fail[name] = {
|
|
"head_sha": head_sha,
|
|
"created_at": created_at,
|
|
"job_url": job_url,
|
|
"job_id": job_id,
|
|
}
|
|
job_recent[name].append("❌")
|
|
elif conclusion == "success":
|
|
job_streak[name] = 0
|
|
job_first_fail.pop(name, None)
|
|
job_last_fail.pop(name, None)
|
|
job_recent[name].append("✅")
|
|
else:
|
|
job_recent[name].append("⚪")
|
|
|
|
# Find jobs with streak >= min_streak
|
|
broken_jobs = {
|
|
name: {
|
|
"streak": streak,
|
|
"first_fail": job_first_fail.get(name, {}),
|
|
"last_fail": job_last_fail.get(name, {}),
|
|
"recent": job_recent.get(name, [])[-10:],
|
|
}
|
|
for name, streak in job_streak.items()
|
|
if streak >= min_streak
|
|
}
|
|
|
|
print(f"Found {len(broken_jobs)} jobs with streak >= {min_streak}")
|
|
if not broken_jobs:
|
|
print(f"Total GitHub API calls: {api_calls}")
|
|
return [], logs_cache
|
|
|
|
# For broken jobs, fetch logs and parse test-level failures
|
|
# Only fetch logs for the MOST RECENT failure of each broken job
|
|
print("\nFetching logs for broken jobs to identify failing tests...")
|
|
targets = []
|
|
|
|
for job_name, data in broken_jobs.items():
|
|
last_fail = data["last_fail"]
|
|
last_job_id = last_fail.get("job_id")
|
|
|
|
test_failures = []
|
|
if last_job_id:
|
|
logs = fetch_job_logs(last_job_id, token)
|
|
api_calls += 1
|
|
if logs:
|
|
logs_cache[last_job_id] = logs
|
|
summary = parse_test_summary(logs)
|
|
if summary and summary.get("failed_tests"):
|
|
test_failures = summary["failed_tests"]
|
|
|
|
first_fail = data["first_fail"]
|
|
|
|
def _make_target(test_file: str) -> FailureTarget:
|
|
return FailureTarget(
|
|
job_name=job_name,
|
|
test_file=test_file,
|
|
hardware="Nvidia",
|
|
current_streak=data["streak"],
|
|
first_failure_sha=first_fail.get("head_sha", ""),
|
|
last_failure_sha=last_fail.get("head_sha", ""),
|
|
first_failure_date=first_fail.get("created_at", ""),
|
|
last_failure_date=last_fail.get("created_at", ""),
|
|
first_failure_job_url=first_fail.get("job_url", ""),
|
|
last_failure_job_url=last_fail.get("job_url", ""),
|
|
first_failure_job_id=first_fail.get("job_id"),
|
|
last_failure_job_id=last_fail.get("job_id"),
|
|
recent_run_statuses=data["recent"],
|
|
test_streak=data["streak"],
|
|
test_total_failures=data["streak"],
|
|
)
|
|
|
|
if test_failures:
|
|
for tf in test_failures:
|
|
targets.append(_make_target(tf["test_file"]))
|
|
else:
|
|
targets.append(_make_target("<job-level>"))
|
|
|
|
time.sleep(0.1)
|
|
|
|
print(f"Total GitHub API calls: {api_calls}")
|
|
|
|
# Deduplicate: same test across partitions -> keep highest streak
|
|
# For job-level targets, include job_name to avoid collapsing distinct failures
|
|
seen: Dict[str, FailureTarget] = {}
|
|
for t in targets:
|
|
if t.test_file == "<job-level>":
|
|
key = f"<job-level>:{t.job_name}"
|
|
else:
|
|
key = t.test_file
|
|
if key not in seen or t.current_streak > seen[key].current_streak:
|
|
seen[key] = t
|
|
targets = list(seen.values())
|
|
|
|
# Prioritize by streak, descending
|
|
targets.sort(
|
|
key=lambda t: t.current_streak * 10 + t.test_total_failures, reverse=True
|
|
)
|
|
|
|
return targets[:max_failures], logs_cache
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Git helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def resolve_sha(short_sha: str) -> str:
|
|
"""Resolve a short SHA to a full SHA using git rev-parse."""
|
|
if not short_sha:
|
|
return ""
|
|
try:
|
|
result = subprocess.run(
|
|
["git", "rev-parse", short_sha],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
)
|
|
if result.returncode == 0:
|
|
return result.stdout.strip()
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
pass
|
|
return short_sha
|
|
|
|
|
|
def get_commits_between(first_sha: str, last_sha: str) -> List[str]:
|
|
"""Get commit list between two SHAs using git log.
|
|
|
|
Uses first_sha~1..last_sha to include the first failure commit itself,
|
|
since that commit may be the one that introduced the regression.
|
|
"""
|
|
if not first_sha or not last_sha:
|
|
return []
|
|
|
|
full_first = resolve_sha(first_sha)
|
|
full_last = resolve_sha(last_sha)
|
|
|
|
# Try first_sha~1 to include the first failure commit itself
|
|
try:
|
|
result = subprocess.run(
|
|
["git", "log", "--oneline", f"{full_first}~1..{full_last}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
if result.returncode == 0:
|
|
lines = [l for l in result.stdout.strip().split("\n") if l]
|
|
if len(lines) > 50:
|
|
return (
|
|
lines[:25]
|
|
+ [f"... ({len(lines) - 50} commits omitted) ..."]
|
|
+ lines[-25:]
|
|
)
|
|
return lines
|
|
# Fallback: ~1 may fail on root commit or missing SHA
|
|
print(f" Note: first_sha~1 failed, falling back to exclusive range")
|
|
result = subprocess.run(
|
|
["git", "log", "--oneline", f"{full_first}..{full_last}"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
if result.returncode == 0:
|
|
lines = [l for l in result.stdout.strip().split("\n") if l]
|
|
if len(lines) > 50:
|
|
return (
|
|
lines[:25]
|
|
+ [f"... ({len(lines) - 50} commits omitted) ..."]
|
|
+ lines[-25:]
|
|
)
|
|
return lines
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
pass
|
|
return []
|
|
|
|
|
|
def get_candidate_commits(first_sha: str, last_sha: str, test_file: str) -> List[str]:
|
|
"""Get commits that touch files related to the failing test."""
|
|
if not first_sha or not last_sha or test_file == "<job-level>":
|
|
return []
|
|
|
|
full_first = resolve_sha(first_sha)
|
|
full_last = resolve_sha(last_sha)
|
|
|
|
related_paths = _infer_related_paths(test_file)
|
|
if not related_paths:
|
|
return []
|
|
|
|
try:
|
|
# Use first_sha~1 to include the first failure commit itself
|
|
result = subprocess.run(
|
|
["git", "log", "--oneline", f"{full_first}~1..{full_last}", "--"]
|
|
+ related_paths,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
if result.returncode == 0:
|
|
lines = [l for l in result.stdout.strip().split("\n") if l]
|
|
return lines[:15]
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
pass
|
|
return []
|
|
|
|
|
|
def _infer_related_paths(test_file: str) -> List[str]:
|
|
"""Heuristically infer source paths related to a test file."""
|
|
paths = ["test/"]
|
|
|
|
core = test_file
|
|
if core.startswith("test_"):
|
|
core = core[5:]
|
|
if core.endswith(".py"):
|
|
core = core[:-3]
|
|
|
|
path_hints = {
|
|
"lora": ["python/sglang/srt/lora/"],
|
|
"moe": ["python/sglang/srt/layers/moe/"],
|
|
"tp": ["python/sglang/srt/distributed/"],
|
|
"dp": ["python/sglang/srt/distributed/"],
|
|
"endpoint": ["python/sglang/srt/entrypoints/"],
|
|
"openai": ["python/sglang/srt/entrypoints/openai/"],
|
|
"anthropic": ["python/sglang/srt/entrypoints/anthropic/"],
|
|
"server": ["python/sglang/srt/entrypoints/"],
|
|
"engine": ["python/sglang/srt/"],
|
|
"sampling": ["python/sglang/srt/sampling/"],
|
|
"tokenizer": ["python/sglang/srt/managers/tokenizer_manager.py"],
|
|
"schedule": ["python/sglang/srt/managers/schedule_batch.py"],
|
|
"radix": ["python/sglang/srt/mem_cache/"],
|
|
"cuda_graph": ["python/sglang/srt/layers/cuda_graph_runner.py"],
|
|
"attention": ["python/sglang/srt/layers/attention/"],
|
|
"quantiz": ["python/sglang/srt/layers/quantization/"],
|
|
"specul": ["python/sglang/srt/speculative/"],
|
|
"vision": ["python/sglang/srt/models/"],
|
|
"embed": ["python/sglang/srt/layers/"],
|
|
"kernel": ["sgl-kernel/", "python/sglang/srt/layers/"],
|
|
"bench": ["benchmark/"],
|
|
"constrained": ["python/sglang/srt/constrained/"],
|
|
}
|
|
|
|
for hint, hint_paths in path_hints.items():
|
|
if hint in core:
|
|
paths.extend(hint_paths)
|
|
|
|
if len(paths) == 1:
|
|
paths.append("python/sglang/srt/")
|
|
|
|
return paths
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Error extraction
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def extract_error_signature(logs: str, test_file: str) -> str:
|
|
"""Extract error-relevant lines from job logs.
|
|
|
|
When a specific test_file is given, tries to find the error context
|
|
closest to where that test ran (not just the last error in the log).
|
|
"""
|
|
if not logs:
|
|
return ""
|
|
|
|
logs = _ANSI_ESCAPE_RE.sub("", logs)
|
|
lines = logs.split("\n")
|
|
|
|
# If we have a specific test file, try to find the error near its FAILED marker
|
|
test_stem = ""
|
|
if test_file and test_file != "<job-level>":
|
|
test_stem = re.escape(test_file.replace(".py", ""))
|
|
|
|
# Strategy: find the "FAILED: .../test_name.py" line and look backwards
|
|
# for the traceback/error that caused it. CI logs have this pattern:
|
|
# <traceback>
|
|
# FAILED (errors=N)
|
|
# ...
|
|
# FAILED: /path/to/test_name.py returned exit code 1
|
|
failed_marker = None
|
|
for i, line in enumerate(lines):
|
|
if re.search(r"FAILED:.*" + test_stem, line):
|
|
failed_marker = i
|
|
break
|
|
|
|
if failed_marker:
|
|
# Look backwards from the FAILED marker for the traceback
|
|
# CI logs can have ~150 lines between the error and the FAILED marker
|
|
# (metrics, report writing, rate limit messages, etc.)
|
|
search_start = max(0, failed_marker - 200)
|
|
region = lines[search_start : failed_marker + 5]
|
|
|
|
# Find the last Traceback or error in this region
|
|
error_indices = []
|
|
for j, line in enumerate(region):
|
|
if re.search(
|
|
r"Traceback|AssertionError|Exception:|FAILED \(errors",
|
|
line,
|
|
):
|
|
error_indices.append(j)
|
|
|
|
if error_indices:
|
|
last_err = error_indices[-1]
|
|
ctx_start = max(0, last_err - 5)
|
|
ctx_end = min(len(region), last_err + 20)
|
|
excerpt = "\n".join(region[ctx_start:ctx_end])
|
|
return excerpt[:2000]
|
|
|
|
# Fallback: find the last error/traceback anywhere in the log
|
|
error_patterns = [
|
|
r"AssertionError",
|
|
r"FAIL(?:ED)?:",
|
|
r"Error:",
|
|
r"Exception:",
|
|
r"Traceback",
|
|
r"raise ",
|
|
]
|
|
if test_stem:
|
|
error_patterns.append(test_stem)
|
|
|
|
combined_pattern = "|".join(error_patterns)
|
|
|
|
match_indices = []
|
|
for i, line in enumerate(lines):
|
|
if re.search(combined_pattern, line, re.IGNORECASE):
|
|
match_indices.append(i)
|
|
|
|
if not match_indices:
|
|
return "\n".join(lines[-50:])[:2000]
|
|
|
|
last_match = match_indices[-1]
|
|
start = max(0, last_match - 10)
|
|
end = min(len(lines), last_match + 40)
|
|
excerpt = "\n".join(lines[start:end])
|
|
|
|
summary_match = re.search(r"Test Summary:.{0,2000}?(?:={10,}|$)", logs, re.DOTALL)
|
|
if summary_match:
|
|
excerpt += "\n\n--- Test Summary ---\n" + summary_match.group(0)[:500]
|
|
|
|
return excerpt[:2000]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Context gathering
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def gather_bisection_context(
|
|
target: FailureTarget,
|
|
github_token: str,
|
|
logs_cache: Optional[Dict[int, str]] = None,
|
|
) -> BisectionContext:
|
|
"""Gather all context needed for bisection analysis.
|
|
|
|
Args:
|
|
logs_cache: Pre-fetched logs from analyze_scheduled_failures to avoid
|
|
re-fetching the same job logs.
|
|
"""
|
|
print(f" Gathering context for {target.test_file} in {target.job_name}...")
|
|
|
|
commits = get_commits_between(target.first_failure_sha, target.last_failure_sha)
|
|
print(f" Found {len(commits)} commits in range")
|
|
|
|
candidates = get_candidate_commits(
|
|
target.first_failure_sha, target.last_failure_sha, target.test_file
|
|
)
|
|
print(f" Found {len(candidates)} candidate commits")
|
|
|
|
# Fetch error logs from the most recent failure (reuse cache if available)
|
|
error_sig = ""
|
|
if target.last_failure_job_id:
|
|
cached = (logs_cache or {}).get(target.last_failure_job_id)
|
|
if cached:
|
|
print(f" Using cached logs for job {target.last_failure_job_id}")
|
|
logs = cached
|
|
else:
|
|
print(f" Fetching logs for job {target.last_failure_job_id}...")
|
|
logs = fetch_job_logs(target.last_failure_job_id, github_token)
|
|
if logs:
|
|
error_sig = extract_error_signature(logs, target.test_file)
|
|
print(f" Extracted {len(error_sig)} chars of error context")
|
|
else:
|
|
print(" Warning: No logs retrieved")
|
|
|
|
return BisectionContext(
|
|
target=target,
|
|
commits_between=commits,
|
|
error_signature=error_sig,
|
|
candidate_commits=candidates,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Skill loading
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class SkillLoadError(Exception):
|
|
"""Raised when the bisect skill SKILL.md cannot be loaded or parsed."""
|
|
|
|
|
|
# Required sections to extract from SKILL.md. If any are missing after a
|
|
# rename or restructuring, the script raises SkillLoadError so the team
|
|
# gets a Slack notification instead of silently falling back.
|
|
_REQUIRED_SKILL_SECTIONS = {
|
|
"Key Patterns to Recognize": r"## Key Patterns to Recognize\n(.*?)(?=\n## |\Z)",
|
|
"Important Notes": r"## Important Notes\n(.*?)(?=\n## |\Z)",
|
|
"Root Cause Classification": r"### Root Cause Classification\n(.*?)(?=\n### |\Z)",
|
|
}
|
|
|
|
|
|
def load_bisect_skill() -> str:
|
|
"""Load the bisect skill SKILL.md and extract analysis methodology sections.
|
|
|
|
Reads the skill definition from the repo and extracts the sections that
|
|
are useful as Claude prompt context: Key Patterns, Important Notes, and
|
|
Root Cause Classification. This keeps the automated workflow in sync with
|
|
any updates to the skill definition.
|
|
|
|
Raises:
|
|
SkillLoadError: If SKILL.md is not found or required sections are missing.
|
|
"""
|
|
# Try repo-relative path first, then look in common locations
|
|
candidates = [
|
|
BISECT_SKILL_PATH,
|
|
os.path.join(os.path.dirname(__file__), "..", "..", BISECT_SKILL_PATH),
|
|
]
|
|
|
|
content = ""
|
|
for path in candidates:
|
|
try:
|
|
with open(path) as f:
|
|
content = f.read()
|
|
break
|
|
except FileNotFoundError:
|
|
continue
|
|
|
|
if not content:
|
|
raise SkillLoadError(
|
|
f"Could not find {BISECT_SKILL_PATH}. Searched: {candidates}"
|
|
)
|
|
|
|
# Extract the required analysis sections
|
|
sections = []
|
|
missing = []
|
|
|
|
for section_name, pattern in _REQUIRED_SKILL_SECTIONS.items():
|
|
match = re.search(pattern, content, re.DOTALL)
|
|
if match:
|
|
sections.append(f"## {section_name}\n{match.group(1).strip()}")
|
|
else:
|
|
missing.append(section_name)
|
|
|
|
if missing:
|
|
raise SkillLoadError(
|
|
f"SKILL.md is missing required sections: {missing}. "
|
|
f"Was SKILL.md restructured? Update _REQUIRED_SKILL_SECTIONS in "
|
|
f"ci_auto_bisect.py to match the new section names."
|
|
)
|
|
|
|
return "\n\n".join(sections)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Claude API integration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def build_prompt(context: BisectionContext, skill_content: str = "") -> str:
|
|
"""Build a structured prompt for Claude to analyze a CI failure.
|
|
|
|
Args:
|
|
skill_content: Extracted sections from SKILL.md to use as analysis
|
|
methodology.
|
|
"""
|
|
t = context.target
|
|
|
|
statuses_str = " ".join(t.recent_run_statuses) if t.recent_run_statuses else "N/A"
|
|
commits_str = (
|
|
"\n".join(context.commits_between)
|
|
if context.commits_between
|
|
else "No commits found in range (SHAs may be identical or unresolvable)"
|
|
)
|
|
candidates_str = (
|
|
"\n".join(context.candidate_commits)
|
|
if context.candidate_commits
|
|
else "No commits found touching related files"
|
|
)
|
|
|
|
runner_str = "No runner-specific data available"
|
|
if context.runner_correlation:
|
|
runner_lines = []
|
|
for runner_instance, data in context.runner_correlation.items():
|
|
runner_lines.append(
|
|
f" - {runner_instance} / {data['runner_name']} "
|
|
f"({data['runner_labels']}): {data['count']} failures"
|
|
)
|
|
runner_str = "\n".join(runner_lines)
|
|
|
|
error_str = context.error_signature or "No error logs available"
|
|
|
|
methodology = f"""## Analysis Methodology (from bisect skill definition)
|
|
|
|
{skill_content}
|
|
|
|
## Additional Classification Guidance
|
|
Classify as exactly ONE of: code_regression, flaky_test, hardware_issue, environment_change.
|
|
- If recent run pattern shows alternating pass/fail -> likely flaky
|
|
- If recent run pattern shows solid block of failures -> likely regression or environment
|
|
- If commit range is empty (same SHA) -> the failure predates this range, check if flaky
|
|
- If candidate commits are empty but failures are consistent -> environment change or hardware"""
|
|
|
|
return f"""You are an expert CI regression analyst for the SGLang project (a high-performance LLM serving framework).
|
|
|
|
## Task
|
|
Analyze this CI test failure and classify its root cause. Be precise and evidence-based.
|
|
|
|
## Failure Details
|
|
- **Test**: {t.test_file}
|
|
- **Job**: {t.job_name}
|
|
- **Hardware**: {t.hardware}
|
|
- **Job consecutive failures**: {t.current_streak}
|
|
- **Test consecutive failures**: {t.test_streak}
|
|
- **First failure**: {t.first_failure_date} (SHA: {t.first_failure_sha})
|
|
URL: {t.first_failure_job_url}
|
|
- **Last failure**: {t.last_failure_date} (SHA: {t.last_failure_sha})
|
|
URL: {t.last_failure_job_url}
|
|
- **Recent run pattern** (oldest to newest): {statuses_str}
|
|
|
|
## Error Signature (from most recent failure)
|
|
```
|
|
{error_str}
|
|
```
|
|
|
|
## All Commits in Range ({t.first_failure_sha}..{t.last_failure_sha})
|
|
```
|
|
{commits_str}
|
|
```
|
|
|
|
## Commits Touching Related Files
|
|
```
|
|
{candidates_str}
|
|
```
|
|
|
|
## Runner Correlation
|
|
{runner_str}
|
|
|
|
Note: PR numbers appear in squash-merged commit messages as (#1234). Extract the PR number from the suspected commit message if possible.
|
|
|
|
{methodology}
|
|
|
|
## Required Output
|
|
Respond with ONLY a JSON object (no markdown fencing, no extra text):
|
|
{{"classification": "code_regression|flaky_test|hardware_issue|environment_change", "confidence": "high|medium|low", "suspected_commit": "short SHA or null", "suspected_pr": PR_NUMBER_or_null, "evidence_summary": "2-3 sentence explanation of your reasoning", "recommended_fix": "1-2 sentence actionable recommendation"}}"""
|
|
|
|
|
|
def call_claude_api(
|
|
prompt: str,
|
|
api_key: str,
|
|
max_retries: int = 3,
|
|
) -> Tuple[str, int]:
|
|
"""Call Claude API with retry logic. Returns (response_text, total_tokens)."""
|
|
client = anthropic.Anthropic(api_key=api_key)
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
message = client.messages.create(
|
|
model=CLAUDE_MODEL,
|
|
max_tokens=16000,
|
|
thinking={"type": "adaptive"},
|
|
messages=[{"role": "user", "content": prompt}],
|
|
)
|
|
if not message.content:
|
|
print(" Warning: Claude returned empty content")
|
|
return "", 0
|
|
# With extended thinking, content has thinking + text blocks
|
|
response_text = ""
|
|
for block in message.content:
|
|
if block.type == "text":
|
|
response_text = block.text
|
|
break
|
|
tokens_used = message.usage.input_tokens + message.usage.output_tokens
|
|
return response_text, tokens_used
|
|
except anthropic.AuthenticationError as e:
|
|
# Auth errors will never self-resolve -- fail fast
|
|
raise RuntimeError(f"Anthropic API authentication failed: {e}") from e
|
|
except anthropic.RateLimitError:
|
|
if attempt < max_retries - 1:
|
|
wait = 2 ** (attempt + 1)
|
|
print(f" Rate limited, waiting {wait}s...")
|
|
time.sleep(wait)
|
|
else:
|
|
print(f" Rate limited after {max_retries} retries, giving up")
|
|
return "", 0
|
|
except anthropic.APIError as e:
|
|
if attempt < max_retries - 1:
|
|
wait = 2 ** (attempt + 1)
|
|
print(f" API error: {e}, retrying in {wait}s...")
|
|
time.sleep(wait)
|
|
else:
|
|
print(f" API error after {max_retries} retries: {e}")
|
|
return "", 0
|
|
|
|
return "", 0
|
|
|
|
|
|
def parse_claude_response(response_text: str) -> dict:
|
|
"""Parse Claude's JSON response."""
|
|
if not response_text:
|
|
return {
|
|
"classification": "unknown",
|
|
"confidence": "low",
|
|
"suspected_commit": None,
|
|
"suspected_pr": None,
|
|
"evidence_summary": "Failed to get analysis from Claude API",
|
|
"recommended_fix": "Manual investigation required",
|
|
}
|
|
|
|
# First try: the entire response is JSON
|
|
try:
|
|
return json.loads(response_text.strip())
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Second try: find JSON block (possibly with nested braces)
|
|
json_match = re.search(r"\{.*\}", response_text, re.DOTALL)
|
|
if json_match:
|
|
try:
|
|
return json.loads(json_match.group(0))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return {
|
|
"classification": "unknown",
|
|
"confidence": "low",
|
|
"suspected_commit": None,
|
|
"suspected_pr": None,
|
|
"evidence_summary": f"Could not parse Claude response: {response_text[:200]}",
|
|
"recommended_fix": "Manual investigation required",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# GitHub Actions summary
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def generate_github_summary(results: List[BisectionResult]) -> None:
|
|
"""Write markdown summary to $GITHUB_STEP_SUMMARY."""
|
|
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
|
|
if not summary_path:
|
|
print("Not running in GitHub Actions, skipping step summary")
|
|
return
|
|
|
|
by_class: Dict[str, List[BisectionResult]] = {}
|
|
for r in results:
|
|
by_class.setdefault(r.classification, []).append(r)
|
|
|
|
class_order = [
|
|
("code_regression", "🔴"),
|
|
("hardware_issue", "🟠"),
|
|
("environment_change", "🟡"),
|
|
("flaky_test", "🔵"),
|
|
("unknown", "⚪"),
|
|
]
|
|
|
|
lines = ["# CI Auto Bisect Results\n"]
|
|
|
|
lines.append("## Summary\n")
|
|
for cls, emoji in class_order:
|
|
count = len(by_class.get(cls, []))
|
|
if count > 0:
|
|
lines.append(f"- {emoji} **{cls.replace('_', ' ').title()}**: {count}")
|
|
lines.append("")
|
|
|
|
if not results:
|
|
lines.append("No failures requiring bisection analysis.\n")
|
|
else:
|
|
lines.append("## Details\n")
|
|
lines.append(
|
|
"| Classification | Test | Job | Confidence "
|
|
"| Suspected Cause | Recommendation |"
|
|
)
|
|
lines.append("|---|---|---|---|---|---|")
|
|
|
|
for cls, emoji in class_order:
|
|
for r in by_class.get(cls, []):
|
|
suspected = ""
|
|
if r.suspected_commit:
|
|
suspected = f"`{r.suspected_commit}`"
|
|
if r.suspected_pr:
|
|
suspected += f" (PR #{r.suspected_pr})"
|
|
suspected = suspected or "N/A"
|
|
|
|
test_display = r.target.test_file
|
|
if len(test_display) > 30:
|
|
test_display = "..." + test_display[-27:]
|
|
|
|
job_display = r.target.job_name
|
|
if len(job_display) > 30:
|
|
job_display = "..." + job_display[-27:]
|
|
|
|
lines.append(
|
|
f"| {emoji} {cls} | `{test_display}` | `{job_display}` | "
|
|
f"{r.confidence} | {suspected} | {r.recommended_fix[:80]} |"
|
|
)
|
|
|
|
total_tokens = sum(r.tokens_used for r in results)
|
|
lines.append(
|
|
f"\n---\n*Analyzed {len(results)} failures using {total_tokens} tokens*"
|
|
)
|
|
|
|
with open(summary_path, "a") as f:
|
|
f.write("\n".join(lines))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main orchestration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
VALID_CLASSIFICATIONS = {
|
|
"code_regression",
|
|
"flaky_test",
|
|
"hardware_issue",
|
|
"environment_change",
|
|
}
|
|
|
|
|
|
def run_bisection_analysis(
|
|
github_token: str,
|
|
api_key: str,
|
|
max_failures: int = 10,
|
|
min_streak: int = 1,
|
|
output_file: Optional[str] = None,
|
|
dry_run: bool = False,
|
|
) -> dict:
|
|
"""Main orchestration: fetch failures, gather context, call Claude, report."""
|
|
print("=" * 80)
|
|
print("SGLang CI Auto Bisect")
|
|
print("=" * 80)
|
|
|
|
# Load bisect skill methodology for prompt construction
|
|
# Raises SkillLoadError if SKILL.md is missing or sections were renamed
|
|
skill_content = load_bisect_skill()
|
|
print(f"Loaded bisect skill ({len(skill_content)} chars)")
|
|
|
|
# Fetch and analyze failures directly (no external report file needed)
|
|
targets, logs_cache = analyze_scheduled_failures(
|
|
github_token, min_streak, max_failures
|
|
)
|
|
print(f"\n{len(targets)} failure targets to analyze")
|
|
|
|
if not targets:
|
|
print("No failures requiring bisection analysis.")
|
|
output = {
|
|
"analysis_timestamp": datetime.now().isoformat(),
|
|
"total_failures_analyzed": 0,
|
|
"total_tokens_used": 0,
|
|
"results": [],
|
|
"summary": {
|
|
"code_regressions": 0,
|
|
"flaky_tests": 0,
|
|
"hardware_issues": 0,
|
|
"environment_changes": 0,
|
|
"unknown": 0,
|
|
},
|
|
}
|
|
if output_file:
|
|
with open(output_file, "w") as f:
|
|
json.dump(output, f, indent=2)
|
|
generate_github_summary([])
|
|
return output
|
|
|
|
for i, t in enumerate(targets, 1):
|
|
print(f" [{i}] {t.test_file} in {t.job_name} (streak: {t.current_streak})")
|
|
|
|
# Process each target
|
|
results: List[BisectionResult] = []
|
|
total_tokens = 0
|
|
|
|
for i, target in enumerate(targets, 1):
|
|
print(f"\n{'─' * 60}")
|
|
print(f"[{i}/{len(targets)}] Analyzing: {target.test_file}")
|
|
print(f" Job: {target.job_name}")
|
|
print(f" Streak: {target.current_streak} (test-level: {target.test_streak})")
|
|
print(f" SHA range: {target.first_failure_sha}..{target.last_failure_sha}")
|
|
|
|
context = gather_bisection_context(target, github_token, logs_cache)
|
|
|
|
if dry_run:
|
|
prompt = build_prompt(context, skill_content)
|
|
print(" [DRY RUN] Skipping Claude API call")
|
|
print(f" Prompt length: {len(prompt)} chars")
|
|
result = BisectionResult(
|
|
target=target,
|
|
classification="dry_run",
|
|
confidence="n/a",
|
|
evidence_summary="Dry run - no API call made",
|
|
recommended_fix="N/A",
|
|
)
|
|
results.append(result)
|
|
continue
|
|
|
|
prompt = build_prompt(context, skill_content)
|
|
print(f" Calling Claude ({CLAUDE_MODEL})...")
|
|
response_text, tokens = call_claude_api(prompt, api_key)
|
|
total_tokens += tokens
|
|
print(f" Tokens used: {tokens}")
|
|
|
|
parsed = parse_claude_response(response_text)
|
|
classification = parsed.get("classification", "unknown")
|
|
if classification not in VALID_CLASSIFICATIONS:
|
|
classification = "unknown"
|
|
|
|
result = BisectionResult(
|
|
target=target,
|
|
classification=classification,
|
|
confidence=parsed.get("confidence", "low"),
|
|
suspected_commit=parsed.get("suspected_commit"),
|
|
suspected_pr=parsed.get("suspected_pr"),
|
|
evidence_summary=parsed.get("evidence_summary", ""),
|
|
recommended_fix=parsed.get("recommended_fix", ""),
|
|
raw_response=response_text,
|
|
tokens_used=tokens,
|
|
)
|
|
results.append(result)
|
|
|
|
print(f" Classification: {result.classification} ({result.confidence})")
|
|
if result.suspected_commit:
|
|
print(f" Suspected commit: {result.suspected_commit}")
|
|
print(f" Evidence: {result.evidence_summary[:100]}...")
|
|
|
|
if i < len(targets):
|
|
time.sleep(1)
|
|
|
|
# Aggregate
|
|
summary = {
|
|
"code_regressions": sum(
|
|
1 for r in results if r.classification == "code_regression"
|
|
),
|
|
"flaky_tests": sum(1 for r in results if r.classification == "flaky_test"),
|
|
"hardware_issues": sum(
|
|
1 for r in results if r.classification == "hardware_issue"
|
|
),
|
|
"environment_changes": sum(
|
|
1 for r in results if r.classification == "environment_change"
|
|
),
|
|
"unknown": sum(1 for r in results if r.classification == "unknown"),
|
|
}
|
|
|
|
output = {
|
|
"analysis_timestamp": datetime.now().isoformat(),
|
|
"total_failures_analyzed": len(results),
|
|
"total_tokens_used": total_tokens,
|
|
"results": [
|
|
{
|
|
"target": asdict(r.target),
|
|
"classification": r.classification,
|
|
"confidence": r.confidence,
|
|
"suspected_commit": r.suspected_commit,
|
|
"suspected_pr": r.suspected_pr,
|
|
"evidence_summary": r.evidence_summary,
|
|
"recommended_fix": r.recommended_fix,
|
|
"tokens_used": r.tokens_used,
|
|
}
|
|
for r in results
|
|
],
|
|
"summary": summary,
|
|
}
|
|
|
|
if output_file:
|
|
with open(output_file, "w") as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
print(f"\nResults saved to {output_file}")
|
|
|
|
generate_github_summary(results)
|
|
|
|
print(f"\n{'=' * 80}")
|
|
print("BISECTION SUMMARY")
|
|
print(f"{'=' * 80}")
|
|
print(f"Total failures analyzed: {len(results)}")
|
|
print(f"Total tokens used: {total_tokens}")
|
|
for cls, count in summary.items():
|
|
if count > 0:
|
|
print(f" {cls}: {count}")
|
|
|
|
return output
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="SGLang CI Auto Bisect")
|
|
parser.add_argument(
|
|
"--github-token",
|
|
required=True,
|
|
help="GitHub token for API access",
|
|
)
|
|
parser.add_argument(
|
|
"--anthropic-api-key",
|
|
required=True,
|
|
help="Anthropic API key for Claude",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
default=None,
|
|
help="Output JSON file path",
|
|
)
|
|
parser.add_argument(
|
|
"--max-failures",
|
|
type=int,
|
|
default=10,
|
|
help="Maximum number of failures to analyze (default: 10)",
|
|
)
|
|
parser.add_argument(
|
|
"--min-streak",
|
|
type=int,
|
|
default=1,
|
|
help="Minimum consecutive failure streak to trigger bisection (default: 1)",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Skip Claude API calls, only gather context",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
try:
|
|
run_bisection_analysis(
|
|
github_token=args.github_token,
|
|
api_key=args.anthropic_api_key,
|
|
max_failures=args.max_failures,
|
|
min_streak=args.min_streak,
|
|
output_file=args.output,
|
|
dry_run=args.dry_run,
|
|
)
|
|
except Exception as e:
|
|
print(f"Error during bisection analysis: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
|
|
# Write an error result file so the Slack notification step can
|
|
# report the failure instead of silently skipping
|
|
if args.output:
|
|
error_output = {
|
|
"analysis_timestamp": datetime.now().isoformat(),
|
|
"total_failures_analyzed": 0,
|
|
"total_tokens_used": 0,
|
|
"error": str(e),
|
|
"results": [],
|
|
"summary": {
|
|
"code_regressions": 0,
|
|
"flaky_tests": 0,
|
|
"hardware_issues": 0,
|
|
"environment_changes": 0,
|
|
"unknown": 0,
|
|
},
|
|
}
|
|
try:
|
|
with open(args.output, "w") as f:
|
|
json.dump(error_output, f, indent=2)
|
|
print(f"Error report saved to {args.output}")
|
|
except OSError:
|
|
pass
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|