Files
sglang/scripts/ci/utils/query_job_status.py
2026-04-16 22:03:37 -07:00

1944 lines
67 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Query GitHub Actions job status for specific jobs or generate runner fleet reports.
Usage:
# Per-job reports (original mode)
python scripts/ci/utils/query_job_status.py --job "stage-c-test-large-8-gpu-amd-mi35x"
python scripts/ci/utils/query_job_status.py --job "stage-c-test-large-8-gpu-amd-mi35x" --hours 48
python scripts/ci/utils/query_job_status.py --job "stage-c-test-large-8-gpu-amd-mi35x" --workflow "pr-test-amd.yml" --input-data-file actions-job-snapshot.json --summary
# Runner fleet report (cross-workflow runner analytics)
python scripts/ci/utils/query_job_status.py --runner-report --workflow "pr-test-amd.yml,nightly-test-amd.yml" --hours 24
python scripts/ci/utils/query_job_status.py --runner-report --workflow "pr-test-amd.yml,nightly-test-amd.yml,pr-test-amd-rocm720.yml,nightly-test-amd-rocm720.yml" --summary
python scripts/ci/utils/query_job_status.py --workflow "pr-test-amd.yml,nightly-test-amd.yml,pr-test-amd-rocm720.yml,nightly-test-amd-rocm720.yml" --dump-data-file actions-job-snapshot.json
Requirements:
pip install tabulate
"""
import argparse
import json
import os
import re
import subprocess
import sys
from datetime import datetime, timedelta, timezone
from typing import Any, Optional
try:
from tabulate import tabulate
except ImportError:
print("Please install tabulate: pip install tabulate")
exit(1)
def check_gh_cli_available() -> bool:
"""Check if gh CLI is installed and authenticated."""
try:
result = subprocess.run(
["gh", "--version"],
capture_output=True,
text=True,
)
if result.returncode != 0:
return False
# Check if authenticated
auth_result = subprocess.run(
["gh", "auth", "status"],
capture_output=True,
text=True,
)
if auth_result.returncode != 0:
print(
"Error: gh CLI is not authenticated. Please run 'gh auth login' first.",
file=sys.stderr,
)
print(f"Details: {auth_result.stderr}", file=sys.stderr)
return False
return True
except FileNotFoundError:
print(
"Error: gh CLI is not installed. Please install it from https://cli.github.com/",
file=sys.stderr,
)
return False
def run_gh_command(args: list[str]) -> dict:
"""Run gh CLI command and return JSON result."""
try:
result = subprocess.run(
["gh", "api"] + args,
capture_output=True,
text=True,
)
except FileNotFoundError:
raise Exception("gh CLI not found. Please install from https://cli.github.com/")
if result.returncode != 0:
raise Exception(f"gh api failed: {result.stderr}")
return json.loads(result.stdout)
def is_rate_limit_error(error: str) -> bool:
"""Check whether an API error was caused by GitHub rate limiting."""
return "rate limit exceeded" in error.lower()
def _new_workflow_fetch_stats(workflow: str) -> dict[str, Any]:
"""Create an empty metadata bucket for a workflow snapshot."""
return {
"workflow": workflow,
"total_runs_seen": 0,
"runs_with_jobs": 0,
"skipped_runs": 0,
"skipped_runs_rate_limit": 0,
"jobs_collected": 0,
}
def _new_fetch_metadata(repo: str, workflows: list[str], hours: int) -> dict[str, Any]:
"""Create the fetch metadata container stored alongside snapshot jobs."""
return {
"repo": repo,
"hours": hours,
"requested_workflows": workflows,
"total_runs_seen": 0,
"runs_with_jobs": 0,
"jobs_collected": 0,
"skipped_runs": [],
"workflow_fetch_failures": [],
"workflow_stats": {
workflow: _new_workflow_fetch_stats(workflow) for workflow in workflows
},
}
def _record_workflow_fetch_failure(
fetch_metadata: dict[str, Any], workflow: str, error: str
) -> None:
"""Record a workflow-level failure while listing workflow runs."""
fetch_metadata["workflow_fetch_failures"].append(
{
"workflow": workflow,
"error": error.strip(),
"reason": "rate_limit" if is_rate_limit_error(error) else "api_error",
}
)
def _record_skipped_run(
fetch_metadata: dict[str, Any], workflow: str, run: dict, error: str
) -> None:
"""Record a run whose jobs could not be fetched."""
workflow_stats = fetch_metadata["workflow_stats"].setdefault(
workflow, _new_workflow_fetch_stats(workflow)
)
workflow_stats["skipped_runs"] += 1
if is_rate_limit_error(error):
workflow_stats["skipped_runs_rate_limit"] += 1
fetch_metadata["skipped_runs"].append(
{
"workflow": workflow,
"run_id": run["id"],
"created_at": run.get("created_at", ""),
"status": run.get("status", "unknown"),
"conclusion": run.get("conclusion") or "-",
"reason": "rate_limit" if is_rate_limit_error(error) else "api_error",
"error": error.strip(),
}
)
def parse_time(time_str: str) -> Optional[datetime]:
"""Parse ISO timestamp to datetime."""
if not time_str:
return None
return datetime.fromisoformat(time_str.replace("Z", "+00:00"))
def format_time(time_str: str) -> str:
"""Format ISO timestamp to readable format in UTC."""
if not time_str:
return "-"
dt = parse_time(time_str)
if dt:
# Ensure UTC
dt_utc = dt.astimezone(timezone.utc)
return dt_utc.strftime("%m-%d %H:%M")
return "-"
def get_workflow_runs(repo: str, workflow: str, hours: int = 24) -> list[dict]:
"""Get workflow runs from the last N hours."""
since = datetime.now(timezone.utc) - timedelta(hours=hours)
runs = []
page = 1
while True:
url = f"repos/{repo}/actions/runs?per_page=100&page={page}"
if workflow:
url = f"repos/{repo}/actions/workflows/{workflow}/runs?per_page=100&page={page}"
data = run_gh_command([url])
page_runs = data.get("workflow_runs", [])
for run in page_runs:
created_at = parse_time(run.get("created_at"))
if created_at and created_at >= since:
runs.append(run)
elif created_at and created_at < since:
return runs
if len(page_runs) < 100:
break
page += 1
if page > 20:
break
return runs
def get_jobs_for_run(repo: str, run_id: int) -> list[dict]:
"""Get all jobs for a workflow run."""
jobs = []
page = 1
while True:
data = run_gh_command(
[f"repos/{repo}/actions/runs/{run_id}/jobs?per_page=100&page={page}"]
)
jobs.extend(data.get("jobs", []))
if len(data.get("jobs", [])) < 100:
break
page += 1
if page > 5:
break
return jobs
def get_pr_number_from_run(run: dict) -> Optional[int]:
"""Extract PR number from run data."""
# Try to get from pull_requests array
prs = run.get("pull_requests", [])
if prs:
return prs[0].get("number")
return None
def _job_name_matches_filter(job_name: str, job_filter: str) -> bool:
"""Check whether a job name matches the report filter prefix."""
job_name_lower = job_name.lower()
filter_lower = job_filter.lower()
if not job_name_lower.startswith(filter_lower):
return False
if len(job_name_lower) > len(filter_lower):
next_char = job_name_lower[len(filter_lower)]
if next_char not in (" ", "("):
return False
return True
def filter_jobs(
jobs: list[dict],
job_filter: str,
workflow: str = None,
status_filter: str = None,
) -> list[dict]:
"""Filter a prefetched job list for a specific report target."""
results = []
for job in jobs:
if workflow and job.get("workflow") != workflow:
continue
if not _job_name_matches_filter(job.get("job_name", ""), job_filter):
continue
if status_filter and job.get("status") != status_filter:
continue
results.append(job)
return results
def save_snapshot(path: str, snapshot: dict[str, Any]) -> None:
"""Persist a prefetched Actions snapshot to disk."""
with open(path, "w") as f:
json.dump(snapshot, f, indent=2)
def load_snapshot(path: str) -> dict[str, Any]:
"""Load a previously saved Actions snapshot from disk."""
with open(path) as f:
snapshot = json.load(f)
if "jobs" not in snapshot:
raise ValueError(f"Snapshot file {path} is missing the 'jobs' field")
return snapshot
def fetch_all_jobs_snapshot(
repo: str,
workflows: list[str],
hours: int = 24,
) -> dict[str, Any]:
"""Fetch jobs once and store enough metadata to detect incomplete data."""
fetch_metadata = _new_fetch_metadata(repo, workflows, hours)
all_runs = []
for workflow in workflows:
print(f"Fetching runs for {workflow}...", file=sys.stderr)
try:
runs = get_workflow_runs(repo, workflow, hours)
except Exception as e:
error = str(e)
print(
f"Warning: Failed to list runs for workflow {workflow}: {error}",
file=sys.stderr,
)
_record_workflow_fetch_failure(fetch_metadata, workflow, error)
continue
print(f" Found {len(runs)} runs for {workflow}", file=sys.stderr)
fetch_metadata["workflow_stats"][workflow]["total_runs_seen"] = len(runs)
for run in runs:
run["_workflow"] = workflow
all_runs.extend(runs)
seen_run_ids = set()
unique_runs = []
for run in all_runs:
if run["id"] not in seen_run_ids:
seen_run_ids.add(run["id"])
unique_runs.append(run)
fetch_metadata["total_runs_seen"] = len(unique_runs)
print(f"Total unique workflow runs: {len(unique_runs)}", file=sys.stderr)
results = []
jobs_excluded_no_label = 0
total_runs = len(unique_runs)
for i, run in enumerate(unique_runs):
if (i + 1) % 20 == 0:
print(f"Processing run {i+1}/{total_runs}...", file=sys.stderr)
workflow_name = run.get("_workflow", "-")
try:
jobs = get_jobs_for_run(repo, run["id"])
except Exception as e:
error = str(e)
print(
f"Warning: Failed to get jobs for run {run['id']}: {error}",
file=sys.stderr,
)
_record_skipped_run(fetch_metadata, workflow_name, run, error)
continue
workflow_stats = fetch_metadata["workflow_stats"].setdefault(
workflow_name, _new_workflow_fetch_stats(workflow_name)
)
workflow_stats["runs_with_jobs"] += 1
fetch_metadata["runs_with_jobs"] += 1
pr_number = get_pr_number_from_run(run)
branch = run.get("head_branch", "")
run_status = run.get("status", "unknown")
run_conclusion = run.get("conclusion") or "-"
jobs_added = 0
for job in jobs:
job_name = job.get("name", "")
job_status = job.get("status", "unknown")
runner_name = job.get("runner_name") or "-"
labels = job.get("labels", [])
if len(labels) == 1 and labels[0] == "ubuntu-latest":
continue
if not labels:
jobs_excluded_no_label += 1
continue
is_stuck = False
if job_status == "in_progress":
if runner_name == "-":
is_stuck = True
elif run_status == "completed" and run_conclusion in (
"cancelled",
"failure",
):
is_stuck = True
results.append(
{
"job_name": job_name,
"status": job_status,
"conclusion": job.get("conclusion") or "-",
"created_at": job.get("created_at", ""),
"started_at": job.get("started_at", ""),
"completed_at": job.get("completed_at", ""),
"runner_name": runner_name,
"labels": labels,
"runner_group_name": job.get("runner_group_name") or "-",
"run_id": run["id"],
"run_status": run_status,
"run_conclusion": run_conclusion,
"pr_number": pr_number,
"branch": branch,
"html_url": job.get("html_url", ""),
"is_stuck": is_stuck,
"workflow": workflow_name,
}
)
jobs_added += 1
workflow_stats["jobs_collected"] += jobs_added
fetch_metadata["jobs_collected"] = len(results)
fetch_metadata["jobs_excluded_no_label"] = jobs_excluded_no_label
return {
"snapshot_version": 1,
"repo": repo,
"hours": hours,
"workflows": workflows,
"generated_at": datetime.now(timezone.utc).isoformat(),
"jobs": results,
"fetch_metadata": fetch_metadata,
}
def query_jobs(
repo: str,
job_filter: str,
workflow: str = None,
hours: int = 24,
status_filter: str = None,
) -> list[dict]:
"""Query jobs matching the filter."""
snapshot = fetch_all_jobs_snapshot(repo, [workflow], hours)
return filter_jobs(snapshot["jobs"], job_filter, workflow, status_filter)
def query_all_jobs(
repo: str,
workflows: list[str],
hours: int = 24,
) -> list[dict]:
"""Query all jobs across multiple workflows for fleet-level analysis.
Unlike query_jobs(), this does NOT filter by job name and collects
everything in a single pass -- ideal for runner-centric analytics.
Jobs on ubuntu-latest are excluded since those are utility jobs.
"""
return fetch_all_jobs_snapshot(repo, workflows, hours)["jobs"]
def calculate_duration(started_at: str, completed_at: str) -> str:
"""Calculate duration between start and completion."""
if not started_at or not completed_at:
return "-"
start = parse_time(started_at)
end = parse_time(completed_at)
if start and end:
duration = (end - start).total_seconds()
if duration < 0:
return "-" # Invalid data, skip
minutes = int(duration // 60)
seconds = int(duration % 60)
if minutes >= 60:
hours = minutes // 60
minutes = minutes % 60
return f"{hours}h{minutes}m"
return f"{minutes}m{seconds}s"
return "-"
def calculate_queue_time(
job: dict,
report_time: datetime = None,
) -> str:
"""
Calculate queue time for a job.
Uses ``runner_name`` as the reliable signal for whether a runner
picked the job up (consistent with ``_queue_time_seconds``):
* **Has runner** (job was picked up): ``started_at - created_at``.
* **No runner + queued/waiting** (still in queue):
``report_time - created_at``, suffixed with "(queuing)".
* **No runner + other status** (skipped / cancelled / stuck):
returns "-" (never truly queued for a runner).
"""
created = parse_time(job.get("created_at", ""))
if not created:
return "-"
runner = job.get("runner_name") or ""
has_runner = runner and runner != "-"
if has_runner:
started = parse_time(job.get("started_at", ""))
if not started:
return "-"
queue_seconds = (started - created).total_seconds()
if queue_seconds < 0:
return "-" # re-run; timestamps unreliable
else:
status = job.get("status", "")
if status not in ("queued", "waiting"):
return "-"
ref = report_time or datetime.now(timezone.utc)
queue_seconds = (ref - created).total_seconds()
if queue_seconds < 0:
return "-"
minutes = int(queue_seconds // 60)
seconds = int(queue_seconds % 60)
suffix = " (queuing)" if not has_runner else ""
if minutes >= 60:
hours = minutes // 60
minutes = minutes % 60
return f"{hours}h{minutes}m{suffix}"
return f"{minutes}m{seconds}s{suffix}"
# ---------------------------------------------------------------------------
# Runner fleet analytics functions
# ---------------------------------------------------------------------------
def _format_duration_seconds(seconds: Optional[float]) -> str:
"""Format seconds into human-readable duration string."""
if seconds is None or seconds < 0:
return "-"
total_seconds = int(seconds)
minutes = total_seconds // 60
secs = total_seconds % 60
if minutes >= 60:
hours = minutes // 60
minutes = minutes % 60
return f"{hours}h{minutes}m"
return f"{minutes}m{secs}s"
def _get_runner_label(job: dict) -> str:
"""Extract the primary runner label from a job's labels list."""
labels = job.get("labels", [])
if not labels:
return "unknown"
for label in labels:
if label.startswith("linux-mi"):
return label
return labels[0]
_RUNNER_LABEL_RE = re.compile(r"linux-(mi\w+?)-(\d+)gpu")
_RUNNER_LABEL_ALT_RE = re.compile(r"linux-(mi\w+?)-gpu-(\d+)")
def _runner_label_sort_key(label: str) -> tuple:
"""Sort key for natural ordering: GPU type first, then GPU count.
linux-mi325-1gpu-sglang -> ('mi325', 1, 'linux-mi325-1gpu-sglang')
linux-mi35x-8gpu-sglang -> ('mi35x', 8, 'linux-mi35x-8gpu-sglang')
linux-mi35x-gpu-8.fabric -> ('mi35x', 8, 'linux-mi35x-gpu-8.fabric')
"""
m = _RUNNER_LABEL_RE.search(label)
if m:
return (m.group(1), int(m.group(2)), label)
m2 = _RUNNER_LABEL_ALT_RE.search(label)
if m2:
return (m2.group(1), int(m2.group(2)), label)
return ("zzz", 0, label)
def _percentile(data: list[float], p: int) -> Optional[float]:
"""Return a percentile from an already sorted or unsorted numeric list."""
if not data:
return None
sorted_data = sorted(data)
idx = min(int(len(sorted_data) * p / 100), len(sorted_data) - 1)
return sorted_data[idx]
def _average(data: list[float]) -> Optional[float]:
"""Return the average of a numeric list when samples exist."""
if not data:
return None
return sum(data) / len(data)
def _queue_time_seconds(job: dict, report_time: datetime = None) -> Optional[float]:
"""Extract queue time in seconds for a job.
* Has ``runner_name`` (picked up by a runner): ``started_at - created_at``.
* No ``runner_name`` + status ``queued``/``waiting`` (still in queue):
``report_time - created_at``.
* No ``runner_name`` + other status (e.g. skipped/cancelled before
pickup): ``None`` (skip).
GitHub sets ``started_at`` when a job *enters* the queue, so for jobs
that have not been picked up yet ``started_at ≈ created_at`` and the
naive difference would be ~0, which is wrong. The reliable signal for
"actually dequeued" is a non-empty ``runner_name``.
"""
created = parse_time(job.get("created_at", ""))
if not created:
return None
runner = job.get("runner_name") or ""
if not runner or runner == "-":
status = job.get("status", "")
if status not in ("queued", "waiting"):
return None
if report_time is None:
report_time = datetime.now(timezone.utc)
queue_seconds = (report_time - created).total_seconds()
return queue_seconds if queue_seconds >= 0 else None
started = parse_time(job.get("started_at", ""))
if not started:
return None
queue_seconds = (started - created).total_seconds()
return queue_seconds if queue_seconds >= 0 else None
def _build_queue_distribution(queue_times: list[float]) -> dict[str, Any]:
"""Build queue time buckets and percentile stats for one sample set."""
if not queue_times:
return {"buckets": [], "p50": None, "p90": None, "p99": None, "total": 0}
sorted_queue_times = sorted(queue_times)
bucket_defs = [
("< 1 min", 0, 60),
("1-5 min", 60, 300),
("5-15 min", 300, 900),
("15-30 min", 900, 1800),
("30-60 min", 1800, 3600),
("> 60 min", 3600, float("inf")),
]
total = len(sorted_queue_times)
buckets = []
for label, lo, hi in bucket_defs:
count = sum(1 for qt in sorted_queue_times if lo <= qt < hi)
pct = count / total * 100 if total > 0 else 0
buckets.append({"range": label, "count": count, "percentage": round(pct, 1)})
return {
"buckets": buckets,
"p50": _percentile(sorted_queue_times, 50),
"p90": _percentile(sorted_queue_times, 90),
"p99": _percentile(sorted_queue_times, 99),
"total": total,
}
def analyze_concurrency(jobs: list[dict], report_time: datetime = None) -> dict:
"""Analyze concurrent runner usage per runner label.
Uses an event-sweep algorithm: for each job that ran, create +1 event
at started_at and -1 event at completed_at, then sweep through sorted
events tracking the concurrent count.
"""
if report_time is None:
report_time = datetime.now(timezone.utc)
label_jobs: dict[str, list[dict]] = {}
for job in jobs:
label = _get_runner_label(job)
label_jobs.setdefault(label, []).append(job)
results = {}
for label in sorted(label_jobs):
pool_jobs = label_jobs[label]
events: list[tuple[datetime, int]] = []
queue_times: list[float] = []
durations: list[float] = []
for job in pool_jobs:
runner = job.get("runner_name") or ""
has_runner = bool(runner and runner != "-")
if has_runner:
started = parse_time(job.get("started_at", ""))
completed = parse_time(job.get("completed_at", ""))
if started and completed:
events.append((started, +1))
events.append((completed, -1))
durations.append((completed - started).total_seconds())
elif started:
events.append((started, +1))
events.append((report_time, -1))
durations.append((report_time - started).total_seconds())
qt = _queue_time_seconds(job, report_time=report_time)
if qt is not None:
queue_times.append(qt)
if not events:
results[label] = {
"peak": 0,
"avg_concurrent": 0.0,
"total_jobs": len(pool_jobs),
"avg_queue_seconds": _average(queue_times),
"p50_queue_seconds": _percentile(queue_times, 50),
"p99_queue_seconds": _percentile(queue_times, 99),
"avg_duration_seconds": _average(durations),
}
continue
events.sort(key=lambda x: (x[0], x[1]))
concurrent = 0
peak = 0
time_weighted_sum = 0.0
total_time = 0.0
prev_time = events[0][0]
for ts, delta in events:
if prev_time and concurrent > 0:
dt = (ts - prev_time).total_seconds()
time_weighted_sum += concurrent * dt
total_time += dt
concurrent += delta
peak = max(peak, concurrent)
prev_time = ts
avg_concurrent = time_weighted_sum / total_time if total_time > 0 else 0
avg_queue = _average(queue_times)
avg_duration = _average(durations)
results[label] = {
"peak": peak,
"avg_concurrent": round(avg_concurrent, 1),
"total_jobs": len(pool_jobs),
"avg_queue_seconds": avg_queue,
"p50_queue_seconds": _percentile(queue_times, 50),
"p99_queue_seconds": _percentile(queue_times, 99),
"avg_duration_seconds": avg_duration,
}
return results
def analyze_busy_periods(jobs: list[dict], report_time: datetime = None) -> list[dict]:
"""Analyze job activity by hour of day (UTC).
Buckets jobs by the UTC hour they started (or were created, for
still-queued jobs) and computes avg queue time. Classifies each hour
as Quiet / Moderate / Busy / Peak relative to the busiest hour.
"""
if report_time is None:
report_time = datetime.now(timezone.utc)
hourly: dict[int, dict] = {
h: {"jobs_started": 0, "queue_times": []} for h in range(24)
}
for job in jobs:
started = parse_time(job.get("started_at", ""))
created = parse_time(job.get("created_at", ""))
runner = job.get("runner_name") or ""
has_runner = bool(runner and runner != "-")
if has_runner and started:
hour = started.astimezone(timezone.utc).hour
hourly[hour]["jobs_started"] += 1
if created:
qt = (started - created).total_seconds()
if qt >= 0:
hourly[hour]["queue_times"].append(qt)
elif job.get("status") in ("queued", "waiting") and created:
hour = created.astimezone(timezone.utc).hour
hourly[hour]["jobs_started"] += 1
qt = (report_time - created).total_seconds()
if qt >= 0:
hourly[hour]["queue_times"].append(qt)
max_jobs = max((v["jobs_started"] for v in hourly.values()), default=1) or 1
results = []
for hour in range(24):
data = hourly[hour]
avg_queue = (
sum(data["queue_times"]) / len(data["queue_times"])
if data["queue_times"]
else 0
)
ratio = data["jobs_started"] / max_jobs
if ratio >= 0.75:
load = "Peak"
elif ratio >= 0.5:
load = "Busy"
elif ratio >= 0.25:
load = "Moderate"
else:
load = "Quiet"
results.append(
{
"hour": hour,
"hour_label": f"{hour:02d}:00-{(hour + 1) % 24:02d}:00",
"jobs_started": data["jobs_started"],
"avg_queue_seconds": avg_queue,
"load": load,
}
)
return results
def analyze_queue_distribution(jobs: list[dict], report_time: datetime = None) -> dict:
"""Analyze queue time distribution per runner label."""
queue_times_by_label: dict[str, list[float]] = {}
for job in jobs:
queue_seconds = _queue_time_seconds(job, report_time=report_time)
if queue_seconds is None:
continue
label = _get_runner_label(job)
queue_times_by_label.setdefault(label, []).append(queue_seconds)
return {
label: _build_queue_distribution(queue_times)
for label, queue_times in sorted(queue_times_by_label.items())
}
def analyze_utilization_snapshots(
jobs: list[dict],
report_time: datetime = None,
interval_minutes: int = 15,
hours: int = 24,
) -> dict[str, list[dict]]:
"""Point-in-time snapshot at regular intervals per runner label.
At each interval mark over the last *hours* hours, counts:
- running: jobs that have a runner assigned (``runner_name`` set)
and are between ``started_at`` and ``completed_at``
- queued: jobs that have no runner assigned and haven't completed
GitHub's ``started_at`` is unreliable for distinguishing running vs
queued -- it is set when a job enters the queue, not when a runner
picks it up. The reliable signal is ``runner_name`` being non-empty.
"""
if report_time is None:
report_time = datetime.now(timezone.utc)
label_jobs: dict[str, list[dict]] = {}
for job in jobs:
label = _get_runner_label(job)
label_jobs.setdefault(label, []).append(job)
results: dict[str, list[dict]] = {}
for label in sorted(label_jobs, key=_runner_label_sort_key):
pool_jobs = label_jobs[label]
running_events: list[tuple[datetime, int]] = []
queued_events: list[tuple[datetime, int]] = []
for job in pool_jobs:
created = parse_time(job.get("created_at", ""))
started = parse_time(job.get("started_at", ""))
completed = parse_time(job.get("completed_at", ""))
runner = job.get("runner_name") or ""
has_runner = bool(runner and runner != "-")
if has_runner and started:
end = completed if completed else report_time
running_events.append((started, +1))
running_events.append((end, -1))
if created and created < started:
queued_events.append((created, +1))
queued_events.append((started, -1))
elif created and job.get("status") in ("queued", "waiting"):
queued_events.append((created, +1))
queued_events.append((report_time, -1))
sorted_running = sorted(running_events, key=lambda x: (x[0], x[1]))
sorted_queued = sorted(queued_events, key=lambda x: (x[0], x[1]))
window_start = report_time - timedelta(hours=hours)
window_start = window_start.replace(
minute=(window_start.minute // interval_minutes) * interval_minutes,
second=0,
microsecond=0,
)
snapshot_data: list[dict] = []
t = window_start
while t <= report_time:
running = _count_at_time(sorted_running, t)
queued = _count_at_time(sorted_queued, t)
if running > 0 or queued > 0:
snapshot_data.append(
{
"time": t.strftime("%m-%d %H:%M"),
"running": running,
"queued": queued,
}
)
t += timedelta(minutes=interval_minutes)
if snapshot_data:
results[label] = snapshot_data
return results
def _count_at_time(
sorted_events: list[tuple[datetime, int]],
t: datetime,
) -> int:
"""Count concurrent items at an exact point in time using event sweep."""
count = 0
for ts, delta in sorted_events:
if ts > t:
break
count += delta
return max(count, 0)
def process_results(
results: list[dict], repo: str, report_time: datetime = None
) -> dict:
"""
Process raw results into structured data for presentation.
Returns a dictionary containing:
- status_summary: dict of job_name -> status counts
- sorted_results: list of results sorted by created_at descending
- active_jobs: list of in_progress/queued/waiting jobs (excluding stuck)
- stuck_jobs: list of stuck/ghost jobs
- failed_jobs: list of failed jobs
- processed_jobs: list of jobs with calculated fields (queue_time, duration, etc.)
"""
if report_time is None:
report_time = datetime.now(timezone.utc)
if not results:
return {
"status_summary": {},
"sorted_results": [],
"active_jobs": [],
"stuck_jobs": [],
"failed_jobs": [],
"processed_jobs": [],
}
# Group by job name for summary
status_summary = {}
for r in results:
job_name = r["job_name"]
status = r["status"]
conclusion = r.get("conclusion", "-")
is_stuck = r.get("is_stuck", False)
if job_name not in status_summary:
status_summary[job_name] = {
"in_progress": 0,
"queued": 0,
"waiting": 0,
"stuck": 0,
"success": 0,
"failure": 0,
"cancelled": 0,
"skipped": 0,
}
if is_stuck:
status_summary[job_name]["stuck"] += 1
elif status == "completed":
if conclusion == "success":
status_summary[job_name]["success"] += 1
elif conclusion == "failure":
status_summary[job_name]["failure"] += 1
elif conclusion == "skipped":
status_summary[job_name]["skipped"] += 1
elif conclusion in (
"cancelled",
"timed_out",
"action_required",
"neutral",
"stale",
):
status_summary[job_name]["cancelled"] += 1
elif status in status_summary[job_name]:
status_summary[job_name][status] += 1
# Sort by created_at descending
sorted_results = sorted(results, key=lambda x: x["created_at"], reverse=True)
# Filter into categories (mutually exclusive)
active_jobs = [
r
for r in results
if r.get("status") in ("in_progress", "queued", "waiting")
and not r.get("is_stuck", False)
]
stuck_jobs = [r for r in results if r.get("is_stuck", False)]
# Only include jobs with conclusion "failure"
# Exclude stuck jobs to avoid double-counting
failed_jobs = [
r
for r in results
if r.get("conclusion", "-") == "failure" and not r.get("is_stuck", False)
]
# Process jobs with calculated fields
processed_jobs = []
for r in sorted_results:
processed = r.copy()
processed["created_formatted"] = format_time(r["created_at"])
processed["started_formatted"] = format_time(r["started_at"])
processed["queue_time"] = calculate_queue_time(r, report_time)
processed["duration"] = calculate_duration(r["started_at"], r["completed_at"])
# Use the job's html_url for direct link to the specific job
processed["url"] = (
r.get("html_url") or f"https://github.com/{repo}/actions/runs/{r['run_id']}"
)
if r["pr_number"]:
processed["pr_info"] = f"PR#{r['pr_number']}"
else:
processed["pr_info"] = r["branch"] if r["branch"] else "-"
# Status display with stuck marker
if r.get("is_stuck", False):
processed["status_display"] = f"STUCK ({r['status']})"
else:
processed["status_display"] = r["status"]
processed_jobs.append(processed)
return {
"status_summary": status_summary,
"sorted_results": sorted_results,
"active_jobs": active_jobs,
"stuck_jobs": stuck_jobs,
"failed_jobs": failed_jobs,
"processed_jobs": processed_jobs,
}
def summarize_fetch_metadata(
fetch_metadata: Optional[dict[str, Any]], workflows: list[str] = None
) -> Optional[dict[str, Any]]:
"""Summarize snapshot completeness for the workflows relevant to a report."""
if not fetch_metadata:
return None
workflow_filter = (
set(workflows)
if workflows
else set(fetch_metadata.get("requested_workflows", []))
)
workflow_stats = fetch_metadata.get("workflow_stats", {})
if not workflow_filter:
workflow_filter = set(workflow_stats)
relevant_stats = [
workflow_stats[workflow]
for workflow in workflow_filter
if workflow in workflow_stats
]
relevant_skipped_runs = [
run
for run in fetch_metadata.get("skipped_runs", [])
if run.get("workflow") in workflow_filter
]
relevant_workflow_failures = [
failure
for failure in fetch_metadata.get("workflow_fetch_failures", [])
if failure.get("workflow") in workflow_filter
]
skipped_run_rate_limit = sum(
1 for run in relevant_skipped_runs if run.get("reason") == "rate_limit"
)
workflow_failure_rate_limit = sum(
1
for failure in relevant_workflow_failures
if failure.get("reason") == "rate_limit"
)
return {
"known_runs": sum(stat.get("total_runs_seen", 0) for stat in relevant_stats),
"runs_with_jobs": sum(stat.get("runs_with_jobs", 0) for stat in relevant_stats),
"jobs_collected": sum(stat.get("jobs_collected", 0) for stat in relevant_stats),
"skipped_runs": relevant_skipped_runs,
"workflow_failures": relevant_workflow_failures,
"skipped_run_rate_limit": skipped_run_rate_limit,
"workflow_failure_rate_limit": workflow_failure_rate_limit,
"incomplete": bool(relevant_skipped_runs or relevant_workflow_failures),
}
def append_fetch_metadata_notice(
lines: list[str],
fetch_metadata: Optional[dict[str, Any]],
workflows: list[str] = None,
) -> None:
"""Append a markdown notice when the report is based on incomplete data."""
summary = summarize_fetch_metadata(fetch_metadata, workflows)
if not summary or not summary["incomplete"]:
return
skipped_runs = summary["skipped_runs"]
workflow_failures = summary["workflow_failures"]
other_skipped = len(skipped_runs) - summary["skipped_run_rate_limit"]
other_workflow_failures = (
len(workflow_failures) - summary["workflow_failure_rate_limit"]
)
lines.append(
"> **Data completeness:** Incomplete. GitHub API rate limit and/or fetch errors prevented a full dataset."
)
if summary["known_runs"] > 0:
lines.append(
f"> Successfully fetched jobs for **{summary['runs_with_jobs']}/{summary['known_runs']}** known runs in scope. Missing runs: **{len(skipped_runs)}** (rate limit: {summary['skipped_run_rate_limit']}, other API errors: {other_skipped})."
)
if workflow_failures:
workflow_names = ", ".join(
f"`{failure['workflow']}`" for failure in workflow_failures
)
lines.append(
f"> Could not list workflow runs for {workflow_names}. Missing run count is unknown for those workflows (rate limit: {summary['workflow_failure_rate_limit']}, other API errors: {other_workflow_failures})."
)
if skipped_runs:
skipped_ids = ", ".join(f"`{run['run_id']}`" for run in skipped_runs[:10])
remaining = len(skipped_runs) - 10
suffix = f", and {remaining} more" if remaining > 0 else ""
lines.append(f"> Missing run IDs: {skipped_ids}{suffix}.")
lines.append(
"> Missing job counts inside skipped runs are unknown because GitHub did not return those run job lists."
)
lines.append("")
def print_table(
results: list[dict], repo: str, generated_time: str, report_time: datetime = None
):
"""Print results as a formatted table using tabulate."""
print("")
print(f"Report generated: {generated_time} UTC")
print("Note: All times are in UTC")
print("")
if not results:
print("No jobs found matching the filter.")
return
# Process data
data = process_results(results, repo, report_time)
status_summary = data["status_summary"]
processed_jobs = data["processed_jobs"]
active_jobs = data["active_jobs"]
stuck_jobs = data["stuck_jobs"]
# Print summary table
print("\n" + "=" * 100)
print("SUMMARY BY JOB NAME")
print("=" * 100)
summary_data = []
for job_name, counts in sorted(status_summary.items()):
summary_data.append(
[
job_name,
counts["in_progress"],
counts["queued"],
counts["waiting"],
counts["stuck"],
counts["success"],
counts["failure"],
counts["cancelled"],
counts["skipped"],
]
)
print(
tabulate(
summary_data,
headers=[
"Job Name",
"Running",
"Queued",
"Waiting",
"Stuck",
"Success",
"Failure",
"Cancelled",
"Skipped",
],
tablefmt="grid",
)
)
# Print detailed table
print("\n" + "=" * 100)
print("DETAILED JOB LIST")
print("=" * 100)
detail_data = []
for p in processed_jobs:
detail_data.append(
[
p["job_name"],
p["status_display"],
p["conclusion"],
p["created_formatted"],
p["started_formatted"],
p["queue_time"],
p["duration"],
p["runner_name"] or "-",
p["pr_info"],
p["run_id"],
]
)
print(
tabulate(
detail_data,
headers=[
"Job Name",
"Status",
"Conclusion",
"Created",
"Started",
"Queue",
"Duration",
"Runner",
"PR/Branch",
"Run ID",
],
tablefmt="grid",
)
)
# Print links for active jobs (use processed_jobs for correct queue_time)
if active_jobs:
print("\n" + "=" * 100)
print("ACTIVE JOB LINKS")
print("=" * 100)
link_data = []
for r in active_jobs:
# Find the corresponding processed job to get pre-calculated fields
p = next(
(
p
for p in processed_jobs
if p["run_id"] == r["run_id"] and p["job_name"] == r["job_name"]
),
None,
)
if p:
link_data.append(
[
p["job_name"],
p["status"],
p["queue_time"],
p["pr_info"],
p["runner_name"] or "-",
p["url"],
]
)
print(
tabulate(
link_data,
headers=["Job Name", "Status", "Queue", "PR/Branch", "Runner", "URL"],
tablefmt="simple",
)
)
# Print stuck jobs (use processed_jobs for correct data)
if stuck_jobs:
print("\n" + "=" * 100)
print("STUCK/GHOST JOBS (in_progress but no runner or workflow cancelled)")
print("=" * 100)
stuck_data = []
for r in stuck_jobs:
# Find the corresponding processed job
p = next(
(
p
for p in processed_jobs
if p["run_id"] == r["run_id"] and p["job_name"] == r["job_name"]
),
None,
)
if p:
run_info = f"{r.get('run_status', '-')}/{r.get('run_conclusion', '-')}"
stuck_data.append(
[
p["job_name"],
p["status"],
run_info,
p["pr_info"],
p["runner_name"] or "-",
p["url"],
]
)
print(
tabulate(
stuck_data,
headers=[
"Job Name",
"Job Status",
"Run Status/Conclusion",
"PR/Branch",
"Runner",
"URL",
],
tablefmt="simple",
)
)
def format_markdown(
results: list[dict],
repo: str,
job_filter: str,
hours: int,
generated_time: str,
report_time: datetime = None,
fetch_metadata: dict[str, Any] = None,
workflow: str = None,
) -> str:
"""Format results as markdown for GitHub Actions summary."""
lines = []
# Header
lines.append(f"# Job Status Report: `{job_filter}`")
lines.append("")
lines.append(f"**Time window:** Last {hours} hours")
lines.append(f"**Generated:** {generated_time} UTC")
lines.append(f"**Total jobs found:** {len(results)}")
lines.append("")
lines.append("> **Note:** All times are displayed in UTC")
lines.append("")
append_fetch_metadata_notice(
lines, fetch_metadata, [workflow] if workflow else None
)
if not results:
lines.append("> No jobs found matching the filter.")
return "\n".join(lines)
# Process data using shared function
data = process_results(results, repo, report_time)
status_summary = data["status_summary"]
processed_jobs = data["processed_jobs"]
active_jobs = data["active_jobs"]
stuck_jobs = data["stuck_jobs"]
failed_jobs = data["failed_jobs"]
# Summary table
lines.append("## Summary by Job Name")
lines.append("")
lines.append(
"> **Status meanings:** Running = executing, Queued = waiting for runner, Waiting = waiting for dependent jobs, Stuck = ghost job, Cancelled = cancelled/timed_out, Skipped = skipped by workflow conditions"
)
lines.append("")
lines.append(
"| Job Name | Running | Queued | Waiting | Stuck | Success | Failure | Cancelled | Skipped |"
)
lines.append(
"|----------|---------|--------|---------|-------|---------|---------|-----------|---------|"
)
for job_name, counts in sorted(status_summary.items()):
running = f"**{counts['in_progress']}**" if counts["in_progress"] > 0 else "0"
queued = f"**{counts['queued']}**" if counts["queued"] > 0 else "0"
waiting = f"**{counts['waiting']}**" if counts["waiting"] > 0 else "0"
stuck = f"**{counts['stuck']}**" if counts["stuck"] > 0 else "0"
success = str(counts["success"])
failure = f"**{counts['failure']}**" if counts["failure"] > 0 else "0"
cancelled = str(counts["cancelled"])
skipped = str(counts["skipped"])
lines.append(
f"| `{job_name}` | {running} | {queued} | {waiting} | {stuck} | {success} | {failure} | {cancelled} | {skipped} |"
)
lines.append("")
# Active jobs section
if active_jobs:
lines.append("## Active Jobs")
lines.append("")
lines.append(
"| Status | Job Name | Created | Started | Queue | PR/Branch | Runner | Link |"
)
lines.append(
"|--------|----------|---------|---------|-------|-----------|--------|------|"
)
for r in sorted(
active_jobs, key=lambda x: (x["status"], x["created_at"]), reverse=True
):
# Find the processed version for this job
p = next(
(
p
for p in processed_jobs
if p["run_id"] == r["run_id"] and p["job_name"] == r["job_name"]
),
None,
)
if p:
lines.append(
f"| {p['status']} | `{p['job_name']}` | {p['created_formatted']} | {p['started_formatted']} | {p['queue_time']} | {p['pr_info']} | `{p['runner_name'] or '-'}` | [View]({p['url']}) |"
)
lines.append("")
# Stuck/Ghost jobs section
if stuck_jobs:
lines.append("## Stuck/Ghost Jobs")
lines.append("")
lines.append(
"> Jobs showing `in_progress` but have no runner assigned or workflow run is cancelled"
)
lines.append("")
lines.append(
"| Job Status | Run Status | Job Name | PR/Branch | Runner | Link |"
)
lines.append(
"|------------|------------|----------|-----------|--------|------|"
)
for r in sorted(stuck_jobs, key=lambda x: x["created_at"], reverse=True):
p = next(
(
p
for p in processed_jobs
if p["run_id"] == r["run_id"] and p["job_name"] == r["job_name"]
),
None,
)
if p:
run_info = f"{r.get('run_status', '-')}/{r.get('run_conclusion', '-')}"
lines.append(
f"| {p['status']} | {run_info} | `{p['job_name']}` | {p['pr_info']} | `{p['runner_name'] or '-'}` | [View]({p['url']}) |"
)
lines.append("")
# Failed jobs section (before All Jobs)
if failed_jobs:
lines.append(f"## Failed Jobs ({len(failed_jobs)} total)")
lines.append("")
lines.append(
"| Conclusion | Job Name | Created | Started | Queue | Duration | Runner | PR/Branch | Link |"
)
lines.append(
"|------------|----------|---------|---------|-------|----------|--------|-----------|------|"
)
for r in sorted(failed_jobs, key=lambda x: x["created_at"], reverse=True):
p = next(
(
p
for p in processed_jobs
if p["run_id"] == r["run_id"] and p["job_name"] == r["job_name"]
),
None,
)
if p:
lines.append(
f"| {p['conclusion']} | `{p['job_name']}` | {p['created_formatted']} | {p['started_formatted']} | {p['queue_time']} | {p['duration']} | `{p['runner_name'] or '-'}` | {p['pr_info']} | [View]({p['url']}) |"
)
lines.append("")
# Detailed table (all jobs) - collapsible
lines.append("<details>")
lines.append(
f"<summary><strong>All Jobs ({len(results)} total)</strong> - Click to expand</summary>"
)
lines.append("")
lines.append(
"| Job Name | Status | Conclusion | Created | Started | Queue | Duration | Runner | PR/Branch | Link |"
)
lines.append(
"|----------|--------|------------|---------|---------|-------|----------|--------|-----------|------|"
)
for p in processed_jobs:
# Mark stuck jobs in markdown with bold
if p.get("is_stuck", False):
status_display = f"**STUCK** ({p['status']})"
else:
status_display = p["status"]
lines.append(
f"| `{p['job_name']}` | {status_display} | {p['conclusion']} | {p['created_formatted']} | {p['started_formatted']} | {p['queue_time']} | {p['duration']} | `{p['runner_name'] or '-'}` | {p['pr_info']} | [View]({p['url']}) |"
)
lines.append("")
lines.append("</details>")
lines.append("")
return "\n".join(lines)
def format_runner_report_markdown(
jobs: list[dict],
workflows: list[str],
hours: int,
generated_time: str,
report_time: datetime = None,
fetch_metadata: dict[str, Any] = None,
) -> str:
"""Format runner fleet analytics as markdown for GitHub Actions summary."""
if report_time is None:
report_time = datetime.now(timezone.utc)
lines: list[str] = []
# Header
lines.append("# CI Runner Fleet Report")
lines.append("")
lines.append(f"**Workflows:** {', '.join(f'`{w}`' for w in workflows)}")
lines.append(f"**Time window:** Last {hours} hours")
lines.append(f"**Generated:** {generated_time} UTC")
excluded_no_label = (
fetch_metadata.get("jobs_excluded_no_label", 0) if fetch_metadata else 0
)
lines.append(f"**Total jobs analyzed:** {len(jobs)}")
lines.append("")
lines.append(
"> All times are in UTC. Jobs on `ubuntu-latest` and jobs with no runner label "
"(waiting/unassigned) are excluded."
)
lines.append("")
append_fetch_metadata_notice(lines, fetch_metadata, workflows)
if not jobs:
lines.append("> No self-hosted runner jobs found in the time window.")
return "\n".join(lines)
# --- Fleet Overview ---
unique_labels = {_get_runner_label(j) for j in jobs}
completed_jobs = [j for j in jobs if j.get("status") == "completed"]
lines.append("## Fleet Overview")
lines.append("")
lines.append("| Metric | Value |")
lines.append("|--------|-------|")
lines.append(f"| Total runner labels seen | {len(unique_labels)} |")
lines.append(f"| Total jobs analyzed | {len(jobs)} |")
lines.append(f"| Completed jobs | {len(completed_jobs)} |")
if excluded_no_label:
lines.append(f"| Excluded (no runner label) | {excluded_no_label} |")
lines.append(f"| Time window | {hours}h |")
lines.append("")
# --- Concurrency by Runner Label ---
concurrency = analyze_concurrency(jobs, report_time)
if concurrency:
lines.append("## Concurrency by Runner Label")
lines.append("")
lines.append(
"| Runner Label | Peak Concurrent | Avg Concurrent | Total Jobs | Avg Queue | P50 Queue | P99 Queue | Avg Duration |"
)
lines.append(
"|-------------|----------------|---------------|-----------|-----------|-----------|-----------|-------------|"
)
for label in sorted(concurrency, key=_runner_label_sort_key):
c = concurrency[label]
lines.append(
f"| `{label}` | **{c['peak']}** | {c['avg_concurrent']} "
f"| {c['total_jobs']} "
f"| {_format_duration_seconds(c['avg_queue_seconds'])} "
f"| {_format_duration_seconds(c['p50_queue_seconds'])} "
f"| {_format_duration_seconds(c['p99_queue_seconds'])} "
f"| {_format_duration_seconds(c['avg_duration_seconds'])} |"
)
lines.append("")
# --- Busy Periods ---
busy_periods = analyze_busy_periods(jobs, report_time=report_time)
if busy_periods:
lines.append("## Busy Periods (UTC)")
lines.append("")
lines.append("| Hour (UTC) | Jobs Started | Avg Queue Time | Load |")
lines.append("|-----------|-------------|---------------|------|")
for bp in busy_periods:
if bp["jobs_started"] == 0:
continue
load_display = (
f"**{bp['load']}**" if bp["load"] in ("Peak", "Busy") else bp["load"]
)
lines.append(
f"| {bp['hour_label']} | {bp['jobs_started']} "
f"| {_format_duration_seconds(bp['avg_queue_seconds'])} "
f"| {load_display} |"
)
lines.append("")
peak_hours = [bp for bp in busy_periods if bp["load"] == "Peak"]
quiet_hours = [
bp
for bp in busy_periods
if bp["load"] == "Quiet" and bp["jobs_started"] > 0
]
if peak_hours:
labels = ", ".join(bp["hour_label"] for bp in peak_hours)
lines.append(f"> **Peak hours:** {labels}")
lines.append("")
if quiet_hours:
labels = ", ".join(bp["hour_label"] for bp in quiet_hours)
lines.append(f"> **Quiet hours:** {labels}")
lines.append("")
# --- Runner Utilization Snapshots ---
util_snapshots = analyze_utilization_snapshots(jobs, report_time, hours=hours)
if util_snapshots:
lines.append("## Runner Utilization (15-min snapshots)")
lines.append("")
lines.append(
"> Point-in-time snapshot every 15 minutes (UTC). "
"**Running** = jobs with a runner assigned and executing. "
"**Queued** = jobs waiting for a runner."
)
lines.append("")
for label in sorted(util_snapshots, key=_runner_label_sort_key):
snapshots = util_snapshots[label]
lines.append(f"### `{label}`")
lines.append("")
lines.append("| Time (UTC) | Running | Queued |")
lines.append("|-----------|---------|--------|")
for s in snapshots:
lines.append(f"| {s['time']} | **{s['running']}** | {s['queued']} |")
lines.append("")
# --- Queue Time Distribution ---
queue_dist = analyze_queue_distribution(jobs, report_time=report_time)
if queue_dist:
lines.append("## Queue Time Distribution by Runner Label")
lines.append("")
for label in sorted(queue_dist, key=_runner_label_sort_key):
dist = queue_dist[label]
lines.append(f"### `{label}`")
lines.append("")
lines.append(
f"> **Samples:** {dist['total']} | **P50:** {_format_duration_seconds(dist['p50'])} | **P90:** {_format_duration_seconds(dist['p90'])} | **P99:** {_format_duration_seconds(dist['p99'])}"
)
lines.append("")
lines.append("| Queue Time Range | Count | Percentage |")
lines.append("|-----------------|-------|------------|")
for b in dist["buckets"]:
bar = "#" * int(b["percentage"] / 3)
lines.append(
f"| {b['range']} | {b['count']} | {b['percentage']}% {bar} |"
)
lines.append("")
# --- Failed Jobs Detail (collapsible) ---
failed_jobs = [
j
for j in jobs
if j.get("conclusion") == "failure" and not j.get("is_stuck", False)
]
if failed_jobs:
lines.append("<details>")
lines.append(
f"<summary><strong>Failed Jobs ({len(failed_jobs)} total)</strong> - Click to expand</summary>"
)
lines.append("")
lines.append(
"| Job Name | Runner | Workflow | Queue | Duration | PR/Branch | Link |"
)
lines.append(
"|----------|--------|---------|-------|----------|-----------|------|"
)
for j in sorted(failed_jobs, key=lambda x: x["created_at"], reverse=True):
queue = calculate_queue_time(j, report_time)
dur = calculate_duration(j["started_at"], j["completed_at"])
pr_info = (
f"PR#{j['pr_number']}" if j.get("pr_number") else j.get("branch", "-")
)
url = j.get("html_url", "")
wf = j.get("workflow", "-")
lines.append(
f"| `{j['job_name']}` | `{j['runner_name']}` | `{wf}` "
f"| {queue} | {dur} | {pr_info} | [View]({url}) |"
)
lines.append("")
lines.append("</details>")
lines.append("")
# --- Stuck Jobs ---
stuck_jobs = [j for j in jobs if j.get("is_stuck", False)]
if stuck_jobs:
lines.append("## Stuck/Ghost Jobs")
lines.append("")
lines.append(
"> Jobs showing `in_progress` but have no runner assigned or workflow run is cancelled"
)
lines.append("")
lines.append(
"| Job Name | Job Status | Run Status | Runner | Workflow | Link |"
)
lines.append("|----------|-----------|-----------|--------|---------|------|")
for j in sorted(stuck_jobs, key=lambda x: x["created_at"], reverse=True):
run_info = f"{j.get('run_status', '-')}/{j.get('run_conclusion', '-')}"
url = j.get("html_url", "")
wf = j.get("workflow", "-")
lines.append(
f"| `{j['job_name']}` | {j['status']} | {run_info} "
f"| `{j['runner_name']}` | `{wf}` | [View]({url}) |"
)
lines.append("")
return "\n".join(lines)
def main():
# Capture the time when the command is run (both datetime and formatted string)
report_time = datetime.now(timezone.utc)
report_generated_time = report_time.strftime("%Y-%m-%d %H:%M:%S")
parser = argparse.ArgumentParser(description="Query GitHub Actions job status")
parser.add_argument(
"--repo",
default="sgl-project/sglang",
help="GitHub repo (default: sgl-project/sglang)",
)
parser.add_argument(
"--job",
required=False,
default=None,
help="Job name filter (required unless --runner-report is used)",
)
parser.add_argument(
"--workflow",
default="pr-test-amd.yml",
help="Workflow file name, or comma-separated list for --runner-report (default: pr-test-amd.yml)",
)
parser.add_argument(
"--hours",
type=int,
default=24,
help="Time window in hours (default: 24)",
)
parser.add_argument(
"--status",
choices=["in_progress", "queued", "completed", "waiting"],
help="Filter by job status",
)
parser.add_argument(
"--output",
choices=["table", "csv", "json", "markdown"],
default="table",
help="Output format (default: table)",
)
parser.add_argument(
"--summary",
action="store_true",
help="Write markdown output to GITHUB_STEP_SUMMARY",
)
parser.add_argument(
"--output-file",
type=str,
help="Write output to file",
)
parser.add_argument(
"--runner-report",
action="store_true",
help="Generate runner fleet analytics report across all jobs (no --job filter needed)",
)
parser.add_argument(
"--input-data-file",
type=str,
help="Load a prefetched Actions snapshot JSON instead of calling gh api",
)
parser.add_argument(
"--dump-data-file",
type=str,
help="Fetch Actions data once and save it as a snapshot JSON file",
)
args = parser.parse_args()
if args.input_data_file and args.dump_data_file:
parser.error("--input-data-file and --dump-data-file cannot be used together")
if not args.runner_report and not args.job and not args.dump_data_file:
parser.error(
"--job is required unless --runner-report or --dump-data-file is specified"
)
workflows = [w.strip() for w in args.workflow.split(",") if w.strip()]
if not args.input_data_file and not check_gh_cli_available():
sys.exit(1)
snapshot = None
repo = args.repo
fetch_metadata = None
if args.input_data_file:
snapshot = load_snapshot(args.input_data_file)
repo = snapshot.get("repo", args.repo)
fetch_metadata = snapshot.get("fetch_metadata")
generated_at = snapshot.get("generated_at")
if generated_at:
report_time = parse_time(generated_at) or report_time
report_generated_time = report_time.strftime("%Y-%m-%d %H:%M:%S")
if args.dump_data_file:
snapshot = fetch_all_jobs_snapshot(repo, workflows, args.hours)
save_snapshot(args.dump_data_file, snapshot)
summary = summarize_fetch_metadata(snapshot.get("fetch_metadata"), workflows)
print(f"Snapshot written to {args.dump_data_file}", file=sys.stderr)
if summary and summary["incomplete"]:
print(
"Warning: Snapshot is incomplete due to rate limit/API fetch failures.",
file=sys.stderr,
)
if summary["known_runs"] > 0:
print(
f"Known runs fetched successfully: {summary['runs_with_jobs']}/{summary['known_runs']}",
file=sys.stderr,
)
print(
f"Skipped runs with unknown job counts: {len(summary['skipped_runs'])}",
file=sys.stderr,
)
return
# --- Runner fleet report mode ---
if args.runner_report:
if snapshot is None:
snapshot = fetch_all_jobs_snapshot(repo, workflows, args.hours)
fetch_metadata = snapshot.get("fetch_metadata")
workflow_set = set(workflows)
all_snapshot_jobs = [
job for job in snapshot["jobs"] if job.get("workflow") in workflow_set
]
jobs = [job for job in all_snapshot_jobs if job.get("labels")]
if fetch_metadata is None:
fetch_metadata = {}
if "jobs_excluded_no_label" not in fetch_metadata:
fetch_metadata["jobs_excluded_no_label"] = len(all_snapshot_jobs) - len(
jobs
)
md_content = format_runner_report_markdown(
jobs,
workflows,
args.hours,
report_generated_time,
report_time,
fetch_metadata,
)
print(md_content)
if args.output_file:
with open(args.output_file, "w") as f:
f.write(md_content)
print(f"\nOutput written to {args.output_file}", file=sys.stderr)
if args.summary:
summary_file = os.environ.get("GITHUB_STEP_SUMMARY")
if summary_file:
with open(summary_file, "a") as f:
f.write(md_content)
f.write("\n")
print("Summary written to GITHUB_STEP_SUMMARY", file=sys.stderr)
else:
print(
"Warning: GITHUB_STEP_SUMMARY not set, markdown printed above.",
file=sys.stderr,
)
return
# --- Original per-job report mode ---
if snapshot is None:
snapshot = fetch_all_jobs_snapshot(repo, [args.workflow], args.hours)
fetch_metadata = snapshot.get("fetch_metadata")
results = filter_jobs(snapshot["jobs"], args.job, args.workflow, args.status)
output_content = None
if args.output == "table":
print_table(results, repo, report_generated_time, report_time)
elif args.output == "csv":
lines = [
"job_name,status,is_stuck,conclusion,created_at,started_at,queue_time,duration,runner,run_status,run_conclusion,pr_number,branch,url"
]
for r in sorted(results, key=lambda x: x["created_at"], reverse=True):
queue_time = calculate_queue_time(r, report_time)
duration = calculate_duration(r["started_at"], r["completed_at"])
is_stuck = "true" if r.get("is_stuck", False) else "false"
lines.append(
f'"{r["job_name"]}",{r["status"]},{is_stuck},{r["conclusion"]},{r["created_at"]},{r["started_at"]},{queue_time},{duration},{r["runner_name"]},{r.get("run_status", "-")},{r.get("run_conclusion", "-")},{r["pr_number"] or ""},{r["branch"]},{r["html_url"]}'
)
output_content = "\n".join(lines)
print(output_content)
elif args.output == "json":
json_results = []
for r in sorted(results, key=lambda x: x["created_at"], reverse=True):
r_copy = r.copy()
r_copy["queue_time"] = calculate_queue_time(r, report_time)
r_copy["duration"] = calculate_duration(r["started_at"], r["completed_at"])
r_copy["created_at_formatted"] = format_time(r["created_at"])
r_copy["started_at_formatted"] = format_time(r["started_at"])
json_results.append(r_copy)
output_content = json.dumps(json_results, indent=2)
print(output_content)
elif args.output == "markdown":
output_content = format_markdown(
results,
repo,
args.job,
args.hours,
report_generated_time,
report_time,
fetch_metadata,
args.workflow,
)
print(output_content)
if args.output_file and output_content:
with open(args.output_file, "w") as f:
f.write(output_content)
print(f"\nOutput written to {args.output_file}", file=sys.stderr)
if args.summary:
md_content = format_markdown(
results,
repo,
args.job,
args.hours,
report_generated_time,
report_time,
fetch_metadata,
args.workflow,
)
summary_file = os.environ.get("GITHUB_STEP_SUMMARY")
if summary_file:
with open(summary_file, "a") as f:
f.write(md_content)
f.write("\n")
print("Summary written to GITHUB_STEP_SUMMARY", file=sys.stderr)
else:
print(
"Warning: GITHUB_STEP_SUMMARY not set, printing markdown instead:",
file=sys.stderr,
)
print(md_content)
if __name__ == "__main__":
main()