diff --git a/python/scripts/nvbench_compare.py b/python/scripts/nvbench_compare.py index 4043027..50f5762 100644 --- a/python/scripts/nvbench_compare.py +++ b/python/scripts/nvbench_compare.py @@ -9,7 +9,7 @@ import os import sys import warnings from collections import Counter -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum from functools import cached_property from typing import Any, Callable, Mapping @@ -128,6 +128,18 @@ class ComparisonStatus(str, Enum): SLOW = "SLOW" +@dataclass(frozen=True) +class DecisionReason: + code: str + message: str + + +@dataclass(frozen=True) +class TimingDecision: + status: ComparisonStatus + reason: DecisionReason + + @dataclass(frozen=True) class SummaryComparison: ref_estimate: TimeEstimate @@ -140,6 +152,7 @@ class SummaryComparison: frac_diff: float max_noise: float | None status: ComparisonStatus + reason: DecisionReason @dataclass @@ -150,13 +163,18 @@ class ComparisonStats: regression_count: int = 0 undecided_count: int = 0 unknown_count: int = 0 + undecided_reasons: Counter[DecisionReason] = field(default_factory=Counter) - def record(self, status: ComparisonStatus) -> None: + def record( + self, status: ComparisonStatus, reason: DecisionReason | None = None + ) -> None: self.config_count += 1 if status == ComparisonStatus.UNKNOWN: self.unknown_count += 1 elif status == ComparisonStatus.UNDECIDED: self.undecided_count += 1 + if reason is not None: + self.undecided_reasons[reason] += 1 elif status == ComparisonStatus.SAME: self.pass_count += 1 elif status == ComparisonStatus.FAST: @@ -553,6 +571,12 @@ def compute_timing_interval(timing): return None +def make_decision(status, code, message): + return TimingDecision( + status=status, reason=DecisionReason(code=code, message=message) + ) + + def compare_intervals_for_clear_gap(ref_interval, cmp_interval): # These ratios are equivalent to log(ref/cmp) >= log(1 + delta), but avoid # evaluating logarithms on every comparison. @@ -624,28 +648,52 @@ def confirm_clear_gap_with_clock_rate( status, ref_timing, cmp_timing, ref_interval, cmp_interval ): if ref_timing.sm_clock_rate_mean is None or cmp_timing.sm_clock_rate_mean is None: - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "missing_clock_rate", + "clear timing gap was not confirmed because SM clock summaries are unavailable", + ) ref_cycles = scale_interval(ref_interval, ref_timing.sm_clock_rate_mean) cmp_cycles = scale_interval(cmp_interval, cmp_timing.sm_clock_rate_mean) if ref_cycles is None or cmp_cycles is None: - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "invalid_clock_rate", + "clear timing gap was not confirmed because SM clock summaries are invalid", + ) cycle_status = compare_intervals_for_clear_gap(ref_cycles, cmp_cycles) if cycle_status == status: - return status - return ComparisonStatus.UNDECIDED + return make_decision( + status, + "clear_gap_confirmed_by_cycles", + "clear timing gap was confirmed by SM-clock-adjusted cycle intervals", + ) + return make_decision( + ComparisonStatus.UNDECIDED, + "cycle_gap_not_confirmed", + "clear timing gap was not confirmed by SM-clock-adjusted cycle intervals", + ) def compare_timings_for_clear_gap(ref_timing, cmp_timing): ref_interval = compute_timing_interval(ref_timing) cmp_interval = compute_timing_interval(cmp_timing) if ref_interval is None or cmp_interval is None: - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "missing_interval", + "could not construct comparable timing intervals", + ) status = compare_intervals_for_clear_gap(ref_interval, cmp_interval) if status is None: - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "no_clear_gap", + "timing intervals do not have a sufficient clear gap", + ) return confirm_clear_gap_with_clock_rate( status, ref_timing, cmp_timing, ref_interval, cmp_interval @@ -654,38 +702,81 @@ def compare_timings_for_clear_gap(ref_timing, cmp_timing): def compare_intervals_for_same(ref_interval, cmp_interval): if not centers_are_close(ref_interval.center, cmp_interval.center): - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "centers_not_close", + "timing centers are not close enough to declare same", + ) if not intervals_overlap_strongly(ref_interval, cmp_interval): - return ComparisonStatus.UNDECIDED - return ComparisonStatus.SAME + return make_decision( + ComparisonStatus.UNDECIDED, + "weak_interval_overlap", + "timing intervals do not overlap strongly enough to declare same", + ) + return make_decision( + ComparisonStatus.SAME, + "same_summary", + "timing centers are close and intervals overlap strongly", + ) def confirm_same_with_clock_rate(ref_timing, cmp_timing, ref_interval, cmp_interval): if ref_timing.sm_clock_rate_mean is None or cmp_timing.sm_clock_rate_mean is None: - return ComparisonStatus.SAME + return make_decision( + ComparisonStatus.SAME, + "same_without_clock_rate", + "timing centers are close and intervals overlap strongly; SM clock summaries are unavailable", + ) ref_cycles = scale_interval(ref_interval, ref_timing.sm_clock_rate_mean) cmp_cycles = scale_interval(cmp_interval, cmp_timing.sm_clock_rate_mean) if ref_cycles is None or cmp_cycles is None: - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "invalid_clock_rate", + "same decision was not confirmed because SM clock summaries are invalid", + ) - return compare_intervals_for_same(ref_cycles, cmp_cycles) + decision = compare_intervals_for_same(ref_cycles, cmp_cycles) + if decision.status == ComparisonStatus.SAME: + return make_decision( + ComparisonStatus.SAME, + "same_confirmed_by_cycles", + "timing and SM-clock-adjusted cycle intervals both support same", + ) + return make_decision( + ComparisonStatus.UNDECIDED, + "cycle_same_not_confirmed", + "same decision was not confirmed by SM-clock-adjusted cycle intervals", + ) def compare_timings_for_same(ref_timing, cmp_timing, ref_noise, cmp_noise): if not has_finite_noise(ref_noise) or not has_finite_noise(cmp_noise): - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "noise_unavailable", + "relative dispersion is unavailable or non-finite", + ) if max(ref_noise, cmp_noise) > SAME_RELATIVE_DISPERSION_CEILING: - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "noise_too_high", + "relative dispersion is too high to declare same", + ) ref_interval = compute_timing_interval(ref_timing) cmp_interval = compute_timing_interval(cmp_timing) if ref_interval is None or cmp_interval is None: - return ComparisonStatus.UNDECIDED + return make_decision( + ComparisonStatus.UNDECIDED, + "missing_interval", + "could not construct comparable timing intervals", + ) - status = compare_intervals_for_same(ref_interval, cmp_interval) - if status != ComparisonStatus.SAME: - return status + decision = compare_intervals_for_same(ref_interval, cmp_interval) + if decision.status != ComparisonStatus.SAME: + return decision return confirm_same_with_clock_rate( ref_timing, cmp_timing, ref_interval, cmp_interval @@ -790,9 +881,14 @@ def compare_gpu_timings(ref_timing, cmp_timing): else: max_noise = max(ref_noise, cmp_noise) - status = compare_timings_for_clear_gap(ref_timing, cmp_timing) - if status == ComparisonStatus.UNDECIDED: - status = compare_timings_for_same(ref_timing, cmp_timing, ref_noise, cmp_noise) + decision = compare_timings_for_clear_gap(ref_timing, cmp_timing) + if decision.status == ComparisonStatus.UNDECIDED and decision.reason.code in { + "no_clear_gap", + "missing_interval", + }: + decision = compare_timings_for_same( + ref_timing, cmp_timing, ref_noise, cmp_noise + ) return SummaryComparison( ref_estimate=ref_estimate, @@ -804,7 +900,8 @@ def compare_gpu_timings(ref_timing, cmp_timing): diff=diff, frac_diff=frac_diff, max_noise=max_noise, - status=status, + status=decision.status, + reason=decision.reason, ) @@ -1295,7 +1392,7 @@ def compare_benches( comparison.ref_noise ) - run_data.stats.record(comparison.status) + run_data.stats.record(comparison.status, comparison.reason) status = colorize_comparison_status(comparison.status, no_color) if abs(comparison.frac_diff) >= threshold: @@ -1629,6 +1726,10 @@ def main() -> int: print( f" - Undecided (comparison requires more evidence): {stats.undecided_count}" ) + if stats.undecided_reasons: + print(" - Reasons:") + for reason, count in stats.undecided_reasons.most_common(): + print(f" - {reason.code}: {count} ({reason.message})") print(f" - Unknown (infinite or unavailable noise): {stats.unknown_count}") return 0 diff --git a/python/test/test_nvbench_compare.py b/python/test/test_nvbench_compare.py index 9f34321..9937042 100644 --- a/python/test/test_nvbench_compare.py +++ b/python/test/test_nvbench_compare.py @@ -471,6 +471,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): assert undecided.diff == pytest.approx(0.03) assert undecided.frac_diff == pytest.approx(0.03) assert undecided.max_noise == pytest.approx(0.05) + assert undecided.reason.code == "noise_too_high" ref_interval_timing = make_gpu_timing_data( nvbench_compare, @@ -499,6 +500,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert fast is not None assert fast.status == nvbench_compare.ComparisonStatus.FAST + assert fast.reason.code == "clear_gap_confirmed_by_cycles" slow = nvbench_compare.compare_gpu_timings( ref_interval_timing, @@ -515,6 +517,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert slow is not None assert slow.status == nvbench_compare.ComparisonStatus.SLOW + assert slow.reason.code == "clear_gap_confirmed_by_cycles" same = nvbench_compare.compare_gpu_timings( ref_interval_timing, @@ -531,6 +534,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert same is not None assert same.status == nvbench_compare.ComparisonStatus.SAME + assert same.reason.code == "same_confirmed_by_cycles" weak_overlap = nvbench_compare.compare_gpu_timings( make_gpu_timing_data( @@ -554,6 +558,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert weak_overlap is not None assert weak_overlap.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert weak_overlap.reason.code == "weak_interval_overlap" center_too_far = nvbench_compare.compare_gpu_timings( ref_interval_timing, @@ -569,6 +574,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert center_too_far is not None assert center_too_far.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert center_too_far.reason.code == "centers_not_close" noisy_same = nvbench_compare.compare_gpu_timings( ref_interval_timing, @@ -584,6 +590,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert noisy_same is not None assert noisy_same.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert noisy_same.reason.code == "noise_too_high" clock_disagreement = nvbench_compare.compare_gpu_timings( ref_interval_timing, @@ -600,6 +607,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert clock_disagreement is not None assert clock_disagreement.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert clock_disagreement.reason.code == "cycle_same_not_confirmed" missing_clock = nvbench_compare.compare_gpu_timings( ref_interval_timing, @@ -615,6 +623,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert missing_clock is not None assert missing_clock.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert missing_clock.reason.code == "missing_clock_rate" frequency_shift = nvbench_compare.compare_gpu_timings( ref_interval_timing, @@ -631,6 +640,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ) assert frequency_shift is not None assert frequency_shift.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert frequency_shift.reason.code == "cycle_gap_not_confirmed" missing_noise = nvbench_compare.compare_gpu_timings( ref_timing, @@ -639,6 +649,7 @@ def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): assert missing_noise is not None assert missing_noise.status == nvbench_compare.ComparisonStatus.UNDECIDED assert missing_noise.max_noise is None + assert missing_noise.reason.code == "noise_unavailable" def test_comparison_stats_records_undecided_status(nvbench_compare): @@ -654,6 +665,18 @@ def test_comparison_stats_records_undecided_status(nvbench_compare): assert stats.unknown_count == 0 +def test_comparison_stats_records_undecided_reason(nvbench_compare): + stats = nvbench_compare.ComparisonStats() + reason = nvbench_compare.DecisionReason( + code="test_reason", + message="test reason", + ) + + stats.record(nvbench_compare.ComparisonStatus.UNDECIDED, reason) + + assert stats.undecided_reasons[reason] == 1 + + @pytest.mark.parametrize("ref_time, cmp_time", [(None, 1.0), (1.0, None), (0.0, 1.0)]) def test_compare_gpu_timings_rejects_unusable_centers( nvbench_compare, ref_time, cmp_time @@ -989,3 +1012,32 @@ def test_main_returns_success_exit_code_when_regressions_are_detected( assert nvbench_compare.main() == 0 assert "Regression (clear timing gap, %Diff > 0): 1" in capsys.readouterr().out + + +def test_main_prints_undecided_reason_summary(monkeypatch, capsys, nvbench_compare): + devices = [{"id": 0, "name": "Test GPU"}] + ref_root = { + "devices": devices, + "benchmarks": [ + make_benchmark([make_state(nvbench_compare, "state", noise="0.05")]) + ], + } + cmp_root = { + "devices": devices, + "benchmarks": [ + make_benchmark( + [make_state(nvbench_compare, "state", mean="1.01", noise="0.05")] + ) + ], + } + + def read_file(path): + return ref_root if path == "ref.json" else cmp_root + + monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file) + monkeypatch.setattr(sys, "argv", ["nvbench_compare", "ref.json", "cmp.json"]) + + assert nvbench_compare.main() == 0 + output = capsys.readouterr().out + assert "Undecided (comparison requires more evidence): 1" in output + assert "noise_too_high: 1" in output