diff --git a/python/scripts/nvbench_compare.py b/python/scripts/nvbench_compare.py index cd52504..a61992d 100644 --- a/python/scripts/nvbench_compare.py +++ b/python/scripts/nvbench_compare.py @@ -45,6 +45,7 @@ GPU_TIME_IR_RELATIVE_TAG = "nv/cold/time/gpu/ir/relative" GPU_SM_CLOCK_RATE_MEAN_TAG = "nv/cold/sm_clock_rate/mean" SAMPLE_TIMES_TAG = "nv/json/bin:nv/cold/sample_times" SAMPLE_FREQUENCIES_TAG = "nv/json/freqs-bin:nv/cold/sample_freqs" +CLEAR_GAP_RELATIVE_THRESHOLD = 0.005 # The reader returns an object supporting the buffer protocol. Python 3.10 does # not provide a standard Buffer type annotation. @@ -109,6 +110,13 @@ class TimeEstimate: relative_dispersion: float | None +@dataclass(frozen=True) +class TimingInterval: + lower: float + upper: float + center: float + + class ComparisonStatus(str, Enum): UNKNOWN = "????" UNDECIDED = "UNDECIDED" @@ -493,6 +501,111 @@ def compute_relative_dispersion(dispersion, center): return dispersion / center +def is_positive_finite(value): + return value is not None and value > 0.0 and math.isfinite(value) + + +def make_timing_interval(lower, upper, center): + if ( + not is_positive_finite(lower) + or not is_positive_finite(upper) + or not is_positive_finite(center) + or lower > center + or center > upper + ): + return None + return TimingInterval(lower=lower, upper=upper, center=center) + + +def compute_timing_interval(timing): + if ( + is_positive_finite(timing.minimum) + and is_positive_finite(timing.first_quartile) + and is_positive_finite(timing.median) + and is_positive_finite(timing.third_quartile) + and timing.minimum <= timing.first_quartile + and timing.first_quartile <= timing.median + and timing.median <= timing.third_quartile + ): + return make_timing_interval( + lower=timing.minimum, + upper=timing.third_quartile, + center=timing.median, + ) + + if ( + is_positive_finite(timing.minimum) + and is_positive_finite(timing.maximum) + and is_positive_finite(timing.mean) + and is_positive_finite(timing.stdev) + and timing.minimum <= timing.mean + and timing.mean <= timing.maximum + ): + return make_timing_interval( + lower=max(timing.minimum, timing.mean - timing.stdev), + upper=min(timing.maximum, timing.mean + timing.stdev), + center=timing.mean, + ) + + return None + + +def compare_intervals_for_clear_gap(ref_interval, cmp_interval): + # These ratios are equivalent to log(ref/cmp) >= log(1 + delta), but avoid + # evaluating logarithms on every comparison. + if cmp_interval.upper < ref_interval.lower: + gap = ref_interval.lower - cmp_interval.upper + if gap / cmp_interval.upper >= CLEAR_GAP_RELATIVE_THRESHOLD: + return ComparisonStatus.FAST + if cmp_interval.lower > ref_interval.upper: + gap = cmp_interval.lower - ref_interval.upper + if gap / ref_interval.upper >= CLEAR_GAP_RELATIVE_THRESHOLD: + return ComparisonStatus.SLOW + return None + + +def scale_interval(interval, scale): + if not is_positive_finite(scale): + return None + return make_timing_interval( + lower=interval.lower * scale, + upper=interval.upper * scale, + center=interval.center * scale, + ) + + +def confirm_clear_gap_with_clock_rate( + status, ref_timing, cmp_timing, ref_interval, cmp_interval +): + if ref_timing.sm_clock_rate_mean is None or cmp_timing.sm_clock_rate_mean is None: + return ComparisonStatus.UNDECIDED + + ref_cycles = scale_interval(ref_interval, ref_timing.sm_clock_rate_mean) + cmp_cycles = scale_interval(cmp_interval, cmp_timing.sm_clock_rate_mean) + if ref_cycles is None or cmp_cycles is None: + return ComparisonStatus.UNDECIDED + + cycle_status = compare_intervals_for_clear_gap(ref_cycles, cmp_cycles) + if cycle_status == status: + return status + return ComparisonStatus.UNDECIDED + + +def compare_timings_for_clear_gap(ref_timing, cmp_timing): + ref_interval = compute_timing_interval(ref_timing) + cmp_interval = compute_timing_interval(cmp_timing) + if ref_interval is None or cmp_interval is None: + return ComparisonStatus.UNDECIDED + + status = compare_intervals_for_clear_gap(ref_interval, cmp_interval) + if status is None: + return ComparisonStatus.UNDECIDED + + return confirm_clear_gap_with_clock_rate( + status, ref_timing, cmp_timing, ref_interval, cmp_interval + ) + + def has_robust_estimate(summary): return summary.median is not None and ( summary.interquartile_range_relative is not None @@ -588,15 +701,10 @@ def compare_gpu_timings(ref_timing, cmp_timing): if not has_finite_noise(ref_noise) or not has_finite_noise(cmp_noise): max_noise = None - status = ComparisonStatus.UNKNOWN else: max_noise = max(ref_noise, cmp_noise) - if abs(frac_diff) <= max_noise: - status = ComparisonStatus.SAME - elif diff < 0: - status = ComparisonStatus.FAST - else: - status = ComparisonStatus.SLOW + + status = compare_timings_for_clear_gap(ref_timing, cmp_timing) return SummaryComparison( ref_estimate=ref_estimate, diff --git a/python/test/test_nvbench_compare.py b/python/test/test_nvbench_compare.py index 2f5bf5b..31710c9 100644 --- a/python/test/test_nvbench_compare.py +++ b/python/test/test_nvbench_compare.py @@ -113,6 +113,8 @@ def make_binary_summary(nvbench_compare, tag, filename, size): def make_gpu_timing_data( nvbench_compare, *, + minimum=None, + maximum=None, mean=1.0, stdev=None, stdev_relative=0.01, @@ -124,8 +126,8 @@ def make_gpu_timing_data( sm_clock_rate_mean=None, ): return nvbench_compare.GpuTimingData( - minimum=None, - maximum=None, + minimum=minimum, + maximum=maximum, mean=mean, stdev=stdev, stdev_relative=stdev_relative, @@ -204,9 +206,10 @@ def test_compare_benches_accepts_matching_duplicate_state_counts( ) assert run_data.stats.config_count == 3 - assert run_data.stats.pass_count == 3 + assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 + assert run_data.stats.undecided_count == 3 assert run_data.stats.unknown_count == 0 @@ -287,9 +290,10 @@ def test_compare_benches_matches_duplicate_states_after_axis_filter( ) assert run_data.stats.config_count == 1 - assert run_data.stats.pass_count == 1 + assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 + assert run_data.stats.undecided_count == 1 assert run_data.stats.unknown_count == 0 @@ -328,9 +332,10 @@ def test_compare_benches_skips_non_finite_centers(monkeypatch, nvbench_compare): ) assert run_data.stats.config_count == 1 - assert run_data.stats.pass_count == 1 + assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 + assert run_data.stats.undecided_count == 1 assert run_data.stats.unknown_count == 0 @@ -455,38 +460,99 @@ def test_gpu_timing_data_warns_when_lazy_sample_read_fails(tmp_path, nvbench_com def test_compare_gpu_timings_classifies_common_cases(nvbench_compare): ref_timing = make_gpu_timing_data(nvbench_compare, mean=1.0, stdev_relative=0.05) - same = nvbench_compare.compare_gpu_timings( + undecided = nvbench_compare.compare_gpu_timings( ref_timing, make_gpu_timing_data(nvbench_compare, mean=1.03, stdev_relative=0.05), ) - assert same is not None - assert same.status == nvbench_compare.ComparisonStatus.SAME - assert same.ref_time == pytest.approx(1.0) - assert same.cmp_time == pytest.approx(1.03) - assert same.diff == pytest.approx(0.03) - assert same.frac_diff == pytest.approx(0.03) - assert same.max_noise == pytest.approx(0.05) + assert undecided is not None + assert undecided.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert undecided.ref_time == pytest.approx(1.0) + assert undecided.cmp_time == pytest.approx(1.03) + assert undecided.diff == pytest.approx(0.03) + assert undecided.frac_diff == pytest.approx(0.03) + assert undecided.max_noise == pytest.approx(0.05) + + ref_interval_timing = make_gpu_timing_data( + nvbench_compare, + minimum=1.0, + first_quartile=1.1, + median=1.2, + third_quartile=1.3, + mean=1.2, + stdev_relative=0.05, + sm_clock_rate_mean=100.0, + ) fast = nvbench_compare.compare_gpu_timings( - ref_timing, - make_gpu_timing_data(nvbench_compare, mean=0.8, stdev_relative=0.05), + ref_interval_timing, + make_gpu_timing_data( + nvbench_compare, + minimum=0.8, + first_quartile=0.85, + median=0.9, + third_quartile=0.95, + mean=0.9, + stdev_relative=0.05, + sm_clock_rate_mean=100.0, + ), ) assert fast is not None assert fast.status == nvbench_compare.ComparisonStatus.FAST slow = nvbench_compare.compare_gpu_timings( - ref_timing, - make_gpu_timing_data(nvbench_compare, mean=1.2, stdev_relative=0.05), + ref_interval_timing, + make_gpu_timing_data( + nvbench_compare, + minimum=1.4, + first_quartile=1.45, + median=1.5, + third_quartile=1.55, + mean=1.5, + stdev_relative=0.05, + sm_clock_rate_mean=100.0, + ), ) assert slow is not None assert slow.status == nvbench_compare.ComparisonStatus.SLOW - unknown = nvbench_compare.compare_gpu_timings( + missing_clock = nvbench_compare.compare_gpu_timings( + ref_interval_timing, + make_gpu_timing_data( + nvbench_compare, + minimum=0.8, + first_quartile=0.85, + median=0.9, + third_quartile=0.95, + mean=0.9, + stdev_relative=0.05, + ), + ) + assert missing_clock is not None + assert missing_clock.status == nvbench_compare.ComparisonStatus.UNDECIDED + + frequency_shift = nvbench_compare.compare_gpu_timings( + ref_interval_timing, + make_gpu_timing_data( + nvbench_compare, + minimum=0.8, + first_quartile=0.85, + median=0.9, + third_quartile=0.95, + mean=0.9, + stdev_relative=0.05, + sm_clock_rate_mean=200.0, + ), + ) + assert frequency_shift is not None + assert frequency_shift.status == nvbench_compare.ComparisonStatus.UNDECIDED + + missing_noise = nvbench_compare.compare_gpu_timings( ref_timing, make_gpu_timing_data(nvbench_compare, mean=1.2, stdev_relative=None), ) - assert unknown is not None - assert unknown.status == nvbench_compare.ComparisonStatus.UNKNOWN + assert missing_noise is not None + assert missing_noise.status == nvbench_compare.ComparisonStatus.UNDECIDED + assert missing_noise.max_noise is None def test_comparison_stats_records_undecided_status(nvbench_compare): @@ -515,7 +581,7 @@ def test_compare_gpu_timings_rejects_unusable_centers( ) -def test_compare_benches_prefers_median_and_iqr_when_available( +def test_compare_benches_reports_regression_when_robust_intervals_and_clock_confirm( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) @@ -523,15 +589,23 @@ def test_compare_benches_prefers_median_and_iqr_when_available( ref_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01") ref_state["summaries"].extend( [ + make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"), + make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"), + make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"), make_summary(nvbench_compare, "GPU_TIME_IR_RELATIVE_TAG", "0.01"), + make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) cmp_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01") cmp_state["summaries"].extend( [ + make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"), + make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"), + make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"), make_summary(nvbench_compare, "GPU_TIME_IR_RELATIVE_TAG", "0.01"), + make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) @@ -551,10 +625,13 @@ def test_compare_benches_prefers_median_and_iqr_when_available( assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 1 + assert run_data.stats.undecided_count == 0 assert run_data.stats.unknown_count == 0 -def test_compare_benches_marks_unavailable_noise_unknown(monkeypatch, nvbench_compare): +def test_compare_benches_marks_unavailable_noise_undecided( + monkeypatch, nvbench_compare +): run_data = make_comparison_run_data(nvbench_compare) missing_noise_ref = make_state(nvbench_compare, "missing_noise") @@ -593,7 +670,8 @@ def test_compare_benches_marks_unavailable_noise_unknown(monkeypatch, nvbench_co assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 - assert run_data.stats.unknown_count == 2 + assert run_data.stats.undecided_count == 2 + assert run_data.stats.unknown_count == 0 def test_plot_along_skips_states_without_selected_axis(monkeypatch, nvbench_compare): @@ -629,9 +707,10 @@ def test_plot_along_skips_states_without_selected_axis(monkeypatch, nvbench_comp ) assert run_data.stats.config_count == 2 - assert run_data.stats.pass_count == 2 + assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 + assert run_data.stats.undecided_count == 2 assert run_data.stats.unknown_count == 0 @@ -717,9 +796,10 @@ def test_compare_benches_pairs_filtered_devices_by_position( ) assert run_data.stats.config_count == 1 - assert run_data.stats.pass_count == 1 + assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 + assert run_data.stats.undecided_count == 1 assert run_data.stats.unknown_count == 0 @@ -775,9 +855,10 @@ def test_axis_filter_applies_to_most_recent_benchmark(monkeypatch, nvbench_compa ) assert run_data.stats.config_count == 3 - assert run_data.stats.pass_count == 3 + assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 + assert run_data.stats.undecided_count == 3 assert run_data.stats.unknown_count == 0 @@ -785,17 +866,33 @@ def test_main_returns_success_exit_code_when_regressions_are_detected( monkeypatch, capsys, nvbench_compare ): devices = [{"id": 0, "name": "Test GPU"}] + ref_state = make_state(nvbench_compare, "state", mean="1.0") + ref_state["summaries"].extend( + [ + make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"), + make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"), + make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"), + make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"), + make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), + ] + ) + cmp_state = make_state(nvbench_compare, "state", mean="1.2") + cmp_state["summaries"].extend( + [ + make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"), + make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"), + make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"), + make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"), + make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), + ] + ) ref_root = { "devices": devices, - "benchmarks": [ - make_benchmark([make_state(nvbench_compare, "state", mean="1.0")]) - ], + "benchmarks": [make_benchmark([ref_state])], } cmp_root = { "devices": devices, - "benchmarks": [ - make_benchmark([make_state(nvbench_compare, "state", mean="1.2")]) - ], + "benchmarks": [make_benchmark([cmp_state])], } def read_file(path):