diff --git a/python/scripts/nvbench_compare.py b/python/scripts/nvbench_compare.py index adb9a10..4398370 100644 --- a/python/scripts/nvbench_compare.py +++ b/python/scripts/nvbench_compare.py @@ -477,6 +477,14 @@ class TimingInterval: center: float +@dataclass(frozen=True) +class TimingComparisonInputs: + ref_estimate: TimeEstimate + cmp_estimate: TimeEstimate + ref_interval: TimingInterval | None + cmp_interval: TimingInterval | None + + class ComparisonStatus(str, Enum): UNKNOWN = "????" UNDECIDED = "AMBG" @@ -1169,7 +1177,7 @@ def make_timing_interval(lower, upper, center): return TimingInterval(lower=lower, upper=upper, center=center) -def compute_timing_interval(timing): +def compute_robust_summary_interval(timing): if ( is_positive_finite(timing.minimum) and is_positive_finite(timing.first_quartile) @@ -1185,6 +1193,10 @@ def compute_timing_interval(timing): center=timing.median, ) + return None + + +def compute_mean_summary_interval(timing): if ( is_positive_finite(timing.minimum) and is_positive_finite(timing.maximum) @@ -1202,17 +1214,39 @@ def compute_timing_interval(timing): return None +def compute_timing_interval(timing): + robust_interval = compute_robust_summary_interval(timing) + if robust_interval is not None: + return robust_interval + return compute_mean_summary_interval(timing) + + def compute_timing_interval_from_samples(samples): + timing_input = compute_robust_timing_input_from_samples(samples) + if timing_input is None: + return None + _, interval = timing_input + return interval + + +def compute_robust_timing_input_from_samples(samples): values = positive_finite_array(samples) if values is None: return None - first_quartile, median, third_quartile = np.quantile(values, [0.25, 0.5, 0.75]) - return make_timing_interval( + interval = make_timing_interval( lower=np.min(values), upper=third_quartile, center=median, ) + relative_dispersion = compute_relative_dispersion( + third_quartile - first_quartile, median + ) + if interval is None or relative_dispersion is None: + return None + return TimeEstimate( + center=median, relative_dispersion=relative_dispersion + ), interval def make_decision(status, code, message, *, severity=0.0): @@ -1605,9 +1639,9 @@ def confirm_clear_gap_with_bulk_cycles(status, ref_timing, cmp_timing, threshold ) -def compare_timings_for_clear_gap(ref_timing, cmp_timing, thresholds): - ref_interval = compute_timing_interval(ref_timing) - cmp_interval = compute_timing_interval(cmp_timing) +def compare_timings_for_clear_gap( + ref_timing, cmp_timing, ref_interval, cmp_interval, thresholds +): if ref_interval is None or cmp_interval is None: return make_decision( ComparisonStatus.UNDECIDED, @@ -1736,7 +1770,9 @@ def compare_timings_for_bulk_same(ref_timing, cmp_timing, thresholds): ) -def compare_timings_for_same(ref_timing, cmp_timing, ref_noise, cmp_noise, thresholds): +def compare_timings_for_same( + ref_timing, cmp_timing, ref_noise, cmp_noise, ref_interval, cmp_interval, thresholds +): if not is_usable_noise(ref_noise) or not is_usable_noise(cmp_noise): return make_decision( ComparisonStatus.UNDECIDED, @@ -1750,8 +1786,6 @@ def compare_timings_for_same(ref_timing, cmp_timing, ref_noise, cmp_noise, thres "relative dispersion is too high to declare same", ) - ref_interval = compute_timing_interval(ref_timing) - cmp_interval = compute_timing_interval(cmp_timing) if ref_interval is None or cmp_interval is None: return make_decision( ComparisonStatus.UNDECIDED, @@ -1796,47 +1830,74 @@ def select_relative_dispersion(relative_dispersion, absolute_dispersion, center) return compute_relative_dispersion(absolute_dispersion, center) -def compute_common_time_estimates(ref_timing, cmp_timing): - if ( - has_robust_estimate(ref_timing) - and has_robust_interval(ref_timing) - and has_robust_estimate(cmp_timing) - and has_robust_interval(cmp_timing) - ): - return ( - TimeEstimate( - center=ref_timing.median, - relative_dispersion=select_relative_dispersion( - ref_timing.interquartile_range_relative, - ref_timing.interquartile_range, - ref_timing.median, - ), - ), - TimeEstimate( - center=cmp_timing.median, - relative_dispersion=select_relative_dispersion( - cmp_timing.interquartile_range_relative, - cmp_timing.interquartile_range, - cmp_timing.median, - ), - ), - ) +def compute_robust_summary_estimate(timing): + if not has_robust_estimate(timing): + return None + return TimeEstimate( + center=timing.median, + relative_dispersion=select_relative_dispersion( + timing.interquartile_range_relative, + timing.interquartile_range, + timing.median, + ), + ) - if has_mean_estimate(ref_timing) and has_mean_estimate(cmp_timing): - return ( - TimeEstimate( - center=ref_timing.mean, - relative_dispersion=select_relative_dispersion( - ref_timing.stdev_relative, ref_timing.stdev, ref_timing.mean - ), - ), - TimeEstimate( - center=cmp_timing.mean, - relative_dispersion=select_relative_dispersion( - cmp_timing.stdev_relative, cmp_timing.stdev, cmp_timing.mean - ), - ), - ) + +def compute_mean_summary_estimate(timing): + if not has_mean_estimate(timing): + return None + return TimeEstimate( + center=timing.mean, + relative_dispersion=select_relative_dispersion( + timing.stdev_relative, timing.stdev, timing.mean + ), + ) + + +def compute_robust_summary_timing_input(timing): + estimate = compute_robust_summary_estimate(timing) + interval = compute_robust_summary_interval(timing) + if estimate is None or interval is None: + return None + return estimate, interval + + +def compute_mean_summary_timing_input(timing): + estimate = compute_mean_summary_estimate(timing) + interval = compute_mean_summary_interval(timing) + if estimate is None or interval is None: + return None + return estimate, interval + + +def make_timing_comparison_inputs(ref_input, cmp_input): + ref_estimate, ref_interval = ref_input + cmp_estimate, cmp_interval = cmp_input + return TimingComparisonInputs( + ref_estimate=ref_estimate, + cmp_estimate=cmp_estimate, + ref_interval=ref_interval, + cmp_interval=cmp_interval, + ) + + +def compute_robust_timing_input(timing): + robust_input = compute_robust_summary_timing_input(timing) + if robust_input is not None: + return robust_input + return compute_robust_timing_input_from_samples(timing.samples) + + +def compute_common_time_estimates(ref_timing, cmp_timing): + ref_robust_estimate = compute_robust_summary_estimate(ref_timing) + cmp_robust_estimate = compute_robust_summary_estimate(cmp_timing) + if ref_robust_estimate is not None and cmp_robust_estimate is not None: + return ref_robust_estimate, cmp_robust_estimate + + ref_mean_estimate = compute_mean_summary_estimate(ref_timing) + cmp_mean_estimate = compute_mean_summary_estimate(cmp_timing) + if ref_mean_estimate is not None and cmp_mean_estimate is not None: + return ref_mean_estimate, cmp_mean_estimate return ( TimeEstimate( @@ -1854,6 +1915,26 @@ def compute_common_time_estimates(ref_timing, cmp_timing): ) +def compute_timing_comparison_inputs(ref_timing, cmp_timing): + ref_robust_input = compute_robust_timing_input(ref_timing) + cmp_robust_input = compute_robust_timing_input(cmp_timing) + if ref_robust_input is not None and cmp_robust_input is not None: + return make_timing_comparison_inputs(ref_robust_input, cmp_robust_input) + + ref_mean_input = compute_mean_summary_timing_input(ref_timing) + cmp_mean_input = compute_mean_summary_timing_input(cmp_timing) + if ref_mean_input is not None and cmp_mean_input is not None: + return make_timing_comparison_inputs(ref_mean_input, cmp_mean_input) + + ref_estimate, cmp_estimate = compute_common_time_estimates(ref_timing, cmp_timing) + return TimingComparisonInputs( + ref_estimate=ref_estimate, + cmp_estimate=cmp_estimate, + ref_interval=None, + cmp_interval=None, + ) + + def unusable_timing_center_decision(ref_time, cmp_time): if ref_time is None or cmp_time is None: return make_decision( @@ -1876,17 +1957,16 @@ def unusable_timing_center_decision(ref_time, cmp_time): return None -def make_unavailable_timing_comparison(decision, ref_timing, cmp_timing): - ref_estimate, cmp_estimate = compute_common_time_estimates(ref_timing, cmp_timing) +def make_unavailable_timing_comparison(decision, timing_inputs): return SummaryComparison( - ref_interval=compute_timing_interval(ref_timing), - cmp_interval=compute_timing_interval(cmp_timing), - ref_estimate=ref_estimate, - cmp_estimate=cmp_estimate, - ref_time=ref_estimate.center, - cmp_time=cmp_estimate.center, - ref_noise=ref_estimate.relative_dispersion, - cmp_noise=cmp_estimate.relative_dispersion, + ref_interval=timing_inputs.ref_interval, + cmp_interval=timing_inputs.cmp_interval, + ref_estimate=timing_inputs.ref_estimate, + cmp_estimate=timing_inputs.cmp_estimate, + ref_time=timing_inputs.ref_estimate.center, + cmp_time=timing_inputs.cmp_estimate.center, + ref_noise=timing_inputs.ref_estimate.relative_dispersion, + cmp_noise=timing_inputs.cmp_estimate.relative_dispersion, diff=None, frac_diff=None, diff_interval=None, @@ -1901,7 +1981,9 @@ def compare_gpu_timings(ref_timing, cmp_timing, comparison_thresholds=None): if comparison_thresholds is None: comparison_thresholds = get_default_thresholds() - ref_estimate, cmp_estimate = compute_common_time_estimates(ref_timing, cmp_timing) + timing_inputs = compute_timing_comparison_inputs(ref_timing, cmp_timing) + ref_estimate = timing_inputs.ref_estimate + cmp_estimate = timing_inputs.cmp_estimate cmp_time = cmp_estimate.center ref_time = ref_estimate.center @@ -1912,11 +1994,11 @@ def compare_gpu_timings(ref_timing, cmp_timing, comparison_thresholds=None): unusable_center_decision = unusable_timing_center_decision(ref_time, cmp_time) if unusable_center_decision is not None: return make_unavailable_timing_comparison( - unusable_center_decision, ref_timing, cmp_timing + unusable_center_decision, timing_inputs ) - ref_interval = compute_timing_interval(ref_timing) - cmp_interval = compute_timing_interval(cmp_timing) + ref_interval = timing_inputs.ref_interval + cmp_interval = timing_inputs.cmp_interval diff = cmp_time - ref_time frac_diff = diff / ref_time diff_interval = None @@ -1931,7 +2013,7 @@ def compare_gpu_timings(ref_timing, cmp_timing, comparison_thresholds=None): max_noise = max(ref_noise, cmp_noise) decision = compare_timings_for_clear_gap( - ref_timing, cmp_timing, comparison_thresholds + ref_timing, cmp_timing, ref_interval, cmp_interval, comparison_thresholds ) if decision.status == ComparisonStatus.UNDECIDED and decision.reason.code in { "no_clear_gap", @@ -1942,7 +2024,13 @@ def compare_gpu_timings(ref_timing, cmp_timing, comparison_thresholds=None): ) if bulk_decision.reason.code == "bulk_data_unavailable": decision = compare_timings_for_same( - ref_timing, cmp_timing, ref_noise, cmp_noise, comparison_thresholds + ref_timing, + cmp_timing, + ref_noise, + cmp_noise, + ref_interval, + cmp_interval, + comparison_thresholds, ) else: decision = bulk_decision @@ -2819,8 +2907,11 @@ def compare_benches( if ref_summaries else make_empty_gpu_timing_data() ) + timing_inputs = compute_timing_comparison_inputs( + ref_gpu_time, cmp_gpu_time + ) comparison = make_unavailable_timing_comparison( - missing_summaries_decision, ref_gpu_time, cmp_gpu_time + missing_summaries_decision, timing_inputs ) if comparison is None: continue diff --git a/python/test/test_nvbench_compare.py b/python/test/test_nvbench_compare.py index 1269f46..57848bc 100644 --- a/python/test/test_nvbench_compare.py +++ b/python/test/test_nvbench_compare.py @@ -177,7 +177,7 @@ def make_gpu_timing_data( median=None, third_quartile=None, interquartile_range=None, - interquartile_range_relative=None, + interquartile_range_relative=0.01, sm_clock_rate_mean=None, sample_values=None, frequency_values=None, @@ -727,6 +727,64 @@ def test_compare_gpu_timings_classifies_common_cases(tmp_path, nvbench_compare): assert partial_robust.ref_interval.center == pytest.approx(1.0) assert partial_robust.cmp_interval.center == pytest.approx(1.05) + mixed_summary_families = nvbench_compare.compare_gpu_timings( + make_gpu_timing_data( + nvbench_compare, + minimum=0.8, + maximum=1.2, + mean=1.0, + stdev=0.1, + stdev_relative=0.1, + first_quartile=9.0, + median=10.0, + third_quartile=11.0, + interquartile_range_relative=0.01, + ), + make_gpu_timing_data( + nvbench_compare, + minimum=0.85, + maximum=1.25, + mean=1.05, + stdev=0.1, + stdev_relative=0.1, + ), + ) + assert mixed_summary_families is not None + assert mixed_summary_families.ref_time == pytest.approx(1.0) + assert mixed_summary_families.cmp_time == pytest.approx(1.05) + assert mixed_summary_families.ref_interval.center == pytest.approx(1.0) + assert mixed_summary_families.cmp_interval.center == pytest.approx(1.05) + + mixed_robust_summary_and_bulk = nvbench_compare.compare_gpu_timings( + make_gpu_timing_data( + nvbench_compare, + minimum=0.8, + maximum=1.2, + mean=1.0, + stdev=0.1, + stdev_relative=0.1, + first_quartile=9.0, + median=10.0, + third_quartile=11.0, + interquartile_range_relative=0.01, + sample_values=[1.0, 2.0, 3.0], + ), + make_gpu_timing_data( + nvbench_compare, + minimum=98.0, + maximum=102.0, + mean=100.0, + stdev=1.0, + stdev_relative=0.01, + sample_values=[4.0, 5.0, 6.0], + ), + ) + assert mixed_robust_summary_and_bulk is not None + assert mixed_robust_summary_and_bulk.ref_time == pytest.approx(10.0) + assert mixed_robust_summary_and_bulk.cmp_time == pytest.approx(5.0) + assert mixed_robust_summary_and_bulk.ref_interval.center == pytest.approx(10.0) + assert mixed_robust_summary_and_bulk.cmp_interval.center == pytest.approx(5.0) + ref_interval_timing = make_gpu_timing_data( nvbench_compare, minimum=1.0, @@ -1995,6 +2053,7 @@ def test_main_returns_success_exit_code_when_regressions_are_detected( make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"), + make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) @@ -2005,6 +2064,7 @@ def test_main_returns_success_exit_code_when_regressions_are_detected( make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"), + make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) @@ -2610,6 +2670,7 @@ def test_compare_benches_explain_display_uses_explicit_intervals( make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.01"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.02"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.03"), + make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) @@ -2620,6 +2681,7 @@ def test_compare_benches_explain_display_uses_explicit_intervals( make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.02"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.03"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.04"), + make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] )