mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-06-30 11:18:12 +00:00
Implement clear-gap comparison for early FAST/SLOW decision
Implemented the clear-gap comparison, with the log-distance-equivalent
algebra and pessimistic SM-clock fallback.
What changed:
- Added TimingInterval and interval construction from summaries:
- robust interval: [min, q3], centered at median
- fallback interval: clipped [mean - stdev, mean + stdev] intersected with [min, max]
- Added CLEAR_GAP_RELATIVE_THRESHOLD = 0.005.
- FAST gap uses:
(ref.lower - cmp.upper) / cmp.upper >= delta
which is equivalent to log(ref.lower / cmp.upper) >= log(1 + delta).
- SLOW gap uses:
(cmp.lower - ref.upper) / ref.upper >= delta
- FAST/SLOW now requires SM clock summaries on both sides and the same clear-gap result after scaling intervals by sm_clock_rate_mean.
- If intervals are missing, overlap, fail the gap threshold, have missing/invalid clock summaries, or time/cycle comparison disagrees, status is UNDECIDED.
- Existing center/noise values are still computed and displayed, but no longer drive FAST/SLOW/SAME classification.
Updated tests to cover:
- center/noise-only comparisons becoming UNDECIDED
- clear FAST/SLOW with matching clock evidence
- missing clock fallback to UNDECIDED
- frequency-shift disagreement becoming UNDECIDED
- regression reporting with robust interval and clock evidence
This commit is contained in:
@@ -45,6 +45,7 @@ GPU_TIME_IR_RELATIVE_TAG = "nv/cold/time/gpu/ir/relative"
|
||||
GPU_SM_CLOCK_RATE_MEAN_TAG = "nv/cold/sm_clock_rate/mean"
|
||||
SAMPLE_TIMES_TAG = "nv/json/bin:nv/cold/sample_times"
|
||||
SAMPLE_FREQUENCIES_TAG = "nv/json/freqs-bin:nv/cold/sample_freqs"
|
||||
CLEAR_GAP_RELATIVE_THRESHOLD = 0.005
|
||||
|
||||
# The reader returns an object supporting the buffer protocol. Python 3.10 does
|
||||
# not provide a standard Buffer type annotation.
|
||||
@@ -109,6 +110,13 @@ class TimeEstimate:
|
||||
relative_dispersion: float | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TimingInterval:
|
||||
lower: float
|
||||
upper: float
|
||||
center: float
|
||||
|
||||
|
||||
class ComparisonStatus(str, Enum):
|
||||
UNKNOWN = "????"
|
||||
UNDECIDED = "UNDECIDED"
|
||||
@@ -493,6 +501,111 @@ def compute_relative_dispersion(dispersion, center):
|
||||
return dispersion / center
|
||||
|
||||
|
||||
def is_positive_finite(value):
|
||||
return value is not None and value > 0.0 and math.isfinite(value)
|
||||
|
||||
|
||||
def make_timing_interval(lower, upper, center):
|
||||
if (
|
||||
not is_positive_finite(lower)
|
||||
or not is_positive_finite(upper)
|
||||
or not is_positive_finite(center)
|
||||
or lower > center
|
||||
or center > upper
|
||||
):
|
||||
return None
|
||||
return TimingInterval(lower=lower, upper=upper, center=center)
|
||||
|
||||
|
||||
def compute_timing_interval(timing):
|
||||
if (
|
||||
is_positive_finite(timing.minimum)
|
||||
and is_positive_finite(timing.first_quartile)
|
||||
and is_positive_finite(timing.median)
|
||||
and is_positive_finite(timing.third_quartile)
|
||||
and timing.minimum <= timing.first_quartile
|
||||
and timing.first_quartile <= timing.median
|
||||
and timing.median <= timing.third_quartile
|
||||
):
|
||||
return make_timing_interval(
|
||||
lower=timing.minimum,
|
||||
upper=timing.third_quartile,
|
||||
center=timing.median,
|
||||
)
|
||||
|
||||
if (
|
||||
is_positive_finite(timing.minimum)
|
||||
and is_positive_finite(timing.maximum)
|
||||
and is_positive_finite(timing.mean)
|
||||
and is_positive_finite(timing.stdev)
|
||||
and timing.minimum <= timing.mean
|
||||
and timing.mean <= timing.maximum
|
||||
):
|
||||
return make_timing_interval(
|
||||
lower=max(timing.minimum, timing.mean - timing.stdev),
|
||||
upper=min(timing.maximum, timing.mean + timing.stdev),
|
||||
center=timing.mean,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def compare_intervals_for_clear_gap(ref_interval, cmp_interval):
|
||||
# These ratios are equivalent to log(ref/cmp) >= log(1 + delta), but avoid
|
||||
# evaluating logarithms on every comparison.
|
||||
if cmp_interval.upper < ref_interval.lower:
|
||||
gap = ref_interval.lower - cmp_interval.upper
|
||||
if gap / cmp_interval.upper >= CLEAR_GAP_RELATIVE_THRESHOLD:
|
||||
return ComparisonStatus.FAST
|
||||
if cmp_interval.lower > ref_interval.upper:
|
||||
gap = cmp_interval.lower - ref_interval.upper
|
||||
if gap / ref_interval.upper >= CLEAR_GAP_RELATIVE_THRESHOLD:
|
||||
return ComparisonStatus.SLOW
|
||||
return None
|
||||
|
||||
|
||||
def scale_interval(interval, scale):
|
||||
if not is_positive_finite(scale):
|
||||
return None
|
||||
return make_timing_interval(
|
||||
lower=interval.lower * scale,
|
||||
upper=interval.upper * scale,
|
||||
center=interval.center * scale,
|
||||
)
|
||||
|
||||
|
||||
def confirm_clear_gap_with_clock_rate(
|
||||
status, ref_timing, cmp_timing, ref_interval, cmp_interval
|
||||
):
|
||||
if ref_timing.sm_clock_rate_mean is None or cmp_timing.sm_clock_rate_mean is None:
|
||||
return ComparisonStatus.UNDECIDED
|
||||
|
||||
ref_cycles = scale_interval(ref_interval, ref_timing.sm_clock_rate_mean)
|
||||
cmp_cycles = scale_interval(cmp_interval, cmp_timing.sm_clock_rate_mean)
|
||||
if ref_cycles is None or cmp_cycles is None:
|
||||
return ComparisonStatus.UNDECIDED
|
||||
|
||||
cycle_status = compare_intervals_for_clear_gap(ref_cycles, cmp_cycles)
|
||||
if cycle_status == status:
|
||||
return status
|
||||
return ComparisonStatus.UNDECIDED
|
||||
|
||||
|
||||
def compare_timings_for_clear_gap(ref_timing, cmp_timing):
|
||||
ref_interval = compute_timing_interval(ref_timing)
|
||||
cmp_interval = compute_timing_interval(cmp_timing)
|
||||
if ref_interval is None or cmp_interval is None:
|
||||
return ComparisonStatus.UNDECIDED
|
||||
|
||||
status = compare_intervals_for_clear_gap(ref_interval, cmp_interval)
|
||||
if status is None:
|
||||
return ComparisonStatus.UNDECIDED
|
||||
|
||||
return confirm_clear_gap_with_clock_rate(
|
||||
status, ref_timing, cmp_timing, ref_interval, cmp_interval
|
||||
)
|
||||
|
||||
|
||||
def has_robust_estimate(summary):
|
||||
return summary.median is not None and (
|
||||
summary.interquartile_range_relative is not None
|
||||
@@ -588,15 +701,10 @@ def compare_gpu_timings(ref_timing, cmp_timing):
|
||||
|
||||
if not has_finite_noise(ref_noise) or not has_finite_noise(cmp_noise):
|
||||
max_noise = None
|
||||
status = ComparisonStatus.UNKNOWN
|
||||
else:
|
||||
max_noise = max(ref_noise, cmp_noise)
|
||||
if abs(frac_diff) <= max_noise:
|
||||
status = ComparisonStatus.SAME
|
||||
elif diff < 0:
|
||||
status = ComparisonStatus.FAST
|
||||
else:
|
||||
status = ComparisonStatus.SLOW
|
||||
|
||||
status = compare_timings_for_clear_gap(ref_timing, cmp_timing)
|
||||
|
||||
return SummaryComparison(
|
||||
ref_estimate=ref_estimate,
|
||||
|
||||
@@ -113,6 +113,8 @@ def make_binary_summary(nvbench_compare, tag, filename, size):
|
||||
def make_gpu_timing_data(
|
||||
nvbench_compare,
|
||||
*,
|
||||
minimum=None,
|
||||
maximum=None,
|
||||
mean=1.0,
|
||||
stdev=None,
|
||||
stdev_relative=0.01,
|
||||
@@ -124,8 +126,8 @@ def make_gpu_timing_data(
|
||||
sm_clock_rate_mean=None,
|
||||
):
|
||||
return nvbench_compare.GpuTimingData(
|
||||
minimum=None,
|
||||
maximum=None,
|
||||
minimum=minimum,
|
||||
maximum=maximum,
|
||||
mean=mean,
|
||||
stdev=stdev,
|
||||
stdev_relative=stdev_relative,
|
||||
@@ -204,9 +206,10 @@ def test_compare_benches_accepts_matching_duplicate_state_counts(
|
||||
)
|
||||
|
||||
assert run_data.stats.config_count == 3
|
||||
assert run_data.stats.pass_count == 3
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 0
|
||||
assert run_data.stats.undecided_count == 3
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
@@ -287,9 +290,10 @@ def test_compare_benches_matches_duplicate_states_after_axis_filter(
|
||||
)
|
||||
|
||||
assert run_data.stats.config_count == 1
|
||||
assert run_data.stats.pass_count == 1
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 0
|
||||
assert run_data.stats.undecided_count == 1
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
@@ -328,9 +332,10 @@ def test_compare_benches_skips_non_finite_centers(monkeypatch, nvbench_compare):
|
||||
)
|
||||
|
||||
assert run_data.stats.config_count == 1
|
||||
assert run_data.stats.pass_count == 1
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 0
|
||||
assert run_data.stats.undecided_count == 1
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
@@ -455,38 +460,99 @@ def test_gpu_timing_data_warns_when_lazy_sample_read_fails(tmp_path, nvbench_com
|
||||
def test_compare_gpu_timings_classifies_common_cases(nvbench_compare):
|
||||
ref_timing = make_gpu_timing_data(nvbench_compare, mean=1.0, stdev_relative=0.05)
|
||||
|
||||
same = nvbench_compare.compare_gpu_timings(
|
||||
undecided = nvbench_compare.compare_gpu_timings(
|
||||
ref_timing,
|
||||
make_gpu_timing_data(nvbench_compare, mean=1.03, stdev_relative=0.05),
|
||||
)
|
||||
assert same is not None
|
||||
assert same.status == nvbench_compare.ComparisonStatus.SAME
|
||||
assert same.ref_time == pytest.approx(1.0)
|
||||
assert same.cmp_time == pytest.approx(1.03)
|
||||
assert same.diff == pytest.approx(0.03)
|
||||
assert same.frac_diff == pytest.approx(0.03)
|
||||
assert same.max_noise == pytest.approx(0.05)
|
||||
assert undecided is not None
|
||||
assert undecided.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
||||
assert undecided.ref_time == pytest.approx(1.0)
|
||||
assert undecided.cmp_time == pytest.approx(1.03)
|
||||
assert undecided.diff == pytest.approx(0.03)
|
||||
assert undecided.frac_diff == pytest.approx(0.03)
|
||||
assert undecided.max_noise == pytest.approx(0.05)
|
||||
|
||||
ref_interval_timing = make_gpu_timing_data(
|
||||
nvbench_compare,
|
||||
minimum=1.0,
|
||||
first_quartile=1.1,
|
||||
median=1.2,
|
||||
third_quartile=1.3,
|
||||
mean=1.2,
|
||||
stdev_relative=0.05,
|
||||
sm_clock_rate_mean=100.0,
|
||||
)
|
||||
|
||||
fast = nvbench_compare.compare_gpu_timings(
|
||||
ref_timing,
|
||||
make_gpu_timing_data(nvbench_compare, mean=0.8, stdev_relative=0.05),
|
||||
ref_interval_timing,
|
||||
make_gpu_timing_data(
|
||||
nvbench_compare,
|
||||
minimum=0.8,
|
||||
first_quartile=0.85,
|
||||
median=0.9,
|
||||
third_quartile=0.95,
|
||||
mean=0.9,
|
||||
stdev_relative=0.05,
|
||||
sm_clock_rate_mean=100.0,
|
||||
),
|
||||
)
|
||||
assert fast is not None
|
||||
assert fast.status == nvbench_compare.ComparisonStatus.FAST
|
||||
|
||||
slow = nvbench_compare.compare_gpu_timings(
|
||||
ref_timing,
|
||||
make_gpu_timing_data(nvbench_compare, mean=1.2, stdev_relative=0.05),
|
||||
ref_interval_timing,
|
||||
make_gpu_timing_data(
|
||||
nvbench_compare,
|
||||
minimum=1.4,
|
||||
first_quartile=1.45,
|
||||
median=1.5,
|
||||
third_quartile=1.55,
|
||||
mean=1.5,
|
||||
stdev_relative=0.05,
|
||||
sm_clock_rate_mean=100.0,
|
||||
),
|
||||
)
|
||||
assert slow is not None
|
||||
assert slow.status == nvbench_compare.ComparisonStatus.SLOW
|
||||
|
||||
unknown = nvbench_compare.compare_gpu_timings(
|
||||
missing_clock = nvbench_compare.compare_gpu_timings(
|
||||
ref_interval_timing,
|
||||
make_gpu_timing_data(
|
||||
nvbench_compare,
|
||||
minimum=0.8,
|
||||
first_quartile=0.85,
|
||||
median=0.9,
|
||||
third_quartile=0.95,
|
||||
mean=0.9,
|
||||
stdev_relative=0.05,
|
||||
),
|
||||
)
|
||||
assert missing_clock is not None
|
||||
assert missing_clock.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
||||
|
||||
frequency_shift = nvbench_compare.compare_gpu_timings(
|
||||
ref_interval_timing,
|
||||
make_gpu_timing_data(
|
||||
nvbench_compare,
|
||||
minimum=0.8,
|
||||
first_quartile=0.85,
|
||||
median=0.9,
|
||||
third_quartile=0.95,
|
||||
mean=0.9,
|
||||
stdev_relative=0.05,
|
||||
sm_clock_rate_mean=200.0,
|
||||
),
|
||||
)
|
||||
assert frequency_shift is not None
|
||||
assert frequency_shift.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
||||
|
||||
missing_noise = nvbench_compare.compare_gpu_timings(
|
||||
ref_timing,
|
||||
make_gpu_timing_data(nvbench_compare, mean=1.2, stdev_relative=None),
|
||||
)
|
||||
assert unknown is not None
|
||||
assert unknown.status == nvbench_compare.ComparisonStatus.UNKNOWN
|
||||
assert missing_noise is not None
|
||||
assert missing_noise.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
||||
assert missing_noise.max_noise is None
|
||||
|
||||
|
||||
def test_comparison_stats_records_undecided_status(nvbench_compare):
|
||||
@@ -515,7 +581,7 @@ def test_compare_gpu_timings_rejects_unusable_centers(
|
||||
)
|
||||
|
||||
|
||||
def test_compare_benches_prefers_median_and_iqr_when_available(
|
||||
def test_compare_benches_reports_regression_when_robust_intervals_and_clock_confirm(
|
||||
monkeypatch, nvbench_compare
|
||||
):
|
||||
run_data = make_comparison_run_data(nvbench_compare)
|
||||
@@ -523,15 +589,23 @@ def test_compare_benches_prefers_median_and_iqr_when_available(
|
||||
ref_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01")
|
||||
ref_state["summaries"].extend(
|
||||
[
|
||||
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_IR_RELATIVE_TAG", "0.01"),
|
||||
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
||||
]
|
||||
)
|
||||
cmp_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01")
|
||||
cmp_state["summaries"].extend(
|
||||
[
|
||||
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_IR_RELATIVE_TAG", "0.01"),
|
||||
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -551,10 +625,13 @@ def test_compare_benches_prefers_median_and_iqr_when_available(
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 1
|
||||
assert run_data.stats.undecided_count == 0
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
def test_compare_benches_marks_unavailable_noise_unknown(monkeypatch, nvbench_compare):
|
||||
def test_compare_benches_marks_unavailable_noise_undecided(
|
||||
monkeypatch, nvbench_compare
|
||||
):
|
||||
run_data = make_comparison_run_data(nvbench_compare)
|
||||
|
||||
missing_noise_ref = make_state(nvbench_compare, "missing_noise")
|
||||
@@ -593,7 +670,8 @@ def test_compare_benches_marks_unavailable_noise_unknown(monkeypatch, nvbench_co
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 0
|
||||
assert run_data.stats.unknown_count == 2
|
||||
assert run_data.stats.undecided_count == 2
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
def test_plot_along_skips_states_without_selected_axis(monkeypatch, nvbench_compare):
|
||||
@@ -629,9 +707,10 @@ def test_plot_along_skips_states_without_selected_axis(monkeypatch, nvbench_comp
|
||||
)
|
||||
|
||||
assert run_data.stats.config_count == 2
|
||||
assert run_data.stats.pass_count == 2
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 0
|
||||
assert run_data.stats.undecided_count == 2
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
@@ -717,9 +796,10 @@ def test_compare_benches_pairs_filtered_devices_by_position(
|
||||
)
|
||||
|
||||
assert run_data.stats.config_count == 1
|
||||
assert run_data.stats.pass_count == 1
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 0
|
||||
assert run_data.stats.undecided_count == 1
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
@@ -775,9 +855,10 @@ def test_axis_filter_applies_to_most_recent_benchmark(monkeypatch, nvbench_compa
|
||||
)
|
||||
|
||||
assert run_data.stats.config_count == 3
|
||||
assert run_data.stats.pass_count == 3
|
||||
assert run_data.stats.pass_count == 0
|
||||
assert run_data.stats.improvement_count == 0
|
||||
assert run_data.stats.regression_count == 0
|
||||
assert run_data.stats.undecided_count == 3
|
||||
assert run_data.stats.unknown_count == 0
|
||||
|
||||
|
||||
@@ -785,17 +866,33 @@ def test_main_returns_success_exit_code_when_regressions_are_detected(
|
||||
monkeypatch, capsys, nvbench_compare
|
||||
):
|
||||
devices = [{"id": 0, "name": "Test GPU"}]
|
||||
ref_state = make_state(nvbench_compare, "state", mean="1.0")
|
||||
ref_state["summaries"].extend(
|
||||
[
|
||||
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"),
|
||||
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
||||
]
|
||||
)
|
||||
cmp_state = make_state(nvbench_compare, "state", mean="1.2")
|
||||
cmp_state["summaries"].extend(
|
||||
[
|
||||
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"),
|
||||
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"),
|
||||
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
||||
]
|
||||
)
|
||||
ref_root = {
|
||||
"devices": devices,
|
||||
"benchmarks": [
|
||||
make_benchmark([make_state(nvbench_compare, "state", mean="1.0")])
|
||||
],
|
||||
"benchmarks": [make_benchmark([ref_state])],
|
||||
}
|
||||
cmp_root = {
|
||||
"devices": devices,
|
||||
"benchmarks": [
|
||||
make_benchmark([make_state(nvbench_compare, "state", mean="1.2")])
|
||||
],
|
||||
"benchmarks": [make_benchmark([cmp_state])],
|
||||
}
|
||||
|
||||
def read_file(path):
|
||||
|
||||
Reference in New Issue
Block a user