Implement early SAME check

If SLOW/FAST check returned undecided, we attempt conservative
SAME check based on summary data alone (bulk data are not read)

Reference and compare measurements are considered SAME if
   - both centers are positive finite values;
   - abs(ref - cmp) / min(ref, cmp) <= 0.5%.
     This is equivalent to max(ref, cmp) / min(ref, cmp) <= 1 + delta;
   - interval overlap must cover at least 50% of the smaller interval;
   - relative dispersion must be finite on both sides and no more than 2%;
   - if SM clock summaries are available, the same check must also pass in cycle space.

Otherwise UNDECIDED remains working decision, to be refined by further checks
This commit is contained in:
Oleksandr Pavlyk
2026-06-03 07:38:00 -05:00
parent 48b7f61da3
commit 6de54fa07a
2 changed files with 178 additions and 11 deletions

View File

@@ -46,6 +46,9 @@ GPU_SM_CLOCK_RATE_MEAN_TAG = "nv/cold/sm_clock_rate/mean"
SAMPLE_TIMES_TAG = "nv/json/bin:nv/cold/sample_times"
SAMPLE_FREQUENCIES_TAG = "nv/json/freqs-bin:nv/cold/sample_freqs"
CLEAR_GAP_RELATIVE_THRESHOLD = 0.005
SAME_CENTER_RELATIVE_THRESHOLD = 0.005
SAME_OVERLAP_FRACTION_THRESHOLD = 0.5
SAME_RELATIVE_DISPERSION_CEILING = 0.02
# The reader returns an object supporting the buffer protocol. Python 3.10 does
# not provide a standard Buffer type annotation.
@@ -564,6 +567,49 @@ def compare_intervals_for_clear_gap(ref_interval, cmp_interval):
return None
def centers_are_close(ref_center, cmp_center):
if not is_positive_finite(ref_center) or not is_positive_finite(cmp_center):
return False
return (
abs(ref_center - cmp_center) / min(ref_center, cmp_center)
<= SAME_CENTER_RELATIVE_THRESHOLD
)
def interval_overlap_fraction(ref_interval, cmp_interval):
intersection_lower = max(ref_interval.lower, cmp_interval.lower)
intersection_upper = min(ref_interval.upper, cmp_interval.upper)
if intersection_upper < intersection_lower:
return 0.0
ref_width = ref_interval.upper - ref_interval.lower
cmp_width = cmp_interval.upper - cmp_interval.lower
min_width = min(ref_width, cmp_width)
if min_width > 0.0:
return (intersection_upper - intersection_lower) / min_width
if ref_width == 0.0 and cmp_width == 0.0:
return 1.0 if ref_interval.lower == cmp_interval.lower else 0.0
if ref_width == 0.0:
return (
1.0
if cmp_interval.lower <= ref_interval.lower <= cmp_interval.upper
else 0.0
)
return (
1.0 if ref_interval.lower <= cmp_interval.lower <= ref_interval.upper else 0.0
)
def intervals_overlap_strongly(ref_interval, cmp_interval):
return (
interval_overlap_fraction(ref_interval, cmp_interval)
>= SAME_OVERLAP_FRACTION_THRESHOLD
)
def scale_interval(interval, scale):
if not is_positive_finite(scale):
return None
@@ -606,6 +652,46 @@ def compare_timings_for_clear_gap(ref_timing, cmp_timing):
)
def compare_intervals_for_same(ref_interval, cmp_interval):
if not centers_are_close(ref_interval.center, cmp_interval.center):
return ComparisonStatus.UNDECIDED
if not intervals_overlap_strongly(ref_interval, cmp_interval):
return ComparisonStatus.UNDECIDED
return ComparisonStatus.SAME
def confirm_same_with_clock_rate(ref_timing, cmp_timing, ref_interval, cmp_interval):
if ref_timing.sm_clock_rate_mean is None or cmp_timing.sm_clock_rate_mean is None:
return ComparisonStatus.SAME
ref_cycles = scale_interval(ref_interval, ref_timing.sm_clock_rate_mean)
cmp_cycles = scale_interval(cmp_interval, cmp_timing.sm_clock_rate_mean)
if ref_cycles is None or cmp_cycles is None:
return ComparisonStatus.UNDECIDED
return compare_intervals_for_same(ref_cycles, cmp_cycles)
def compare_timings_for_same(ref_timing, cmp_timing, ref_noise, cmp_noise):
if not has_finite_noise(ref_noise) or not has_finite_noise(cmp_noise):
return ComparisonStatus.UNDECIDED
if max(ref_noise, cmp_noise) > SAME_RELATIVE_DISPERSION_CEILING:
return ComparisonStatus.UNDECIDED
ref_interval = compute_timing_interval(ref_timing)
cmp_interval = compute_timing_interval(cmp_timing)
if ref_interval is None or cmp_interval is None:
return ComparisonStatus.UNDECIDED
status = compare_intervals_for_same(ref_interval, cmp_interval)
if status != ComparisonStatus.SAME:
return status
return confirm_same_with_clock_rate(
ref_timing, cmp_timing, ref_interval, cmp_interval
)
def has_robust_estimate(summary):
return summary.median is not None and (
summary.interquartile_range_relative is not None
@@ -705,6 +791,8 @@ def compare_gpu_timings(ref_timing, cmp_timing):
max_noise = max(ref_noise, cmp_noise)
status = compare_timings_for_clear_gap(ref_timing, cmp_timing)
if status == ComparisonStatus.UNDECIDED:
status = compare_timings_for_same(ref_timing, cmp_timing, ref_noise, cmp_noise)
return SummaryComparison(
ref_estimate=ref_estimate,
@@ -1535,14 +1623,9 @@ def main() -> int:
print("# Summary\n")
print(f"- Total Matches: {stats.config_count}")
print(f" - Pass (abs(%Diff) <= max_noise): {stats.pass_count}")
print(
" - Improvement (abs(%Diff) > max_noise, %Diff < 0): "
f"{stats.improvement_count}"
)
print(
f" - Regression (abs(%Diff) > max_noise, %Diff > 0): {stats.regression_count}"
)
print(f" - Pass (centers close and intervals overlap): {stats.pass_count}")
print(f" - Improvement (clear timing gap, %Diff < 0): {stats.improvement_count}")
print(f" - Regression (clear timing gap, %Diff > 0): {stats.regression_count}")
print(
f" - Undecided (comparison requires more evidence): {stats.undecided_count}"
)