# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import importlib.util import sys import types from pathlib import Path import numpy as np import pytest @pytest.fixture def nvbench_compare(monkeypatch): class DummyLine: def get_color(self): return "black" pyplot = types.ModuleType("matplotlib.pyplot") pyplot.figure = lambda *args, **kwargs: None pyplot.xscale = lambda *args, **kwargs: None pyplot.yscale = lambda *args, **kwargs: None pyplot.xlabel = lambda *args, **kwargs: None pyplot.ylabel = lambda *args, **kwargs: None pyplot.title = lambda *args, **kwargs: None pyplot.plot = lambda *args, **kwargs: [DummyLine()] pyplot.fill_between = lambda *args, **kwargs: None pyplot.legend = lambda *args, **kwargs: None pyplot.show = lambda *args, **kwargs: None pyplot.close = lambda *args, **kwargs: None matplotlib = types.ModuleType("matplotlib") matplotlib.pyplot = pyplot monkeypatch.setitem(sys.modules, "matplotlib", matplotlib) monkeypatch.setitem(sys.modules, "matplotlib.pyplot", pyplot) monkeypatch.setitem( sys.modules, "seaborn", types.SimpleNamespace(set_theme=lambda *args, **kwargs: None), ) monkeypatch.setitem( sys.modules, "jsondiff", types.SimpleNamespace(diff=lambda *args, **kwargs: {}) ) monkeypatch.setitem( sys.modules, "tabulate", types.SimpleNamespace( __version__="0.8.10", tabulate=lambda *args, **kwargs: "" ), ) monkeypatch.setitem( sys.modules, "colorama", types.SimpleNamespace( Fore=types.SimpleNamespace( BLUE="", GREEN="", LIGHTBLACK_EX="", RED="", RESET="", YELLOW="", ) ), ) module_path = Path(__file__).resolve().parents[1] / "scripts" / "nvbench_compare.py" spec = importlib.util.spec_from_file_location("nvbench_compare", module_path) assert spec is not None assert spec.loader is not None module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module def make_state( nvbench_compare, name, *, mean="1.0", noise="0.01", axis_value=None, device=0 ): return { "name": name, "device": device, "axis_values": [] if axis_value is None else [{"name": "A", "type": "int64", "value": axis_value}], "summaries": [ { "tag": nvbench_compare.GPU_TIME_MEAN_TAG, "data": [{"name": "value", "type": "float64", "value": mean}], }, { "tag": nvbench_compare.GPU_TIME_STDEV_RELATIVE_TAG, "data": [{"name": "value", "type": "float64", "value": noise}], }, ], } def make_summary(nvbench_compare, tag, value): return { "tag": getattr(nvbench_compare, tag), "data": [{"name": "value", "type": "float64", "value": value}], } def make_binary_summary(nvbench_compare, tag, filename, size): return { "tag": getattr(nvbench_compare, tag), "data": [ {"name": "filename", "type": "string", "value": filename}, {"name": "size", "type": "int64", "value": str(size)}, ], } def make_gpu_timing_data( nvbench_compare, *, minimum=None, maximum=None, mean=1.0, stdev=None, stdev_relative=0.01, first_quartile=None, median=None, third_quartile=None, interquartile_range=None, interquartile_range_relative=None, sm_clock_rate_mean=None, sample_values=None, frequency_values=None, ): return nvbench_compare.GpuTimingData( minimum=minimum, maximum=maximum, mean=mean, stdev=stdev, stdev_relative=stdev_relative, first_quartile=first_quartile, median=median, third_quartile=third_quartile, interquartile_range=interquartile_range, interquartile_range_relative=interquartile_range_relative, sm_clock_rate_mean=sm_clock_rate_mean, sample_source=None if sample_values is None else types.SimpleNamespace(values=np.asarray(sample_values, dtype=np.float32)), frequency_source=None if frequency_values is None else types.SimpleNamespace( values=np.asarray(frequency_values, dtype=np.float32) ), ) def make_benchmark(states, *, name="bench"): devices = [] for state in states: if state["device"] not in devices: devices.append(state["device"]) return { "name": name, "devices": devices, "axes": [{"name": "A", "type": "int64", "flags": ""}] if any(state["axis_values"] for state in states) else [], "states": states, } def make_comparison_run_data(nvbench_compare, ref_devices=None, cmp_devices=None): devices = [{"id": 0, "name": "Test GPU"}] return nvbench_compare.ComparisonRunData( stats=nvbench_compare.ComparisonStats(), ref_devices=tuple(devices if ref_devices is None else ref_devices), cmp_devices=tuple(devices if cmp_devices is None else cmp_devices), ) def make_filter_plan(nvbench_compare, filter_actions=None): return nvbench_compare.build_benchmark_filter_plan(filter_actions or []) def test_compare_benches_accepts_matching_duplicate_state_counts( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) ref_benches = [ make_benchmark( [ make_state(nvbench_compare, "state1"), make_state(nvbench_compare, "state1"), make_state(nvbench_compare, "state2"), ] ) ] cmp_benches = [ make_benchmark( [ make_state(nvbench_compare, "state1", mean="1.005"), make_state(nvbench_compare, "state1", mean="1.005"), make_state(nvbench_compare, "state2", mean="1.005"), ] ) ] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, ) assert run_data.stats.config_count == 3 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 assert run_data.stats.undecided_count == 3 assert run_data.stats.unknown_count == 0 def test_compare_benches_rejects_swapped_duplicate_state_counts( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) ref_benches = [ make_benchmark( [ make_state(nvbench_compare, "state1"), make_state(nvbench_compare, "state1"), make_state(nvbench_compare, "state1"), make_state(nvbench_compare, "state2"), make_state(nvbench_compare, "state2"), ] ) ] cmp_benches = [ make_benchmark( [ make_state(nvbench_compare, "state1"), make_state(nvbench_compare, "state1"), make_state(nvbench_compare, "state2"), make_state(nvbench_compare, "state2"), make_state(nvbench_compare, "state2"), ] ) ] with pytest.raises(ValueError, match="mismatched state occurrences"): nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, ) def test_compare_benches_matches_duplicate_states_after_axis_filter( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) ref_benches = [ make_benchmark( [ make_state(nvbench_compare, "state", mean="1.0", axis_value=1), make_state(nvbench_compare, "state", mean="2.0", axis_value=2), ] ) ] cmp_benches = [ make_benchmark( [ make_state(nvbench_compare, "state", mean="2.0", axis_value=2), make_state(nvbench_compare, "state", mean="1.0", axis_value=1), ] ) ] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare, [("axis", "A=2")]), no_color=True, ) assert run_data.stats.config_count == 1 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 assert run_data.stats.undecided_count == 1 assert run_data.stats.unknown_count == 0 def test_compare_benches_skips_non_finite_centers(monkeypatch, nvbench_compare): run_data = make_comparison_run_data(nvbench_compare) ref_benches = [ make_benchmark( [ make_state(nvbench_compare, "finite", mean="1.0"), make_state(nvbench_compare, "nan", mean="nan"), make_state(nvbench_compare, "inf", mean="inf"), ] ) ] cmp_benches = [ make_benchmark( [ make_state(nvbench_compare, "finite", mean="1.0"), make_state(nvbench_compare, "nan", mean="1.0"), make_state(nvbench_compare, "inf", mean="1.0"), ] ) ] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, ) assert run_data.stats.config_count == 1 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 assert run_data.stats.undecided_count == 1 assert run_data.stats.unknown_count == 0 def test_gpu_timing_data_loads_samples_and_frequencies_lazily( tmp_path, nvbench_compare ): samples_dir = tmp_path / "result.json-bin" freqs_dir = tmp_path / "result.json-freqs-bin" samples_dir.mkdir() freqs_dir.mkdir() samples_file = samples_dir / "0.bin" freqs_file = freqs_dir / "0.bin" np.array([1.0, 2.0, 4.0], dtype="= +7.7%" ) def test_format_change_only_reports_fast_and_slow_rows(nvbench_compare): fast = types.SimpleNamespace( status=nvbench_compare.ComparisonStatus.FAST, frac_diff_interval=(-0.3, -0.05), ) slow = types.SimpleNamespace( status=nvbench_compare.ComparisonStatus.SLOW, frac_diff_interval=(0.07, 0.55), ) same = types.SimpleNamespace( status=nvbench_compare.ComparisonStatus.SAME, frac_diff_interval=(-0.01, 0.01), ) undecided = types.SimpleNamespace( status=nvbench_compare.ComparisonStatus.UNDECIDED, frac_diff_interval=(-0.01, 0.01), ) assert nvbench_compare.format_change(fast) == "<= -5.0%" assert nvbench_compare.format_change(slow) == ">= +7.0%" assert nvbench_compare.format_change(same) == "" assert nvbench_compare.format_change(undecided) == "" def test_ambiguous_status_uses_shrug_marker(nvbench_compare): assert ( nvbench_compare.colorize_comparison_status( nvbench_compare.ComparisonStatus.UNDECIDED, no_color=True ) == "\U0001f937 AMBG" ) def test_format_timing_with_interval(nvbench_compare): interval = nvbench_compare.TimingInterval( lower=0.002237, upper=0.002389, center=0.0023 ) assert ( nvbench_compare.format_timing_with_interval(0.0023, interval) == "2.300 ms [-63, +89] us" ) interval = nvbench_compare.TimingInterval( lower=19.380e-6, upper=20.508e-6, center=19.944e-6 ) assert ( nvbench_compare.format_timing_with_interval(19.944e-6, interval) == "19.944 [-0.564, +0.564] us" ) def test_format_timing_with_explicit_interval(nvbench_compare): interval = nvbench_compare.TimingInterval( lower=0.001434, upper=0.001458, center=0.001446 ) assert ( nvbench_compare.format_timing_with_explicit_interval(0.001446, interval) == "1.4[34 | 46 | 58] ms" ) interval = nvbench_compare.TimingInterval( lower=18.400e-6, upper=19.464e-6, center=18.736e-6 ) assert ( nvbench_compare.format_timing_with_explicit_interval(18.736e-6, interval) == "[18.400 | 18.736 | 19.464] us" ) interval = nvbench_compare.TimingInterval( lower=19.380e-6, upper=20.508e-6, center=19.944e-6 ) assert ( nvbench_compare.format_timing_with_explicit_interval(19.944e-6, interval) == "[19.380 | 19.944 | 20.508] us" ) interval = nvbench_compare.TimingInterval( lower=99.094e-6, upper=100.882e-6, center=99.988e-6 ) assert ( nvbench_compare.format_timing_with_explicit_interval(99.988e-6, interval) == "[ 99.094 | 99.988 | 100.882] us" ) def test_align_explain_interval_columns_pads_values_across_rows(nvbench_compare): rows = [["", ""], ["", ""]] comparisons = [ types.SimpleNamespace( ref_time=19.944e-6, ref_interval=nvbench_compare.TimingInterval( lower=19.380e-6, center=19.944e-6, upper=20.508e-6 ), cmp_time=97.712e-6, cmp_interval=nvbench_compare.TimingInterval( lower=96.849e-6, center=97.712e-6, upper=98.574e-6 ), ), types.SimpleNamespace( ref_time=103.466e-6, ref_interval=nvbench_compare.TimingInterval( lower=102.739e-6, center=103.466e-6, upper=104.193e-6 ), cmp_time=101.868e-6, cmp_interval=nvbench_compare.TimingInterval( lower=100.916e-6, center=101.868e-6, upper=102.819e-6 ), ), ] nvbench_compare.align_explain_interval_columns(rows, comparisons, axis_count=0) assert rows[0][0] == "[ 19.380 | 19.944 | 20.508] us" assert rows[1][0] == "[102.739 | 103.466 | 104.193] us" assert rows[0][1] == "[ 96.849 | 97.712 | 98.574] us" assert rows[1][1] == "[100.916 | 101.868 | 102.819] us" def test_align_timing_interval_columns_reserves_missing_interval_slot(nvbench_compare): rows = [["", ""], ["", ""]] comparisons = [ types.SimpleNamespace( ref_time=19.944e-6, ref_interval=nvbench_compare.TimingInterval( lower=19.380e-6, center=19.944e-6, upper=20.508e-6 ), cmp_time=18.736e-6, cmp_interval=nvbench_compare.TimingInterval( lower=18.400e-6, center=18.736e-6, upper=19.464e-6 ), ), types.SimpleNamespace( ref_time=20.390e-6, ref_interval=nvbench_compare.TimingInterval( lower=19.659e-6, center=20.390e-6, upper=21.121e-6 ), cmp_time=20.480e-6, cmp_interval=None, ), ] nvbench_compare.align_timing_interval_columns(rows, comparisons, axis_count=0) cmp_interval_slot = len("[-0.336, +0.728]") assert rows[0][1] == "18.736 [-0.336, +0.728] us" assert rows[1][1] == f"20.480 {' ' * cmp_interval_slot} us" def test_compare_gpu_timings_keeps_bulk_mismatch_undecided(nvbench_compare): ref_timing = make_gpu_timing_data( nvbench_compare, minimum=1.0, first_quartile=1.1, median=1.2, third_quartile=1.3, mean=1.2, interquartile_range_relative=0.01, sample_values=[1.0, 1.0, 1.004, 1.004], frequency_values=[100.0] * 4, ) cmp_timing = make_gpu_timing_data( nvbench_compare, minimum=1.02, first_quartile=1.1, median=1.204, third_quartile=1.28, mean=1.204, interquartile_range_relative=0.01, sample_values=[1.02, 1.02, 1.024, 1.024], frequency_values=[100.0] * 4, ) comparison = nvbench_compare.compare_gpu_timings(ref_timing, cmp_timing) assert comparison is not None assert comparison.status == nvbench_compare.ComparisonStatus.UNDECIDED assert comparison.reason.code == "bulk_time_support_mismatch" assert "sample: min(ref=0.0%, cmp=0.0%) >= 99.0%" in comparison.reason.message assert "support: min(ref=0.0%, cmp=0.0%) >= 80.0%" in comparison.reason.message assert "99.0%" in comparison.reason.message assert "80.0%" in comparison.reason.message def test_compare_gpu_timings_requires_bulk_cycle_coverage(nvbench_compare): ref_timing = make_gpu_timing_data( nvbench_compare, mean=1.0, stdev_relative=0.01, sample_values=[1.0, 1.0, 1.004, 1.004], frequency_values=[100.0] * 4, ) cmp_timing = make_gpu_timing_data( nvbench_compare, mean=1.0, stdev_relative=0.01, sample_values=[1.0, 1.0, 1.004, 1.004], frequency_values=[200.0] * 4, ) comparison = nvbench_compare.compare_gpu_timings(ref_timing, cmp_timing) assert comparison is not None assert comparison.status == nvbench_compare.ComparisonStatus.UNDECIDED assert comparison.reason.code == "bulk_cycle_support_mismatch" def test_bulk_same_reports_sample_weight_coverage_mismatch(nvbench_compare): ref_values = [1.0, 1.001, 1.002, 1.003] + [1.02] * 100 cmp_values = [1.0, 1.001, 1.002, 1.003] decision = nvbench_compare.compare_values_for_bulk_same( ref_values, cmp_values, label="time", thresholds=nvbench_compare.ComparisonThresholds(), ) assert decision.status == nvbench_compare.ComparisonStatus.UNDECIDED assert decision.reason.code == "bulk_time_support_mismatch" assert "sample: min(ref=3.8%, cmp=100.0%) >= 99.0%" in decision.reason.message assert "support: min(ref=80.0%, cmp=100.0%) >= 80.0%" in decision.reason.message def test_bulk_same_filters_rare_values_from_support_coverage(nvbench_compare): ref_values = [1.0] * 1000 + [1.02 + 0.01 * i for i in range(10)] cmp_values = [1.0] decision = nvbench_compare.compare_values_for_bulk_same( ref_values, cmp_values, label="time", thresholds=nvbench_compare.ComparisonThresholds(), ) assert decision.status == nvbench_compare.ComparisonStatus.SAME assert decision.reason.code == "bulk_time_same" def test_bulk_same_reports_unique_support_coverage_mismatch(nvbench_compare): ref_values = [1.0] * 1000 + [1.02 + 0.01 * i for i in range(10)] cmp_values = [1.0] decision = nvbench_compare.compare_values_for_bulk_same( ref_values, cmp_values, label="time", thresholds=nvbench_compare.ComparisonThresholds( bulk_support_max_removed_sample_fraction=0.005 ), ) assert decision.status == nvbench_compare.ComparisonStatus.UNDECIDED assert decision.reason.code == "bulk_time_support_mismatch" assert "sample: min(ref=99.0%, cmp=100.0%) >= 99.0%" in decision.reason.message assert "support: min(ref=9.1%, cmp=100.0%) >= 80.0%" in decision.reason.message def test_bulk_same_retains_full_support_when_all_values_are_unique(nvbench_compare): coverages = nvbench_compare.compute_nearest_neighbor_coverages( [1.0, 1.02], [1.0], thresholds=nvbench_compare.ComparisonThresholds( bulk_support_rare_sample_fraction=1.0, bulk_support_max_removed_sample_fraction=1.0, ), ) assert coverages is not None assert coverages["ref_sample"] == 0.5 assert coverages["ref_support"] == 0.5 assert coverages["ref_support_filter"] == nvbench_compare.SupportFilterInfo( activated=False, reason="all_values_unique", removed_sample_fraction=0.0, ) def test_comparison_stats_records_undecided_status(nvbench_compare): stats = nvbench_compare.ComparisonStats() stats.record(nvbench_compare.ComparisonStatus.UNDECIDED) assert stats.config_count == 1 assert stats.pass_count == 0 assert stats.improvement_count == 0 assert stats.regression_count == 0 assert stats.undecided_count == 1 assert stats.unknown_count == 0 def test_comparison_stats_records_undecided_reason(nvbench_compare): stats = nvbench_compare.ComparisonStats() less_severe_reason = nvbench_compare.DecisionReason( code="test_reason", message="less severe reason", severity=1.0, ) more_severe_reason = nvbench_compare.DecisionReason( code="test_reason", message="more severe reason", severity=2.0, ) stats.record(nvbench_compare.ComparisonStatus.UNDECIDED, less_severe_reason) stats.record(nvbench_compare.ComparisonStatus.UNDECIDED, more_severe_reason) summary = stats.undecided_reasons["test_reason"] assert summary.count == 2 assert summary.message == "more severe reason" def test_reason_legend_omits_trivial_aliases(nvbench_compare): reason_legend = { "bulk-same": nvbench_compare.DecisionReasonSummary(canonical_code="bulk_same"), "bt-sup-miss": nvbench_compare.DecisionReasonSummary( canonical_code="bulk_time_support_mismatch" ), } assert nvbench_compare.format_reason_legend_entries(reason_legend) == [ "bt-sup-miss = bulk_time_support_mismatch" ] @pytest.mark.parametrize("ref_time, cmp_time", [(None, 1.0), (1.0, None), (0.0, 1.0)]) def test_compare_gpu_timings_rejects_unusable_centers( nvbench_compare, ref_time, cmp_time ): assert ( nvbench_compare.compare_gpu_timings( make_gpu_timing_data(nvbench_compare, mean=ref_time), make_gpu_timing_data(nvbench_compare, mean=cmp_time), ) is None ) def test_compare_benches_reports_regression_when_robust_intervals_and_clock_confirm( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) ref_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01") ref_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"), make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) cmp_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01") cmp_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"), make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) nvbench_compare.compare_benches( run_data, [make_benchmark([ref_state])], [make_benchmark([cmp_state])], threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, ) assert run_data.stats.config_count == 1 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 1 assert run_data.stats.undecided_count == 0 assert run_data.stats.unknown_count == 0 def test_compare_benches_accepts_custom_comparison_thresholds( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) ref_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01") ref_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.99"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.995"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.01"), make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), ] ) cmp_state = make_state(nvbench_compare, "state", mean="1.01", noise="0.01") cmp_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.0"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.005"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.01"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.02"), make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"), ] ) nvbench_compare.compare_benches( run_data, [make_benchmark([ref_state])], [make_benchmark([cmp_state])], threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, comparison_thresholds=nvbench_compare.ComparisonThresholds( same_center_relative=0.02 ), ) assert run_data.stats.config_count == 1 assert run_data.stats.pass_count == 1 assert run_data.stats.undecided_count == 0 def test_compare_benches_marks_unavailable_noise_undecided( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) missing_noise_ref = make_state(nvbench_compare, "missing_noise") missing_noise_ref["summaries"] = [ make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.0") ] missing_noise_cmp = make_state(nvbench_compare, "missing_noise") missing_noise_cmp["summaries"] = [ make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.001") ] null_noise_ref = make_state(nvbench_compare, "null_noise") null_noise_ref["summaries"] = [ make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.0"), make_summary(nvbench_compare, "GPU_TIME_STDEV_RELATIVE_TAG", None), ] null_noise_cmp = make_state(nvbench_compare, "null_noise") null_noise_cmp["summaries"] = [ make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.001"), make_summary(nvbench_compare, "GPU_TIME_STDEV_RELATIVE_TAG", None), ] nvbench_compare.compare_benches( run_data, [make_benchmark([missing_noise_ref, null_noise_ref])], [make_benchmark([missing_noise_cmp, null_noise_cmp])], threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, ) assert run_data.stats.config_count == 2 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 assert run_data.stats.undecided_count == 2 assert run_data.stats.unknown_count == 0 def test_plot_along_skips_states_without_selected_axis(monkeypatch, nvbench_compare): run_data = make_comparison_run_data(nvbench_compare) ref_benches = [ make_benchmark( [ make_state(nvbench_compare, "with_axis", axis_value=1), make_state(nvbench_compare, "without_axis"), ] ) ] cmp_benches = [ make_benchmark( [ make_state(nvbench_compare, "with_axis", axis_value=1), make_state(nvbench_compare, "without_axis"), ] ) ] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along="A", plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, ) assert run_data.stats.config_count == 2 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 assert run_data.stats.undecided_count == 2 assert run_data.stats.unknown_count == 0 def test_device_filter_parser_accepts_all_and_duplicate_ids(nvbench_compare): assert nvbench_compare.parse_device_filter(" all ", "--reference-devices") is None assert nvbench_compare.parse_device_filter("0", "--reference-devices") == [0] assert nvbench_compare.parse_device_filter("0, 2,0", "--reference-devices") == [ 0, 2, 0, ] @pytest.mark.parametrize( "device_arg", [ "", " ", "gpu", "-1", "0,gpu", "0,-1", "0,", ",0", ], ) def test_device_filter_parser_rejects_invalid_values(nvbench_compare, device_arg): with pytest.raises(ValueError, match="must be 'all'"): nvbench_compare.parse_device_filter(device_arg, "--reference-devices") def test_explicit_device_filters_downgrade_device_mismatch_to_warning(nvbench_compare): assert nvbench_compare.require_matching_device_sections(None, None) assert not nvbench_compare.require_matching_device_sections([0], None) assert not nvbench_compare.require_matching_device_sections(None, [1]) assert not nvbench_compare.require_matching_device_sections([0], [1]) def test_compare_benches_pairs_filtered_devices_by_position( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data( nvbench_compare, ref_devices=[ {"id": 0, "name": "Reference GPU 0"}, {"id": 1, "name": "Reference GPU 1"}, ], cmp_devices=[ {"id": 0, "name": "Compare GPU 0"}, {"id": 1, "name": "Compare GPU 1"}, ], ) ref_benches = [ make_benchmark( [ make_state(nvbench_compare, "Device=0", mean="1.0", device=0), make_state(nvbench_compare, "Device=1", mean="9.0", device=1), ] ) ] cmp_benches = [ make_benchmark( [ make_state(nvbench_compare, "Device=0", mean="9.0", device=0), make_state(nvbench_compare, "Device=1", mean="1.0", device=1), ] ) ] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, reference_device_filter=[0], compare_device_filter=[1], ) assert run_data.stats.config_count == 1 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 assert run_data.stats.undecided_count == 1 assert run_data.stats.unknown_count == 0 def test_axis_filter_applies_to_most_recent_benchmark(monkeypatch, nvbench_compare): run_data = make_comparison_run_data(nvbench_compare) ref_benches = [ make_benchmark( [ make_state(nvbench_compare, "state", mean="1.0", axis_value=1), make_state(nvbench_compare, "state", mean="2.0", axis_value=2), ], name="bench1", ), make_benchmark( [ make_state(nvbench_compare, "state", mean="3.0", axis_value=1), make_state(nvbench_compare, "state", mean="4.0", axis_value=2), ], name="bench2", ), ] cmp_benches = [ make_benchmark( [ make_state(nvbench_compare, "state", mean="1.0", axis_value=1), make_state(nvbench_compare, "state", mean="2.0", axis_value=2), ], name="bench1", ), make_benchmark( [ make_state(nvbench_compare, "state", mean="3.0", axis_value=1), make_state(nvbench_compare, "state", mean="4.0", axis_value=2), ], name="bench2", ), ] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan( nvbench_compare, [("benchmark", "bench1"), ("axis", "A=2"), ("benchmark", "bench2")], ), no_color=True, ) assert run_data.stats.config_count == 3 assert run_data.stats.pass_count == 0 assert run_data.stats.improvement_count == 0 assert run_data.stats.regression_count == 0 assert run_data.stats.undecided_count == 3 assert run_data.stats.unknown_count == 0 def test_main_returns_success_exit_code_when_regressions_are_detected( monkeypatch, capsys, nvbench_compare ): devices = [{"id": 0, "name": "Test GPU"}] ref_state = make_state(nvbench_compare, "state", mean="1.0") ref_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) cmp_state = make_state(nvbench_compare, "state", mean="1.2") cmp_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) ref_root = { "devices": devices, "benchmarks": [make_benchmark([ref_state])], } cmp_root = { "devices": devices, "benchmarks": [make_benchmark([cmp_state])], } def read_file(path): return ref_root if path == "ref.json" else cmp_root monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file) monkeypatch.setattr(sys, "argv", ["nvbench_compare", "ref.json", "cmp.json"]) assert nvbench_compare.main() == 0 assert "Regression (clear timing gap, %Diff > 0): 1" in capsys.readouterr().out def test_main_prints_undecided_reason_summary(monkeypatch, capsys, nvbench_compare): devices = [{"id": 0, "name": "Test GPU"}] ref_root = { "devices": devices, "benchmarks": [ make_benchmark([make_state(nvbench_compare, "state", noise="0.05")]) ], } cmp_root = { "devices": devices, "benchmarks": [ make_benchmark( [make_state(nvbench_compare, "state", mean="1.01", noise="0.05")] ) ], } def read_file(path): return ref_root if path == "ref.json" else cmp_root monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file) monkeypatch.setattr( sys, "argv", ["nvbench_compare", "--display", "explain", "ref.json", "cmp.json"] ) assert nvbench_compare.main() == 0 output = capsys.readouterr().out assert "Ambiguous (comparison requires more evidence): 1" in output assert "noise_too_high: 1" in output assert "Reason legend: noise-high = noise_too_high" in output def test_get_comparison_thresholds_returns_named_presets(nvbench_compare): default = nvbench_compare.get_comparison_thresholds("default") strict = nvbench_compare.get_comparison_thresholds("strict") permissive = nvbench_compare.get_comparison_thresholds("permissive") assert default == nvbench_compare.ComparisonThresholds( **nvbench_compare.COMPARISON_THRESHOLD_PRESET_VALUES["default"] ) assert strict.clear_gap_relative > default.clear_gap_relative assert strict.same_center_relative < default.same_center_relative assert strict.bulk_same_sample_coverage > default.bulk_same_sample_coverage assert permissive.clear_gap_relative < default.clear_gap_relative assert permissive.same_center_relative > default.same_center_relative assert permissive.bulk_same_support_coverage < default.bulk_same_support_coverage def test_dump_comparison_config_uses_grouped_toml(nvbench_compare): config = nvbench_compare.dump_comparison_config( "default", nvbench_compare.get_comparison_thresholds("default") ) assert "version = 1\n" in config assert '[preset]\nname = "default"\n' in config assert "[clear_gap]\nrelative = 0.005\n" in config assert "[same]\n" in config assert "[bulk]\n" in config assert "sample_coverage = 0.97\n" in config assert "[bulk.rare_support]\n" in config def test_resolve_comparison_thresholds_applies_config_overrides( monkeypatch, nvbench_compare ): def read_config(_): return ( "strict", { "bulk_same_sample_coverage": 0.93, "bulk_support_max_removed_sample_fraction": 0.02, }, ) monkeypatch.setattr(nvbench_compare, "read_comparison_config_file", read_config) preset, thresholds = nvbench_compare.resolve_comparison_thresholds( None, "settings.toml" ) assert preset == "strict" assert thresholds.clear_gap_relative == pytest.approx( nvbench_compare.get_comparison_thresholds("strict").clear_gap_relative ) assert thresholds.bulk_same_sample_coverage == pytest.approx(0.93) assert thresholds.bulk_support_max_removed_sample_fraction == pytest.approx(0.02) preset, thresholds = nvbench_compare.resolve_comparison_thresholds( "permissive", "settings.toml" ) assert preset == "permissive" assert thresholds.clear_gap_relative == pytest.approx( nvbench_compare.get_comparison_thresholds("permissive").clear_gap_relative ) assert thresholds.bulk_same_sample_coverage == pytest.approx(0.93) assert thresholds.bulk_support_max_removed_sample_fraction == pytest.approx(0.02) def test_parse_comparison_config_data_validates_grouped_thresholds(nvbench_compare): preset, overrides = nvbench_compare.parse_comparison_config_data( { "version": 1, "preset": {"name": "strict"}, "clear_gap": {"relative": 0.01}, "same": { "center_relative": 0.002, "overlap_fraction": 0.75, "relative_dispersion_ceiling": 0.02, }, "bulk": { "sample_coverage": 0.99, "support_coverage": 0.8, "rare_support": { "sample_fraction": 0.001, "max_removed_sample_fraction": 0.01, }, }, } ) assert preset == "strict" assert overrides == { "clear_gap_relative": 0.01, "same_center_relative": 0.002, "same_overlap_fraction": 0.75, "same_relative_dispersion_ceiling": 0.02, "bulk_same_sample_coverage": 0.99, "bulk_same_support_coverage": 0.8, "bulk_support_rare_sample_fraction": 0.001, "bulk_support_max_removed_sample_fraction": 0.01, } @pytest.mark.parametrize( "config_data, match", [ ({}, "version"), ({"version": 2}, "unsupported"), ({"version": 1, "rare_support": {}}, "unknown top-level"), ({"version": 1, "bulk": {"unknown": 0.1}}, r"\[bulk\]"), ({"version": 1, "clear_gap": {"rare_support": {}}}, r"\[clear_gap\]"), ({"version": 1, "bulk": {"sample_coverage": 1.5}}, "<= 1"), ({"version": 1, "same": {"center_relative": "tight"}}, "finite number"), ({"version": 1, "preset": {"name": "aggressive"}}, "unknown comparison preset"), ], ) def test_parse_comparison_config_data_rejects_invalid_config( nvbench_compare, config_data, match ): with pytest.raises(ValueError, match=match): nvbench_compare.parse_comparison_config_data(config_data) def test_read_comparison_config_file_parses_toml_when_parser_is_available( tmp_path, nvbench_compare ): parser_module = "tomllib" if sys.version_info >= (3, 11) else "tomli" pytest.importorskip(parser_module) config_path = tmp_path / "settings.toml" config_path.write_text( """ version = 1 [preset] name = "strict" [bulk] sample_coverage = 0.93 """, encoding="utf-8", ) preset, overrides = nvbench_compare.read_comparison_config_file(config_path) assert preset == "strict" assert overrides == {"bulk_same_sample_coverage": 0.93} def test_main_dump_config_does_not_require_input_files( monkeypatch, capsys, nvbench_compare ): def read_file(_): raise AssertionError("dump-config should not read JSON files") monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file) monkeypatch.setattr( sys, "argv", ["nvbench_compare", "--preset", "strict", "--dump-config"], ) assert nvbench_compare.main() == 0 output = capsys.readouterr().out assert 'name = "strict"' in output assert "[bulk.rare_support]" in output def test_main_dump_config_merges_config_and_cli_preset( monkeypatch, capsys, nvbench_compare ): def read_config(_): return ("strict", {"bulk_same_sample_coverage": 0.93}) monkeypatch.setattr(nvbench_compare, "read_comparison_config_file", read_config) monkeypatch.setattr( sys, "argv", [ "nvbench_compare", "--config", "settings.toml", "--preset", "permissive", "--dump-config", ], ) assert nvbench_compare.main() == 0 output = capsys.readouterr().out assert 'name = "permissive"' in output assert "relative = 0.0025" in output assert "sample_coverage = 0.93" in output def test_main_prints_bulk_debug_python_to_stdout(monkeypatch, capsys, nvbench_compare): devices = [{"id": 0, "name": "Test GPU"}] root = { "devices": devices, "benchmarks": [], } monkeypatch.setattr(nvbench_compare.reader, "read_file", lambda _: root) def fake_compare_benches(*args, **kwargs): kwargs["bulk_debug_rows"].append( { "row_index": 0, "status": "AMBG", "reference_sample_filename": None, "reference_sample_count": None, "reference_frequency_filename": None, "reference_frequency_count": None, "compare_sample_filename": None, "compare_sample_count": None, "compare_frequency_filename": None, "compare_frequency_count": None, } ) monkeypatch.setattr(nvbench_compare, "compare_benches", fake_compare_benches) monkeypatch.setattr( sys, "argv", [ "nvbench_compare", "--bulk-debug-python", "STDOUT", "ref.json", "cmp.json", ], ) assert nvbench_compare.main() == 0 output = capsys.readouterr().out assert "# NVB-BULK-BEGIN" in output assert "bulk_rows = [" in output assert "'status': 'AMBG'" in output assert "def load_bulk_data(row):" in output assert "# NVB-BULK-END" in output def test_compare_benches_defaults_to_interval_display(monkeypatch, nvbench_compare): run_data = make_comparison_run_data(nvbench_compare) captured = {} def fake_tabulate(rows, headers, *args, **kwargs): captured["rows"] = rows captured["headers"] = headers return "" monkeypatch.setattr(nvbench_compare.tabulate, "tabulate", fake_tabulate) ref_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.0")])] cmp_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.01")])] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, ) assert captured["headers"][-4:] == ["Ref", "Cmp", "Change", "Status"] row = captured["rows"][0] assert row[-4].startswith("1.000 s") assert row[-3].startswith("1.010 s") assert row[-2] == "" def test_compare_benches_legacy_display_uses_scalar_diff(monkeypatch, nvbench_compare): run_data = make_comparison_run_data(nvbench_compare) captured = {} def fake_tabulate(rows, headers, *args, **kwargs): captured["rows"] = rows captured["headers"] = headers return "" monkeypatch.setattr(nvbench_compare.tabulate, "tabulate", fake_tabulate) ref_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.0")])] cmp_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.01")])] nvbench_compare.compare_benches( run_data, ref_benches, cmp_benches, threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, display="legacy", ) assert captured["headers"][-7:] == [ "Ref Time", "Ref Noise", "Cmp Time", "Cmp Noise", "Diff", "%Diff", "Status", ] row = captured["rows"][0] assert row[-7] == "1.000 s" assert row[-5] == "1.010 s" assert row[-3] == "10.000 ms" assert row[-2] == "1.00%" def test_compare_benches_explain_display_uses_explicit_intervals( monkeypatch, nvbench_compare ): run_data = make_comparison_run_data(nvbench_compare) captured = {} def fake_tabulate(rows, headers, *args, **kwargs): captured["rows"] = rows captured["headers"] = headers return "" monkeypatch.setattr(nvbench_compare.tabulate, "tabulate", fake_tabulate) ref_state = make_state(nvbench_compare, "state", mean="1.0") ref_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.0"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.01"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.02"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.03"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) cmp_state = make_state(nvbench_compare, "state", mean="1.01") cmp_state["summaries"].extend( [ make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.01"), make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.02"), make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.03"), make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.04"), make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"), ] ) nvbench_compare.compare_benches( run_data, [make_benchmark([ref_state])], [make_benchmark([cmp_state])], threshold=0.0, plot_along=None, plot=False, dark=False, filter_plan=make_filter_plan(nvbench_compare), no_color=True, display="explain", ) assert captured["headers"][-7:] == [ "Ref [Lo | Ce | Hi]", "Cmp [Lo | Ce | Hi]", "Ref Noise", "Cmp Noise", "Reason", "Change", "Status", ] row = captured["rows"][0] assert row[-7] == "1.0[00 | 20 | 30] s" assert row[-6] == "1.0[10 | 30 | 40] s" assert row[-3] == "centers-far" assert row[-2] == "" def test_main_passes_selected_preset_to_compare_benches(monkeypatch, nvbench_compare): devices = [{"id": 0, "name": "Test GPU"}] root = { "devices": devices, "benchmarks": [], } captured = {} monkeypatch.setattr(nvbench_compare.reader, "read_file", lambda _: root) def fake_compare_benches(*args, **kwargs): captured["comparison_thresholds"] = kwargs["comparison_thresholds"] captured["display"] = kwargs["display"] monkeypatch.setattr(nvbench_compare, "compare_benches", fake_compare_benches) monkeypatch.setattr( sys, "argv", [ "nvbench_compare", "--preset", "strict", "--display", "explain", "ref.json", "cmp.json", ], ) assert nvbench_compare.main() == 0 assert captured[ "comparison_thresholds" ] == nvbench_compare.get_comparison_thresholds("strict") assert captured["display"] == "explain"