mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-07-01 19:57:41 +00:00
Let argparse derive the program name from the actual invocation instead of hardcoding nvbench_compare, so help and error output match the installed nvbench-compare entry point. Declare comparison inputs as explicit positional arguments and use parse_args() instead of parse_known_args(). This preserves --dump-config without input files while rejecting unknown options through argparse rather than treating typoed flags as JSON paths. Add regression coverage for rejecting an unknown CLI option.
2302 lines
75 KiB
Python
2302 lines
75 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
|
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
import importlib.util
|
|
import math
|
|
import sys
|
|
import types
|
|
from dataclasses import replace
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture
|
|
def nvbench_compare(monkeypatch):
|
|
class DummyLine:
|
|
def get_color(self):
|
|
return "black"
|
|
|
|
pyplot = types.ModuleType("matplotlib.pyplot")
|
|
pyplot.figure = lambda *args, **kwargs: None
|
|
pyplot.xscale = lambda *args, **kwargs: None
|
|
pyplot.yscale = lambda *args, **kwargs: None
|
|
pyplot.xlabel = lambda *args, **kwargs: None
|
|
pyplot.ylabel = lambda *args, **kwargs: None
|
|
pyplot.title = lambda *args, **kwargs: None
|
|
pyplot.plot = lambda *args, **kwargs: [DummyLine()]
|
|
pyplot.fill_between = lambda *args, **kwargs: None
|
|
pyplot.legend = lambda *args, **kwargs: None
|
|
pyplot.show = lambda *args, **kwargs: None
|
|
pyplot.close = lambda *args, **kwargs: None
|
|
|
|
matplotlib = types.ModuleType("matplotlib")
|
|
matplotlib.pyplot = pyplot
|
|
monkeypatch.setitem(sys.modules, "matplotlib", matplotlib)
|
|
monkeypatch.setitem(sys.modules, "matplotlib.pyplot", pyplot)
|
|
monkeypatch.setitem(
|
|
sys.modules,
|
|
"seaborn",
|
|
types.SimpleNamespace(set_theme=lambda *args, **kwargs: None),
|
|
)
|
|
monkeypatch.setitem(
|
|
sys.modules, "jsondiff", types.SimpleNamespace(diff=lambda *args, **kwargs: {})
|
|
)
|
|
monkeypatch.setitem(
|
|
sys.modules,
|
|
"tabulate",
|
|
types.SimpleNamespace(
|
|
__version__="0.8.10", tabulate=lambda *args, **kwargs: ""
|
|
),
|
|
)
|
|
monkeypatch.setitem(
|
|
sys.modules,
|
|
"colorama",
|
|
types.SimpleNamespace(
|
|
Fore=types.SimpleNamespace(
|
|
BLUE="",
|
|
GREEN="",
|
|
LIGHTBLACK_EX="",
|
|
RED="",
|
|
RESET="",
|
|
YELLOW="",
|
|
)
|
|
),
|
|
)
|
|
|
|
module_path = Path(__file__).resolve().parents[1] / "scripts" / "nvbench_compare.py"
|
|
spec = importlib.util.spec_from_file_location("nvbench_compare", module_path)
|
|
assert spec is not None
|
|
assert spec.loader is not None
|
|
module = importlib.util.module_from_spec(spec)
|
|
monkeypatch.setitem(sys.modules, spec.name, module)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
def make_state(
|
|
nvbench_compare, name, *, mean="1.0", noise="0.01", axis_value=None, device=0
|
|
):
|
|
return {
|
|
"name": name,
|
|
"device": device,
|
|
"axis_values": []
|
|
if axis_value is None
|
|
else [{"name": "A", "type": "int64", "value": axis_value}],
|
|
"summaries": [
|
|
{
|
|
"tag": nvbench_compare.GPU_TIME_MEAN_TAG,
|
|
"data": [{"name": "value", "type": "float64", "value": mean}],
|
|
},
|
|
{
|
|
"tag": nvbench_compare.GPU_TIME_STDEV_RELATIVE_TAG,
|
|
"data": [{"name": "value", "type": "float64", "value": noise}],
|
|
},
|
|
],
|
|
}
|
|
|
|
|
|
def make_summary(nvbench_compare, tag, value):
|
|
return {
|
|
"tag": getattr(nvbench_compare, tag),
|
|
"data": [{"name": "value", "type": "float64", "value": value}],
|
|
}
|
|
|
|
|
|
def make_binary_summary(nvbench_compare, tag, filename, size):
|
|
return {
|
|
"tag": getattr(nvbench_compare, tag),
|
|
"data": [
|
|
{"name": "filename", "type": "string", "value": filename},
|
|
{"name": "size", "type": "int64", "value": str(size)},
|
|
],
|
|
}
|
|
|
|
|
|
def capture_tabulate_calls(monkeypatch, nvbench_compare):
|
|
calls = []
|
|
|
|
def fake_tabulate(rows, headers, *args, **kwargs):
|
|
calls.append({"rows": rows, "headers": headers})
|
|
return ""
|
|
|
|
monkeypatch.setattr(nvbench_compare.tabulate, "tabulate", fake_tabulate)
|
|
return calls
|
|
|
|
|
|
def find_tabulate_call(calls, expected_header_suffix):
|
|
return next(
|
|
call
|
|
for call in calls
|
|
if call["headers"][-len(expected_header_suffix) :] == expected_header_suffix
|
|
)
|
|
|
|
|
|
INTERVAL_DISPLAY_HEADERS = ["Ref", "Cmp", "Change", "Status"]
|
|
LEGACY_DISPLAY_HEADERS = [
|
|
"Ref Time",
|
|
"Ref Noise",
|
|
"Cmp Time",
|
|
"Cmp Noise",
|
|
"Diff",
|
|
"%Diff",
|
|
"Status",
|
|
]
|
|
EXPLAIN_DISPLAY_HEADERS = [
|
|
"Ref [Lo | Ce | Hi]",
|
|
"Cmp [Lo | Ce | Hi]",
|
|
"Ref Noise",
|
|
"Cmp Noise",
|
|
"Reason",
|
|
"Change",
|
|
"Status",
|
|
]
|
|
|
|
|
|
def make_gpu_timing_data(
|
|
nvbench_compare,
|
|
*,
|
|
minimum=None,
|
|
maximum=None,
|
|
mean=1.0,
|
|
stdev=None,
|
|
stdev_relative=0.01,
|
|
first_quartile=None,
|
|
median=None,
|
|
third_quartile=None,
|
|
interquartile_range=None,
|
|
interquartile_range_relative=None,
|
|
sm_clock_rate_mean=None,
|
|
sample_values=None,
|
|
frequency_values=None,
|
|
):
|
|
return nvbench_compare.GpuTimingData(
|
|
minimum=minimum,
|
|
maximum=maximum,
|
|
mean=mean,
|
|
stdev=stdev,
|
|
stdev_relative=stdev_relative,
|
|
first_quartile=first_quartile,
|
|
median=median,
|
|
third_quartile=third_quartile,
|
|
interquartile_range=interquartile_range,
|
|
interquartile_range_relative=interquartile_range_relative,
|
|
sm_clock_rate_mean=sm_clock_rate_mean,
|
|
sample_source=None
|
|
if sample_values is None
|
|
else types.SimpleNamespace(values=np.asarray(sample_values, dtype=np.float32)),
|
|
frequency_source=None
|
|
if frequency_values is None
|
|
else types.SimpleNamespace(
|
|
values=np.asarray(frequency_values, dtype=np.float32)
|
|
),
|
|
)
|
|
|
|
|
|
def make_benchmark(states, *, name="bench"):
|
|
devices = []
|
|
for state in states:
|
|
if state["device"] not in devices:
|
|
devices.append(state["device"])
|
|
|
|
return {
|
|
"name": name,
|
|
"devices": devices,
|
|
"axes": [{"name": "A", "type": "int64", "flags": ""}]
|
|
if any(state["axis_values"] for state in states)
|
|
else [],
|
|
"states": states,
|
|
}
|
|
|
|
|
|
def make_comparison_run_data(nvbench_compare, ref_devices=None, cmp_devices=None):
|
|
devices = [{"id": 0, "name": "Test GPU"}]
|
|
return nvbench_compare.ComparisonRunData(
|
|
stats=nvbench_compare.ComparisonStats(),
|
|
ref_devices=tuple(devices if ref_devices is None else ref_devices),
|
|
cmp_devices=tuple(devices if cmp_devices is None else cmp_devices),
|
|
)
|
|
|
|
|
|
def make_filter_plan(nvbench_compare, filter_actions=None):
|
|
return nvbench_compare.build_benchmark_filter_plan(filter_actions or [])
|
|
|
|
|
|
def test_compare_benches_accepts_matching_duplicate_state_counts(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state1"),
|
|
make_state(nvbench_compare, "state1"),
|
|
make_state(nvbench_compare, "state2"),
|
|
]
|
|
)
|
|
]
|
|
cmp_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state1", mean="1.005"),
|
|
make_state(nvbench_compare, "state1", mean="1.005"),
|
|
make_state(nvbench_compare, "state2", mean="1.005"),
|
|
]
|
|
)
|
|
]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 3
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 0
|
|
assert run_data.stats.undecided_count == 3
|
|
assert run_data.stats.unknown_count == 0
|
|
|
|
|
|
def test_compare_benches_rejects_swapped_duplicate_state_counts(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state1"),
|
|
make_state(nvbench_compare, "state1"),
|
|
make_state(nvbench_compare, "state1"),
|
|
make_state(nvbench_compare, "state2"),
|
|
make_state(nvbench_compare, "state2"),
|
|
]
|
|
)
|
|
]
|
|
cmp_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state1"),
|
|
make_state(nvbench_compare, "state1"),
|
|
make_state(nvbench_compare, "state2"),
|
|
make_state(nvbench_compare, "state2"),
|
|
make_state(nvbench_compare, "state2"),
|
|
]
|
|
)
|
|
]
|
|
|
|
with pytest.raises(ValueError, match="mismatched state occurrences"):
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
|
|
def test_compare_benches_matches_duplicate_states_after_axis_filter(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state", mean="1.0", axis_value=1),
|
|
make_state(nvbench_compare, "state", mean="2.0", axis_value=2),
|
|
]
|
|
)
|
|
]
|
|
cmp_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state", mean="2.0", axis_value=2),
|
|
make_state(nvbench_compare, "state", mean="1.0", axis_value=1),
|
|
]
|
|
)
|
|
]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare, [("axis", "A=2")]),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 0
|
|
assert run_data.stats.undecided_count == 1
|
|
assert run_data.stats.unknown_count == 0
|
|
|
|
|
|
def test_compare_benches_counts_non_finite_centers_as_unknown(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "finite", mean="1.0"),
|
|
make_state(nvbench_compare, "nan", mean="nan"),
|
|
make_state(nvbench_compare, "inf", mean="inf"),
|
|
]
|
|
)
|
|
]
|
|
cmp_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "finite", mean="1.0"),
|
|
make_state(nvbench_compare, "nan", mean="1.0"),
|
|
make_state(nvbench_compare, "inf", mean="1.0"),
|
|
]
|
|
)
|
|
]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 3
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 0
|
|
assert run_data.stats.undecided_count == 1
|
|
assert run_data.stats.unknown_count == 2
|
|
|
|
|
|
def test_gpu_timing_data_loads_samples_and_frequencies_lazily(
|
|
tmp_path, nvbench_compare
|
|
):
|
|
samples_dir = tmp_path / "result.json-bin"
|
|
freqs_dir = tmp_path / "result.json-freqs-bin"
|
|
samples_dir.mkdir()
|
|
freqs_dir.mkdir()
|
|
samples_file = samples_dir / "0.bin"
|
|
freqs_file = freqs_dir / "0.bin"
|
|
|
|
np.array([1.0, 2.0, 4.0], dtype="<f4").tofile(samples_file)
|
|
np.array([100.0, 200.0, 400.0], dtype="<f4").tofile(freqs_file)
|
|
|
|
reader_calls = []
|
|
buffers = {
|
|
str(samples_file): np.array([1.0, 2.0, 4.0], dtype="<f4").tobytes(),
|
|
str(freqs_file): np.array([100.0, 200.0, 400.0], dtype="<f4").tobytes(),
|
|
}
|
|
|
|
def tracking_reader(filename):
|
|
reader_calls.append(filename)
|
|
return buffers[filename]
|
|
|
|
timing = nvbench_compare.extract_gpu_timing_data(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "2.0"),
|
|
make_binary_summary(
|
|
nvbench_compare,
|
|
"SAMPLE_TIMES_TAG",
|
|
str(samples_file.relative_to(tmp_path)),
|
|
3,
|
|
),
|
|
make_binary_summary(
|
|
nvbench_compare,
|
|
"SAMPLE_FREQUENCIES_TAG",
|
|
str(freqs_file.relative_to(tmp_path)),
|
|
3,
|
|
),
|
|
],
|
|
str(tmp_path),
|
|
float32_reader=tracking_reader,
|
|
)
|
|
|
|
assert reader_calls == []
|
|
|
|
assert timing.samples is not None
|
|
assert list(timing.samples) == pytest.approx([1.0, 2.0, 4.0])
|
|
assert reader_calls == [str(samples_file)]
|
|
|
|
assert list(timing.samples) == pytest.approx([1.0, 2.0, 4.0])
|
|
assert reader_calls == [str(samples_file)]
|
|
|
|
assert timing.frequencies is not None
|
|
assert list(timing.frequencies) == pytest.approx([100.0, 200.0, 400.0])
|
|
assert reader_calls == [str(samples_file), str(freqs_file)]
|
|
|
|
|
|
def test_compare_benches_collects_bulk_debug_rows(tmp_path, nvbench_compare):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
ref_samples_file = tmp_path / "ref-samples.bin"
|
|
ref_freqs_file = tmp_path / "ref-freqs.bin"
|
|
cmp_samples_file = tmp_path / "cmp-samples.bin"
|
|
cmp_freqs_file = tmp_path / "cmp-freqs.bin"
|
|
np.array([1.0, 1.0], dtype="<f4").tofile(ref_samples_file)
|
|
np.array([100.0, 100.0], dtype="<f4").tofile(ref_freqs_file)
|
|
np.array([1.0, 1.0], dtype="<f4").tofile(cmp_samples_file)
|
|
np.array([100.0, 100.0], dtype="<f4").tofile(cmp_freqs_file)
|
|
|
|
ref_state = make_state(nvbench_compare, "state", mean="1.0")
|
|
ref_state["summaries"].extend(
|
|
[
|
|
make_binary_summary(
|
|
nvbench_compare, "SAMPLE_TIMES_TAG", str(ref_samples_file), 2
|
|
),
|
|
make_binary_summary(
|
|
nvbench_compare, "SAMPLE_FREQUENCIES_TAG", str(ref_freqs_file), 2
|
|
),
|
|
]
|
|
)
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.01")
|
|
cmp_state["summaries"].extend(
|
|
[
|
|
make_binary_summary(
|
|
nvbench_compare, "SAMPLE_TIMES_TAG", str(cmp_samples_file), 2
|
|
),
|
|
make_binary_summary(
|
|
nvbench_compare, "SAMPLE_FREQUENCIES_TAG", str(cmp_freqs_file), 2
|
|
),
|
|
]
|
|
)
|
|
bulk_debug_rows = []
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([ref_state])],
|
|
[make_benchmark([cmp_state])],
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
ref_json_dir=str(tmp_path),
|
|
cmp_json_dir=str(tmp_path),
|
|
ref_json_path="ref.json",
|
|
cmp_json_path="cmp.json",
|
|
bulk_debug_rows=bulk_debug_rows,
|
|
)
|
|
|
|
assert len(bulk_debug_rows) == 1
|
|
row = bulk_debug_rows[0]
|
|
assert row["row_index"] == 0
|
|
assert row["table_row_index"] == 0
|
|
assert row["benchmark"] == "bench"
|
|
assert row["reference_json"] == "ref.json"
|
|
assert row["compare_json"] == "cmp.json"
|
|
assert row["status"] == nvbench_compare.ComparisonStatus.SAME.value
|
|
assert row["occurrence"] == 0
|
|
assert row["occurrence_count"] == 1
|
|
assert row["reference_sample_filename"] == str(ref_samples_file)
|
|
assert row["reference_sample_count"] == 2
|
|
assert row["reference_frequency_filename"] == str(ref_freqs_file)
|
|
assert row["compare_sample_filename"] == str(cmp_samples_file)
|
|
assert row["compare_frequency_filename"] == str(cmp_freqs_file)
|
|
|
|
|
|
def test_format_bulk_debug_python_loads_arrays(tmp_path, nvbench_compare):
|
|
samples_file = tmp_path / "samples.bin"
|
|
np.array([1.0, 2.0], dtype="<f4").tofile(samples_file)
|
|
script = nvbench_compare.format_bulk_debug_python(
|
|
[
|
|
{
|
|
"reference_sample_filename": str(samples_file),
|
|
"reference_sample_count": 2,
|
|
"reference_frequency_filename": None,
|
|
"reference_frequency_count": None,
|
|
"compare_sample_filename": None,
|
|
"compare_sample_count": None,
|
|
"compare_frequency_filename": None,
|
|
"compare_frequency_count": None,
|
|
}
|
|
]
|
|
)
|
|
namespace = {}
|
|
|
|
assert script.startswith("# NVB-BULK-BEGIN\n")
|
|
assert script.endswith("# NVB-BULK-END\n")
|
|
exec(script, namespace)
|
|
|
|
arrays = namespace["load_bulk_data"](namespace["bulk_rows"][0])
|
|
assert list(arrays["reference_samples"]) == pytest.approx([1.0, 2.0])
|
|
assert arrays["reference_frequencies"] is None
|
|
|
|
|
|
def test_format_bulk_debug_python_handles_nonfinite_values(nvbench_compare):
|
|
script = nvbench_compare.format_bulk_debug_python(
|
|
[
|
|
{
|
|
"reference_time": math.nan,
|
|
"compare_time": math.inf,
|
|
"fractional_difference": -math.inf,
|
|
}
|
|
]
|
|
)
|
|
namespace = {}
|
|
|
|
assert 'nan = float("nan")' in script
|
|
assert 'inf = float("inf")' in script
|
|
assert "'reference_time': nan" in script
|
|
assert "'compare_time': inf" in script
|
|
assert "'fractional_difference': -inf" in script
|
|
exec(script, namespace)
|
|
|
|
row = namespace["bulk_rows"][0]
|
|
assert math.isnan(row["reference_time"])
|
|
assert row["compare_time"] == math.inf
|
|
assert row["fractional_difference"] == -math.inf
|
|
|
|
|
|
def test_gpu_timing_data_parses_quartiles_and_sm_clock_rate_mean(nvbench_compare):
|
|
timing = nvbench_compare.extract_gpu_timing_data(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "2.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.5"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "2.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "2.5"),
|
|
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "1.5e9"),
|
|
],
|
|
)
|
|
|
|
assert timing.first_quartile == pytest.approx(1.5)
|
|
assert timing.median == pytest.approx(2.0)
|
|
assert timing.third_quartile == pytest.approx(2.5)
|
|
assert timing.sm_clock_rate_mean == pytest.approx(1.5e9)
|
|
assert timing.frequencies is None
|
|
|
|
|
|
def test_gpu_timing_data_accepts_legacy_ir_tags(nvbench_compare):
|
|
timing = nvbench_compare.extract_gpu_timing_data(
|
|
[
|
|
make_summary(nvbench_compare, "LEGACY_GPU_TIME_IR_TAG", "0.5"),
|
|
make_summary(nvbench_compare, "LEGACY_GPU_TIME_IR_RELATIVE_TAG", "0.25"),
|
|
],
|
|
)
|
|
|
|
assert timing.interquartile_range == pytest.approx(0.5)
|
|
assert timing.interquartile_range_relative == pytest.approx(0.25)
|
|
|
|
|
|
def test_gpu_timing_data_treats_mismatched_sample_and_frequency_counts_as_unavailable(
|
|
tmp_path, nvbench_compare
|
|
):
|
|
samples_file = tmp_path / "samples.bin"
|
|
freqs_file = tmp_path / "freqs.bin"
|
|
np.array([1.0, 2.0], dtype="<f4").tofile(samples_file)
|
|
np.array([100.0, 200.0, 300.0], dtype="<f4").tofile(freqs_file)
|
|
|
|
with pytest.warns(RuntimeWarning, match="sample count .* frequency count"):
|
|
timing = nvbench_compare.extract_gpu_timing_data(
|
|
[
|
|
make_binary_summary(
|
|
nvbench_compare, "SAMPLE_TIMES_TAG", str(samples_file), 2
|
|
),
|
|
make_binary_summary(
|
|
nvbench_compare, "SAMPLE_FREQUENCIES_TAG", str(freqs_file), 3
|
|
),
|
|
],
|
|
str(tmp_path),
|
|
)
|
|
|
|
assert timing.samples is None
|
|
assert timing.frequencies is None
|
|
|
|
|
|
def test_gpu_timing_data_warns_when_lazy_sample_read_fails(tmp_path, nvbench_compare):
|
|
missing_file = tmp_path / "missing.bin"
|
|
|
|
timing = nvbench_compare.extract_gpu_timing_data(
|
|
[
|
|
make_binary_summary(
|
|
nvbench_compare, "SAMPLE_TIMES_TAG", str(missing_file), 3
|
|
),
|
|
],
|
|
str(tmp_path),
|
|
)
|
|
|
|
with pytest.warns(RuntimeWarning, match="failed to read"):
|
|
assert timing.samples is None
|
|
|
|
assert timing.samples is None
|
|
|
|
|
|
def test_compare_gpu_timings_classifies_common_cases(tmp_path, nvbench_compare):
|
|
ref_timing = make_gpu_timing_data(nvbench_compare, mean=1.0, stdev_relative=0.05)
|
|
|
|
undecided = nvbench_compare.compare_gpu_timings(
|
|
ref_timing,
|
|
make_gpu_timing_data(nvbench_compare, mean=1.03, stdev_relative=0.05),
|
|
)
|
|
assert undecided is not None
|
|
assert undecided.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert undecided.ref_time == pytest.approx(1.0)
|
|
assert undecided.cmp_time == pytest.approx(1.03)
|
|
assert undecided.diff == pytest.approx(0.03)
|
|
assert undecided.frac_diff == pytest.approx(0.03)
|
|
assert undecided.max_noise == pytest.approx(0.05)
|
|
assert undecided.reason.code == "noise_too_high"
|
|
|
|
ref_interval_timing = make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.0,
|
|
first_quartile=1.1,
|
|
median=1.2,
|
|
third_quartile=1.3,
|
|
mean=1.2,
|
|
stdev_relative=0.05,
|
|
interquartile_range_relative=0.01,
|
|
sm_clock_rate_mean=100.0,
|
|
)
|
|
|
|
fast = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=0.8,
|
|
first_quartile=0.85,
|
|
median=0.9,
|
|
third_quartile=0.95,
|
|
mean=0.9,
|
|
stdev_relative=0.05,
|
|
sm_clock_rate_mean=100.0,
|
|
),
|
|
)
|
|
assert fast is not None
|
|
assert fast.status == nvbench_compare.ComparisonStatus.FAST
|
|
assert fast.reason.code == "clear_gap_confirmed_by_summary_cycles"
|
|
assert fast.diff_interval == pytest.approx((-0.5, -0.05))
|
|
assert fast.frac_diff_interval == pytest.approx((-0.3846153846, -0.05))
|
|
|
|
slow = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.4,
|
|
first_quartile=1.45,
|
|
median=1.5,
|
|
third_quartile=1.55,
|
|
mean=1.5,
|
|
stdev_relative=0.05,
|
|
sm_clock_rate_mean=100.0,
|
|
),
|
|
)
|
|
assert slow is not None
|
|
assert slow.status == nvbench_compare.ComparisonStatus.SLOW
|
|
assert slow.reason.code == "clear_gap_confirmed_by_summary_cycles"
|
|
assert slow.diff_interval == pytest.approx((0.1, 0.55))
|
|
assert slow.frac_diff_interval == pytest.approx((0.0769230769, 0.55))
|
|
|
|
same = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.02,
|
|
first_quartile=1.1,
|
|
median=1.204,
|
|
third_quartile=1.28,
|
|
mean=1.204,
|
|
interquartile_range_relative=0.01,
|
|
sm_clock_rate_mean=100.0,
|
|
),
|
|
)
|
|
assert same is not None
|
|
assert same.status == nvbench_compare.ComparisonStatus.SAME
|
|
assert same.reason.code == "same_confirmed_by_cycles"
|
|
assert same.diff_interval == pytest.approx((-0.28, 0.28))
|
|
assert same.frac_diff_interval == pytest.approx((-0.2153846154, 0.28))
|
|
|
|
weak_overlap = nvbench_compare.compare_gpu_timings(
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.0,
|
|
first_quartile=1.19,
|
|
median=1.195,
|
|
third_quartile=1.2,
|
|
mean=1.195,
|
|
interquartile_range_relative=0.01,
|
|
),
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.2,
|
|
first_quartile=1.2,
|
|
median=1.2,
|
|
third_quartile=1.4,
|
|
mean=1.2,
|
|
interquartile_range_relative=0.01,
|
|
),
|
|
)
|
|
assert weak_overlap is not None
|
|
assert weak_overlap.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert weak_overlap.reason.code == "weak_interval_overlap"
|
|
|
|
center_too_far = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.0,
|
|
first_quartile=1.1,
|
|
median=1.21,
|
|
third_quartile=1.3,
|
|
mean=1.21,
|
|
interquartile_range_relative=0.01,
|
|
),
|
|
)
|
|
assert center_too_far is not None
|
|
assert center_too_far.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert center_too_far.reason.code == "centers_not_close"
|
|
|
|
noisy_same = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.02,
|
|
first_quartile=1.1,
|
|
median=1.204,
|
|
third_quartile=1.28,
|
|
mean=1.204,
|
|
interquartile_range_relative=0.03,
|
|
),
|
|
)
|
|
assert noisy_same is not None
|
|
assert noisy_same.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert noisy_same.reason.code == "noise_too_high"
|
|
|
|
clock_disagreement = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.02,
|
|
first_quartile=1.1,
|
|
median=1.204,
|
|
third_quartile=1.28,
|
|
mean=1.204,
|
|
interquartile_range_relative=0.01,
|
|
sm_clock_rate_mean=200.0,
|
|
),
|
|
)
|
|
assert clock_disagreement is not None
|
|
assert clock_disagreement.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert clock_disagreement.reason.code == "cycle_same_not_confirmed"
|
|
|
|
missing_clock = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=0.8,
|
|
first_quartile=0.85,
|
|
median=0.9,
|
|
third_quartile=0.95,
|
|
mean=0.9,
|
|
stdev_relative=0.05,
|
|
),
|
|
)
|
|
assert missing_clock is not None
|
|
assert missing_clock.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert missing_clock.reason.code == "missing_clock_rate"
|
|
|
|
frequency_shift = nvbench_compare.compare_gpu_timings(
|
|
ref_interval_timing,
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=0.8,
|
|
first_quartile=0.85,
|
|
median=0.9,
|
|
third_quartile=0.95,
|
|
mean=0.9,
|
|
stdev_relative=0.05,
|
|
sm_clock_rate_mean=200.0,
|
|
),
|
|
)
|
|
assert frequency_shift is not None
|
|
assert frequency_shift.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert frequency_shift.reason.code == "summary_cycle_gap_not_confirmed"
|
|
|
|
bulk_cycle_fast = nvbench_compare.compare_gpu_timings(
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.0,
|
|
first_quartile=1.1,
|
|
median=1.2,
|
|
third_quartile=1.3,
|
|
mean=1.2,
|
|
stdev_relative=0.05,
|
|
sample_values=[1.0, 1.1, 1.2, 1.3],
|
|
frequency_values=[100.0] * 4,
|
|
),
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=0.8,
|
|
first_quartile=0.85,
|
|
median=0.9,
|
|
third_quartile=0.95,
|
|
mean=0.9,
|
|
stdev_relative=0.05,
|
|
sample_values=[0.8, 0.85, 0.9, 0.95],
|
|
frequency_values=[100.0] * 4,
|
|
),
|
|
)
|
|
assert bulk_cycle_fast is not None
|
|
assert bulk_cycle_fast.status == nvbench_compare.ComparisonStatus.FAST
|
|
assert bulk_cycle_fast.reason.code == "clear_gap_confirmed_by_bulk_cycles"
|
|
|
|
bulk_cycle_shift = nvbench_compare.compare_gpu_timings(
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.0,
|
|
first_quartile=1.1,
|
|
median=1.2,
|
|
third_quartile=1.3,
|
|
mean=1.2,
|
|
stdev_relative=0.05,
|
|
sample_values=[1.0, 1.1, 1.2, 1.3],
|
|
frequency_values=[100.0] * 4,
|
|
),
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=0.8,
|
|
first_quartile=0.85,
|
|
median=0.9,
|
|
third_quartile=0.95,
|
|
mean=0.9,
|
|
stdev_relative=0.05,
|
|
sample_values=[0.8, 0.85, 0.9, 0.95],
|
|
frequency_values=[200.0] * 4,
|
|
),
|
|
)
|
|
assert bulk_cycle_shift is not None
|
|
assert bulk_cycle_shift.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert bulk_cycle_shift.reason.code == "bulk_cycle_gap_not_confirmed"
|
|
|
|
missing_source = nvbench_compare.Float32BinarySource(
|
|
count=4,
|
|
filename="missing.bin",
|
|
json_dir=str(tmp_path),
|
|
description="test sample",
|
|
)
|
|
missing_bulk_files = nvbench_compare.compare_gpu_timings(
|
|
replace(
|
|
ref_interval_timing,
|
|
sample_source=missing_source,
|
|
frequency_source=missing_source,
|
|
),
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=0.8,
|
|
first_quartile=0.85,
|
|
median=0.9,
|
|
third_quartile=0.95,
|
|
mean=0.9,
|
|
stdev_relative=0.05,
|
|
sm_clock_rate_mean=100.0,
|
|
),
|
|
)
|
|
assert missing_bulk_files is not None
|
|
assert missing_bulk_files.status == nvbench_compare.ComparisonStatus.FAST
|
|
assert missing_bulk_files.reason.code == "clear_gap_confirmed_by_summary_cycles"
|
|
|
|
unusable_bulk_cycles = nvbench_compare.compare_gpu_timings(
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.0,
|
|
first_quartile=1.1,
|
|
median=1.2,
|
|
third_quartile=1.3,
|
|
mean=1.2,
|
|
stdev_relative=0.05,
|
|
sm_clock_rate_mean=100.0,
|
|
sample_values=[0.0, 1.1, 1.2, 1.3],
|
|
frequency_values=[100.0] * 4,
|
|
),
|
|
make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=0.8,
|
|
first_quartile=0.85,
|
|
median=0.9,
|
|
third_quartile=0.95,
|
|
mean=0.9,
|
|
stdev_relative=0.05,
|
|
sm_clock_rate_mean=100.0,
|
|
sample_values=[0.8, 0.85, 0.9, 0.95],
|
|
frequency_values=[100.0] * 4,
|
|
),
|
|
)
|
|
assert unusable_bulk_cycles is not None
|
|
assert unusable_bulk_cycles.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert unusable_bulk_cycles.reason.code == "bulk_cycle_data_unusable"
|
|
|
|
missing_noise = nvbench_compare.compare_gpu_timings(
|
|
ref_timing,
|
|
make_gpu_timing_data(nvbench_compare, mean=1.2, stdev_relative=None),
|
|
)
|
|
assert missing_noise is not None
|
|
assert missing_noise.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert missing_noise.max_noise is None
|
|
assert missing_noise.reason.code == "noise_unavailable"
|
|
|
|
|
|
def test_compare_gpu_timings_uses_bulk_data_to_confirm_same(nvbench_compare):
|
|
ref_timing = make_gpu_timing_data(
|
|
nvbench_compare,
|
|
mean=1.0,
|
|
stdev_relative=0.05,
|
|
sample_values=[1.0] * 8 + [1.004] * 2,
|
|
frequency_values=[100.0] * 10,
|
|
)
|
|
cmp_timing = make_gpu_timing_data(
|
|
nvbench_compare,
|
|
mean=1.0,
|
|
stdev_relative=0.05,
|
|
sample_values=[1.0] * 2 + [1.004] * 8,
|
|
frequency_values=[100.0] * 10,
|
|
)
|
|
|
|
comparison = nvbench_compare.compare_gpu_timings(ref_timing, cmp_timing)
|
|
|
|
assert comparison is not None
|
|
assert comparison.status == nvbench_compare.ComparisonStatus.SAME
|
|
assert comparison.reason.code == "bulk_same"
|
|
|
|
|
|
def test_format_diff_and_percent_ranges(nvbench_compare):
|
|
assert nvbench_compare.format_duration(None) == "n/a"
|
|
assert nvbench_compare.format_duration(math.nan) == "n/a"
|
|
assert nvbench_compare.format_duration(math.inf) == "n/a"
|
|
assert nvbench_compare.format_duration(-1.0) == "n/a"
|
|
assert nvbench_compare.format_duration(0.0) == "n/a"
|
|
assert (
|
|
nvbench_compare.format_duration(-1.0, allow_negative=True) == "-1000000.000 us"
|
|
)
|
|
assert nvbench_compare.format_duration(0.0, allow_zero=True) == "0.000 us"
|
|
assert nvbench_compare.format_duration_range((-12e-6, 8e-6)) == "[-12.00, 8.00] us"
|
|
assert (
|
|
nvbench_compare.format_percentage_bounds(
|
|
(-0.2153846154, 0.28), nvbench_compare.ComparisonStatus.UNDECIDED
|
|
)
|
|
== "in [-21.5%, +28.0%]"
|
|
)
|
|
assert (
|
|
nvbench_compare.format_percentage_bounds(
|
|
(-0.3076923077, -0.05), nvbench_compare.ComparisonStatus.FAST
|
|
)
|
|
== "<= -5.0%"
|
|
)
|
|
assert (
|
|
nvbench_compare.format_percentage_bounds(
|
|
(0.0769230769, 0.55), nvbench_compare.ComparisonStatus.SLOW
|
|
)
|
|
== ">= +7.7%"
|
|
)
|
|
|
|
|
|
def test_format_change_only_reports_fast_and_slow_rows(nvbench_compare):
|
|
fast = types.SimpleNamespace(
|
|
status=nvbench_compare.ComparisonStatus.FAST,
|
|
frac_diff_interval=(-0.3, -0.05),
|
|
)
|
|
slow = types.SimpleNamespace(
|
|
status=nvbench_compare.ComparisonStatus.SLOW,
|
|
frac_diff_interval=(0.07, 0.55),
|
|
)
|
|
same = types.SimpleNamespace(
|
|
status=nvbench_compare.ComparisonStatus.SAME,
|
|
frac_diff_interval=(-0.01, 0.01),
|
|
)
|
|
undecided = types.SimpleNamespace(
|
|
status=nvbench_compare.ComparisonStatus.UNDECIDED,
|
|
frac_diff_interval=(-0.01, 0.01),
|
|
)
|
|
|
|
assert nvbench_compare.format_change(fast) == "<= -5.0%"
|
|
assert nvbench_compare.format_change(slow) == ">= +7.0%"
|
|
assert nvbench_compare.format_change(same) == ""
|
|
assert nvbench_compare.format_change(undecided) == ""
|
|
|
|
|
|
def test_ambiguous_status_uses_shrug_marker(nvbench_compare):
|
|
assert (
|
|
nvbench_compare.colorize_comparison_status(
|
|
nvbench_compare.ComparisonStatus.UNDECIDED, no_color=True
|
|
)
|
|
== "\U0001f937 AMBG"
|
|
)
|
|
|
|
|
|
def test_format_timing_with_interval(nvbench_compare):
|
|
interval = nvbench_compare.TimingInterval(
|
|
lower=0.002237, upper=0.002389, center=0.0023
|
|
)
|
|
assert (
|
|
nvbench_compare.format_timing_with_interval(0.0023, interval)
|
|
== "2.300 ms [-63, +89] us"
|
|
)
|
|
|
|
interval = nvbench_compare.TimingInterval(
|
|
lower=19.380e-6, upper=20.508e-6, center=19.944e-6
|
|
)
|
|
assert (
|
|
nvbench_compare.format_timing_with_interval(19.944e-6, interval)
|
|
== "19.944 [-0.564, +0.564] us"
|
|
)
|
|
|
|
|
|
def test_format_timing_with_explicit_interval(nvbench_compare):
|
|
interval = nvbench_compare.TimingInterval(
|
|
lower=0.001434, upper=0.001458, center=0.001446
|
|
)
|
|
assert (
|
|
nvbench_compare.format_timing_with_explicit_interval(0.001446, interval)
|
|
== "1.4[34 | 46 | 58] ms"
|
|
)
|
|
|
|
interval = nvbench_compare.TimingInterval(
|
|
lower=18.400e-6, upper=19.464e-6, center=18.736e-6
|
|
)
|
|
assert (
|
|
nvbench_compare.format_timing_with_explicit_interval(18.736e-6, interval)
|
|
== "[18.400 | 18.736 | 19.464] us"
|
|
)
|
|
|
|
interval = nvbench_compare.TimingInterval(
|
|
lower=19.380e-6, upper=20.508e-6, center=19.944e-6
|
|
)
|
|
assert (
|
|
nvbench_compare.format_timing_with_explicit_interval(19.944e-6, interval)
|
|
== "[19.380 | 19.944 | 20.508] us"
|
|
)
|
|
|
|
interval = nvbench_compare.TimingInterval(
|
|
lower=99.094e-6, upper=100.882e-6, center=99.988e-6
|
|
)
|
|
assert (
|
|
nvbench_compare.format_timing_with_explicit_interval(99.988e-6, interval)
|
|
== "[ 99.094 | 99.988 | 100.882] us"
|
|
)
|
|
|
|
|
|
def test_align_explain_interval_columns_pads_values_across_rows(nvbench_compare):
|
|
rows = [["", ""], ["", ""]]
|
|
comparisons = [
|
|
types.SimpleNamespace(
|
|
ref_time=19.944e-6,
|
|
ref_interval=nvbench_compare.TimingInterval(
|
|
lower=19.380e-6, center=19.944e-6, upper=20.508e-6
|
|
),
|
|
cmp_time=97.712e-6,
|
|
cmp_interval=nvbench_compare.TimingInterval(
|
|
lower=96.849e-6, center=97.712e-6, upper=98.574e-6
|
|
),
|
|
),
|
|
types.SimpleNamespace(
|
|
ref_time=103.466e-6,
|
|
ref_interval=nvbench_compare.TimingInterval(
|
|
lower=102.739e-6, center=103.466e-6, upper=104.193e-6
|
|
),
|
|
cmp_time=101.868e-6,
|
|
cmp_interval=nvbench_compare.TimingInterval(
|
|
lower=100.916e-6, center=101.868e-6, upper=102.819e-6
|
|
),
|
|
),
|
|
]
|
|
|
|
nvbench_compare.align_explain_interval_columns(rows, comparisons, axis_count=0)
|
|
|
|
assert rows[0][0] == "[ 19.380 | 19.944 | 20.508] us"
|
|
assert rows[1][0] == "[102.739 | 103.466 | 104.193] us"
|
|
assert rows[0][1] == "[ 96.849 | 97.712 | 98.574] us"
|
|
assert rows[1][1] == "[100.916 | 101.868 | 102.819] us"
|
|
|
|
|
|
def test_align_timing_interval_columns_reserves_missing_interval_slot(nvbench_compare):
|
|
rows = [["", ""], ["", ""]]
|
|
comparisons = [
|
|
types.SimpleNamespace(
|
|
ref_time=19.944e-6,
|
|
ref_interval=nvbench_compare.TimingInterval(
|
|
lower=19.380e-6, center=19.944e-6, upper=20.508e-6
|
|
),
|
|
cmp_time=18.736e-6,
|
|
cmp_interval=nvbench_compare.TimingInterval(
|
|
lower=18.400e-6, center=18.736e-6, upper=19.464e-6
|
|
),
|
|
),
|
|
types.SimpleNamespace(
|
|
ref_time=20.390e-6,
|
|
ref_interval=nvbench_compare.TimingInterval(
|
|
lower=19.659e-6, center=20.390e-6, upper=21.121e-6
|
|
),
|
|
cmp_time=20.480e-6,
|
|
cmp_interval=None,
|
|
),
|
|
]
|
|
|
|
nvbench_compare.align_timing_interval_columns(rows, comparisons, axis_count=0)
|
|
|
|
cmp_interval_slot = len("[-0.336, +0.728]")
|
|
assert rows[0][1] == "18.736 [-0.336, +0.728] us"
|
|
assert rows[1][1] == f"20.480 {' ' * cmp_interval_slot} us"
|
|
|
|
|
|
def test_compare_gpu_timings_keeps_bulk_mismatch_undecided(nvbench_compare):
|
|
ref_timing = make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.0,
|
|
first_quartile=1.1,
|
|
median=1.2,
|
|
third_quartile=1.3,
|
|
mean=1.2,
|
|
interquartile_range_relative=0.01,
|
|
sample_values=[1.0, 1.0, 1.004, 1.004],
|
|
frequency_values=[100.0] * 4,
|
|
)
|
|
cmp_timing = make_gpu_timing_data(
|
|
nvbench_compare,
|
|
minimum=1.02,
|
|
first_quartile=1.1,
|
|
median=1.204,
|
|
third_quartile=1.28,
|
|
mean=1.204,
|
|
interquartile_range_relative=0.01,
|
|
sample_values=[1.02, 1.02, 1.024, 1.024],
|
|
frequency_values=[100.0] * 4,
|
|
)
|
|
|
|
comparison = nvbench_compare.compare_gpu_timings(ref_timing, cmp_timing)
|
|
|
|
assert comparison is not None
|
|
assert comparison.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert comparison.reason.code == "bulk_time_support_mismatch"
|
|
sample_threshold = (
|
|
nvbench_compare.get_default_thresholds().bulk_same_sample_coverage * 100.0
|
|
)
|
|
assert (
|
|
f"sample: min(ref=0.0%, cmp=0.0%) >= {sample_threshold:0.1f}%"
|
|
in comparison.reason.message
|
|
)
|
|
assert "support: min(ref=0.0%, cmp=0.0%) >= 80.0%" in comparison.reason.message
|
|
assert f"{sample_threshold:0.1f}%" in comparison.reason.message
|
|
assert "80.0%" in comparison.reason.message
|
|
|
|
|
|
def test_compare_gpu_timings_requires_bulk_cycle_coverage(nvbench_compare):
|
|
ref_timing = make_gpu_timing_data(
|
|
nvbench_compare,
|
|
mean=1.0,
|
|
stdev_relative=0.01,
|
|
sample_values=[1.0, 1.0, 1.004, 1.004],
|
|
frequency_values=[100.0] * 4,
|
|
)
|
|
cmp_timing = make_gpu_timing_data(
|
|
nvbench_compare,
|
|
mean=1.0,
|
|
stdev_relative=0.01,
|
|
sample_values=[1.0, 1.0, 1.004, 1.004],
|
|
frequency_values=[200.0] * 4,
|
|
)
|
|
|
|
comparison = nvbench_compare.compare_gpu_timings(ref_timing, cmp_timing)
|
|
|
|
assert comparison is not None
|
|
assert comparison.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert comparison.reason.code == "bulk_cycle_support_mismatch"
|
|
|
|
|
|
def test_bulk_same_reports_sample_weight_coverage_mismatch(nvbench_compare):
|
|
ref_values = [1.0, 1.001, 1.002, 1.003] + [1.02] * 100
|
|
cmp_values = [1.0, 1.001, 1.002, 1.003]
|
|
|
|
decision = nvbench_compare.compare_values_for_bulk_same(
|
|
ref_values,
|
|
cmp_values,
|
|
label="time",
|
|
thresholds=nvbench_compare.get_default_thresholds(),
|
|
)
|
|
|
|
assert decision.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert decision.reason.code == "bulk_time_support_mismatch"
|
|
assert "sample: min(ref=3.8%, cmp=100.0%) >= 97.0%" in decision.reason.message
|
|
assert "support: min(ref=80.0%, cmp=100.0%) >= 80.0%" in decision.reason.message
|
|
|
|
|
|
def test_bulk_same_filters_rare_values_from_support_coverage(nvbench_compare):
|
|
ref_values = [1.0] * 1000 + [1.02 + 0.01 * i for i in range(10)]
|
|
cmp_values = [1.0]
|
|
|
|
decision = nvbench_compare.compare_values_for_bulk_same(
|
|
ref_values,
|
|
cmp_values,
|
|
label="time",
|
|
thresholds=nvbench_compare.get_default_thresholds(),
|
|
)
|
|
|
|
assert decision.status == nvbench_compare.ComparisonStatus.SAME
|
|
assert decision.reason.code == "bulk_time_same"
|
|
|
|
|
|
def test_bulk_same_reports_unique_support_coverage_mismatch(nvbench_compare):
|
|
ref_values = [1.0] * 1000 + [1.02 + 0.01 * i for i in range(10)]
|
|
cmp_values = [1.0]
|
|
|
|
decision = nvbench_compare.compare_values_for_bulk_same(
|
|
ref_values,
|
|
cmp_values,
|
|
label="time",
|
|
thresholds=replace(
|
|
nvbench_compare.get_default_thresholds(),
|
|
bulk_support_max_removed_sample_fraction=0.005,
|
|
),
|
|
)
|
|
|
|
assert decision.status == nvbench_compare.ComparisonStatus.UNDECIDED
|
|
assert decision.reason.code == "bulk_time_support_mismatch"
|
|
assert "sample: min(ref=99.0%, cmp=100.0%) >= 97.0%" in decision.reason.message
|
|
assert "support: min(ref=9.1%, cmp=100.0%) >= 80.0%" in decision.reason.message
|
|
|
|
|
|
def test_bulk_same_retains_full_support_when_all_values_are_unique(nvbench_compare):
|
|
coverages = nvbench_compare.compute_nearest_neighbor_coverages(
|
|
[1.0, 1.02],
|
|
[1.0],
|
|
thresholds=replace(
|
|
nvbench_compare.get_default_thresholds(),
|
|
bulk_support_rare_sample_fraction=1.0,
|
|
bulk_support_max_removed_sample_fraction=1.0,
|
|
),
|
|
)
|
|
|
|
assert coverages is not None
|
|
assert coverages["ref_sample"] == 0.5
|
|
assert coverages["ref_support"] == 0.5
|
|
assert coverages["ref_support_filter"] == nvbench_compare.SupportFilterInfo(
|
|
activated=False,
|
|
reason="all_values_unique",
|
|
removed_sample_fraction=0.0,
|
|
)
|
|
|
|
|
|
def test_comparison_stats_records_undecided_status(nvbench_compare):
|
|
stats = nvbench_compare.ComparisonStats()
|
|
|
|
stats.record(nvbench_compare.ComparisonStatus.UNDECIDED)
|
|
|
|
assert stats.config_count == 1
|
|
assert stats.pass_count == 0
|
|
assert stats.improvement_count == 0
|
|
assert stats.regression_count == 0
|
|
assert stats.undecided_count == 1
|
|
assert stats.unknown_count == 0
|
|
|
|
|
|
def test_comparison_stats_records_undecided_reason(nvbench_compare):
|
|
stats = nvbench_compare.ComparisonStats()
|
|
less_severe_reason = nvbench_compare.DecisionReason(
|
|
code="test_reason",
|
|
message="less severe reason",
|
|
severity=1.0,
|
|
)
|
|
more_severe_reason = nvbench_compare.DecisionReason(
|
|
code="test_reason",
|
|
message="more severe reason",
|
|
severity=2.0,
|
|
)
|
|
|
|
stats.record(nvbench_compare.ComparisonStatus.UNDECIDED, less_severe_reason)
|
|
stats.record(nvbench_compare.ComparisonStatus.UNDECIDED, more_severe_reason)
|
|
|
|
summary = stats.undecided_reasons["test_reason"]
|
|
assert summary.count == 2
|
|
assert summary.message == "more severe reason"
|
|
|
|
|
|
def test_reason_legend_omits_trivial_aliases(nvbench_compare):
|
|
reason_legend = {
|
|
"bulk-same": nvbench_compare.DecisionReasonSummary(canonical_code="bulk_same"),
|
|
"bt-sup-miss": nvbench_compare.DecisionReasonSummary(
|
|
canonical_code="bulk_time_support_mismatch"
|
|
),
|
|
}
|
|
|
|
assert nvbench_compare.format_reason_legend_entries(reason_legend) == [
|
|
"bt-sup-miss = bulk_time_support_mismatch"
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ref_time, cmp_time, reason_code",
|
|
[
|
|
(None, 1.0, "timing_center_missing"),
|
|
(1.0, None, "timing_center_missing"),
|
|
(math.nan, 1.0, "timing_center_nonfinite"),
|
|
(math.inf, 1.0, "timing_center_nonfinite"),
|
|
(0.0, 1.0, "timing_center_nonpositive"),
|
|
(-1.0, 1.0, "timing_center_nonpositive"),
|
|
],
|
|
)
|
|
def test_compare_gpu_timings_reports_unusable_centers_as_unknown(
|
|
nvbench_compare, ref_time, cmp_time, reason_code
|
|
):
|
|
comparison = nvbench_compare.compare_gpu_timings(
|
|
make_gpu_timing_data(nvbench_compare, mean=ref_time),
|
|
make_gpu_timing_data(nvbench_compare, mean=cmp_time),
|
|
)
|
|
|
|
assert comparison is not None
|
|
assert comparison.status == nvbench_compare.ComparisonStatus.UNKNOWN
|
|
assert comparison.reason.code == reason_code
|
|
assert comparison.diff is None
|
|
assert comparison.frac_diff is None
|
|
|
|
|
|
def test_compare_benches_reports_regression_when_robust_intervals_and_clock_confirm(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01")
|
|
ref_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"),
|
|
make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"),
|
|
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
|
]
|
|
)
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01")
|
|
cmp_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"),
|
|
make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"),
|
|
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
|
]
|
|
)
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([ref_state])],
|
|
[make_benchmark([cmp_state])],
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 1
|
|
assert run_data.stats.undecided_count == 0
|
|
assert run_data.stats.unknown_count == 0
|
|
|
|
|
|
def test_compare_benches_accepts_custom_comparison_thresholds(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_state = make_state(nvbench_compare, "state", mean="1.0", noise="0.01")
|
|
ref_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.99"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.995"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.01"),
|
|
make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"),
|
|
]
|
|
)
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.01", noise="0.01")
|
|
cmp_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.005"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.01"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.02"),
|
|
make_summary(nvbench_compare, "GPU_TIME_IQR_RELATIVE_TAG", "0.01"),
|
|
]
|
|
)
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([ref_state])],
|
|
[make_benchmark([cmp_state])],
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
comparison_thresholds=replace(
|
|
nvbench_compare.get_default_thresholds(), same_center_relative=0.02
|
|
),
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.pass_count == 1
|
|
assert run_data.stats.undecided_count == 0
|
|
|
|
|
|
def test_compare_benches_marks_unavailable_noise_undecided(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
missing_noise_ref = make_state(nvbench_compare, "missing_noise")
|
|
missing_noise_ref["summaries"] = [
|
|
make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.0")
|
|
]
|
|
missing_noise_cmp = make_state(nvbench_compare, "missing_noise")
|
|
missing_noise_cmp["summaries"] = [
|
|
make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.001")
|
|
]
|
|
|
|
null_noise_ref = make_state(nvbench_compare, "null_noise")
|
|
null_noise_ref["summaries"] = [
|
|
make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_STDEV_RELATIVE_TAG", None),
|
|
]
|
|
null_noise_cmp = make_state(nvbench_compare, "null_noise")
|
|
null_noise_cmp["summaries"] = [
|
|
make_summary(nvbench_compare, "GPU_TIME_MEAN_TAG", "1.001"),
|
|
make_summary(nvbench_compare, "GPU_TIME_STDEV_RELATIVE_TAG", None),
|
|
]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([missing_noise_ref, null_noise_ref])],
|
|
[make_benchmark([missing_noise_cmp, null_noise_cmp])],
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 2
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 0
|
|
assert run_data.stats.undecided_count == 2
|
|
assert run_data.stats.unknown_count == 0
|
|
|
|
|
|
def test_plot_along_skips_states_without_selected_axis(monkeypatch, nvbench_compare):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "with_axis", axis_value=1),
|
|
make_state(nvbench_compare, "without_axis"),
|
|
]
|
|
)
|
|
]
|
|
cmp_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "with_axis", axis_value=1),
|
|
make_state(nvbench_compare, "without_axis"),
|
|
]
|
|
)
|
|
]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along="A",
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 2
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 0
|
|
assert run_data.stats.undecided_count == 2
|
|
assert run_data.stats.unknown_count == 0
|
|
|
|
|
|
def test_device_filter_parser_accepts_all_and_duplicate_ids(nvbench_compare):
|
|
assert nvbench_compare.parse_device_filter(" all ", "--reference-devices") is None
|
|
assert nvbench_compare.parse_device_filter("0", "--reference-devices") == [0]
|
|
assert nvbench_compare.parse_device_filter("0, 2,0", "--reference-devices") == [
|
|
0,
|
|
2,
|
|
0,
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"device_arg",
|
|
[
|
|
"",
|
|
" ",
|
|
"gpu",
|
|
"-1",
|
|
"0,gpu",
|
|
"0,-1",
|
|
"0,",
|
|
",0",
|
|
],
|
|
)
|
|
def test_device_filter_parser_rejects_invalid_values(nvbench_compare, device_arg):
|
|
with pytest.raises(ValueError, match="must be 'all'"):
|
|
nvbench_compare.parse_device_filter(device_arg, "--reference-devices")
|
|
|
|
|
|
def test_explicit_device_filters_downgrade_device_mismatch_to_warning(nvbench_compare):
|
|
assert nvbench_compare.require_matching_device_sections(None, None)
|
|
assert not nvbench_compare.require_matching_device_sections([0], None)
|
|
assert not nvbench_compare.require_matching_device_sections(None, [1])
|
|
assert not nvbench_compare.require_matching_device_sections([0], [1])
|
|
|
|
|
|
def test_compare_benches_pairs_filtered_devices_by_position(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(
|
|
nvbench_compare,
|
|
ref_devices=[
|
|
{"id": 0, "name": "Reference GPU 0"},
|
|
{"id": 1, "name": "Reference GPU 1"},
|
|
],
|
|
cmp_devices=[
|
|
{"id": 0, "name": "Compare GPU 0"},
|
|
{"id": 1, "name": "Compare GPU 1"},
|
|
],
|
|
)
|
|
|
|
ref_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "Device=0", mean="1.0", device=0),
|
|
make_state(nvbench_compare, "Device=1", mean="9.0", device=1),
|
|
]
|
|
)
|
|
]
|
|
cmp_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "Device=0", mean="9.0", device=0),
|
|
make_state(nvbench_compare, "Device=1", mean="1.0", device=1),
|
|
]
|
|
)
|
|
]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
reference_device_filter=[0],
|
|
compare_device_filter=[1],
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 0
|
|
assert run_data.stats.undecided_count == 1
|
|
assert run_data.stats.unknown_count == 0
|
|
|
|
|
|
def test_axis_filter_applies_to_most_recent_benchmark(monkeypatch, nvbench_compare):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
|
|
ref_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state", mean="1.0", axis_value=1),
|
|
make_state(nvbench_compare, "state", mean="2.0", axis_value=2),
|
|
],
|
|
name="bench1",
|
|
),
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state", mean="3.0", axis_value=1),
|
|
make_state(nvbench_compare, "state", mean="4.0", axis_value=2),
|
|
],
|
|
name="bench2",
|
|
),
|
|
]
|
|
cmp_benches = [
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state", mean="1.0", axis_value=1),
|
|
make_state(nvbench_compare, "state", mean="2.0", axis_value=2),
|
|
],
|
|
name="bench1",
|
|
),
|
|
make_benchmark(
|
|
[
|
|
make_state(nvbench_compare, "state", mean="3.0", axis_value=1),
|
|
make_state(nvbench_compare, "state", mean="4.0", axis_value=2),
|
|
],
|
|
name="bench2",
|
|
),
|
|
]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(
|
|
nvbench_compare,
|
|
[("benchmark", "bench1"), ("axis", "A=2"), ("benchmark", "bench2")],
|
|
),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 3
|
|
assert run_data.stats.pass_count == 0
|
|
assert run_data.stats.improvement_count == 0
|
|
assert run_data.stats.regression_count == 0
|
|
assert run_data.stats.undecided_count == 3
|
|
assert run_data.stats.unknown_count == 0
|
|
|
|
|
|
def test_main_returns_success_exit_code_when_regressions_are_detected(
|
|
monkeypatch, capsys, nvbench_compare
|
|
):
|
|
devices = [{"id": 0, "name": "Test GPU"}]
|
|
ref_state = make_state(nvbench_compare, "state", mean="1.0")
|
|
ref_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "0.9"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "0.95"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.05"),
|
|
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
|
]
|
|
)
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.2")
|
|
cmp_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.15"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.18"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.2"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.25"),
|
|
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
|
]
|
|
)
|
|
ref_root = {
|
|
"devices": devices,
|
|
"benchmarks": [make_benchmark([ref_state])],
|
|
}
|
|
cmp_root = {
|
|
"devices": devices,
|
|
"benchmarks": [make_benchmark([cmp_state])],
|
|
}
|
|
|
|
def read_file(path):
|
|
return ref_root if path == "ref.json" else cmp_root
|
|
|
|
monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file)
|
|
monkeypatch.setattr(sys, "argv", ["nvbench_compare", "ref.json", "cmp.json"])
|
|
|
|
assert nvbench_compare.main() == 0
|
|
assert "Regression (clear timing gap, %Diff > 0): 1" in capsys.readouterr().out
|
|
|
|
|
|
def test_main_prints_undecided_reason_summary(monkeypatch, capsys, nvbench_compare):
|
|
devices = [{"id": 0, "name": "Test GPU"}]
|
|
ref_root = {
|
|
"devices": devices,
|
|
"benchmarks": [
|
|
make_benchmark([make_state(nvbench_compare, "state", noise="0.05")])
|
|
],
|
|
}
|
|
cmp_root = {
|
|
"devices": devices,
|
|
"benchmarks": [
|
|
make_benchmark(
|
|
[make_state(nvbench_compare, "state", mean="1.01", noise="0.05")]
|
|
)
|
|
],
|
|
}
|
|
|
|
def read_file(path):
|
|
return ref_root if path == "ref.json" else cmp_root
|
|
|
|
monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file)
|
|
monkeypatch.setattr(
|
|
sys, "argv", ["nvbench_compare", "--display", "explain", "ref.json", "cmp.json"]
|
|
)
|
|
|
|
assert nvbench_compare.main() == 0
|
|
output = capsys.readouterr().out
|
|
assert "Ambiguous (comparison requires more evidence): 1" in output
|
|
assert "noise_too_high: 1" in output
|
|
assert "Reason legend: noise-high = noise_too_high" in output
|
|
|
|
|
|
def test_get_comparison_thresholds_returns_named_presets(nvbench_compare):
|
|
default = nvbench_compare.get_comparison_thresholds("default")
|
|
strict = nvbench_compare.get_comparison_thresholds("strict")
|
|
permissive = nvbench_compare.get_comparison_thresholds("permissive")
|
|
|
|
assert default == nvbench_compare.ComparisonThresholds(
|
|
**nvbench_compare.COMPARISON_THRESHOLD_PRESET_VALUES["default"]
|
|
)
|
|
assert strict.clear_gap_relative > default.clear_gap_relative
|
|
assert strict.same_center_relative < default.same_center_relative
|
|
assert strict.bulk_same_sample_coverage > default.bulk_same_sample_coverage
|
|
assert permissive.clear_gap_relative < default.clear_gap_relative
|
|
assert permissive.same_center_relative > default.same_center_relative
|
|
assert permissive.bulk_same_support_coverage < default.bulk_same_support_coverage
|
|
|
|
|
|
def test_dump_comparison_config_uses_grouped_toml(nvbench_compare):
|
|
config = nvbench_compare.dump_comparison_config(
|
|
"default", nvbench_compare.get_comparison_thresholds("default")
|
|
)
|
|
|
|
assert "version = 1\n" in config
|
|
assert '[preset]\nname = "default"\n' in config
|
|
assert "[clear_gap]\nrelative = 0.005\n" in config
|
|
assert "[same]\n" in config
|
|
assert "[bulk]\n" in config
|
|
assert "sample_coverage = 0.97\n" in config
|
|
assert "[bulk.rare_support]\n" in config
|
|
|
|
|
|
def test_resolve_comparison_thresholds_applies_config_overrides(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
def read_config(_):
|
|
return (
|
|
"strict",
|
|
{
|
|
"bulk_same_sample_coverage": 0.93,
|
|
"bulk_support_max_removed_sample_fraction": 0.02,
|
|
},
|
|
)
|
|
|
|
monkeypatch.setattr(nvbench_compare, "read_comparison_config_file", read_config)
|
|
|
|
preset, thresholds = nvbench_compare.resolve_comparison_thresholds(
|
|
None, "settings.toml"
|
|
)
|
|
assert preset == "strict"
|
|
assert thresholds.clear_gap_relative == pytest.approx(
|
|
nvbench_compare.get_comparison_thresholds("strict").clear_gap_relative
|
|
)
|
|
assert thresholds.bulk_same_sample_coverage == pytest.approx(0.93)
|
|
assert thresholds.bulk_support_max_removed_sample_fraction == pytest.approx(0.02)
|
|
|
|
preset, thresholds = nvbench_compare.resolve_comparison_thresholds(
|
|
"permissive", "settings.toml"
|
|
)
|
|
assert preset == "permissive"
|
|
assert thresholds.clear_gap_relative == pytest.approx(
|
|
nvbench_compare.get_comparison_thresholds("permissive").clear_gap_relative
|
|
)
|
|
assert thresholds.bulk_same_sample_coverage == pytest.approx(0.93)
|
|
assert thresholds.bulk_support_max_removed_sample_fraction == pytest.approx(0.02)
|
|
|
|
|
|
def test_parse_comparison_config_data_validates_grouped_thresholds(nvbench_compare):
|
|
preset, overrides = nvbench_compare.parse_comparison_config_data(
|
|
{
|
|
"version": 1,
|
|
"preset": {"name": "strict"},
|
|
"clear_gap": {"relative": 0.01},
|
|
"same": {
|
|
"center_relative": 0.002,
|
|
"overlap_fraction": 0.75,
|
|
"relative_dispersion_ceiling": 0.02,
|
|
},
|
|
"bulk": {
|
|
"sample_coverage": 0.99,
|
|
"support_coverage": 0.8,
|
|
"rare_support": {
|
|
"sample_fraction": 0.001,
|
|
"max_removed_sample_fraction": 0.01,
|
|
},
|
|
},
|
|
}
|
|
)
|
|
|
|
assert preset == "strict"
|
|
assert overrides == {
|
|
"clear_gap_relative": 0.01,
|
|
"same_center_relative": 0.002,
|
|
"same_overlap_fraction": 0.75,
|
|
"same_relative_dispersion_ceiling": 0.02,
|
|
"bulk_same_sample_coverage": 0.99,
|
|
"bulk_same_support_coverage": 0.8,
|
|
"bulk_support_rare_sample_fraction": 0.001,
|
|
"bulk_support_max_removed_sample_fraction": 0.01,
|
|
}
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"config_data, match",
|
|
[
|
|
({}, "version"),
|
|
({"version": 2}, "unsupported"),
|
|
({"version": 1, "rare_support": {}}, "unknown top-level"),
|
|
({"version": 1, "bulk": {"unknown": 0.1}}, r"\[bulk\]"),
|
|
({"version": 1, "clear_gap": {"rare_support": {}}}, r"\[clear_gap\]"),
|
|
({"version": 1, "bulk": {"sample_coverage": 1.5}}, "<= 1"),
|
|
({"version": 1, "same": {"center_relative": "tight"}}, "finite number"),
|
|
({"version": 1, "preset": {"name": "aggressive"}}, "unknown comparison preset"),
|
|
],
|
|
)
|
|
def test_parse_comparison_config_data_rejects_invalid_config(
|
|
nvbench_compare, config_data, match
|
|
):
|
|
with pytest.raises(ValueError, match=match):
|
|
nvbench_compare.parse_comparison_config_data(config_data)
|
|
|
|
|
|
def test_read_comparison_config_file_parses_toml_with_available_parser(
|
|
tmp_path, nvbench_compare
|
|
):
|
|
parser_module = "tomllib" if sys.version_info >= (3, 11) else "tomli"
|
|
# TOML config support is optional on Python 3.10 unless tomli is installed.
|
|
# Skip the test if parser_module is not available
|
|
pytest.importorskip(parser_module)
|
|
|
|
config_path = tmp_path / "settings.toml"
|
|
config_path.write_text(
|
|
"""
|
|
version = 1
|
|
|
|
[preset]
|
|
name = "strict"
|
|
|
|
[bulk]
|
|
sample_coverage = 0.93
|
|
""",
|
|
encoding="utf-8",
|
|
)
|
|
|
|
preset, overrides = nvbench_compare.read_comparison_config_file(config_path)
|
|
|
|
assert preset == "strict"
|
|
assert overrides == {"bulk_same_sample_coverage": 0.93}
|
|
|
|
|
|
def test_main_dump_config_does_not_require_input_files(
|
|
monkeypatch, capsys, nvbench_compare
|
|
):
|
|
def read_file(_):
|
|
raise AssertionError("dump-config should not read JSON files")
|
|
|
|
monkeypatch.setattr(nvbench_compare.reader, "read_file", read_file)
|
|
monkeypatch.setattr(
|
|
sys,
|
|
"argv",
|
|
["nvbench_compare", "--preset", "strict", "--dump-config"],
|
|
)
|
|
|
|
assert nvbench_compare.main() == 0
|
|
output = capsys.readouterr().out
|
|
assert 'name = "strict"' in output
|
|
assert "[bulk.rare_support]" in output
|
|
|
|
|
|
def test_main_dump_config_merges_config_and_cli_preset(
|
|
monkeypatch, capsys, nvbench_compare
|
|
):
|
|
def read_config(_):
|
|
return ("strict", {"bulk_same_sample_coverage": 0.93})
|
|
|
|
monkeypatch.setattr(nvbench_compare, "read_comparison_config_file", read_config)
|
|
monkeypatch.setattr(
|
|
sys,
|
|
"argv",
|
|
[
|
|
"nvbench_compare",
|
|
"--config",
|
|
"settings.toml",
|
|
"--preset",
|
|
"permissive",
|
|
"--dump-config",
|
|
],
|
|
)
|
|
|
|
assert nvbench_compare.main() == 0
|
|
output = capsys.readouterr().out
|
|
assert 'name = "permissive"' in output
|
|
assert "relative = 0.0025" in output
|
|
assert "sample_coverage = 0.93" in output
|
|
|
|
|
|
def test_main_rejects_unknown_options(monkeypatch, nvbench_compare):
|
|
monkeypatch.setattr(
|
|
sys,
|
|
"argv",
|
|
["nvbench_compare", "--dispaly", "explain", "ref.json", "cmp.json"],
|
|
)
|
|
|
|
with pytest.raises(SystemExit) as exc_info:
|
|
nvbench_compare.main()
|
|
|
|
assert exc_info.value.code == 2
|
|
|
|
|
|
def test_main_prints_bulk_debug_python_to_stdout(monkeypatch, capsys, nvbench_compare):
|
|
devices = [{"id": 0, "name": "Test GPU"}]
|
|
root = {
|
|
"devices": devices,
|
|
"benchmarks": [],
|
|
}
|
|
|
|
monkeypatch.setattr(nvbench_compare.reader, "read_file", lambda _: root)
|
|
|
|
def fake_compare_benches(*args, **kwargs):
|
|
kwargs["bulk_debug_rows"].append(
|
|
{
|
|
"row_index": 0,
|
|
"status": "AMBG",
|
|
"reference_sample_filename": None,
|
|
"reference_sample_count": None,
|
|
"reference_frequency_filename": None,
|
|
"reference_frequency_count": None,
|
|
"compare_sample_filename": None,
|
|
"compare_sample_count": None,
|
|
"compare_frequency_filename": None,
|
|
"compare_frequency_count": None,
|
|
}
|
|
)
|
|
|
|
monkeypatch.setattr(nvbench_compare, "compare_benches", fake_compare_benches)
|
|
monkeypatch.setattr(
|
|
sys,
|
|
"argv",
|
|
[
|
|
"nvbench_compare",
|
|
"--bulk-debug-python",
|
|
"STDOUT",
|
|
"ref.json",
|
|
"cmp.json",
|
|
],
|
|
)
|
|
|
|
assert nvbench_compare.main() == 0
|
|
output = capsys.readouterr().out
|
|
assert "# NVB-BULK-BEGIN" in output
|
|
assert "bulk_rows = [" in output
|
|
assert "'status': 'AMBG'" in output
|
|
assert "def load_bulk_data(row):" in output
|
|
assert "# NVB-BULK-END" in output
|
|
|
|
|
|
def test_compare_benches_counts_unusable_timing_as_unknown(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
tabulate_calls = capture_tabulate_calls(monkeypatch, nvbench_compare)
|
|
|
|
ref_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="nan")])]
|
|
cmp_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.0")])]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=1.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.unknown_count == 1
|
|
table = find_tabulate_call(tabulate_calls, INTERVAL_DISPLAY_HEADERS)
|
|
row = table["rows"][0]
|
|
assert row[-4] == "n/a"
|
|
assert row[-3] == "1.000 s"
|
|
assert row[-2] == ""
|
|
assert row[-1] == "\U0001f7e1 ????"
|
|
|
|
|
|
def test_compare_benches_counts_skipped_state_as_unknown(monkeypatch, nvbench_compare):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
tabulate_calls = capture_tabulate_calls(monkeypatch, nvbench_compare)
|
|
|
|
ref_state = make_state(nvbench_compare, "state")
|
|
ref_state["summaries"] = None
|
|
ref_state["is_skipped"] = True
|
|
ref_state["skip_reason"] = "requested by benchmark"
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.0")
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([ref_state])],
|
|
[make_benchmark([cmp_state])],
|
|
threshold=1.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.unknown_count == 1
|
|
reason_summary = run_data.stats.reason_legend["state-skip"]
|
|
assert reason_summary.canonical_code == "state_skipped"
|
|
assert reason_summary.message == "reference state skipped: requested by benchmark"
|
|
table = find_tabulate_call(tabulate_calls, INTERVAL_DISPLAY_HEADERS)
|
|
row = table["rows"][0]
|
|
assert row[-4] == "n/a"
|
|
assert row[-3] == "1.000 s"
|
|
assert row[-2] == ""
|
|
assert row[-1] == "\U0001f7e1 ????"
|
|
|
|
|
|
def test_compare_benches_counts_missing_summaries_as_unknown(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
tabulate_calls = capture_tabulate_calls(monkeypatch, nvbench_compare)
|
|
|
|
ref_state = make_state(nvbench_compare, "state")
|
|
del ref_state["summaries"]
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.0")
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([ref_state])],
|
|
[make_benchmark([cmp_state])],
|
|
threshold=1.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.unknown_count == 1
|
|
reason_summary = run_data.stats.reason_legend["summ-miss"]
|
|
assert reason_summary.canonical_code == "gpu_timing_summaries_missing"
|
|
assert reason_summary.message == "reference GPU timing summaries are missing"
|
|
table = find_tabulate_call(tabulate_calls, INTERVAL_DISPLAY_HEADERS)
|
|
row = table["rows"][0]
|
|
assert row[-4] == "n/a"
|
|
assert row[-3] == "1.000 s"
|
|
assert row[-2] == ""
|
|
assert row[-1] == "\U0001f7e1 ????"
|
|
|
|
|
|
def test_compare_benches_plot_skips_unknown_rows(monkeypatch, nvbench_compare):
|
|
plotted_entries = []
|
|
|
|
def fake_plot_comparison_entries(entries, *args, **kwargs):
|
|
plotted_entries.extend(entries)
|
|
return 0
|
|
|
|
monkeypatch.setattr(
|
|
nvbench_compare, "plot_comparison_entries", fake_plot_comparison_entries
|
|
)
|
|
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
ref_state = make_state(nvbench_compare, "state")
|
|
del ref_state["summaries"]
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.0")
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([ref_state])],
|
|
[make_benchmark([cmp_state])],
|
|
threshold=1.0,
|
|
plot_along=None,
|
|
plot=True,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
assert run_data.stats.config_count == 1
|
|
assert run_data.stats.unknown_count == 1
|
|
assert plotted_entries == []
|
|
|
|
|
|
def test_compare_benches_defaults_to_interval_display(monkeypatch, nvbench_compare):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
tabulate_calls = capture_tabulate_calls(monkeypatch, nvbench_compare)
|
|
|
|
ref_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.0")])]
|
|
cmp_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.01")])]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
)
|
|
|
|
table = find_tabulate_call(tabulate_calls, INTERVAL_DISPLAY_HEADERS)
|
|
row = table["rows"][0]
|
|
assert row[-4].startswith("1.000 s")
|
|
assert row[-3].startswith("1.010 s")
|
|
assert row[-2] == ""
|
|
|
|
|
|
def test_compare_benches_legacy_display_uses_scalar_diff(monkeypatch, nvbench_compare):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
tabulate_calls = capture_tabulate_calls(monkeypatch, nvbench_compare)
|
|
|
|
ref_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.0")])]
|
|
cmp_benches = [make_benchmark([make_state(nvbench_compare, "state", mean="1.01")])]
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
ref_benches,
|
|
cmp_benches,
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
display="legacy",
|
|
)
|
|
|
|
table = find_tabulate_call(tabulate_calls, LEGACY_DISPLAY_HEADERS)
|
|
row = table["rows"][0]
|
|
assert row[-7] == "1.000 s"
|
|
assert row[-5] == "1.010 s"
|
|
assert row[-3] == "10.000 ms"
|
|
assert row[-2] == "1.00%"
|
|
|
|
|
|
def test_compare_benches_explain_display_uses_explicit_intervals(
|
|
monkeypatch, nvbench_compare
|
|
):
|
|
run_data = make_comparison_run_data(nvbench_compare)
|
|
tabulate_calls = capture_tabulate_calls(monkeypatch, nvbench_compare)
|
|
|
|
ref_state = make_state(nvbench_compare, "state", mean="1.0")
|
|
ref_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.0"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.01"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.02"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.03"),
|
|
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
|
]
|
|
)
|
|
cmp_state = make_state(nvbench_compare, "state", mean="1.01")
|
|
cmp_state["summaries"].extend(
|
|
[
|
|
make_summary(nvbench_compare, "GPU_TIME_MIN_TAG", "1.01"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q1_TAG", "1.02"),
|
|
make_summary(nvbench_compare, "GPU_TIME_MEDIAN_TAG", "1.03"),
|
|
make_summary(nvbench_compare, "GPU_TIME_Q3_TAG", "1.04"),
|
|
make_summary(nvbench_compare, "GPU_SM_CLOCK_RATE_MEAN_TAG", "100.0"),
|
|
]
|
|
)
|
|
|
|
nvbench_compare.compare_benches(
|
|
run_data,
|
|
[make_benchmark([ref_state])],
|
|
[make_benchmark([cmp_state])],
|
|
threshold=0.0,
|
|
plot_along=None,
|
|
plot=False,
|
|
dark=False,
|
|
filter_plan=make_filter_plan(nvbench_compare),
|
|
no_color=True,
|
|
display="explain",
|
|
)
|
|
|
|
table = find_tabulate_call(tabulate_calls, EXPLAIN_DISPLAY_HEADERS)
|
|
row = table["rows"][0]
|
|
assert row[-7] == "1.0[00 | 20 | 30] s"
|
|
assert row[-6] == "1.0[10 | 30 | 40] s"
|
|
assert row[-3] == "centers-far"
|
|
assert row[-2] == ""
|
|
|
|
|
|
def test_main_passes_selected_preset_to_compare_benches(monkeypatch, nvbench_compare):
|
|
devices = [{"id": 0, "name": "Test GPU"}]
|
|
root = {
|
|
"devices": devices,
|
|
"benchmarks": [],
|
|
}
|
|
captured = {}
|
|
|
|
monkeypatch.setattr(nvbench_compare.reader, "read_file", lambda _: root)
|
|
|
|
def fake_compare_benches(*args, **kwargs):
|
|
captured["comparison_thresholds"] = kwargs["comparison_thresholds"]
|
|
captured["display"] = kwargs["display"]
|
|
|
|
monkeypatch.setattr(nvbench_compare, "compare_benches", fake_compare_benches)
|
|
monkeypatch.setattr(
|
|
sys,
|
|
"argv",
|
|
[
|
|
"nvbench_compare",
|
|
"--preset",
|
|
"strict",
|
|
"--display",
|
|
"explain",
|
|
"ref.json",
|
|
"cmp.json",
|
|
],
|
|
)
|
|
|
|
assert nvbench_compare.main() == 0
|
|
assert captured[
|
|
"comparison_thresholds"
|
|
] == nvbench_compare.get_comparison_thresholds("strict")
|
|
assert captured["display"] == "explain"
|