Files
nvbench/python/test/test_benchmark_result.py
Oleksandr Pavlyk 338936b6fe Provide BenchmarkResult class for parsing JSON output of NVBench-instrumented benchmarks (#356)
Implements `cuda.bench.results.BenchmarkResult` class to represent data from JSON output of benchmark execution.

The contains implements two class methods `BenchmarkResult.from_json(filename : str | os.PathLike, *, metadata : Any = None)` which expects well-formed JSON filename and `BenchmarkResult.empty(*, metadata : Any = None)` intended to represent failed result with reasons that can be recorded in metadata at user's discretion.

The `BenchmarkResult` implements mapping interface, supporting `.keys()`, `.values()`, `.items()` methods, `__len__`, `__contains__`, `__getitem__` and `__iter__` special methods. 

Values in `BenchmarkResult` has type `cuda.bench.results.SubBenchmarkResult` which implements a list-like interface, i.e. implements `__len__`, `__getitem__`, and `__iter__` special methods. Values in this list-like structure correspond to measurements of individual states of a particular benchmark (the key in `BenchmarkResult`).

Elements of `SubBenchmarkResult` structure have type `SubBenchmarkState` that supports mapping protocol with axis_values as a key and represent data corresponding to measurements for a particular state (combination of settings for each axis). 

The state provides `.samples` and `.frequencies` attributes storing raw execution duration values and estimates for average GPU frequencies. 

Example usage:

```
import array, numpy as np, cuda.bench.results

r = cuda.bench.results.BenchmarkResult("perf_data/axes_run1.json")

r["copy_sweep_grid_shape"].centers_with_frequencies(
     lambda t, f: np.median(np.asarray(t)*np.asarray(f)))

```

```
In [1]: import array, numpy as np, cuda.bench.results

In [2]: r = cuda.bench.results.BenchmarkResult("temp_data/axes_run1.json")

In [3]: list(r)
Out[3]:
['simple',
 'single_float64_axis',
 'copy_sweep_grid_shape',
 'copy_type_sweep',
 'copy_type_conversion_sweep',
 'copy_type_and_block_size_sweep']

In [4]: r["simple"].centers(lambda t: np.percentile(t, [25,75]))
Out[4]: {'Device=0': array([0.00100966, 0.00101299])}

In [5]: r.centers(lambda t: np.percentile(t, [25,75]))["simple"]
Out[5]: {'Device=0': array([0.00100966, 0.00101299])}

In [6]: len(r)
Out[6]: 6

In [7]: "fake" in r
Out[7]: False
```

Each `SubBenchmarkState` implements 
`.summaries` attribute - rich object that retains tag/name/hint/hide/description metadata.

* Add nvbench-json-summary to render NVBench JSON output as an NVBench-style
markdown summary table, including axis formatting, device sections, hidden
summary filtering, and summary hint formatting.

Update packaging, type stubs, and tests for the new namespace, renamed
classes, Python 3.10-compatible annotations, and summary-table generation.

* Split tests in test_benchmark_result into smaller tests

* Fix break due to file name change

* Add python/examples/benchmark_result_autotune.py

This example demonstrates using cuda.bench and cuda.bench.results
to implement simple auto-tuning, demonstrated on selecting of
tile shape hyperparameter for naive stencil kernel implemented
in numba-cuda.

* Resolve ruff PLE0604

* Fix for format_axis_value in json format script to handle None value

Add tests to cover such input.

* Address code rabbit review feedback

* Fix license header, add validation

* Addressed both issues raised in review

Malformed values are now represented in result as None.

Skipped benchmarks are no longer dropped, i.e., they are present
in BenchmarkResult data, but they are not reflected in summary
table in line with what NVBench-instrumented benchmarks do.
2026-05-13 13:23:58 -05:00

751 lines
24 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
import json
import struct
from dataclasses import dataclass
import cuda.bench
import cuda.bench.results as results
import pytest
def write_json(path, data):
path.write_text(json.dumps(data), encoding="utf-8")
def block_size_axis(*values):
return {
"name": "BlockSize",
"type": "int64",
"flags": "pow2",
"values": [
{
"input_string": str(value),
"description": f"2^{value} = {2**value}",
"value": 2**value,
}
for value in values
],
}
def sample_file_summary(tag, filename, size):
return {
"tag": tag,
"data": [
{
"name": "filename",
"type": "string",
"value": filename,
},
{
"name": "size",
"type": "int64",
"value": str(size),
},
],
}
def sample_times_summary(filename, size):
return sample_file_summary(
"nv/json/bin:nv/cold/sample_times",
filename,
size,
)
def sample_frequencies_summary(filename, size):
return sample_file_summary(
"nv/json/freqs-bin:nv/cold/sample_freqs",
filename,
size,
)
def bwutil_summary(value):
return {
"tag": "nv/cold/bw/global/utilization",
"name": "BWUtil",
"hint": "percentage",
"description": "Global memory utilization",
"data": [
{
"name": "value",
"type": "float64",
"value": str(value),
}
],
}
@pytest.fixture
def sample_result_path(tmp_path):
bin_dir = tmp_path / "result.json-bin"
bin_dir.mkdir()
(bin_dir / "0.bin").write_bytes(struct.pack("<3f", 1.0, 2.0, 4.0))
freq_bin_dir = tmp_path / "result.json-freqs-bin"
freq_bin_dir.mkdir()
(freq_bin_dir / "0.bin").write_bytes(struct.pack("<3f", 100.0, 200.0, 400.0))
json_fn = tmp_path / "result.json"
write_json(
json_fn,
{
"benchmarks": [
{
"name": "copy",
"axes": [block_size_axis(8)],
"states": [
{
"name": "Device=0 BlockSize=2^8",
"axis_values": [
{
"name": "BlockSize",
"type": "int64",
"value": "256",
}
],
"summaries": [
sample_times_summary("result.json-bin/0.bin", 3),
bwutil_summary(0.75),
sample_frequencies_summary(
"result.json-freqs-bin/0.bin",
3,
),
],
"is_skipped": False,
}
],
}
]
},
)
return json_fn
@pytest.fixture
def sample_result(sample_result_path):
return results.BenchmarkResult.from_json(sample_result_path)
@pytest.fixture
def sample_subbenchmark(sample_result):
return sample_result["copy"]
@pytest.fixture
def sample_state(sample_subbenchmark):
return sample_subbenchmark[0]
def test_result_classes_are_exposed_from_results_namespace():
assert results.BenchmarkResult.__module__ == results.__name__
assert results.BenchmarkResultSummary.__module__ == results.__name__
assert not hasattr(cuda.bench, "BenchmarkResult")
def test_from_json_preserves_optional_metadata(sample_result_path):
metadata = {"returncode": 0, "elapsed_seconds": 0.25}
default_result = results.BenchmarkResult.from_json(sample_result_path)
result = results.BenchmarkResult.from_json(sample_result_path, metadata=metadata)
assert default_result.metadata is None
assert result.metadata is metadata
def test_benchmark_result_implements_mapping_protocol(sample_result):
subbenchmark = sample_result["copy"]
assert len(sample_result) == 1
assert list(sample_result) == ["copy"]
assert list(sample_result.keys()) == ["copy"]
assert list(sample_result.values()) == [subbenchmark]
assert list(sample_result.items()) == [("copy", subbenchmark)]
assert "copy" in sample_result
assert "missing" not in sample_result
assert subbenchmark is sample_result.subbenches["copy"]
with pytest.raises(KeyError):
sample_result["missing"]
def test_subbenchmark_result_implements_sequence_protocol(sample_subbenchmark):
state = sample_subbenchmark[0]
assert len(sample_subbenchmark) == 1
assert sample_subbenchmark[-1] is state
assert sample_subbenchmark[:] == sample_subbenchmark.states
assert list(sample_subbenchmark) == sample_subbenchmark.states
with pytest.raises(IndexError):
sample_subbenchmark[1]
def test_state_parses_axis_name_and_bandwidth(sample_state):
assert sample_state.name() == "BlockSize[pow2]=8"
assert sample_state.bw == 0.75
def test_state_stores_rich_summary_metadata(sample_state):
bw_summary = sample_state.summaries["nv/cold/bw/global/utilization"]
assert bw_summary.tag == "nv/cold/bw/global/utilization"
assert bw_summary.name == "BWUtil"
assert bw_summary.hint == "percentage"
assert bw_summary.hide is None
assert bw_summary.description == "Global memory utilization"
assert bw_summary.value == pytest.approx(0.75)
assert bw_summary["value"] == pytest.approx(0.75)
assert sample_state.summaries["nv/json/bin:nv/cold/sample_times"].data == {
"filename": "result.json-bin/0.bin",
"size": 3,
}
assert sample_state.summaries["nv/json/freqs-bin:nv/cold/sample_freqs"].data == {
"filename": "result.json-freqs-bin/0.bin",
"size": 3,
}
def test_state_preserves_null_summary_values(tmp_path):
json_fn = tmp_path / "result.json"
write_json(
json_fn,
{
"benchmarks": [
{
"name": "copy",
"axes": [],
"states": [
{
"name": "Device=0",
"axis_values": [],
"summaries": [
{
"tag": "nv/cold/time/gpu/stdev/relative",
"name": "Noise",
"hint": "percentage",
"data": [
{
"name": "value",
"type": "float64",
"value": None,
}
],
}
],
"is_skipped": False,
}
],
}
]
},
)
summary = results.BenchmarkResult.from_json(json_fn)["copy"][0].summaries[
"nv/cold/time/gpu/stdev/relative"
]
assert summary.value is None
assert summary["value"] is None
def test_state_reports_malformed_numeric_summary_values(tmp_path):
json_fn = tmp_path / "result.json"
write_json(
json_fn,
{
"benchmarks": [
{
"name": "copy",
"axes": [],
"states": [
{
"name": "Device=0",
"axis_values": [],
"summaries": [
{
"tag": "nv/cold/time/gpu/mean",
"name": "GPU Time",
"hint": "duration",
"data": [
{
"name": "value",
"type": "float64",
"value": "not-a-number",
}
],
}
],
"is_skipped": False,
}
],
}
]
},
)
with pytest.raises(
ValueError,
match=(
"summary 'nv/cold/time/gpu/mean' field 'value' "
"value 'not-a-number' is not a float64"
),
):
results.BenchmarkResult.from_json(json_fn)
def test_state_loads_samples_and_frequencies(sample_state):
assert sample_state.samples is not None
assert list(sample_state.samples) == pytest.approx([1.0, 2.0, 4.0])
assert sample_state.frequencies is not None
assert list(sample_state.frequencies) == pytest.approx([100.0, 200.0, 400.0])
def test_centers_apply_estimators_to_samples(sample_result):
centers = sample_result.centers(lambda samples: sum(samples) / len(samples))
assert centers == {"copy": {"BlockSize[pow2]=8": pytest.approx(7.0 / 3.0)}}
def test_centers_with_frequencies_apply_estimators(sample_result, sample_subbenchmark):
def weighted_mean(samples, frequencies):
return sum(
sample * frequency for sample, frequency in zip(samples, frequencies)
) / sum(frequencies)
weighted_centers = sample_result.centers_with_frequencies(weighted_mean)
assert weighted_centers == {"copy": {"BlockSize[pow2]=8": pytest.approx(3.0)}}
assert (
sample_subbenchmark.centers_with_frequencies(weighted_mean)
== weighted_centers["copy"]
)
def test_benchmark_result_constructor_is_private():
with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
results.BenchmarkResult()
with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
results.BenchmarkResult("result.json")
with pytest.raises(TypeError):
results.BenchmarkResult(metadata=None)
with pytest.raises(TypeError):
results.BenchmarkResult(json_path="result.json", parse=False)
def test_benchmark_result_empty_does_not_read_json(tmp_path):
@dataclass
class RunMetadata:
returncode: int
elapsed_seconds: float
metadata = RunMetadata(returncode=1, elapsed_seconds=0.25)
missing_json = tmp_path / "missing.json"
result = results.BenchmarkResult.empty(metadata=metadata)
assert result.metadata is metadata
assert result.subbenches == {}
with pytest.raises(FileNotFoundError):
results.BenchmarkResult.from_json(missing_json, metadata=metadata)
with pytest.raises(FileNotFoundError):
results.BenchmarkResult.from_json(json_path=missing_json, metadata=metadata)
def test_benchmark_result_accepts_no_axis_benchmark_with_recorded_binary_path(
tmp_path, monkeypatch
):
data_dir = tmp_path / "temp_data"
data_dir.mkdir()
bin_dir = data_dir / "axes_run1.json-bin"
bin_dir.mkdir()
(bin_dir / "0.bin").write_bytes(struct.pack("<2f", 1.0, 4.0))
freq_bin_dir = data_dir / "axes_run1.json-freqs-bin"
freq_bin_dir.mkdir()
(freq_bin_dir / "0.bin").write_bytes(struct.pack("<2f", 100.0, 400.0))
json_fn = data_dir / "axes_run1.json"
write_json(
json_fn,
{
"benchmarks": [
{
"name": "simple",
"axes": None,
"states": [
{
"name": "Device=0",
"axis_values": None,
"summaries": [
sample_times_summary(
"temp_data/axes_run1.json-bin/0.bin",
2,
),
sample_frequencies_summary(
"temp_data/axes_run1.json-freqs-bin/0.bin",
2,
),
],
"is_skipped": False,
}
],
}
]
},
)
monkeypatch.chdir(tmp_path)
result = results.BenchmarkResult.from_json("temp_data/axes_run1.json")
state = result.subbenches["simple"].states[0]
assert state.name() == "Device=0"
assert state.point == {}
assert state.samples is not None
assert list(state.samples) == pytest.approx([1.0, 4.0])
assert state.frequencies is not None
assert list(state.frequencies) == pytest.approx([100.0, 400.0])
def test_benchmark_result_accepts_axis_value_input_string():
result = results.SubBenchmarkResult(
{
"name": "single_float64_axis",
"axes": [
{
"name": "Duration",
"type": "float64",
"flags": "",
"values": [
{
"input_string": "0",
"description": "",
"value": 0.0,
}
],
}
],
"states": [
{
"name": "Device=0 Duration=0",
"axis_values": [
{
"name": "Duration",
"type": "float64",
"value": "0",
}
],
"summaries": [],
"is_skipped": False,
}
],
},
"",
)
state = result.states[0]
assert state.name() == "Duration=0"
assert state.point == {"Duration": "0"}
def test_benchmark_result_normalizes_axis_value_lookup_key():
result = results.SubBenchmarkResult(
{
"name": "num_blocks",
"axes": [
{
"name": "NumBlocks",
"type": "int64",
"flags": "",
"values": [
{
"input_string": "64",
"description": "",
"value": 64,
},
{
"input_string": "default",
"description": "",
"value": None,
},
],
}
],
"states": [
{
"name": "Device=0 NumBlocks=64",
"axis_values": [
{
"name": "NumBlocks",
"type": "int64",
"value": 64,
}
],
"summaries": [],
"is_skipped": False,
},
{
"name": "Device=0 NumBlocks=default",
"axis_values": [
{
"name": "NumBlocks",
"type": "int64",
"value": None,
}
],
"summaries": [],
"is_skipped": False,
},
{
"name": "Device=0 NumBlocks=64",
"axis_values": [
{
"name": "NumBlocks",
"type": "int64",
"input_string": "64",
}
],
"summaries": [],
"is_skipped": False,
},
],
},
"",
)
assert result.states[0].point == {"NumBlocks": "64"}
assert result.states[1].point == {"NumBlocks": "default"}
assert result.states[2].point == {"NumBlocks": "64"}
def test_benchmark_result_preserves_skipped_state_with_no_summaries():
result = results.SubBenchmarkResult(
{
"name": "copy_sweep_grid_shape",
"axes": [block_size_axis(6, 8)],
"states": [
{
"name": "Device=0 BlockSize=2^8",
"axis_values": [
{
"name": "BlockSize",
"type": "int64",
"value": "256",
}
],
"summaries": None,
"is_skipped": True,
},
{
"name": "Device=0 BlockSize=2^6",
"axis_values": [
{
"name": "BlockSize",
"type": "int64",
"value": "64",
}
],
"summaries": [],
"is_skipped": False,
},
],
},
"",
)
assert len(result.states) == 2
assert result.states[0].name() == "BlockSize[pow2]=8"
assert result.states[0].is_skipped is True
assert result.states[0].summaries == {}
assert result.states[0].samples is None
assert result.states[0].frequencies is None
assert result.states[1].name() == "BlockSize[pow2]=6"
assert result.states[1].is_skipped is False
def test_benchmark_result_uses_empty_summaries_when_field_is_missing():
result = results.SubBenchmarkResult(
{
"name": "copy_sweep_grid_shape",
"axes": [block_size_axis(8)],
"states": [
{
"name": "Device=0 BlockSize=2^8",
"axis_values": [
{
"name": "BlockSize",
"type": "int64",
"value": "256",
}
],
"is_skipped": False,
},
],
},
"",
)
state = result.states[0]
assert state.name() == "BlockSize[pow2]=8"
assert state.summaries == {}
assert state.samples is None
assert state.frequencies is None
assert state.bw is None
@pytest.mark.parametrize(
"field_name,bad_type,expected_type",
[
("filename", "int64", "string"),
("size", "string", "int64"),
],
)
def test_benchmark_result_validates_binary_summary_field_types(
field_name, bad_type, expected_type
):
summary = sample_times_summary("result.json-bin/0.bin", 3)
for value_data in summary["data"]:
if value_data["name"] == field_name:
value_data["type"] = bad_type
if field_name == "filename":
value_data["value"] = "123"
with pytest.raises(
ValueError,
match=rf"field '{field_name}' has type '{bad_type}'; expected '{expected_type}'",
):
results.SubBenchmarkResult(
{
"name": "copy",
"axes": [],
"states": [
{
"name": "Device=0",
"axis_values": [],
"summaries": [summary],
"is_skipped": False,
}
],
},
"",
)
def test_benchmark_result_uses_none_for_unavailable_samples(tmp_path):
json_fn = tmp_path / "result.json"
write_json(
json_fn,
{
"benchmarks": [
{
"name": "copy",
"axes": [block_size_axis(8, 9)],
"states": [
{
"name": "Device=0 BlockSize=2^8",
"axis_values": [
{
"name": "BlockSize",
"type": "int64",
"value": "256",
}
],
"summaries": [],
"is_skipped": False,
},
{
"name": "Device=0 BlockSize=2^9",
"axis_values": [
{
"name": "BlockSize",
"type": "int64",
"value": "512",
}
],
"summaries": [
sample_times_summary(
"result.json-bin/missing.bin",
3,
),
sample_frequencies_summary(
"result.json-freqs-bin/missing.bin",
3,
),
],
"is_skipped": False,
},
],
}
]
},
)
result = results.BenchmarkResult.from_json(json_fn)
states = result.subbenches["copy"].states
assert states[0].samples is None
assert states[1].samples is None
assert states[0].frequencies is None
assert states[1].frequencies is None
assert result.centers(lambda samples: pytest.fail("estimator should not run")) == {
"copy": {
"BlockSize[pow2]=8": None,
"BlockSize[pow2]=9": None,
}
}
assert result.centers_with_frequencies(
lambda samples, frequencies: pytest.fail("estimator should not run")
) == {
"copy": {
"BlockSize[pow2]=8": None,
"BlockSize[pow2]=9": None,
}
}
def test_benchmark_result_rejects_mismatched_sample_and_frequency_counts(tmp_path):
bin_dir = tmp_path / "result.json-bin"
bin_dir.mkdir()
(bin_dir / "0.bin").write_bytes(struct.pack("<3f", 1.0, 2.0, 4.0))
freq_bin_dir = tmp_path / "result.json-freqs-bin"
freq_bin_dir.mkdir()
(freq_bin_dir / "0.bin").write_bytes(struct.pack("<2f", 100.0, 200.0))
json_fn = tmp_path / "result.json"
write_json(
json_fn,
{
"benchmarks": [
{
"name": "copy",
"axes": [block_size_axis(8)],
"states": [
{
"name": "Device=0 BlockSize=2^8",
"axis_values": [
{
"name": "BlockSize",
"type": "int64",
"value": "256",
}
],
"summaries": [
sample_times_summary("result.json-bin/0.bin", 3),
sample_frequencies_summary(
"result.json-freqs-bin/0.bin",
2,
),
],
"is_skipped": False,
}
],
}
]
},
)
with pytest.raises(ValueError, match="sample count .* frequency count"):
results.BenchmarkResult.from_json(json_fn)