mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-05-14 10:07:25 +00:00
Implements `cuda.bench.results.BenchmarkResult` class to represent data from JSON output of benchmark execution.
The contains implements two class methods `BenchmarkResult.from_json(filename : str | os.PathLike, *, metadata : Any = None)` which expects well-formed JSON filename and `BenchmarkResult.empty(*, metadata : Any = None)` intended to represent failed result with reasons that can be recorded in metadata at user's discretion.
The `BenchmarkResult` implements mapping interface, supporting `.keys()`, `.values()`, `.items()` methods, `__len__`, `__contains__`, `__getitem__` and `__iter__` special methods.
Values in `BenchmarkResult` has type `cuda.bench.results.SubBenchmarkResult` which implements a list-like interface, i.e. implements `__len__`, `__getitem__`, and `__iter__` special methods. Values in this list-like structure correspond to measurements of individual states of a particular benchmark (the key in `BenchmarkResult`).
Elements of `SubBenchmarkResult` structure have type `SubBenchmarkState` that supports mapping protocol with axis_values as a key and represent data corresponding to measurements for a particular state (combination of settings for each axis).
The state provides `.samples` and `.frequencies` attributes storing raw execution duration values and estimates for average GPU frequencies.
Example usage:
```
import array, numpy as np, cuda.bench.results
r = cuda.bench.results.BenchmarkResult("perf_data/axes_run1.json")
r["copy_sweep_grid_shape"].centers_with_frequencies(
lambda t, f: np.median(np.asarray(t)*np.asarray(f)))
```
```
In [1]: import array, numpy as np, cuda.bench.results
In [2]: r = cuda.bench.results.BenchmarkResult("temp_data/axes_run1.json")
In [3]: list(r)
Out[3]:
['simple',
'single_float64_axis',
'copy_sweep_grid_shape',
'copy_type_sweep',
'copy_type_conversion_sweep',
'copy_type_and_block_size_sweep']
In [4]: r["simple"].centers(lambda t: np.percentile(t, [25,75]))
Out[4]: {'Device=0': array([0.00100966, 0.00101299])}
In [5]: r.centers(lambda t: np.percentile(t, [25,75]))["simple"]
Out[5]: {'Device=0': array([0.00100966, 0.00101299])}
In [6]: len(r)
Out[6]: 6
In [7]: "fake" in r
Out[7]: False
```
Each `SubBenchmarkState` implements
`.summaries` attribute - rich object that retains tag/name/hint/hide/description metadata.
* Add nvbench-json-summary to render NVBench JSON output as an NVBench-style
markdown summary table, including axis formatting, device sections, hidden
summary filtering, and summary hint formatting.
Update packaging, type stubs, and tests for the new namespace, renamed
classes, Python 3.10-compatible annotations, and summary-table generation.
* Split tests in test_benchmark_result into smaller tests
* Fix break due to file name change
* Add python/examples/benchmark_result_autotune.py
This example demonstrates using cuda.bench and cuda.bench.results
to implement simple auto-tuning, demonstrated on selecting of
tile shape hyperparameter for naive stencil kernel implemented
in numba-cuda.
* Resolve ruff PLE0604
* Fix for format_axis_value in json format script to handle None value
Add tests to cover such input.
* Address code rabbit review feedback
* Fix license header, add validation
* Addressed both issues raised in review
Malformed values are now represented in result as None.
Skipped benchmarks are no longer dropped, i.e., they are present
in BenchmarkResult data, but they are not reflected in summary
table in line with what NVBench-instrumented benchmarks do.
377 lines
14 KiB
Python
377 lines
14 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
|
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
import importlib.util
|
|
import json
|
|
from pathlib import Path
|
|
|
|
|
|
def load_nvbench_json_summary():
|
|
module_path = (
|
|
Path(__file__).resolve().parents[1] / "scripts" / "nvbench_json_summary.py"
|
|
)
|
|
spec = importlib.util.spec_from_file_location("nvbench_json_summary", module_path)
|
|
assert spec is not None
|
|
assert spec.loader is not None
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
nvbench_json_summary = load_nvbench_json_summary()
|
|
|
|
|
|
def write_result_json(path):
|
|
path.write_text(
|
|
json.dumps(
|
|
{
|
|
"devices": [
|
|
{
|
|
"id": 0,
|
|
"name": "Test GPU",
|
|
}
|
|
],
|
|
"benchmarks": [
|
|
{
|
|
"name": "copy",
|
|
"devices": [0],
|
|
"axes": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"flags": "pow2",
|
|
"values": [
|
|
{
|
|
"input_string": "8",
|
|
"description": "2^8 = 256",
|
|
"value": 256,
|
|
}
|
|
],
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 BlockSize=2^8",
|
|
"device": 0,
|
|
"type_config_index": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256",
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/time/gpu/sample_size",
|
|
"name": "Samples",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "12",
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/mean",
|
|
"name": "GPU Time",
|
|
"hint": "duration",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.25e-6",
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/stdev/relative",
|
|
"name": "Noise",
|
|
"hint": "percentage",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.015",
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/bytes_per_second",
|
|
"name": "GlobalMem BW",
|
|
"hint": "byte_rate",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "2.5e9",
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"tag": "nv/cold/bw/global/utilization",
|
|
"name": "BWUtil",
|
|
"hint": "percentage",
|
|
"hide": False,
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "0.625",
|
|
}
|
|
],
|
|
},
|
|
{
|
|
"tag": "nv/cold/time/gpu/min",
|
|
"name": "Min GPU Time",
|
|
"hint": "duration",
|
|
"hide": "Hidden by default.",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "float64",
|
|
"value": "1.0e-6",
|
|
}
|
|
],
|
|
},
|
|
],
|
|
"is_skipped": False,
|
|
}
|
|
],
|
|
}
|
|
],
|
|
}
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def test_json_summary_formats_nvbench_style_markdown(tmp_path):
|
|
json_path = tmp_path / "result.json"
|
|
write_result_json(json_path)
|
|
|
|
result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
|
|
report = nvbench_json_summary.format_result(result)
|
|
|
|
assert "# Benchmark Results" in report
|
|
assert "## copy" in report
|
|
assert "### [0] Test GPU" in report
|
|
assert (
|
|
"| BlockSize | Samples | GPU Time | Noise | GlobalMem BW | BWUtil |" in report
|
|
)
|
|
assert (
|
|
"| 2^8 = 256 | 12x | 1.250 us | 1.50% | 2.500 GB/s | 62.50% |" in report
|
|
)
|
|
assert "Min GPU Time" not in report
|
|
|
|
|
|
def test_json_summary_formats_null_summary_value_as_blank():
|
|
summary = nvbench_json_summary.BenchmarkResultSummary(
|
|
tag="nv/cold/time/gpu/stdev/relative",
|
|
name="Noise",
|
|
hint="percentage",
|
|
hide=None,
|
|
description=None,
|
|
data={"value": None},
|
|
)
|
|
|
|
assert nvbench_json_summary.format_summary(summary) == ""
|
|
|
|
|
|
def test_json_summary_formats_axis_values_like_markdown_printer():
|
|
axes_by_name = {
|
|
"BlockSize": {
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"flags": "pow2",
|
|
},
|
|
"NumBlocks": {
|
|
"name": "NumBlocks",
|
|
"type": "int64",
|
|
"flags": "",
|
|
},
|
|
"Duration": {
|
|
"name": "Duration",
|
|
"type": "float64",
|
|
"flags": "",
|
|
},
|
|
"Nullable": {
|
|
"name": "Nullable",
|
|
"type": "int64",
|
|
"flags": "",
|
|
},
|
|
}
|
|
|
|
assert nvbench_json_summary.format_axis_value(
|
|
{"name": "BlockSize", "type": "int64", "value": "256"}, axes_by_name
|
|
) == ("BlockSize", "2^8 = 256")
|
|
assert nvbench_json_summary.format_axis_value(
|
|
{"name": "NumBlocks", "type": "int64", "value": "64"}, axes_by_name
|
|
) == ("NumBlocks", "64")
|
|
assert nvbench_json_summary.format_axis_value(
|
|
{"name": "Duration", "type": "float64", "value": "0.123456789"},
|
|
axes_by_name,
|
|
) == ("Duration", "0.12346")
|
|
assert nvbench_json_summary.format_axis_value(
|
|
{"name": "Nullable", "type": "int64", "value": None}, axes_by_name
|
|
) == ("Nullable", "")
|
|
|
|
|
|
def test_json_summary_formats_state_with_null_axis_values(tmp_path):
|
|
json_path = tmp_path / "result.json"
|
|
json_path.write_text(
|
|
json.dumps(
|
|
{
|
|
"devices": [
|
|
{
|
|
"id": 0,
|
|
"name": "Test GPU",
|
|
}
|
|
],
|
|
"benchmarks": [
|
|
{
|
|
"name": "no_axes",
|
|
"devices": [0],
|
|
"axes": None,
|
|
"states": [
|
|
{
|
|
"name": "Device=0",
|
|
"device": 0,
|
|
"axis_values": None,
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/time/gpu/sample_size",
|
|
"name": "Samples",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "7",
|
|
}
|
|
],
|
|
}
|
|
],
|
|
"is_skipped": False,
|
|
}
|
|
],
|
|
}
|
|
],
|
|
}
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
|
|
report = nvbench_json_summary.format_result(result)
|
|
|
|
assert "## no_axes" in report
|
|
assert "| Samples |" in report
|
|
assert "| 7x |" in report
|
|
|
|
|
|
def test_json_summary_omits_skipped_states(tmp_path):
|
|
json_path = tmp_path / "result.json"
|
|
json_path.write_text(
|
|
json.dumps(
|
|
{
|
|
"devices": [
|
|
{
|
|
"id": 0,
|
|
"name": "Test GPU",
|
|
}
|
|
],
|
|
"benchmarks": [
|
|
{
|
|
"name": "copy",
|
|
"devices": [0],
|
|
"axes": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"flags": "pow2",
|
|
"values": [
|
|
{
|
|
"input_string": "8",
|
|
"description": "2^8 = 256",
|
|
"value": 256,
|
|
},
|
|
{
|
|
"input_string": "9",
|
|
"description": "2^9 = 512",
|
|
"value": 512,
|
|
},
|
|
],
|
|
}
|
|
],
|
|
"states": [
|
|
{
|
|
"name": "Device=0 BlockSize=2^8",
|
|
"device": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "256",
|
|
}
|
|
],
|
|
"summaries": None,
|
|
"is_skipped": True,
|
|
"skip_reason": "Deadlock detected",
|
|
},
|
|
{
|
|
"name": "Device=0 BlockSize=2^9",
|
|
"device": 0,
|
|
"axis_values": [
|
|
{
|
|
"name": "BlockSize",
|
|
"type": "int64",
|
|
"value": "512",
|
|
}
|
|
],
|
|
"summaries": [
|
|
{
|
|
"tag": "nv/cold/time/gpu/sample_size",
|
|
"name": "Samples",
|
|
"hint": "sample_size",
|
|
"data": [
|
|
{
|
|
"name": "value",
|
|
"type": "int64",
|
|
"value": "3",
|
|
}
|
|
],
|
|
}
|
|
],
|
|
"is_skipped": False,
|
|
},
|
|
],
|
|
}
|
|
],
|
|
}
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
|
|
report = nvbench_json_summary.format_result(result)
|
|
|
|
assert "Skip Reason" not in report
|
|
assert "Deadlock detected" not in report
|
|
assert "2^8 = 256" not in report
|
|
assert "2^9 = 512" in report
|
|
assert "3x" in report
|
|
|
|
|
|
def test_json_summary_cli_writes_output_file(tmp_path):
|
|
json_path = tmp_path / "result.json"
|
|
output_path = tmp_path / "summary.md"
|
|
write_result_json(json_path)
|
|
|
|
rc = nvbench_json_summary.main([str(json_path), "--output", str(output_path)])
|
|
|
|
assert rc == 0
|
|
assert "GlobalMem BW" in output_path.read_text(encoding="utf-8")
|