Provide BenchmarkResult class for parsing JSON output of NVBench-instrumented benchmarks (#356)

Implements `cuda.bench.results.BenchmarkResult` class to represent data from JSON output of benchmark execution. The contains implements two class methods `BenchmarkResult.from_json(filename : str | os.PathLike, *, metadata : Any = None)` which expects well-formed JSON filename and `BenchmarkResult.empty(*, metadata : Any = None)` intended to represent failed result with reasons that can be recorded in metadata at user's discretion. The `BenchmarkResult` implements mapping interface, supporting `.keys()`, `.values()`, `.items()` methods, `__len__`, `__contains__`, `__getitem__` and `__iter__` special methods. Values in `BenchmarkResult` has type `cuda.bench.results.SubBenchmarkResult` which implements a list-like interface, i.e. implements `__len__`, `__getitem__`, and `__iter__` special methods. Values in this list-like structure correspond to measurements of individual states of a particular benchmark (the key in `BenchmarkResult`). Elements of `SubBenchmarkResult` structure have type `SubBenchmarkState` that supports mapping protocol with axis_values as a key and represent data corresponding to measurements for a particular state (combination of settings for each axis). The state provides `.samples` and `.frequencies` attributes storing raw execution duration values and estimates for average GPU frequencies. Example usage: ``` import array, numpy as np, cuda.bench.results r = cuda.bench.results.BenchmarkResult("perf_data/axes_run1.json") r["copy_sweep_grid_shape"].centers_with_frequencies( lambda t, f: np.median(np.asarray(t)*np.asarray(f))) ``` ``` In [1]: import array, numpy as np, cuda.bench.results In [2]: r = cuda.bench.results.BenchmarkResult("temp_data/axes_run1.json") In [3]: list(r) Out[3]: ['simple', 'single_float64_axis', 'copy_sweep_grid_shape', 'copy_type_sweep', 'copy_type_conversion_sweep', 'copy_type_and_block_size_sweep'] In [4]: r["simple"].centers(lambda t: np.percentile(t, [25,75])) Out[4]: {'Device=0': array([0.00100966, 0.00101299])} In [5]: r.centers(lambda t: np.percentile(t, [25,75]))["simple"] Out[5]: {'Device=0': array([0.00100966, 0.00101299])} In [6]: len(r) Out[6]: 6 In [7]: "fake" in r Out[7]: False ``` Each `SubBenchmarkState` implements `.summaries` attribute - rich object that retains tag/name/hint/hide/description metadata. * Add nvbench-json-summary to render NVBench JSON output as an NVBench-style markdown summary table, including axis formatting, device sections, hidden summary filtering, and summary hint formatting. Update packaging, type stubs, and tests for the new namespace, renamed classes, Python 3.10-compatible annotations, and summary-table generation. * Split tests in test_benchmark_result into smaller tests * Fix break due to file name change * Add python/examples/benchmark_result_autotune.py This example demonstrates using cuda.bench and cuda.bench.results to implement simple auto-tuning, demonstrated on selecting of tile shape hyperparameter for naive stencil kernel implemented in numba-cuda. * Resolve ruff PLE0604 * Fix for format_axis_value in json format script to handle None value Add tests to cover such input. * Address code rabbit review feedback * Fix license header, add validation * Addressed both issues raised in review Malformed values are now represented in result as None. Skipped benchmarks are no longer dropped, i.e., they are present in BenchmarkResult data, but they are not reflected in summary table in line with what NVBench-instrumented benchmarks do.
2026-05-14 02:02:16 +00:00 · 2026-05-13 13:23:58 -05:00
parent 6df6dc8d89
commit 338936b6fe
12 changed files with 2480 additions and 45 deletions
--- a/python/test/test_benchmark_result.py
+++ b/python/test/test_benchmark_result.py
@@ -0,0 +1,750 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import json
+import struct
+from dataclasses import dataclass
+
+import cuda.bench
+import cuda.bench.results as results
+import pytest
+
+
+def write_json(path, data):
+    path.write_text(json.dumps(data), encoding="utf-8")
+
+
+def block_size_axis(*values):
+    return {
+        "name": "BlockSize",
+        "type": "int64",
+        "flags": "pow2",
+        "values": [
+            {
+                "input_string": str(value),
+                "description": f"2^{value} = {2**value}",
+                "value": 2**value,
+            }
+            for value in values
+        ],
+    }
+
+
+def sample_file_summary(tag, filename, size):
+    return {
+        "tag": tag,
+        "data": [
+            {
+                "name": "filename",
+                "type": "string",
+                "value": filename,
+            },
+            {
+                "name": "size",
+                "type": "int64",
+                "value": str(size),
+            },
+        ],
+    }
+
+
+def sample_times_summary(filename, size):
+    return sample_file_summary(
+        "nv/json/bin:nv/cold/sample_times",
+        filename,
+        size,
+    )
+
+
+def sample_frequencies_summary(filename, size):
+    return sample_file_summary(
+        "nv/json/freqs-bin:nv/cold/sample_freqs",
+        filename,
+        size,
+    )
+
+
+def bwutil_summary(value):
+    return {
+        "tag": "nv/cold/bw/global/utilization",
+        "name": "BWUtil",
+        "hint": "percentage",
+        "description": "Global memory utilization",
+        "data": [
+            {
+                "name": "value",
+                "type": "float64",
+                "value": str(value),
+            }
+        ],
+    }
+
+
+@pytest.fixture
+def sample_result_path(tmp_path):
+    bin_dir = tmp_path / "result.json-bin"
+    bin_dir.mkdir()
+    (bin_dir / "0.bin").write_bytes(struct.pack("<3f", 1.0, 2.0, 4.0))
+    freq_bin_dir = tmp_path / "result.json-freqs-bin"
+    freq_bin_dir.mkdir()
+    (freq_bin_dir / "0.bin").write_bytes(struct.pack("<3f", 100.0, 200.0, 400.0))
+
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [block_size_axis(8)],
+                    "states": [
+                        {
+                            "name": "Device=0 BlockSize=2^8",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "256",
+                                }
+                            ],
+                            "summaries": [
+                                sample_times_summary("result.json-bin/0.bin", 3),
+                                bwutil_summary(0.75),
+                                sample_frequencies_summary(
+                                    "result.json-freqs-bin/0.bin",
+                                    3,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+    return json_fn
+
+
+@pytest.fixture
+def sample_result(sample_result_path):
+    return results.BenchmarkResult.from_json(sample_result_path)
+
+
+@pytest.fixture
+def sample_subbenchmark(sample_result):
+    return sample_result["copy"]
+
+
+@pytest.fixture
+def sample_state(sample_subbenchmark):
+    return sample_subbenchmark[0]
+
+
+def test_result_classes_are_exposed_from_results_namespace():
+    assert results.BenchmarkResult.__module__ == results.__name__
+    assert results.BenchmarkResultSummary.__module__ == results.__name__
+    assert not hasattr(cuda.bench, "BenchmarkResult")
+
+
+def test_from_json_preserves_optional_metadata(sample_result_path):
+    metadata = {"returncode": 0, "elapsed_seconds": 0.25}
+
+    default_result = results.BenchmarkResult.from_json(sample_result_path)
+    result = results.BenchmarkResult.from_json(sample_result_path, metadata=metadata)
+
+    assert default_result.metadata is None
+    assert result.metadata is metadata
+
+
+def test_benchmark_result_implements_mapping_protocol(sample_result):
+    subbenchmark = sample_result["copy"]
+
+    assert len(sample_result) == 1
+    assert list(sample_result) == ["copy"]
+    assert list(sample_result.keys()) == ["copy"]
+    assert list(sample_result.values()) == [subbenchmark]
+    assert list(sample_result.items()) == [("copy", subbenchmark)]
+    assert "copy" in sample_result
+    assert "missing" not in sample_result
+    assert subbenchmark is sample_result.subbenches["copy"]
+    with pytest.raises(KeyError):
+        sample_result["missing"]
+
+
+def test_subbenchmark_result_implements_sequence_protocol(sample_subbenchmark):
+    state = sample_subbenchmark[0]
+
+    assert len(sample_subbenchmark) == 1
+    assert sample_subbenchmark[-1] is state
+    assert sample_subbenchmark[:] == sample_subbenchmark.states
+    assert list(sample_subbenchmark) == sample_subbenchmark.states
+    with pytest.raises(IndexError):
+        sample_subbenchmark[1]
+
+
+def test_state_parses_axis_name_and_bandwidth(sample_state):
+    assert sample_state.name() == "BlockSize[pow2]=8"
+    assert sample_state.bw == 0.75
+
+
+def test_state_stores_rich_summary_metadata(sample_state):
+    bw_summary = sample_state.summaries["nv/cold/bw/global/utilization"]
+
+    assert bw_summary.tag == "nv/cold/bw/global/utilization"
+    assert bw_summary.name == "BWUtil"
+    assert bw_summary.hint == "percentage"
+    assert bw_summary.hide is None
+    assert bw_summary.description == "Global memory utilization"
+    assert bw_summary.value == pytest.approx(0.75)
+    assert bw_summary["value"] == pytest.approx(0.75)
+    assert sample_state.summaries["nv/json/bin:nv/cold/sample_times"].data == {
+        "filename": "result.json-bin/0.bin",
+        "size": 3,
+    }
+    assert sample_state.summaries["nv/json/freqs-bin:nv/cold/sample_freqs"].data == {
+        "filename": "result.json-freqs-bin/0.bin",
+        "size": 3,
+    }
+
+
+def test_state_preserves_null_summary_values(tmp_path):
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [],
+                    "states": [
+                        {
+                            "name": "Device=0",
+                            "axis_values": [],
+                            "summaries": [
+                                {
+                                    "tag": "nv/cold/time/gpu/stdev/relative",
+                                    "name": "Noise",
+                                    "hint": "percentage",
+                                    "data": [
+                                        {
+                                            "name": "value",
+                                            "type": "float64",
+                                            "value": None,
+                                        }
+                                    ],
+                                }
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    summary = results.BenchmarkResult.from_json(json_fn)["copy"][0].summaries[
+        "nv/cold/time/gpu/stdev/relative"
+    ]
+
+    assert summary.value is None
+    assert summary["value"] is None
+
+
+def test_state_reports_malformed_numeric_summary_values(tmp_path):
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [],
+                    "states": [
+                        {
+                            "name": "Device=0",
+                            "axis_values": [],
+                            "summaries": [
+                                {
+                                    "tag": "nv/cold/time/gpu/mean",
+                                    "name": "GPU Time",
+                                    "hint": "duration",
+                                    "data": [
+                                        {
+                                            "name": "value",
+                                            "type": "float64",
+                                            "value": "not-a-number",
+                                        }
+                                    ],
+                                }
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    with pytest.raises(
+        ValueError,
+        match=(
+            "summary 'nv/cold/time/gpu/mean' field 'value' "
+            "value 'not-a-number' is not a float64"
+        ),
+    ):
+        results.BenchmarkResult.from_json(json_fn)
+
+
+def test_state_loads_samples_and_frequencies(sample_state):
+    assert sample_state.samples is not None
+    assert list(sample_state.samples) == pytest.approx([1.0, 2.0, 4.0])
+    assert sample_state.frequencies is not None
+    assert list(sample_state.frequencies) == pytest.approx([100.0, 200.0, 400.0])
+
+
+def test_centers_apply_estimators_to_samples(sample_result):
+    centers = sample_result.centers(lambda samples: sum(samples) / len(samples))
+
+    assert centers == {"copy": {"BlockSize[pow2]=8": pytest.approx(7.0 / 3.0)}}
+
+
+def test_centers_with_frequencies_apply_estimators(sample_result, sample_subbenchmark):
+    def weighted_mean(samples, frequencies):
+        return sum(
+            sample * frequency for sample, frequency in zip(samples, frequencies)
+        ) / sum(frequencies)
+
+    weighted_centers = sample_result.centers_with_frequencies(weighted_mean)
+
+    assert weighted_centers == {"copy": {"BlockSize[pow2]=8": pytest.approx(3.0)}}
+    assert (
+        sample_subbenchmark.centers_with_frequencies(weighted_mean)
+        == weighted_centers["copy"]
+    )
+
+
+def test_benchmark_result_constructor_is_private():
+    with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
+        results.BenchmarkResult()
+    with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
+        results.BenchmarkResult("result.json")
+    with pytest.raises(TypeError):
+        results.BenchmarkResult(metadata=None)
+    with pytest.raises(TypeError):
+        results.BenchmarkResult(json_path="result.json", parse=False)
+
+
+def test_benchmark_result_empty_does_not_read_json(tmp_path):
+    @dataclass
+    class RunMetadata:
+        returncode: int
+        elapsed_seconds: float
+
+    metadata = RunMetadata(returncode=1, elapsed_seconds=0.25)
+    missing_json = tmp_path / "missing.json"
+
+    result = results.BenchmarkResult.empty(metadata=metadata)
+
+    assert result.metadata is metadata
+    assert result.subbenches == {}
+
+    with pytest.raises(FileNotFoundError):
+        results.BenchmarkResult.from_json(missing_json, metadata=metadata)
+    with pytest.raises(FileNotFoundError):
+        results.BenchmarkResult.from_json(json_path=missing_json, metadata=metadata)
+
+
+def test_benchmark_result_accepts_no_axis_benchmark_with_recorded_binary_path(
+    tmp_path, monkeypatch
+):
+    data_dir = tmp_path / "temp_data"
+    data_dir.mkdir()
+    bin_dir = data_dir / "axes_run1.json-bin"
+    bin_dir.mkdir()
+    (bin_dir / "0.bin").write_bytes(struct.pack("<2f", 1.0, 4.0))
+    freq_bin_dir = data_dir / "axes_run1.json-freqs-bin"
+    freq_bin_dir.mkdir()
+    (freq_bin_dir / "0.bin").write_bytes(struct.pack("<2f", 100.0, 400.0))
+
+    json_fn = data_dir / "axes_run1.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "simple",
+                    "axes": None,
+                    "states": [
+                        {
+                            "name": "Device=0",
+                            "axis_values": None,
+                            "summaries": [
+                                sample_times_summary(
+                                    "temp_data/axes_run1.json-bin/0.bin",
+                                    2,
+                                ),
+                                sample_frequencies_summary(
+                                    "temp_data/axes_run1.json-freqs-bin/0.bin",
+                                    2,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    monkeypatch.chdir(tmp_path)
+
+    result = results.BenchmarkResult.from_json("temp_data/axes_run1.json")
+
+    state = result.subbenches["simple"].states[0]
+    assert state.name() == "Device=0"
+    assert state.point == {}
+    assert state.samples is not None
+    assert list(state.samples) == pytest.approx([1.0, 4.0])
+    assert state.frequencies is not None
+    assert list(state.frequencies) == pytest.approx([100.0, 400.0])
+
+
+def test_benchmark_result_accepts_axis_value_input_string():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "single_float64_axis",
+            "axes": [
+                {
+                    "name": "Duration",
+                    "type": "float64",
+                    "flags": "",
+                    "values": [
+                        {
+                            "input_string": "0",
+                            "description": "",
+                            "value": 0.0,
+                        }
+                    ],
+                }
+            ],
+            "states": [
+                {
+                    "name": "Device=0 Duration=0",
+                    "axis_values": [
+                        {
+                            "name": "Duration",
+                            "type": "float64",
+                            "value": "0",
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                }
+            ],
+        },
+        "",
+    )
+
+    state = result.states[0]
+    assert state.name() == "Duration=0"
+    assert state.point == {"Duration": "0"}
+
+
+def test_benchmark_result_normalizes_axis_value_lookup_key():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "num_blocks",
+            "axes": [
+                {
+                    "name": "NumBlocks",
+                    "type": "int64",
+                    "flags": "",
+                    "values": [
+                        {
+                            "input_string": "64",
+                            "description": "",
+                            "value": 64,
+                        },
+                        {
+                            "input_string": "default",
+                            "description": "",
+                            "value": None,
+                        },
+                    ],
+                }
+            ],
+            "states": [
+                {
+                    "name": "Device=0 NumBlocks=64",
+                    "axis_values": [
+                        {
+                            "name": "NumBlocks",
+                            "type": "int64",
+                            "value": 64,
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+                {
+                    "name": "Device=0 NumBlocks=default",
+                    "axis_values": [
+                        {
+                            "name": "NumBlocks",
+                            "type": "int64",
+                            "value": None,
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+                {
+                    "name": "Device=0 NumBlocks=64",
+                    "axis_values": [
+                        {
+                            "name": "NumBlocks",
+                            "type": "int64",
+                            "input_string": "64",
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+            ],
+        },
+        "",
+    )
+
+    assert result.states[0].point == {"NumBlocks": "64"}
+    assert result.states[1].point == {"NumBlocks": "default"}
+    assert result.states[2].point == {"NumBlocks": "64"}
+
+
+def test_benchmark_result_preserves_skipped_state_with_no_summaries():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "copy_sweep_grid_shape",
+            "axes": [block_size_axis(6, 8)],
+            "states": [
+                {
+                    "name": "Device=0 BlockSize=2^8",
+                    "axis_values": [
+                        {
+                            "name": "BlockSize",
+                            "type": "int64",
+                            "value": "256",
+                        }
+                    ],
+                    "summaries": None,
+                    "is_skipped": True,
+                },
+                {
+                    "name": "Device=0 BlockSize=2^6",
+                    "axis_values": [
+                        {
+                            "name": "BlockSize",
+                            "type": "int64",
+                            "value": "64",
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+            ],
+        },
+        "",
+    )
+
+    assert len(result.states) == 2
+    assert result.states[0].name() == "BlockSize[pow2]=8"
+    assert result.states[0].is_skipped is True
+    assert result.states[0].summaries == {}
+    assert result.states[0].samples is None
+    assert result.states[0].frequencies is None
+    assert result.states[1].name() == "BlockSize[pow2]=6"
+    assert result.states[1].is_skipped is False
+
+
+def test_benchmark_result_uses_empty_summaries_when_field_is_missing():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "copy_sweep_grid_shape",
+            "axes": [block_size_axis(8)],
+            "states": [
+                {
+                    "name": "Device=0 BlockSize=2^8",
+                    "axis_values": [
+                        {
+                            "name": "BlockSize",
+                            "type": "int64",
+                            "value": "256",
+                        }
+                    ],
+                    "is_skipped": False,
+                },
+            ],
+        },
+        "",
+    )
+
+    state = result.states[0]
+    assert state.name() == "BlockSize[pow2]=8"
+    assert state.summaries == {}
+    assert state.samples is None
+    assert state.frequencies is None
+    assert state.bw is None
+
+
+@pytest.mark.parametrize(
+    "field_name,bad_type,expected_type",
+    [
+        ("filename", "int64", "string"),
+        ("size", "string", "int64"),
+    ],
+)
+def test_benchmark_result_validates_binary_summary_field_types(
+    field_name, bad_type, expected_type
+):
+    summary = sample_times_summary("result.json-bin/0.bin", 3)
+    for value_data in summary["data"]:
+        if value_data["name"] == field_name:
+            value_data["type"] = bad_type
+            if field_name == "filename":
+                value_data["value"] = "123"
+
+    with pytest.raises(
+        ValueError,
+        match=rf"field '{field_name}' has type '{bad_type}'; expected '{expected_type}'",
+    ):
+        results.SubBenchmarkResult(
+            {
+                "name": "copy",
+                "axes": [],
+                "states": [
+                    {
+                        "name": "Device=0",
+                        "axis_values": [],
+                        "summaries": [summary],
+                        "is_skipped": False,
+                    }
+                ],
+            },
+            "",
+        )
+
+
+def test_benchmark_result_uses_none_for_unavailable_samples(tmp_path):
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [block_size_axis(8, 9)],
+                    "states": [
+                        {
+                            "name": "Device=0 BlockSize=2^8",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "256",
+                                }
+                            ],
+                            "summaries": [],
+                            "is_skipped": False,
+                        },
+                        {
+                            "name": "Device=0 BlockSize=2^9",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "512",
+                                }
+                            ],
+                            "summaries": [
+                                sample_times_summary(
+                                    "result.json-bin/missing.bin",
+                                    3,
+                                ),
+                                sample_frequencies_summary(
+                                    "result.json-freqs-bin/missing.bin",
+                                    3,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        },
+                    ],
+                }
+            ]
+        },
+    )
+
+    result = results.BenchmarkResult.from_json(json_fn)
+
+    states = result.subbenches["copy"].states
+    assert states[0].samples is None
+    assert states[1].samples is None
+    assert states[0].frequencies is None
+    assert states[1].frequencies is None
+    assert result.centers(lambda samples: pytest.fail("estimator should not run")) == {
+        "copy": {
+            "BlockSize[pow2]=8": None,
+            "BlockSize[pow2]=9": None,
+        }
+    }
+    assert result.centers_with_frequencies(
+        lambda samples, frequencies: pytest.fail("estimator should not run")
+    ) == {
+        "copy": {
+            "BlockSize[pow2]=8": None,
+            "BlockSize[pow2]=9": None,
+        }
+    }
+
+
+def test_benchmark_result_rejects_mismatched_sample_and_frequency_counts(tmp_path):
+    bin_dir = tmp_path / "result.json-bin"
+    bin_dir.mkdir()
+    (bin_dir / "0.bin").write_bytes(struct.pack("<3f", 1.0, 2.0, 4.0))
+    freq_bin_dir = tmp_path / "result.json-freqs-bin"
+    freq_bin_dir.mkdir()
+    (freq_bin_dir / "0.bin").write_bytes(struct.pack("<2f", 100.0, 200.0))
+
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [block_size_axis(8)],
+                    "states": [
+                        {
+                            "name": "Device=0 BlockSize=2^8",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "256",
+                                }
+                            ],
+                            "summaries": [
+                                sample_times_summary("result.json-bin/0.bin", 3),
+                                sample_frequencies_summary(
+                                    "result.json-freqs-bin/0.bin",
+                                    2,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    with pytest.raises(ValueError, match="sample count .* frequency count"):
+        results.BenchmarkResult.from_json(json_fn)
--- a/python/test/test_nvbench_json_summary.py
+++ b/python/test/test_nvbench_json_summary.py
@@ -0,0 +1,376 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import importlib.util
+import json
+from pathlib import Path
+
+
+def load_nvbench_json_summary():
+    module_path = (
+        Path(__file__).resolve().parents[1] / "scripts" / "nvbench_json_summary.py"
+    )
+    spec = importlib.util.spec_from_file_location("nvbench_json_summary", module_path)
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+nvbench_json_summary = load_nvbench_json_summary()
+
+
+def write_result_json(path):
+    path.write_text(
+        json.dumps(
+            {
+                "devices": [
+                    {
+                        "id": 0,
+                        "name": "Test GPU",
+                    }
+                ],
+                "benchmarks": [
+                    {
+                        "name": "copy",
+                        "devices": [0],
+                        "axes": [
+                            {
+                                "name": "BlockSize",
+                                "type": "int64",
+                                "flags": "pow2",
+                                "values": [
+                                    {
+                                        "input_string": "8",
+                                        "description": "2^8 = 256",
+                                        "value": 256,
+                                    }
+                                ],
+                            }
+                        ],
+                        "states": [
+                            {
+                                "name": "Device=0 BlockSize=2^8",
+                                "device": 0,
+                                "type_config_index": 0,
+                                "axis_values": [
+                                    {
+                                        "name": "BlockSize",
+                                        "type": "int64",
+                                        "value": "256",
+                                    }
+                                ],
+                                "summaries": [
+                                    {
+                                        "tag": "nv/cold/time/gpu/sample_size",
+                                        "name": "Samples",
+                                        "hint": "sample_size",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "int64",
+                                                "value": "12",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/mean",
+                                        "name": "GPU Time",
+                                        "hint": "duration",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "1.25e-6",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/stdev/relative",
+                                        "name": "Noise",
+                                        "hint": "percentage",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "0.015",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/bw/global/bytes_per_second",
+                                        "name": "GlobalMem BW",
+                                        "hint": "byte_rate",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "2.5e9",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/bw/global/utilization",
+                                        "name": "BWUtil",
+                                        "hint": "percentage",
+                                        "hide": False,
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "0.625",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/min",
+                                        "name": "Min GPU Time",
+                                        "hint": "duration",
+                                        "hide": "Hidden by default.",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "1.0e-6",
+                                            }
+                                        ],
+                                    },
+                                ],
+                                "is_skipped": False,
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_json_summary_formats_nvbench_style_markdown(tmp_path):
+    json_path = tmp_path / "result.json"
+    write_result_json(json_path)
+
+    result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
+    report = nvbench_json_summary.format_result(result)
+
+    assert "# Benchmark Results" in report
+    assert "## copy" in report
+    assert "### [0] Test GPU" in report
+    assert (
+        "| BlockSize | Samples | GPU Time | Noise | GlobalMem BW | BWUtil |" in report
+    )
+    assert (
+        "| 2^8 = 256 |     12x | 1.250 us | 1.50% |   2.500 GB/s | 62.50% |" in report
+    )
+    assert "Min GPU Time" not in report
+
+
+def test_json_summary_formats_null_summary_value_as_blank():
+    summary = nvbench_json_summary.BenchmarkResultSummary(
+        tag="nv/cold/time/gpu/stdev/relative",
+        name="Noise",
+        hint="percentage",
+        hide=None,
+        description=None,
+        data={"value": None},
+    )
+
+    assert nvbench_json_summary.format_summary(summary) == ""
+
+
+def test_json_summary_formats_axis_values_like_markdown_printer():
+    axes_by_name = {
+        "BlockSize": {
+            "name": "BlockSize",
+            "type": "int64",
+            "flags": "pow2",
+        },
+        "NumBlocks": {
+            "name": "NumBlocks",
+            "type": "int64",
+            "flags": "",
+        },
+        "Duration": {
+            "name": "Duration",
+            "type": "float64",
+            "flags": "",
+        },
+        "Nullable": {
+            "name": "Nullable",
+            "type": "int64",
+            "flags": "",
+        },
+    }
+
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "BlockSize", "type": "int64", "value": "256"}, axes_by_name
+    ) == ("BlockSize", "2^8 = 256")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "NumBlocks", "type": "int64", "value": "64"}, axes_by_name
+    ) == ("NumBlocks", "64")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "Duration", "type": "float64", "value": "0.123456789"},
+        axes_by_name,
+    ) == ("Duration", "0.12346")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "Nullable", "type": "int64", "value": None}, axes_by_name
+    ) == ("Nullable", "")
+
+
+def test_json_summary_formats_state_with_null_axis_values(tmp_path):
+    json_path = tmp_path / "result.json"
+    json_path.write_text(
+        json.dumps(
+            {
+                "devices": [
+                    {
+                        "id": 0,
+                        "name": "Test GPU",
+                    }
+                ],
+                "benchmarks": [
+                    {
+                        "name": "no_axes",
+                        "devices": [0],
+                        "axes": None,
+                        "states": [
+                            {
+                                "name": "Device=0",
+                                "device": 0,
+                                "axis_values": None,
+                                "summaries": [
+                                    {
+                                        "tag": "nv/cold/time/gpu/sample_size",
+                                        "name": "Samples",
+                                        "hint": "sample_size",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "int64",
+                                                "value": "7",
+                                            }
+                                        ],
+                                    }
+                                ],
+                                "is_skipped": False,
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
+    report = nvbench_json_summary.format_result(result)
+
+    assert "## no_axes" in report
+    assert "| Samples |" in report
+    assert "|      7x |" in report
+
+
+def test_json_summary_omits_skipped_states(tmp_path):
+    json_path = tmp_path / "result.json"
+    json_path.write_text(
+        json.dumps(
+            {
+                "devices": [
+                    {
+                        "id": 0,
+                        "name": "Test GPU",
+                    }
+                ],
+                "benchmarks": [
+                    {
+                        "name": "copy",
+                        "devices": [0],
+                        "axes": [
+                            {
+                                "name": "BlockSize",
+                                "type": "int64",
+                                "flags": "pow2",
+                                "values": [
+                                    {
+                                        "input_string": "8",
+                                        "description": "2^8 = 256",
+                                        "value": 256,
+                                    },
+                                    {
+                                        "input_string": "9",
+                                        "description": "2^9 = 512",
+                                        "value": 512,
+                                    },
+                                ],
+                            }
+                        ],
+                        "states": [
+                            {
+                                "name": "Device=0 BlockSize=2^8",
+                                "device": 0,
+                                "axis_values": [
+                                    {
+                                        "name": "BlockSize",
+                                        "type": "int64",
+                                        "value": "256",
+                                    }
+                                ],
+                                "summaries": None,
+                                "is_skipped": True,
+                                "skip_reason": "Deadlock detected",
+                            },
+                            {
+                                "name": "Device=0 BlockSize=2^9",
+                                "device": 0,
+                                "axis_values": [
+                                    {
+                                        "name": "BlockSize",
+                                        "type": "int64",
+                                        "value": "512",
+                                    }
+                                ],
+                                "summaries": [
+                                    {
+                                        "tag": "nv/cold/time/gpu/sample_size",
+                                        "name": "Samples",
+                                        "hint": "sample_size",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "int64",
+                                                "value": "3",
+                                            }
+                                        ],
+                                    }
+                                ],
+                                "is_skipped": False,
+                            },
+                        ],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
+    report = nvbench_json_summary.format_result(result)
+
+    assert "Skip Reason" not in report
+    assert "Deadlock detected" not in report
+    assert "2^8 = 256" not in report
+    assert "2^9 = 512" in report
+    assert "3x" in report
+
+
+def test_json_summary_cli_writes_output_file(tmp_path):
+    json_path = tmp_path / "result.json"
+    output_path = tmp_path / "summary.md"
+    write_result_json(json_path)
+
+    rc = nvbench_json_summary.main([str(json_path), "--output", str(output_path)])
+
+    assert rc == 0
+    assert "GlobalMem BW" in output_path.read_text(encoding="utf-8")