diff --git a/ci/test_cuda_bench_inner.sh b/ci/test_cuda_bench_inner.sh
index fca2147..ca6ab47 100755
--- a/ci/test_cuda_bench_inner.sh
+++ b/ci/test_cuda_bench_inner.sh
@@ -47,4 +47,4 @@ python -m pip install "${CUDA_BENCH_WHEEL_PATH}[${TEST_EXTRA}]"
 
 # Run tests
 cd "/workspace/python/test/"
-python -m pytest -v test_cuda_bench.py
+python -m pytest -v .
diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index 4d2f496..3bb0284 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -29,6 +29,26 @@ except Exception as e:
     )
 
 
+_NVBENCH_EXPORTS = (
+    "Benchmark",
+    "CudaStream",
+    "Launch",
+    "NVBenchRuntimeError",
+    "State",
+    "register",
+    "run_all_benchmarks",
+)
+
+_NVBENCH_TEST_EXPORTS = (
+    "_test_cpp_exception",
+    "_test_py_exception",
+)
+
+__all__ = list(_NVBENCH_EXPORTS)
+
+_nvbench_module = None
+
+
 # Detect CUDA runtime version and load appropriate extension
 def _get_cuda_major_version():
     """Detect the CUDA runtime major version."""
@@ -47,51 +67,63 @@ def _get_cuda_major_version():
         )
 
 
-_cuda_major = _get_cuda_major_version()
-_extra_name = f"cu{_cuda_major}"
-_module_fullname = f"cuda.bench.{_extra_name}._nvbench"
+def _bind_nvbench_module(module):
+    for name in _NVBENCH_EXPORTS:
+        globals()[name] = getattr(module, name)
+        # Set module of exposed objects
+        globals()[name].__module__ = __name__
 
-try:
-    _nvbench_module = importlib.import_module(_module_fullname)
-except ImportError as e:
-    raise ImportError(
-        f"No cuda-bench extension found for CUDA {_cuda_major}.x. "
-        f"This wheel may not include support for your CUDA version. "
-        f"Supported CUDA versions: 12, 13. "
-        f"Original error: {e}"
+    for name in _NVBENCH_TEST_EXPORTS:
+        globals()[name] = getattr(module, name)
+
+    # Expose the module as _nvbench for backward compatibility (e.g., for tests)
+    globals()["_nvbench"] = module
+
+
+def _load_nvbench_module():
+    global _nvbench_module
+
+    if _nvbench_module is not None:
+        return _nvbench_module
+
+    cuda_major = _get_cuda_major_version()
+    extra_name = f"cu{cuda_major}"
+    module_fullname = f"cuda.bench.{extra_name}._nvbench"
+
+    try:
+        module = importlib.import_module(module_fullname)
+    except ImportError as e:
+        raise ImportError(
+            f"No cuda-bench extension found for CUDA {cuda_major}.x. "
+            f"This wheel may not include support for your CUDA version. "
+            f"Supported CUDA versions: 12, 13. "
+            f"Original error: {e}"
+        ) from e
+
+    _bind_nvbench_module(module)
+    _nvbench_module = module
+    return module
+
+
+def __getattr__(name):
+    if name == "_nvbench":
+        return _load_nvbench_module()
+
+    if name in _NVBENCH_EXPORTS + _NVBENCH_TEST_EXPORTS:
+        _load_nvbench_module()
+        return globals()[name]
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    return sorted(
+        set(globals())
+        | set(_NVBENCH_EXPORTS)
+        | set(_NVBENCH_TEST_EXPORTS)
+        | {"_nvbench"}
     )
 
-# Import and expose all public symbols from the CUDA-specific extension
-Benchmark = _nvbench_module.Benchmark
-CudaStream = _nvbench_module.CudaStream
-Launch = _nvbench_module.Launch
-NVBenchRuntimeError = _nvbench_module.NVBenchRuntimeError
-State = _nvbench_module.State
-register = _nvbench_module.register
-run_all_benchmarks = _nvbench_module.run_all_benchmarks
-_test_cpp_exception = _nvbench_module._test_cpp_exception
-_test_py_exception = _nvbench_module._test_py_exception
-
-# Expose the module as _nvbench for backward compatibility (e.g., for tests)
-_nvbench = _nvbench_module
-
-# Set module of exposed objects
-Benchmark.__module__ = __name__
-CudaStream.__module__ = __name__
-Launch.__module__ = __name__
-NVBenchRuntimeError.__module__ = __name__
-State.__module__ = __name__
-register.__module__ = __name__
-run_all_benchmarks.__module__ = __name__
-
-# Clean up internal symbols
-del (
-    _nvbench_module,
-    _cuda_major,
-    _extra_name,
-    _module_fullname,
-    _get_cuda_major_version,
-)
 
 __doc__ = """
 CUDA Kernel Benchmarking Library Python API
diff --git a/python/cuda/bench/__init__.pyi b/python/cuda/bench/__init__.pyi
index ae5d4bd..9e0d264 100644
--- a/python/cuda/bench/__init__.pyi
+++ b/python/cuda/bench/__init__.pyi
@@ -25,8 +25,17 @@
 # stubs in generated out/cuda/nvbench/_nvbench.pyi
 # with definitions given here.
 
-from collections.abc import Callable, Sequence
-from typing import Optional, Self, SupportsFloat, SupportsInt, Union
+from collections.abc import (
+    Callable,
+    Sequence,
+)
+from typing import (
+    Optional,
+    Self,
+    SupportsFloat,
+    SupportsInt,
+    Union,
+)
 
 class CudaStream:
     def __cuda_stream__(self) -> tuple[int, int]: ...
diff --git a/python/cuda/bench/results/__init__.py b/python/cuda/bench/results/__init__.py
new file mode 100644
index 0000000..0c5498c
--- /dev/null
+++ b/python/cuda/bench/results/__init__.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Utilities for reading NVBench JSON benchmark result files."""
+
+from ._benchmark_result import (
+    BenchmarkResult,
+    BenchmarkResultDevice,
+    BenchmarkResultSummary,
+    SubBenchmarkResult,
+    SubBenchmarkState,
+)
+
+BenchmarkResult.__module__ = __name__
+BenchmarkResultDevice.__module__ = __name__
+BenchmarkResultSummary.__module__ = __name__
+SubBenchmarkResult.__module__ = __name__
+SubBenchmarkState.__module__ = __name__
+
+__all__ = [
+    "BenchmarkResult",
+    "BenchmarkResultDevice",
+    "BenchmarkResultSummary",
+    "SubBenchmarkResult",
+    "SubBenchmarkState",
+]
diff --git a/python/cuda/bench/results/__init__.pyi b/python/cuda/bench/results/__init__.pyi
new file mode 100644
index 0000000..8698229
--- /dev/null
+++ b/python/cuda/bench/results/__init__.pyi
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from array import array
+from collections.abc import Callable, ItemsView, Iterator, KeysView, ValuesView
+from os import PathLike
+from typing import Any, TypeVar, overload
+
+ResultT = TypeVar("ResultT")
+BenchmarkResultT = TypeVar("BenchmarkResultT", bound="BenchmarkResult")
+_SummaryValue = int | float | str | None
+
+class BenchmarkResultDevice:
+    id: int
+    name: str
+    data: dict[str, Any]
+
+class BenchmarkResultSummary:
+    tag: str
+    name: str | None
+    hint: str | None
+    hide: str | None
+    description: str | None
+    data: dict[str, _SummaryValue]
+    @property
+    def value(self) -> _SummaryValue | None: ...
+    def __getitem__(self, key: str) -> _SummaryValue: ...
+    def get(
+        self, key: str, default: _SummaryValue | None = None
+    ) -> _SummaryValue | None: ...
+
+class SubBenchmarkState:
+    state_name: str
+    device: int | None
+    type_config_index: int | None
+    axis_values: list[dict[str, Any]]
+    is_skipped: bool
+    skip_reason: str | None
+    summaries: dict[str, BenchmarkResultSummary]
+    samples: array | None
+    frequencies: array | None
+    bw: float | None
+    point: dict[str, str]
+    def name(self) -> str: ...
+    def center(self, estimator: Callable[[array], ResultT]) -> ResultT | None: ...
+    def center_with_frequencies(
+        self, estimator: Callable[[array, array], ResultT]
+    ) -> ResultT | None: ...
+
+class SubBenchmarkResult:
+    name: str
+    devices: list[int]
+    axes: list[dict[str, Any]]
+    states: list[SubBenchmarkState]
+    def __len__(self) -> int: ...
+    @overload
+    def __getitem__(self, state_index: int) -> SubBenchmarkState: ...
+    @overload
+    def __getitem__(self, state_index: slice) -> list[SubBenchmarkState]: ...
+    def __iter__(self) -> Iterator[SubBenchmarkState]: ...
+    def centers(
+        self, estimator: Callable[[array], ResultT]
+    ) -> dict[str, ResultT | None]: ...
+    def centers_with_frequencies(
+        self, estimator: Callable[[array, array], ResultT]
+    ) -> dict[str, ResultT | None]: ...
+
+class BenchmarkResult:
+    metadata: Any
+    devices: dict[int, BenchmarkResultDevice]
+    subbenches: dict[str, SubBenchmarkResult]
+    def __init__(self, token: object | None = None) -> None: ...
+    @classmethod
+    def empty(
+        cls: type[BenchmarkResultT], *, metadata: Any = None
+    ) -> BenchmarkResultT: ...
+    @classmethod
+    def from_json(
+        cls: type[BenchmarkResultT],
+        json_path: str | PathLike[str],
+        *,
+        metadata: Any = None,
+    ) -> BenchmarkResultT: ...
+    def __len__(self) -> int: ...
+    def __iter__(self) -> Iterator[str]: ...
+    def __contains__(self, subbench_name: object) -> bool: ...
+    def __getitem__(self, subbench_name: str) -> SubBenchmarkResult: ...
+    def keys(self) -> KeysView[str]: ...
+    def values(self) -> ValuesView[SubBenchmarkResult]: ...
+    def items(self) -> ItemsView[str, SubBenchmarkResult]: ...
+    def centers(
+        self, estimator: Callable[[array], ResultT]
+    ) -> dict[str, dict[str, ResultT | None]]: ...
+    def centers_with_frequencies(
+        self, estimator: Callable[[array, array], ResultT]
+    ) -> dict[str, dict[str, ResultT | None]]: ...
diff --git a/python/cuda/bench/results/_benchmark_result.py b/python/cuda/bench/results/_benchmark_result.py
new file mode 100644
index 0000000..5b7a49f
--- /dev/null
+++ b/python/cuda/bench/results/_benchmark_result.py
@@ -0,0 +1,511 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import array
+import json
+import os
+import sys
+from collections.abc import ItemsView, Iterator, KeysView, ValuesView
+from dataclasses import dataclass
+from typing import Any, Callable, TypeVar
+
+__all__ = [
+    "BenchmarkResult",
+    "BenchmarkResultDevice",
+    "BenchmarkResultSummary",
+    "SubBenchmarkResult",
+    "SubBenchmarkState",
+]
+
+ResultT = TypeVar("ResultT")
+BenchmarkResultT = TypeVar("BenchmarkResultT", bound="BenchmarkResult")
+_SummaryValue = int | float | str | None
+
+
+@dataclass(frozen=True)
+class BenchmarkResultDevice:
+    """Device metadata parsed from an NVBench JSON result file."""
+
+    id: int
+    name: str
+    data: dict[str, Any]
+
+
+def read_json(filename: str | os.PathLike[str]) -> dict:
+    with open(filename, "r", encoding="utf-8") as f:
+        file_root = json.load(f)
+    return file_root
+
+
+def extract_summary_data_value(summary: dict, name: str, expected_type: str) -> Any:
+    summary_tag = summary.get("tag", "<unknown>")
+    for value_data in summary.get("data", []):
+        if value_data.get("name") != name:
+            continue
+
+        value_type = value_data.get("type")
+        if value_type != expected_type:
+            raise ValueError(
+                f"summary {summary_tag!r} field {name!r} has type "
+                f"{value_type!r}; expected {expected_type!r}"
+            )
+        if "value" not in value_data:
+            raise ValueError(f"summary {summary_tag!r} field {name!r} is missing value")
+        return value_data["value"]
+
+    raise ValueError(f"summary {summary_tag!r} is missing field {name!r}")
+
+
+def extract_filename(summary: dict) -> str:
+    value = extract_summary_data_value(summary, "filename", "string")
+    if not isinstance(value, str):
+        raise ValueError(
+            f"summary {summary.get('tag', '<unknown>')!r} field 'filename' "
+            "value must be a string"
+        )
+    return value
+
+
+def extract_size(summary: dict) -> int:
+    value = extract_summary_data_value(summary, "size", "int64")
+    try:
+        return int(value)
+    except (TypeError, ValueError) as e:
+        raise ValueError(
+            f"summary {summary.get('tag', '<unknown>')!r} field 'size' "
+            f"value {value!r} is not an int64"
+        ) from e
+
+
+def parse_summary_value(
+    value_data: dict,
+    *,
+    summary_tag: str,
+    field_name: str,
+) -> _SummaryValue:
+    value_type = value_data.get("type")
+    if "value" not in value_data:
+        raise ValueError(
+            f"summary {summary_tag!r} field {field_name!r} is missing value"
+        )
+
+    value = value_data["value"]
+    if value is None:
+        return None
+
+    if value_type == "int64":
+        try:
+            return int(value)
+        except (TypeError, ValueError) as e:
+            raise ValueError(
+                f"summary {summary_tag!r} field {field_name!r} value {value!r} "
+                "is not an int64"
+            ) from e
+    if value_type == "float64":
+        try:
+            return float(value)
+        except (TypeError, ValueError) as e:
+            raise ValueError(
+                f"summary {summary_tag!r} field {field_name!r} value {value!r} "
+                "is not a float64"
+            ) from e
+    if value_type == "string":
+        if not isinstance(value, str):
+            raise ValueError(
+                f"summary {summary_tag!r} field {field_name!r} value {value!r} "
+                "is not a string"
+            )
+        return value
+    raise ValueError(
+        f"summary {summary_tag!r} field {field_name!r} has unsupported "
+        f"value type {value_type!r}"
+    )
+
+
+@dataclass(frozen=True)
+class BenchmarkResultSummary:
+    """Summary record parsed from one NVBench benchmark state."""
+
+    tag: str
+    name: str | None
+    hint: str | None
+    hide: str | None
+    description: str | None
+    data: dict[str, _SummaryValue]
+
+    @property
+    def value(self) -> _SummaryValue | None:
+        return self.data.get("value")
+
+    def __getitem__(self, key: str) -> _SummaryValue:
+        return self.data[key]
+
+    def get(
+        self, key: str, default: _SummaryValue | None = None
+    ) -> _SummaryValue | None:
+        return self.data.get(key, default)
+
+
+def parse_summary(summary: dict) -> BenchmarkResultSummary:
+    summary_tag = summary["tag"]
+    data = {}
+    for value_data in summary.get("data", []):
+        field_name = value_data.get("name")
+        if not isinstance(field_name, str):
+            raise ValueError(
+                f"summary {summary_tag!r} has a data entry with a missing "
+                "or non-string name"
+            )
+        data[field_name] = parse_summary_value(
+            value_data,
+            summary_tag=summary_tag,
+            field_name=field_name,
+        )
+    return BenchmarkResultSummary(
+        tag=summary_tag,
+        name=summary.get("name"),
+        hint=summary.get("hint"),
+        hide=summary.get("hide"),
+        description=summary.get("description"),
+        data=data,
+    )
+
+
+def get_state_summaries(state: dict) -> list[dict]:
+    return state.get("summaries") or []
+
+
+def parse_summaries(state: dict) -> dict[str, BenchmarkResultSummary]:
+    return {
+        summary["tag"]: parse_summary(summary) for summary in get_state_summaries(state)
+    }
+
+
+def parse_binary_meta(state: dict, tag: str) -> tuple[int | None, str | None]:
+    summaries = get_state_summaries(state)
+    if not summaries:
+        return None, None
+
+    summary = next(
+        filter(lambda s: s["tag"] == tag, summaries),
+        None,
+    )
+    if not summary:
+        return None, None
+
+    sample_filename = extract_filename(summary)
+    sample_count = extract_size(summary)
+    return sample_count, sample_filename
+
+
+def parse_samples_meta(state: dict) -> tuple[int | None, str | None]:
+    return parse_binary_meta(state, "nv/json/bin:nv/cold/sample_times")
+
+
+def parse_frequencies_meta(state: dict) -> tuple[int | None, str | None]:
+    return parse_binary_meta(state, "nv/json/freqs-bin:nv/cold/sample_freqs")
+
+
+def resolve_binary_filename(json_dir: str, binary_filename: str) -> str:
+    if os.path.isabs(binary_filename):
+        return binary_filename
+
+    json_relative_filename = os.path.join(json_dir, binary_filename)
+    if os.path.exists(json_relative_filename):
+        return json_relative_filename
+
+    parent_relative_filename = os.path.join(os.path.dirname(json_dir), binary_filename)
+    if os.path.exists(parent_relative_filename):
+        return parent_relative_filename
+
+    if os.path.exists(binary_filename):
+        return binary_filename
+
+    return json_relative_filename
+
+
+def parse_float32_binary(
+    count: int | None, filename: str | None, json_dir: str
+) -> array.array | None:
+    if count is None or filename is None:
+        return None
+
+    values = array.array("f")
+    if values.itemsize != 4:
+        raise RuntimeError("array('f') is not a 32-bit float on this platform")
+
+    filename = resolve_binary_filename(json_dir, filename)
+    try:
+        with open(filename, "rb") as f:
+            size = os.fstat(f.fileno()).st_size
+            if size % values.itemsize:
+                raise ValueError("file size is not a multiple of float size")
+
+            values.fromfile(f, size // values.itemsize)
+    except FileNotFoundError:
+        return None
+
+    # Match np.fromfile(fn, "<f4"): little-endian float32.
+    if sys.byteorder != "little":
+        values.byteswap()
+
+    if count != len(values):
+        raise ValueError(f"expected {count} values in {filename}, found {len(values)}")
+    return values
+
+
+def parse_samples(state: dict, json_dir: str) -> array.array | None:
+    """Return the state's sample times, or None if sample data is unavailable."""
+    sample_count, samples_filename = parse_samples_meta(state)
+    return parse_float32_binary(sample_count, samples_filename, json_dir)
+
+
+def parse_frequencies(state: dict, json_dir: str) -> array.array | None:
+    """Return the state's sample frequencies, or None if data is unavailable."""
+    frequency_count, frequencies_filename = parse_frequencies_meta(state)
+    return parse_float32_binary(frequency_count, frequencies_filename, json_dir)
+
+
+def parse_bw(summaries: dict[str, BenchmarkResultSummary]) -> float | None:
+    bwutil = summaries.get("nv/cold/bw/global/utilization")
+    if bwutil is None or bwutil.value is None:
+        return None
+
+    return float(bwutil.value)
+
+
+def get_axis_name(axis: dict) -> str:
+    name = axis["name"]
+    if af := axis.get("flags"):
+        name = name + f"[{af}]"
+    return name
+
+
+class SubBenchmarkState:
+    """Result data for one executed state of an NVBench benchmark."""
+
+    def __init__(self, state: dict, axes_names: dict, axes_values: dict, json_dir: str):
+        self.state_name = state["name"]
+        self.device = state.get("device")
+        self.type_config_index = state.get("type_config_index")
+        self.axis_values = state.get("axis_values") or []
+        self.is_skipped = state.get("is_skipped", False)
+        self.skip_reason = state.get("skip_reason")
+        self.summaries = parse_summaries(state)
+        self.samples = parse_samples(state, json_dir)
+        self.frequencies = parse_frequencies(state, json_dir)
+        if (
+            self.samples is not None
+            and self.frequencies is not None
+            and len(self.samples) != len(self.frequencies)
+        ):
+            raise ValueError(
+                f"sample count ({len(self.samples)}) does not match "
+                f"frequency count ({len(self.frequencies)})"
+            )
+        self.bw = parse_bw(self.summaries)
+
+        self.point = {}
+        for axis in self.axis_values:
+            axis_name = axis["name"]
+            name = axes_names[axis_name]
+            axis_value_map = axes_values[axis_name]
+            if "value" in axis:
+                key = str(axis["value"])
+                value = axis_value_map.get(key, key)
+            else:
+                input_string = axis.get("input_string")
+                value = (
+                    axis_value_map.get(input_string, input_string)
+                    if input_string is not None
+                    else ""
+                )
+            self.point[name] = value
+
+    def __repr__(self) -> str:
+        return str(self.__dict__)
+
+    def name(self) -> str:
+        if not self.point:
+            return self.state_name
+        return " ".join(f"{k}={v}" for k, v in self.point.items())
+
+    def center(self, estimator: Callable[[array.array], ResultT]) -> ResultT | None:
+        if self.samples is None:
+            return None
+        return estimator(self.samples)
+
+    def center_with_frequencies(
+        self, estimator: Callable[[array.array, array.array], ResultT]
+    ) -> ResultT | None:
+        if self.samples is None or self.frequencies is None:
+            return None
+        return estimator(self.samples, self.frequencies)
+
+
+class SubBenchmarkResult:
+    """Result data for one NVBench benchmark and its executed states."""
+
+    def __init__(self, bench: dict, json_dir: str):
+        self.name = bench["name"]
+        self.devices = bench.get("devices") or []
+        self.axes = bench.get("axes") or []
+
+        axes_names = {}
+        axes_values = {}
+        for axis in self.axes:
+            short_name = axis["name"]
+            full_name = get_axis_name(axis)
+            this_axis_values = {}
+            for value in axis["values"]:
+                input_string = value["input_string"]
+                this_axis_values[input_string] = input_string
+                if "value" in value:
+                    this_axis_values[str(value["value"])] = input_string
+            axes_names[short_name] = full_name
+            axes_values[short_name] = this_axis_values
+
+        self.states = [
+            SubBenchmarkState(state, axes_names, axes_values, json_dir)
+            for state in bench["states"]
+        ]
+
+    def __repr__(self) -> str:
+        return str(self.__dict__)
+
+    def __len__(self) -> int:
+        return len(self.states)
+
+    def __getitem__(
+        self, state_index: int | slice
+    ) -> SubBenchmarkState | list[SubBenchmarkState]:
+        return self.states[state_index]
+
+    def __iter__(self) -> Iterator[SubBenchmarkState]:
+        return iter(self.states)
+
+    def centers(
+        self, estimator: Callable[[array.array], ResultT]
+    ) -> dict[str, ResultT | None]:
+        result = {}
+        for state in self.states:
+            result[state.name()] = state.center(estimator)
+        return result
+
+    def centers_with_frequencies(
+        self, estimator: Callable[[array.array, array.array], ResultT]
+    ) -> dict[str, ResultT | None]:
+        result = {}
+        for state in self.states:
+            result[state.name()] = state.center_with_frequencies(estimator)
+        return result
+
+
+class BenchmarkResult:
+    """Container for benchmark result data parsed from NVBench JSON output.
+
+    Instances are created with :meth:`from_json` or :meth:`empty`. Direct
+    construction is intentionally disabled to keep creation paths explicit.
+    """
+
+    _construction_token = object()
+
+    def __init__(
+        self,
+        token=None,
+    ):
+        """Initialize an instance created by a BenchmarkResult class method.
+
+        Users should call :meth:`from_json` or :meth:`empty` instead. The token
+        argument is an implementation detail used to prevent direct
+        construction.
+        """
+        if token is not self._construction_token:
+            raise TypeError(
+                "BenchmarkResult cannot be constructed directly; "
+                "use BenchmarkResult.from_json() or BenchmarkResult.empty()"
+            )
+
+        self.metadata: Any = None
+        self.devices: dict[int, BenchmarkResultDevice] = {}
+        self.subbenches: dict[str, SubBenchmarkResult] = {}
+
+    @classmethod
+    def empty(cls: type[BenchmarkResultT], *, metadata: Any = None) -> BenchmarkResultT:
+        """Create an empty result container with optional user metadata."""
+        result = cls(cls._construction_token)
+        result.metadata = metadata
+        return result
+
+    @classmethod
+    def from_json(
+        cls: type[BenchmarkResultT],
+        json_path: str | os.PathLike[str],
+        *,
+        metadata: Any = None,
+    ) -> BenchmarkResultT:
+        """Read benchmark result data from an NVBench JSON output file."""
+        result = cls.empty(metadata=metadata)
+        result._parse_json(json_path)
+        return result
+
+    def _parse_json(self, json_path: str | os.PathLike[str]) -> None:
+        """Populate this instance from an NVBench JSON output file."""
+        json_path = os.fspath(json_path)
+        json_dir = os.path.dirname(os.path.abspath(json_path))
+        result_json = read_json(json_path)
+        self.devices = {
+            int(device["id"]): BenchmarkResultDevice(
+                id=int(device["id"]),
+                name=device["name"],
+                data=device,
+            )
+            for device in result_json.get("devices", [])
+        }
+        for bench in result_json["benchmarks"]:
+            bench_name: str = bench["name"]
+            self.subbenches[bench_name] = SubBenchmarkResult(bench, json_dir)
+
+    def __repr__(self) -> str:
+        return str(self.__dict__)
+
+    def __len__(self) -> int:
+        return len(self.subbenches)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.subbenches)
+
+    def __contains__(self, subbench_name: object) -> bool:
+        return subbench_name in self.subbenches
+
+    def __getitem__(self, subbench_name: str) -> SubBenchmarkResult:
+        return self.subbenches[subbench_name]
+
+    def keys(self) -> KeysView[str]:
+        return self.subbenches.keys()
+
+    def values(self) -> ValuesView[SubBenchmarkResult]:
+        return self.subbenches.values()
+
+    def items(self) -> ItemsView[str, SubBenchmarkResult]:
+        return self.subbenches.items()
+
+    def centers(
+        self, estimator: Callable[[array.array], ResultT]
+    ) -> dict[str, dict[str, ResultT | None]]:
+        result = {}
+        for subbench in self.subbenches:
+            result[subbench] = self.subbenches[subbench].centers(estimator)
+        return result
+
+    def centers_with_frequencies(
+        self, estimator: Callable[[array.array, array.array], ResultT]
+    ) -> dict[str, dict[str, ResultT | None]]:
+        result = {}
+        for subbench in self.subbenches:
+            result[subbench] = self.subbenches[subbench].centers_with_frequencies(
+                estimator
+            )
+        return result
diff --git a/python/examples/benchmark_result_autotune.py b/python/examples/benchmark_result_autotune.py
new file mode 100644
index 0000000..d5b5c1c
--- /dev/null
+++ b/python/examples/benchmark_result_autotune.py
@@ -0,0 +1,361 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+import statistics
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+from cuda.bench.results import BenchmarkResult, BenchmarkResultSummary
+from tabulate import tabulate
+
+TILE_SHAPES = ("4x32", "8x16", "16x16", "32x8", "16x8", "8x8")
+BENCHMARK_NAME = "stencil_autotune"
+MEDIAN_TIE_RELATIVE_TOLERANCE = 0.01
+MIN_RECOMMENDED_INTERIOR_PIXELS = 1_000_000
+
+
+def parse_tile_shape(tile_shape: str) -> tuple[int, int]:
+    block_x, block_y = tile_shape.split("x", maxsplit=1)
+    return int(block_x), int(block_y)
+
+
+def format_duration(seconds: float) -> str:
+    if seconds >= 1.0:
+        return f"{seconds:.3f} s"
+    if seconds >= 1e-3:
+        return f"{seconds * 1e3:.3f} ms"
+    if seconds >= 1e-6:
+        return f"{seconds * 1e6:.3f} us"
+    return f"{seconds * 1e9:.3f} ns"
+
+
+def format_optional_duration(seconds: float | None) -> str:
+    if seconds is None:
+        return "-"
+    return format_duration(seconds)
+
+
+def format_byte_rate(summary: BenchmarkResultSummary | None) -> str:
+    if summary is None or summary.value is None:
+        return "-"
+
+    bytes_per_second = float(summary.value)
+    if bytes_per_second >= 1e12:
+        return f"{bytes_per_second * 1e-12:.3f} TB/s"
+    if bytes_per_second >= 1e9:
+        return f"{bytes_per_second * 1e-9:.3f} GB/s"
+    if bytes_per_second >= 1e6:
+        return f"{bytes_per_second * 1e-6:.3f} MB/s"
+    if bytes_per_second >= 1e3:
+        return f"{bytes_per_second * 1e-3:.3f} KB/s"
+    return f"{bytes_per_second:.3f} B/s"
+
+
+def state_tile_shape(state_name: str) -> str:
+    prefix = "TileShape="
+    for field in state_name.split():
+        if field.startswith(prefix):
+            return field.removeprefix(prefix)
+    return state_name
+
+
+def interior_pixel_count(width: int, height: int) -> int:
+    return max(width - 2, 0) * max(height - 2, 0)
+
+
+def median_ties_best(row: dict[str, Any], best_median_seconds: float) -> bool:
+    tolerance = abs(best_median_seconds) * MEDIAN_TIE_RELATIVE_TOLERANCE
+    return abs(row["median_seconds"] - best_median_seconds) <= tolerance
+
+
+def summarize_result(result: BenchmarkResult) -> list[dict[str, Any]]:
+    subbenchmark = result[BENCHMARK_NAME]
+    medians = subbenchmark.centers(statistics.median)
+    metadata = result.metadata if isinstance(result.metadata, dict) else {}
+    rows = []
+
+    for state in subbenchmark:
+        median_seconds = medians[state.name()]
+        if median_seconds is None:
+            continue
+
+        bandwidth = state.summaries.get("nv/cold/bw/global/bytes_per_second")
+        mean_summary = state.summaries.get("nv/cold/time/gpu/mean")
+        mean_seconds = (
+            None
+            if mean_summary is None or mean_summary.value is None
+            else float(mean_summary.value)
+        )
+        rows.append(
+            {
+                "tile_shape": state_tile_shape(state.name()),
+                "median_seconds": median_seconds,
+                "mean_seconds": mean_seconds,
+                "sample_count": len(state.samples) if state.samples is not None else 0,
+                "bandwidth": format_byte_rate(bandwidth),
+                "subprocess_seconds": metadata.get("elapsed_seconds", 0.0),
+            }
+        )
+
+    return sorted(rows, key=lambda row: row["median_seconds"])
+
+
+def print_summary(rows: list[dict[str, Any]]) -> None:
+    if not rows:
+        raise RuntimeError("No benchmark states with sample data were found.")
+
+    total_subprocess_seconds = sum(row["subprocess_seconds"] for row in rows)
+    print()
+    print(f"Total benchmark subprocess wall time: {total_subprocess_seconds:.3f} s")
+    print()
+
+    best_median_seconds = rows[0]["median_seconds"]
+    tied_rows = [row for row in rows if median_ties_best(row, best_median_seconds)]
+    table = [
+        [
+            "*" if row in tied_rows else "",
+            row["tile_shape"],
+            format_duration(row["median_seconds"]),
+            format_optional_duration(row["mean_seconds"]),
+            row["sample_count"],
+            row["bandwidth"],
+            f"{row['subprocess_seconds']:.3f} s",
+        ]
+        for row in rows
+    ]
+    print(
+        tabulate(
+            table,
+            headers=[
+                "Best",
+                "TileShape",
+                "Median GPU Time",
+                "Mean GPU Time",
+                "Samples",
+                "GlobalMem BW",
+                "Subprocess",
+            ],
+            tablefmt="simple",
+            disable_numparse=True,
+        )
+    )
+
+    print()
+    if len(tied_rows) == 1:
+        best = tied_rows[0]
+        print(
+            "Best tile shape by median isolated GPU time: "
+            f"{best['tile_shape']} ({format_duration(best['median_seconds'])})"
+        )
+    else:
+        tile_shapes = ", ".join(row["tile_shape"] for row in tied_rows)
+        print(
+            "No unique best tile shape by median isolated GPU time: "
+            f"{len(tied_rows)} states are within "
+            f"{MEDIAN_TIE_RELATIVE_TOLERANCE:.1%} of "
+            f"{format_duration(best_median_seconds)} ({tile_shapes})."
+        )
+
+
+def run_driver(args: argparse.Namespace, nvbench_args: list[str]) -> int:
+    with tempfile.TemporaryDirectory(prefix="nvbench-autotune-") as tmp_dir:
+        rows = []
+        total = len(TILE_SHAPES)
+        interior_pixels = interior_pixel_count(args.image_width, args.image_height)
+        print(
+            f"Image size: {args.image_width}x{args.image_height} "
+            f"({interior_pixels} interior stencil points)"
+        )
+        print(f"Sampling {total} tile shapes:")
+        if interior_pixels < MIN_RECOMMENDED_INTERIOR_PIXELS:
+            print(
+                "Warning: this problem has only "
+                f"{interior_pixels} interior stencil points. "
+                "Small problems are usually dominated by kernel launch overhead, "
+                "so median timings may tie across tile shapes."
+            )
+
+        for index, tile_shape in enumerate(TILE_SHAPES, start=1):
+            json_path = Path(tmp_dir) / f"stencil_autotune_{tile_shape}.json"
+            command = [
+                sys.executable,
+                str(Path(__file__).resolve()),
+                "--run-benchmark",
+                "--stopping-criterion",
+                "entropy",
+                "--tile-shape",
+                tile_shape,
+                "--image-width",
+                str(args.image_width),
+                "--image-height",
+                str(args.image_height),
+                "--jsonbin",
+                str(json_path),
+            ]
+            if nvbench_args:
+                command.extend(["--", *nvbench_args])
+
+            print(f"[{index}/{total}] TileShape={tile_shape} ... ", end="", flush=True)
+            start = time.perf_counter()
+            completed = subprocess.run(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                check=False,
+            )
+            elapsed_seconds = time.perf_counter() - start
+
+            if completed.returncode != 0:
+                print(f"failed after {elapsed_seconds:.3f} s")
+                print(completed.stdout, end="")
+                return completed.returncode
+
+            metadata = {
+                "command": command,
+                "returncode": completed.returncode,
+                "elapsed_seconds": elapsed_seconds,
+                "tile_shape": tile_shape,
+            }
+            result = BenchmarkResult.from_json(json_path, metadata=metadata)
+            tile_rows = summarize_result(result)
+            rows.extend(tile_rows)
+
+            if tile_rows:
+                row = tile_rows[0]
+                print(
+                    f"done in {elapsed_seconds:.3f} s, "
+                    f"median {format_duration(row['median_seconds'])}, "
+                    f"{row['bandwidth']}"
+                )
+            else:
+                print(f"done in {elapsed_seconds:.3f} s, no samples")
+
+        print_summary(sorted(rows, key=lambda row: row["median_seconds"]))
+        return 0
+
+
+def run_benchmark(args: argparse.Namespace, nvbench_args: list[str]) -> None:
+    import cuda.bench as bench
+    import numpy as np
+    from numba import cuda
+
+    def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
+        return cuda.external_stream(cs.addressof())
+
+    @cuda.jit
+    def stencil_kernel(inp, out, width, height):
+        x, y = cuda.grid(2)
+        if 0 < x < width - 1 and 0 < y < height - 1:
+            idx = y * width + x
+            out[idx] = 0.2 * (
+                inp[idx]
+                + inp[idx - 1]
+                + inp[idx + 1]
+                + inp[idx - width]
+                + inp[idx + width]
+            )
+
+    def stencil_autotune(state: bench.State) -> None:
+        tile_shape = state.get_string("TileShape")
+        block_x, block_y = parse_tile_shape(tile_shape)
+        width = args.image_width
+        height = args.image_height
+        interior_pixels = (width - 2) * (height - 2)
+
+        state.add_element_count(interior_pixels, column_name="Pixels")
+        state.add_global_memory_reads(
+            interior_pixels * 5 * np.dtype(np.float32).itemsize
+        )
+        state.add_global_memory_writes(interior_pixels * np.dtype(np.float32).itemsize)
+
+        host_input = np.ones(width * height, dtype=np.float32)
+        dev_input = cuda.to_device(host_input)
+        dev_output = cuda.device_array_like(dev_input)
+
+        block_shape = (block_x, block_y)
+        grid_shape = (
+            (width + block_x - 1) // block_x,
+            (height + block_y - 1) // block_y,
+        )
+
+        # Compile the Numba kernel outside NVBench measurement.
+        stencil_kernel[grid_shape, block_shape](dev_input, dev_output, width, height)
+        cuda.synchronize()
+
+        def launcher(launch: bench.Launch) -> None:
+            stream = as_cuda_stream(launch.get_stream())
+            stencil_kernel[grid_shape, block_shape, stream, 0](
+                dev_input,
+                dev_output,
+                width,
+                height,
+            )
+
+        state.exec(launcher)
+
+    benchmark = bench.register(stencil_autotune)
+    benchmark.set_name(BENCHMARK_NAME)
+    tile_shapes = [args.tile_shape] if args.tile_shape is not None else TILE_SHAPES
+    benchmark.add_string_axis("TileShape", tile_shapes)
+    bench.run_all_benchmarks([sys.argv[0], *nvbench_args])
+
+
+def parse_args(argv: list[str] | None = None) -> tuple[argparse.Namespace, list[str]]:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Autotune a simple stencil benchmark and select the best state "
+            "from NVBench JSON-bin output."
+        ),
+        epilog=(
+            "Additional NVBench options may be passed after '--'. "
+            "For example: benchmark_result_autotune.py -- --timeout 30"
+        ),
+    )
+    parser.add_argument(
+        "--run-benchmark",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--tile-shape",
+        choices=TILE_SHAPES,
+        default=None,
+        help=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--image-width",
+        type=int,
+        default=4096,
+        help="Stencil input width used by the subprocess benchmark.",
+    )
+    parser.add_argument(
+        "--image-height",
+        type=int,
+        default=4096,
+        help="Stencil input height used by the subprocess benchmark.",
+    )
+    args, nvbench_args = parser.parse_known_args(argv)
+    if args.image_width < 3 or args.image_height < 3:
+        parser.error("--image-width and --image-height must both be at least 3")
+    nvbench_args = [arg for arg in nvbench_args if arg != "--"]
+    return args, nvbench_args
+
+
+def main(argv: list[str] | None = None) -> int:
+    args, nvbench_args = parse_args(argv)
+    if args.run_benchmark:
+        run_benchmark(args, nvbench_args)
+        return 0
+    return run_driver(args, nvbench_args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/examples/requirements.txt b/python/examples/requirements.txt
index bc17464..adb554a 100644
--- a/python/examples/requirements.txt
+++ b/python/examples/requirements.txt
@@ -6,4 +6,5 @@ numba-cuda
 cuda-cccl
 cupy
 nvidia-cute-dsl[cu13]
+tabulate
 torch[cu13]
diff --git a/python/pyproject.toml b/python/pyproject.toml
index f7ddf78..6a23ff9 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 [build-system]
 requires = ["scikit-build-core>=0.10", "setuptools_scm"]
 build-backend = "scikit_build_core.build"
@@ -52,6 +55,7 @@ tools = [
 [project.scripts]
 nvbench-compare = "scripts.nvbench_compare:main"
 nvbench-histogram = "scripts.nvbench_histogram:main"
+nvbench-json-summary = "scripts.nvbench_json_summary:main"
 nvbench-walltime = "scripts.nvbench_walltime:main"
 
 [project.urls]
@@ -85,4 +89,5 @@ fallback_version = "0.0.0"
 [tool.scikit-build.wheel.packages]
 "cuda" = "cuda"
 "cuda/bench" = "cuda/bench"
+"cuda/bench/results" = "cuda/bench/results"
 "scripts" = "scripts"
diff --git a/python/scripts/nvbench_json_summary.py b/python/scripts/nvbench_json_summary.py
new file mode 100644
index 0000000..2252b66
--- /dev/null
+++ b/python/scripts/nvbench_json_summary.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+from cuda.bench.results import (
+    BenchmarkResult,
+    BenchmarkResultSummary,
+    SubBenchmarkResult,
+    SubBenchmarkState,
+)
+
+
+class MarkdownTable:
+    def __init__(self):
+        self.columns = []
+
+    def add_cell(self, row: int, key: str, header: str, value: str) -> None:
+        column = next((col for col in self.columns if col["key"] == key), None)
+        if column is None:
+            column = {
+                "key": key,
+                "header": header,
+                "rows": [],
+                "max_width": len(header),
+            }
+            self.columns.append(column)
+
+        column["max_width"] = max(column["max_width"], len(value))
+        while len(column["rows"]) <= row:
+            column["rows"].append("")
+        column["rows"][row] = value
+
+    def to_string(self) -> str:
+        if not self.columns:
+            return ""
+
+        num_rows = max(len(column["rows"]) for column in self.columns)
+        for column in self.columns:
+            while len(column["rows"]) < num_rows:
+                column["rows"].append("")
+
+        header = "|"
+        divider = "|"
+        for column in self.columns:
+            width = column["max_width"]
+            header += f" {column['header']:^{width}} |"
+            divider += f"{'':-^{width + 2}}|"
+
+        rows = []
+        for row in range(num_rows):
+            row_text = "|"
+            for column in self.columns:
+                row_text += f" {column['rows'][row]:>{column['max_width']}} |"
+            rows.append(row_text)
+
+        return "\n".join([header, divider, *rows]) + "\n"
+
+
+def format_default(summary: BenchmarkResultSummary) -> str:
+    value = summary.value
+    if isinstance(value, float):
+        return f"{value:.5g}"
+    if value is None:
+        return ""
+    return str(value)
+
+
+def format_duration(summary: BenchmarkResultSummary) -> str:
+    seconds = float(summary["value"])
+    if seconds >= 1.0:
+        return f"{seconds:0.3f} s"
+    if seconds >= 1e-3:
+        return f"{seconds * 1e3:0.3f} ms"
+    if seconds >= 1e-6:
+        return f"{seconds * 1e6:0.3f} us"
+    return f"{seconds * 1e9:0.3f} ns"
+
+
+def format_item_rate(summary: BenchmarkResultSummary) -> str:
+    items_per_second = float(summary["value"])
+    if items_per_second >= 1e15:
+        return f"{items_per_second * 1e-15:0.3f}P"
+    if items_per_second >= 1e12:
+        return f"{items_per_second * 1e-12:0.3f}T"
+    if items_per_second >= 1e9:
+        return f"{items_per_second * 1e-9:0.3f}G"
+    if items_per_second >= 1e6:
+        return f"{items_per_second * 1e-6:0.3f}M"
+    if items_per_second >= 1e3:
+        return f"{items_per_second * 1e-3:0.3f}K"
+    return f"{items_per_second:0.3f}"
+
+
+def format_frequency(summary: BenchmarkResultSummary) -> str:
+    frequency_hz = float(summary["value"])
+    if frequency_hz >= 1e9:
+        return f"{frequency_hz * 1e-9:0.3f} GHz"
+    if frequency_hz >= 1e6:
+        return f"{frequency_hz * 1e-6:0.3f} MHz"
+    if frequency_hz >= 1e3:
+        return f"{frequency_hz * 1e-3:0.3f} KHz"
+    return f"{frequency_hz:0.3f} Hz"
+
+
+def format_bytes(summary: BenchmarkResultSummary) -> str:
+    nbytes = float(summary["value"])
+    if nbytes >= 1024.0 * 1024.0 * 1024.0:
+        return f"{nbytes / (1024.0 * 1024.0 * 1024.0):0.3f} GiB"
+    if nbytes >= 1024.0 * 1024.0:
+        return f"{nbytes / (1024.0 * 1024.0):0.3f} MiB"
+    if nbytes >= 1024.0:
+        return f"{nbytes / 1024.0:0.3f} KiB"
+    return f"{nbytes:0.3f} B"
+
+
+def format_byte_rate(summary: BenchmarkResultSummary) -> str:
+    bytes_per_second = float(summary["value"])
+    if bytes_per_second >= 1e15:
+        return f"{bytes_per_second * 1e-15:0.3f} PB/s"
+    if bytes_per_second >= 1e12:
+        return f"{bytes_per_second * 1e-12:0.3f} TB/s"
+    if bytes_per_second >= 1e9:
+        return f"{bytes_per_second * 1e-9:0.3f} GB/s"
+    if bytes_per_second >= 1e6:
+        return f"{bytes_per_second * 1e-6:0.3f} MB/s"
+    if bytes_per_second >= 1e3:
+        return f"{bytes_per_second * 1e-3:0.3f} KB/s"
+    return f"{bytes_per_second:0.3f} B/s"
+
+
+def format_sample_size(summary: BenchmarkResultSummary) -> str:
+    return f"{int(summary['value'])}x"
+
+
+def format_percentage(summary: BenchmarkResultSummary) -> str:
+    return f"{float(summary['value']) * 100.0:.2f}%"
+
+
+def format_summary(summary: BenchmarkResultSummary) -> str:
+    if summary.value is None:
+        return ""
+    if summary.hint == "duration":
+        return format_duration(summary)
+    if summary.hint == "item_rate":
+        return format_item_rate(summary)
+    if summary.hint == "frequency":
+        return format_frequency(summary)
+    if summary.hint == "bytes":
+        return format_bytes(summary)
+    if summary.hint == "byte_rate":
+        return format_byte_rate(summary)
+    if summary.hint == "sample_size":
+        return format_sample_size(summary)
+    if summary.hint == "percentage":
+        return format_percentage(summary)
+    return format_default(summary)
+
+
+def format_axis_value(
+    axis_value: dict, axes_by_name: dict[str, dict]
+) -> tuple[str, str]:
+    name = axis_value["name"]
+    axis = axes_by_name.get(name, {})
+    value = axis_value["value"]
+    if value is None:
+        return name, ""
+
+    if axis.get("type") == "int64" and axis.get("flags") == "pow2":
+        int_value = int(value)
+        exponent = int_value.bit_length() - 1
+        return name, f"2^{exponent} = {int_value}"
+
+    value_type = axis_value.get("type", axis.get("type"))
+    if value_type == "int64":
+        return name, str(int(value))
+    if value_type == "float64":
+        return name, f"{float(value):.5g}"
+
+    return name, str(value)
+
+
+def add_state_row(
+    table: MarkdownTable,
+    row: int,
+    state: SubBenchmarkState,
+    bench: SubBenchmarkResult,
+) -> None:
+    axes_by_name = {axis["name"]: axis for axis in bench.axes}
+
+    for axis_value in state.axis_values:
+        header, value = format_axis_value(axis_value, axes_by_name)
+        table.add_cell(row, f"axis:{header}", header, value)
+
+    for summary in state.summaries.values():
+        if summary.hide:
+            continue
+        header = summary.name if summary.name is not None else summary.tag
+        table.add_cell(row, summary.tag, header, format_summary(summary))
+
+
+def format_benchmark(result: BenchmarkResult, bench: SubBenchmarkResult) -> str:
+    parts = [f"## {bench.name}\n\n"]
+    device_ids: list[int | None] = list(bench.devices) if bench.devices else [None]
+
+    for device_id in device_ids:
+        if device_id is not None:
+            device = result.devices.get(device_id)
+            device_name = device.name if device is not None else f"Device {device_id}"
+            parts.append(f"### [{device_id}] {device_name}\n\n")
+
+        table = MarkdownTable()
+        row = 0
+        for state in bench.states:
+            if state.is_skipped:
+                continue
+            if device_id is not None and state.device != device_id:
+                continue
+            add_state_row(table, row, state, bench)
+            row += 1
+
+        table_text = table.to_string()
+        parts.append(table_text if table_text else "No data -- check log.\n")
+
+    return "".join(parts)
+
+
+def format_result(result: BenchmarkResult) -> str:
+    parts = ["# Benchmark Results\n"]
+    for bench in result.values():
+        parts.append(f"\n{format_benchmark(result, bench)}")
+    return "".join(parts)
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="nvbench-json-summary",
+        description="Print an NVBench-style markdown summary table from NVBench JSON output.",
+    )
+    parser.add_argument("json_path", help="Path to an NVBench JSON output file.")
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        help="Write markdown output to this file instead of stdout.",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    result = BenchmarkResult.from_json(args.json_path)
+    report = format_result(result)
+
+    if args.output is not None:
+        args.output.write_text(report, encoding="utf-8")
+    else:
+        print(report)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/test/test_benchmark_result.py b/python/test/test_benchmark_result.py
new file mode 100644
index 0000000..e123730
--- /dev/null
+++ b/python/test/test_benchmark_result.py
@@ -0,0 +1,750 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import json
+import struct
+from dataclasses import dataclass
+
+import cuda.bench
+import cuda.bench.results as results
+import pytest
+
+
+def write_json(path, data):
+    path.write_text(json.dumps(data), encoding="utf-8")
+
+
+def block_size_axis(*values):
+    return {
+        "name": "BlockSize",
+        "type": "int64",
+        "flags": "pow2",
+        "values": [
+            {
+                "input_string": str(value),
+                "description": f"2^{value} = {2**value}",
+                "value": 2**value,
+            }
+            for value in values
+        ],
+    }
+
+
+def sample_file_summary(tag, filename, size):
+    return {
+        "tag": tag,
+        "data": [
+            {
+                "name": "filename",
+                "type": "string",
+                "value": filename,
+            },
+            {
+                "name": "size",
+                "type": "int64",
+                "value": str(size),
+            },
+        ],
+    }
+
+
+def sample_times_summary(filename, size):
+    return sample_file_summary(
+        "nv/json/bin:nv/cold/sample_times",
+        filename,
+        size,
+    )
+
+
+def sample_frequencies_summary(filename, size):
+    return sample_file_summary(
+        "nv/json/freqs-bin:nv/cold/sample_freqs",
+        filename,
+        size,
+    )
+
+
+def bwutil_summary(value):
+    return {
+        "tag": "nv/cold/bw/global/utilization",
+        "name": "BWUtil",
+        "hint": "percentage",
+        "description": "Global memory utilization",
+        "data": [
+            {
+                "name": "value",
+                "type": "float64",
+                "value": str(value),
+            }
+        ],
+    }
+
+
+@pytest.fixture
+def sample_result_path(tmp_path):
+    bin_dir = tmp_path / "result.json-bin"
+    bin_dir.mkdir()
+    (bin_dir / "0.bin").write_bytes(struct.pack("<3f", 1.0, 2.0, 4.0))
+    freq_bin_dir = tmp_path / "result.json-freqs-bin"
+    freq_bin_dir.mkdir()
+    (freq_bin_dir / "0.bin").write_bytes(struct.pack("<3f", 100.0, 200.0, 400.0))
+
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [block_size_axis(8)],
+                    "states": [
+                        {
+                            "name": "Device=0 BlockSize=2^8",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "256",
+                                }
+                            ],
+                            "summaries": [
+                                sample_times_summary("result.json-bin/0.bin", 3),
+                                bwutil_summary(0.75),
+                                sample_frequencies_summary(
+                                    "result.json-freqs-bin/0.bin",
+                                    3,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+    return json_fn
+
+
+@pytest.fixture
+def sample_result(sample_result_path):
+    return results.BenchmarkResult.from_json(sample_result_path)
+
+
+@pytest.fixture
+def sample_subbenchmark(sample_result):
+    return sample_result["copy"]
+
+
+@pytest.fixture
+def sample_state(sample_subbenchmark):
+    return sample_subbenchmark[0]
+
+
+def test_result_classes_are_exposed_from_results_namespace():
+    assert results.BenchmarkResult.__module__ == results.__name__
+    assert results.BenchmarkResultSummary.__module__ == results.__name__
+    assert not hasattr(cuda.bench, "BenchmarkResult")
+
+
+def test_from_json_preserves_optional_metadata(sample_result_path):
+    metadata = {"returncode": 0, "elapsed_seconds": 0.25}
+
+    default_result = results.BenchmarkResult.from_json(sample_result_path)
+    result = results.BenchmarkResult.from_json(sample_result_path, metadata=metadata)
+
+    assert default_result.metadata is None
+    assert result.metadata is metadata
+
+
+def test_benchmark_result_implements_mapping_protocol(sample_result):
+    subbenchmark = sample_result["copy"]
+
+    assert len(sample_result) == 1
+    assert list(sample_result) == ["copy"]
+    assert list(sample_result.keys()) == ["copy"]
+    assert list(sample_result.values()) == [subbenchmark]
+    assert list(sample_result.items()) == [("copy", subbenchmark)]
+    assert "copy" in sample_result
+    assert "missing" not in sample_result
+    assert subbenchmark is sample_result.subbenches["copy"]
+    with pytest.raises(KeyError):
+        sample_result["missing"]
+
+
+def test_subbenchmark_result_implements_sequence_protocol(sample_subbenchmark):
+    state = sample_subbenchmark[0]
+
+    assert len(sample_subbenchmark) == 1
+    assert sample_subbenchmark[-1] is state
+    assert sample_subbenchmark[:] == sample_subbenchmark.states
+    assert list(sample_subbenchmark) == sample_subbenchmark.states
+    with pytest.raises(IndexError):
+        sample_subbenchmark[1]
+
+
+def test_state_parses_axis_name_and_bandwidth(sample_state):
+    assert sample_state.name() == "BlockSize[pow2]=8"
+    assert sample_state.bw == 0.75
+
+
+def test_state_stores_rich_summary_metadata(sample_state):
+    bw_summary = sample_state.summaries["nv/cold/bw/global/utilization"]
+
+    assert bw_summary.tag == "nv/cold/bw/global/utilization"
+    assert bw_summary.name == "BWUtil"
+    assert bw_summary.hint == "percentage"
+    assert bw_summary.hide is None
+    assert bw_summary.description == "Global memory utilization"
+    assert bw_summary.value == pytest.approx(0.75)
+    assert bw_summary["value"] == pytest.approx(0.75)
+    assert sample_state.summaries["nv/json/bin:nv/cold/sample_times"].data == {
+        "filename": "result.json-bin/0.bin",
+        "size": 3,
+    }
+    assert sample_state.summaries["nv/json/freqs-bin:nv/cold/sample_freqs"].data == {
+        "filename": "result.json-freqs-bin/0.bin",
+        "size": 3,
+    }
+
+
+def test_state_preserves_null_summary_values(tmp_path):
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [],
+                    "states": [
+                        {
+                            "name": "Device=0",
+                            "axis_values": [],
+                            "summaries": [
+                                {
+                                    "tag": "nv/cold/time/gpu/stdev/relative",
+                                    "name": "Noise",
+                                    "hint": "percentage",
+                                    "data": [
+                                        {
+                                            "name": "value",
+                                            "type": "float64",
+                                            "value": None,
+                                        }
+                                    ],
+                                }
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    summary = results.BenchmarkResult.from_json(json_fn)["copy"][0].summaries[
+        "nv/cold/time/gpu/stdev/relative"
+    ]
+
+    assert summary.value is None
+    assert summary["value"] is None
+
+
+def test_state_reports_malformed_numeric_summary_values(tmp_path):
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [],
+                    "states": [
+                        {
+                            "name": "Device=0",
+                            "axis_values": [],
+                            "summaries": [
+                                {
+                                    "tag": "nv/cold/time/gpu/mean",
+                                    "name": "GPU Time",
+                                    "hint": "duration",
+                                    "data": [
+                                        {
+                                            "name": "value",
+                                            "type": "float64",
+                                            "value": "not-a-number",
+                                        }
+                                    ],
+                                }
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    with pytest.raises(
+        ValueError,
+        match=(
+            "summary 'nv/cold/time/gpu/mean' field 'value' "
+            "value 'not-a-number' is not a float64"
+        ),
+    ):
+        results.BenchmarkResult.from_json(json_fn)
+
+
+def test_state_loads_samples_and_frequencies(sample_state):
+    assert sample_state.samples is not None
+    assert list(sample_state.samples) == pytest.approx([1.0, 2.0, 4.0])
+    assert sample_state.frequencies is not None
+    assert list(sample_state.frequencies) == pytest.approx([100.0, 200.0, 400.0])
+
+
+def test_centers_apply_estimators_to_samples(sample_result):
+    centers = sample_result.centers(lambda samples: sum(samples) / len(samples))
+
+    assert centers == {"copy": {"BlockSize[pow2]=8": pytest.approx(7.0 / 3.0)}}
+
+
+def test_centers_with_frequencies_apply_estimators(sample_result, sample_subbenchmark):
+    def weighted_mean(samples, frequencies):
+        return sum(
+            sample * frequency for sample, frequency in zip(samples, frequencies)
+        ) / sum(frequencies)
+
+    weighted_centers = sample_result.centers_with_frequencies(weighted_mean)
+
+    assert weighted_centers == {"copy": {"BlockSize[pow2]=8": pytest.approx(3.0)}}
+    assert (
+        sample_subbenchmark.centers_with_frequencies(weighted_mean)
+        == weighted_centers["copy"]
+    )
+
+
+def test_benchmark_result_constructor_is_private():
+    with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
+        results.BenchmarkResult()
+    with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
+        results.BenchmarkResult("result.json")
+    with pytest.raises(TypeError):
+        results.BenchmarkResult(metadata=None)
+    with pytest.raises(TypeError):
+        results.BenchmarkResult(json_path="result.json", parse=False)
+
+
+def test_benchmark_result_empty_does_not_read_json(tmp_path):
+    @dataclass
+    class RunMetadata:
+        returncode: int
+        elapsed_seconds: float
+
+    metadata = RunMetadata(returncode=1, elapsed_seconds=0.25)
+    missing_json = tmp_path / "missing.json"
+
+    result = results.BenchmarkResult.empty(metadata=metadata)
+
+    assert result.metadata is metadata
+    assert result.subbenches == {}
+
+    with pytest.raises(FileNotFoundError):
+        results.BenchmarkResult.from_json(missing_json, metadata=metadata)
+    with pytest.raises(FileNotFoundError):
+        results.BenchmarkResult.from_json(json_path=missing_json, metadata=metadata)
+
+
+def test_benchmark_result_accepts_no_axis_benchmark_with_recorded_binary_path(
+    tmp_path, monkeypatch
+):
+    data_dir = tmp_path / "temp_data"
+    data_dir.mkdir()
+    bin_dir = data_dir / "axes_run1.json-bin"
+    bin_dir.mkdir()
+    (bin_dir / "0.bin").write_bytes(struct.pack("<2f", 1.0, 4.0))
+    freq_bin_dir = data_dir / "axes_run1.json-freqs-bin"
+    freq_bin_dir.mkdir()
+    (freq_bin_dir / "0.bin").write_bytes(struct.pack("<2f", 100.0, 400.0))
+
+    json_fn = data_dir / "axes_run1.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "simple",
+                    "axes": None,
+                    "states": [
+                        {
+                            "name": "Device=0",
+                            "axis_values": None,
+                            "summaries": [
+                                sample_times_summary(
+                                    "temp_data/axes_run1.json-bin/0.bin",
+                                    2,
+                                ),
+                                sample_frequencies_summary(
+                                    "temp_data/axes_run1.json-freqs-bin/0.bin",
+                                    2,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    monkeypatch.chdir(tmp_path)
+
+    result = results.BenchmarkResult.from_json("temp_data/axes_run1.json")
+
+    state = result.subbenches["simple"].states[0]
+    assert state.name() == "Device=0"
+    assert state.point == {}
+    assert state.samples is not None
+    assert list(state.samples) == pytest.approx([1.0, 4.0])
+    assert state.frequencies is not None
+    assert list(state.frequencies) == pytest.approx([100.0, 400.0])
+
+
+def test_benchmark_result_accepts_axis_value_input_string():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "single_float64_axis",
+            "axes": [
+                {
+                    "name": "Duration",
+                    "type": "float64",
+                    "flags": "",
+                    "values": [
+                        {
+                            "input_string": "0",
+                            "description": "",
+                            "value": 0.0,
+                        }
+                    ],
+                }
+            ],
+            "states": [
+                {
+                    "name": "Device=0 Duration=0",
+                    "axis_values": [
+                        {
+                            "name": "Duration",
+                            "type": "float64",
+                            "value": "0",
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                }
+            ],
+        },
+        "",
+    )
+
+    state = result.states[0]
+    assert state.name() == "Duration=0"
+    assert state.point == {"Duration": "0"}
+
+
+def test_benchmark_result_normalizes_axis_value_lookup_key():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "num_blocks",
+            "axes": [
+                {
+                    "name": "NumBlocks",
+                    "type": "int64",
+                    "flags": "",
+                    "values": [
+                        {
+                            "input_string": "64",
+                            "description": "",
+                            "value": 64,
+                        },
+                        {
+                            "input_string": "default",
+                            "description": "",
+                            "value": None,
+                        },
+                    ],
+                }
+            ],
+            "states": [
+                {
+                    "name": "Device=0 NumBlocks=64",
+                    "axis_values": [
+                        {
+                            "name": "NumBlocks",
+                            "type": "int64",
+                            "value": 64,
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+                {
+                    "name": "Device=0 NumBlocks=default",
+                    "axis_values": [
+                        {
+                            "name": "NumBlocks",
+                            "type": "int64",
+                            "value": None,
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+                {
+                    "name": "Device=0 NumBlocks=64",
+                    "axis_values": [
+                        {
+                            "name": "NumBlocks",
+                            "type": "int64",
+                            "input_string": "64",
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+            ],
+        },
+        "",
+    )
+
+    assert result.states[0].point == {"NumBlocks": "64"}
+    assert result.states[1].point == {"NumBlocks": "default"}
+    assert result.states[2].point == {"NumBlocks": "64"}
+
+
+def test_benchmark_result_preserves_skipped_state_with_no_summaries():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "copy_sweep_grid_shape",
+            "axes": [block_size_axis(6, 8)],
+            "states": [
+                {
+                    "name": "Device=0 BlockSize=2^8",
+                    "axis_values": [
+                        {
+                            "name": "BlockSize",
+                            "type": "int64",
+                            "value": "256",
+                        }
+                    ],
+                    "summaries": None,
+                    "is_skipped": True,
+                },
+                {
+                    "name": "Device=0 BlockSize=2^6",
+                    "axis_values": [
+                        {
+                            "name": "BlockSize",
+                            "type": "int64",
+                            "value": "64",
+                        }
+                    ],
+                    "summaries": [],
+                    "is_skipped": False,
+                },
+            ],
+        },
+        "",
+    )
+
+    assert len(result.states) == 2
+    assert result.states[0].name() == "BlockSize[pow2]=8"
+    assert result.states[0].is_skipped is True
+    assert result.states[0].summaries == {}
+    assert result.states[0].samples is None
+    assert result.states[0].frequencies is None
+    assert result.states[1].name() == "BlockSize[pow2]=6"
+    assert result.states[1].is_skipped is False
+
+
+def test_benchmark_result_uses_empty_summaries_when_field_is_missing():
+    result = results.SubBenchmarkResult(
+        {
+            "name": "copy_sweep_grid_shape",
+            "axes": [block_size_axis(8)],
+            "states": [
+                {
+                    "name": "Device=0 BlockSize=2^8",
+                    "axis_values": [
+                        {
+                            "name": "BlockSize",
+                            "type": "int64",
+                            "value": "256",
+                        }
+                    ],
+                    "is_skipped": False,
+                },
+            ],
+        },
+        "",
+    )
+
+    state = result.states[0]
+    assert state.name() == "BlockSize[pow2]=8"
+    assert state.summaries == {}
+    assert state.samples is None
+    assert state.frequencies is None
+    assert state.bw is None
+
+
+@pytest.mark.parametrize(
+    "field_name,bad_type,expected_type",
+    [
+        ("filename", "int64", "string"),
+        ("size", "string", "int64"),
+    ],
+)
+def test_benchmark_result_validates_binary_summary_field_types(
+    field_name, bad_type, expected_type
+):
+    summary = sample_times_summary("result.json-bin/0.bin", 3)
+    for value_data in summary["data"]:
+        if value_data["name"] == field_name:
+            value_data["type"] = bad_type
+            if field_name == "filename":
+                value_data["value"] = "123"
+
+    with pytest.raises(
+        ValueError,
+        match=rf"field '{field_name}' has type '{bad_type}'; expected '{expected_type}'",
+    ):
+        results.SubBenchmarkResult(
+            {
+                "name": "copy",
+                "axes": [],
+                "states": [
+                    {
+                        "name": "Device=0",
+                        "axis_values": [],
+                        "summaries": [summary],
+                        "is_skipped": False,
+                    }
+                ],
+            },
+            "",
+        )
+
+
+def test_benchmark_result_uses_none_for_unavailable_samples(tmp_path):
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [block_size_axis(8, 9)],
+                    "states": [
+                        {
+                            "name": "Device=0 BlockSize=2^8",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "256",
+                                }
+                            ],
+                            "summaries": [],
+                            "is_skipped": False,
+                        },
+                        {
+                            "name": "Device=0 BlockSize=2^9",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "512",
+                                }
+                            ],
+                            "summaries": [
+                                sample_times_summary(
+                                    "result.json-bin/missing.bin",
+                                    3,
+                                ),
+                                sample_frequencies_summary(
+                                    "result.json-freqs-bin/missing.bin",
+                                    3,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        },
+                    ],
+                }
+            ]
+        },
+    )
+
+    result = results.BenchmarkResult.from_json(json_fn)
+
+    states = result.subbenches["copy"].states
+    assert states[0].samples is None
+    assert states[1].samples is None
+    assert states[0].frequencies is None
+    assert states[1].frequencies is None
+    assert result.centers(lambda samples: pytest.fail("estimator should not run")) == {
+        "copy": {
+            "BlockSize[pow2]=8": None,
+            "BlockSize[pow2]=9": None,
+        }
+    }
+    assert result.centers_with_frequencies(
+        lambda samples, frequencies: pytest.fail("estimator should not run")
+    ) == {
+        "copy": {
+            "BlockSize[pow2]=8": None,
+            "BlockSize[pow2]=9": None,
+        }
+    }
+
+
+def test_benchmark_result_rejects_mismatched_sample_and_frequency_counts(tmp_path):
+    bin_dir = tmp_path / "result.json-bin"
+    bin_dir.mkdir()
+    (bin_dir / "0.bin").write_bytes(struct.pack("<3f", 1.0, 2.0, 4.0))
+    freq_bin_dir = tmp_path / "result.json-freqs-bin"
+    freq_bin_dir.mkdir()
+    (freq_bin_dir / "0.bin").write_bytes(struct.pack("<2f", 100.0, 200.0))
+
+    json_fn = tmp_path / "result.json"
+    write_json(
+        json_fn,
+        {
+            "benchmarks": [
+                {
+                    "name": "copy",
+                    "axes": [block_size_axis(8)],
+                    "states": [
+                        {
+                            "name": "Device=0 BlockSize=2^8",
+                            "axis_values": [
+                                {
+                                    "name": "BlockSize",
+                                    "type": "int64",
+                                    "value": "256",
+                                }
+                            ],
+                            "summaries": [
+                                sample_times_summary("result.json-bin/0.bin", 3),
+                                sample_frequencies_summary(
+                                    "result.json-freqs-bin/0.bin",
+                                    2,
+                                ),
+                            ],
+                            "is_skipped": False,
+                        }
+                    ],
+                }
+            ]
+        },
+    )
+
+    with pytest.raises(ValueError, match="sample count .* frequency count"):
+        results.BenchmarkResult.from_json(json_fn)
diff --git a/python/test/test_nvbench_json_summary.py b/python/test/test_nvbench_json_summary.py
new file mode 100644
index 0000000..3bba73b
--- /dev/null
+++ b/python/test/test_nvbench_json_summary.py
@@ -0,0 +1,376 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import importlib.util
+import json
+from pathlib import Path
+
+
+def load_nvbench_json_summary():
+    module_path = (
+        Path(__file__).resolve().parents[1] / "scripts" / "nvbench_json_summary.py"
+    )
+    spec = importlib.util.spec_from_file_location("nvbench_json_summary", module_path)
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+nvbench_json_summary = load_nvbench_json_summary()
+
+
+def write_result_json(path):
+    path.write_text(
+        json.dumps(
+            {
+                "devices": [
+                    {
+                        "id": 0,
+                        "name": "Test GPU",
+                    }
+                ],
+                "benchmarks": [
+                    {
+                        "name": "copy",
+                        "devices": [0],
+                        "axes": [
+                            {
+                                "name": "BlockSize",
+                                "type": "int64",
+                                "flags": "pow2",
+                                "values": [
+                                    {
+                                        "input_string": "8",
+                                        "description": "2^8 = 256",
+                                        "value": 256,
+                                    }
+                                ],
+                            }
+                        ],
+                        "states": [
+                            {
+                                "name": "Device=0 BlockSize=2^8",
+                                "device": 0,
+                                "type_config_index": 0,
+                                "axis_values": [
+                                    {
+                                        "name": "BlockSize",
+                                        "type": "int64",
+                                        "value": "256",
+                                    }
+                                ],
+                                "summaries": [
+                                    {
+                                        "tag": "nv/cold/time/gpu/sample_size",
+                                        "name": "Samples",
+                                        "hint": "sample_size",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "int64",
+                                                "value": "12",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/mean",
+                                        "name": "GPU Time",
+                                        "hint": "duration",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "1.25e-6",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/stdev/relative",
+                                        "name": "Noise",
+                                        "hint": "percentage",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "0.015",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/bw/global/bytes_per_second",
+                                        "name": "GlobalMem BW",
+                                        "hint": "byte_rate",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "2.5e9",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/bw/global/utilization",
+                                        "name": "BWUtil",
+                                        "hint": "percentage",
+                                        "hide": False,
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "0.625",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/min",
+                                        "name": "Min GPU Time",
+                                        "hint": "duration",
+                                        "hide": "Hidden by default.",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "1.0e-6",
+                                            }
+                                        ],
+                                    },
+                                ],
+                                "is_skipped": False,
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_json_summary_formats_nvbench_style_markdown(tmp_path):
+    json_path = tmp_path / "result.json"
+    write_result_json(json_path)
+
+    result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
+    report = nvbench_json_summary.format_result(result)
+
+    assert "# Benchmark Results" in report
+    assert "## copy" in report
+    assert "### [0] Test GPU" in report
+    assert (
+        "| BlockSize | Samples | GPU Time | Noise | GlobalMem BW | BWUtil |" in report
+    )
+    assert (
+        "| 2^8 = 256 |     12x | 1.250 us | 1.50% |   2.500 GB/s | 62.50% |" in report
+    )
+    assert "Min GPU Time" not in report
+
+
+def test_json_summary_formats_null_summary_value_as_blank():
+    summary = nvbench_json_summary.BenchmarkResultSummary(
+        tag="nv/cold/time/gpu/stdev/relative",
+        name="Noise",
+        hint="percentage",
+        hide=None,
+        description=None,
+        data={"value": None},
+    )
+
+    assert nvbench_json_summary.format_summary(summary) == ""
+
+
+def test_json_summary_formats_axis_values_like_markdown_printer():
+    axes_by_name = {
+        "BlockSize": {
+            "name": "BlockSize",
+            "type": "int64",
+            "flags": "pow2",
+        },
+        "NumBlocks": {
+            "name": "NumBlocks",
+            "type": "int64",
+            "flags": "",
+        },
+        "Duration": {
+            "name": "Duration",
+            "type": "float64",
+            "flags": "",
+        },
+        "Nullable": {
+            "name": "Nullable",
+            "type": "int64",
+            "flags": "",
+        },
+    }
+
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "BlockSize", "type": "int64", "value": "256"}, axes_by_name
+    ) == ("BlockSize", "2^8 = 256")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "NumBlocks", "type": "int64", "value": "64"}, axes_by_name
+    ) == ("NumBlocks", "64")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "Duration", "type": "float64", "value": "0.123456789"},
+        axes_by_name,
+    ) == ("Duration", "0.12346")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "Nullable", "type": "int64", "value": None}, axes_by_name
+    ) == ("Nullable", "")
+
+
+def test_json_summary_formats_state_with_null_axis_values(tmp_path):
+    json_path = tmp_path / "result.json"
+    json_path.write_text(
+        json.dumps(
+            {
+                "devices": [
+                    {
+                        "id": 0,
+                        "name": "Test GPU",
+                    }
+                ],
+                "benchmarks": [
+                    {
+                        "name": "no_axes",
+                        "devices": [0],
+                        "axes": None,
+                        "states": [
+                            {
+                                "name": "Device=0",
+                                "device": 0,
+                                "axis_values": None,
+                                "summaries": [
+                                    {
+                                        "tag": "nv/cold/time/gpu/sample_size",
+                                        "name": "Samples",
+                                        "hint": "sample_size",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "int64",
+                                                "value": "7",
+                                            }
+                                        ],
+                                    }
+                                ],
+                                "is_skipped": False,
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
+    report = nvbench_json_summary.format_result(result)
+
+    assert "## no_axes" in report
+    assert "| Samples |" in report
+    assert "|      7x |" in report
+
+
+def test_json_summary_omits_skipped_states(tmp_path):
+    json_path = tmp_path / "result.json"
+    json_path.write_text(
+        json.dumps(
+            {
+                "devices": [
+                    {
+                        "id": 0,
+                        "name": "Test GPU",
+                    }
+                ],
+                "benchmarks": [
+                    {
+                        "name": "copy",
+                        "devices": [0],
+                        "axes": [
+                            {
+                                "name": "BlockSize",
+                                "type": "int64",
+                                "flags": "pow2",
+                                "values": [
+                                    {
+                                        "input_string": "8",
+                                        "description": "2^8 = 256",
+                                        "value": 256,
+                                    },
+                                    {
+                                        "input_string": "9",
+                                        "description": "2^9 = 512",
+                                        "value": 512,
+                                    },
+                                ],
+                            }
+                        ],
+                        "states": [
+                            {
+                                "name": "Device=0 BlockSize=2^8",
+                                "device": 0,
+                                "axis_values": [
+                                    {
+                                        "name": "BlockSize",
+                                        "type": "int64",
+                                        "value": "256",
+                                    }
+                                ],
+                                "summaries": None,
+                                "is_skipped": True,
+                                "skip_reason": "Deadlock detected",
+                            },
+                            {
+                                "name": "Device=0 BlockSize=2^9",
+                                "device": 0,
+                                "axis_values": [
+                                    {
+                                        "name": "BlockSize",
+                                        "type": "int64",
+                                        "value": "512",
+                                    }
+                                ],
+                                "summaries": [
+                                    {
+                                        "tag": "nv/cold/time/gpu/sample_size",
+                                        "name": "Samples",
+                                        "hint": "sample_size",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "int64",
+                                                "value": "3",
+                                            }
+                                        ],
+                                    }
+                                ],
+                                "is_skipped": False,
+                            },
+                        ],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
+    report = nvbench_json_summary.format_result(result)
+
+    assert "Skip Reason" not in report
+    assert "Deadlock detected" not in report
+    assert "2^8 = 256" not in report
+    assert "2^9 = 512" in report
+    assert "3x" in report
+
+
+def test_json_summary_cli_writes_output_file(tmp_path):
+    json_path = tmp_path / "result.json"
+    output_path = tmp_path / "summary.md"
+    write_result_json(json_path)
+
+    rc = nvbench_json_summary.main([str(json_path), "--output", str(output_path)])
+
+    assert rc == 0
+    assert "GlobalMem BW" in output_path.read_text(encoding="utf-8")