diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py
index 7eb4fb0..8214f5b 100644
--- a/python/cuda/bench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -18,8 +18,6 @@ import importlib
 import importlib.metadata
 import warnings
 
-from ._bench_result import BenchmarkResult, SubBenchResult, SubBenchState
-
 try:
     __version__ = importlib.metadata.version("cuda-bench")
 except Exception as e:
@@ -31,10 +29,6 @@ except Exception as e:
     )
 
 
-BenchmarkResult.__module__ = __name__
-SubBenchResult.__module__ = __name__
-SubBenchState.__module__ = __name__
-
 _NVBENCH_EXPORTS = (
     "Benchmark",
     "CudaStream",
@@ -51,9 +45,6 @@ _NVBENCH_TEST_EXPORTS = (
 )
 
 __all__ = [
-    "BenchmarkResult",
-    "SubBenchResult",
-    "SubBenchState",
     *_NVBENCH_EXPORTS,
 ]
 
diff --git a/python/cuda/bench/__init__.pyi b/python/cuda/bench/__init__.pyi
index 8773d1a..9e0d264 100644
--- a/python/cuda/bench/__init__.pyi
+++ b/python/cuda/bench/__init__.pyi
@@ -25,31 +25,18 @@
 # stubs in generated out/cuda/nvbench/_nvbench.pyi
 # with definitions given here.
 
-from array import array
 from collections.abc import (
     Callable,
-    ItemsView,
-    Iterator,
-    KeysView,
     Sequence,
-    ValuesView,
 )
-from os import PathLike
 from typing import (
-    Any,
     Optional,
     Self,
     SupportsFloat,
     SupportsInt,
-    TypeVar,
     Union,
-    overload,
 )
 
-ResultT = TypeVar("ResultT")
-_SummaryValue = int | float | str
-_SummaryData = _SummaryValue | dict[str, _SummaryValue]
-
 class CudaStream:
     def __cuda_stream__(self) -> tuple[int, int]: ...
     def addressof(self) -> int: ...
@@ -138,60 +125,3 @@ def register(fn: Callable[[State], None]) -> Benchmark: ...
 def run_all_benchmarks(argv: Sequence[str]) -> None: ...
 
 class NVBenchRuntimeError(RuntimeError): ...
-
-class SubBenchState:
-    state_name: str
-    summaries: dict[str, _SummaryData]
-    samples: array | None
-    frequencies: array | None
-    bw: float | None
-    point: dict[str, str]
-    def name(self) -> str: ...
-    def center(self, estimator: Callable[[array], ResultT]) -> ResultT | None: ...
-    def center_with_frequencies(
-        self, estimator: Callable[[array, array], ResultT]
-    ) -> ResultT | None: ...
-
-class SubBenchResult:
-    states: list[SubBenchState]
-    def __len__(self) -> int: ...
-    @overload
-    def __getitem__(self, state_index: int) -> SubBenchState: ...
-    @overload
-    def __getitem__(self, state_index: slice) -> list[SubBenchState]: ...
-    def __iter__(self) -> Iterator[SubBenchState]: ...
-    def centers(
-        self, estimator: Callable[[array], ResultT]
-    ) -> dict[str, ResultT | None]: ...
-    def centers_with_frequencies(
-        self, estimator: Callable[[array, array], ResultT]
-    ) -> dict[str, ResultT | None]: ...
-
-class BenchmarkResult:
-    metadata: Any
-    subbenches: dict[str, SubBenchResult]
-    def __init__(
-        self,
-        *,
-        json_path: str | PathLike[str],
-        metadata: Any = None,
-    ) -> None: ...
-    @classmethod
-    def empty(cls, *, metadata: Any = None) -> Self: ...
-    @classmethod
-    def from_json(
-        cls, json_path: str | PathLike[str], *, metadata: Any = None
-    ) -> Self: ...
-    def __len__(self) -> int: ...
-    def __iter__(self) -> Iterator[str]: ...
-    def __contains__(self, subbench_name: object) -> bool: ...
-    def __getitem__(self, subbench_name: str) -> SubBenchResult: ...
-    def keys(self) -> KeysView[str]: ...
-    def values(self) -> ValuesView[SubBenchResult]: ...
-    def items(self) -> ItemsView[str, SubBenchResult]: ...
-    def centers(
-        self, estimator: Callable[[array], ResultT]
-    ) -> dict[str, dict[str, ResultT | None]]: ...
-    def centers_with_frequencies(
-        self, estimator: Callable[[array, array], ResultT]
-    ) -> dict[str, dict[str, ResultT | None]]: ...
diff --git a/python/cuda/bench/results/__init__.py b/python/cuda/bench/results/__init__.py
new file mode 100644
index 0000000..955d975
--- /dev/null
+++ b/python/cuda/bench/results/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2026 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Utilities for reading NVBench JSON benchmark result files."""
+
+from ._benchmark_result import (
+    BenchmarkResult,
+    BenchmarkResultDevice,
+    BenchmarkResultSummary,
+    SubBenchmarkResult,
+    SubBenchmarkState,
+)
+
+BenchmarkResult.__module__ = __name__
+BenchmarkResultDevice.__module__ = __name__
+BenchmarkResultSummary.__module__ = __name__
+SubBenchmarkResult.__module__ = __name__
+SubBenchmarkState.__module__ = __name__
+
+__all__ = [
+    "BenchmarkResult",
+    "BenchmarkResultDevice",
+    "BenchmarkResultSummary",
+    "SubBenchmarkResult",
+    "SubBenchmarkState",
+]
diff --git a/python/cuda/bench/results/__init__.pyi b/python/cuda/bench/results/__init__.pyi
new file mode 100644
index 0000000..3435895
--- /dev/null
+++ b/python/cuda/bench/results/__init__.pyi
@@ -0,0 +1,109 @@
+# Copyright 2026 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from array import array
+from collections.abc import Callable, ItemsView, Iterator, KeysView, ValuesView
+from os import PathLike
+from typing import Any, TypeVar, overload
+
+ResultT = TypeVar("ResultT")
+BenchmarkResultT = TypeVar("BenchmarkResultT", bound="BenchmarkResult")
+_SummaryValue = int | float | str
+
+class BenchmarkResultDevice:
+    id: int
+    name: str
+    data: dict[str, Any]
+
+class BenchmarkResultSummary:
+    tag: str
+    name: str | None
+    hint: str | None
+    hide: str | None
+    description: str | None
+    data: dict[str, _SummaryValue]
+    @property
+    def value(self) -> _SummaryValue | None: ...
+    def __getitem__(self, key: str) -> _SummaryValue: ...
+    def get(
+        self, key: str, default: _SummaryValue | None = None
+    ) -> _SummaryValue | None: ...
+
+class SubBenchmarkState:
+    state_name: str
+    device: int | None
+    type_config_index: int | None
+    axis_values: list[dict[str, Any]]
+    is_skipped: bool
+    skip_reason: str | None
+    summaries: dict[str, BenchmarkResultSummary]
+    samples: array | None
+    frequencies: array | None
+    bw: float | None
+    point: dict[str, str]
+    def name(self) -> str: ...
+    def center(self, estimator: Callable[[array], ResultT]) -> ResultT | None: ...
+    def center_with_frequencies(
+        self, estimator: Callable[[array, array], ResultT]
+    ) -> ResultT | None: ...
+
+class SubBenchmarkResult:
+    name: str
+    devices: list[int]
+    axes: list[dict[str, Any]]
+    states: list[SubBenchmarkState]
+    def __len__(self) -> int: ...
+    @overload
+    def __getitem__(self, state_index: int) -> SubBenchmarkState: ...
+    @overload
+    def __getitem__(self, state_index: slice) -> list[SubBenchmarkState]: ...
+    def __iter__(self) -> Iterator[SubBenchmarkState]: ...
+    def centers(
+        self, estimator: Callable[[array], ResultT]
+    ) -> dict[str, ResultT | None]: ...
+    def centers_with_frequencies(
+        self, estimator: Callable[[array, array], ResultT]
+    ) -> dict[str, ResultT | None]: ...
+
+class BenchmarkResult:
+    metadata: Any
+    devices: dict[int, BenchmarkResultDevice]
+    subbenches: dict[str, SubBenchmarkResult]
+    def __init__(self, token: object | None = None) -> None: ...
+    @classmethod
+    def empty(
+        cls: type[BenchmarkResultT], *, metadata: Any = None
+    ) -> BenchmarkResultT: ...
+    @classmethod
+    def from_json(
+        cls: type[BenchmarkResultT],
+        json_path: str | PathLike[str],
+        *,
+        metadata: Any = None,
+    ) -> BenchmarkResultT: ...
+    def __len__(self) -> int: ...
+    def __iter__(self) -> Iterator[str]: ...
+    def __contains__(self, subbench_name: object) -> bool: ...
+    def __getitem__(self, subbench_name: str) -> SubBenchmarkResult: ...
+    def keys(self) -> KeysView[str]: ...
+    def values(self) -> ValuesView[SubBenchmarkResult]: ...
+    def items(self) -> ItemsView[str, SubBenchmarkResult]: ...
+    def centers(
+        self, estimator: Callable[[array], ResultT]
+    ) -> dict[str, dict[str, ResultT | None]]: ...
+    def centers_with_frequencies(
+        self, estimator: Callable[[array, array], ResultT]
+    ) -> dict[str, dict[str, ResultT | None]]: ...
diff --git a/python/cuda/bench/_bench_result.py b/python/cuda/bench/results/_benchmark_result.py
similarity index 68%
rename from python/cuda/bench/_bench_result.py
rename to python/cuda/bench/results/_benchmark_result.py
index 6072041..d0806c7 100644
--- a/python/cuda/bench/_bench_result.py
+++ b/python/cuda/bench/results/_benchmark_result.py
@@ -14,19 +14,36 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from __future__ import annotations
+
 import array
 import json
 import os
 import sys
 from collections.abc import ItemsView, Iterator, KeysView, ValuesView
+from dataclasses import dataclass
 from typing import Any, Callable, TypeVar
 
-__all__ = ["BenchmarkResult", "SubBenchResult", "SubBenchState"]
+__all__ = [
+    "BenchmarkResult",
+    "BenchmarkResultDevice",
+    "BenchmarkResultSummary",
+    "SubBenchmarkResult",
+    "SubBenchmarkState",
+]
 
 ResultT = TypeVar("ResultT")
 BenchmarkResultT = TypeVar("BenchmarkResultT", bound="BenchmarkResult")
 _SummaryValue = int | float | str
-_SummaryData = _SummaryValue | dict[str, _SummaryValue]
+
+
+@dataclass(frozen=True)
+class BenchmarkResultDevice:
+    """Device metadata parsed from an NVBench JSON result file."""
+
+    id: int
+    name: str
+    data: dict[str, Any]
 
 
 def read_json(filename: str | os.PathLike[str]) -> dict:
@@ -49,13 +66,6 @@ def extract_size(summary: dict) -> int:
     return int(value_data["value"])
 
 
-def extract_bw(summary: dict) -> float:
-    summary_data = summary["data"]
-    value_data = next(filter(lambda v: v["name"] == "value", summary_data))
-    assert value_data["type"] == "float64"
-    return float(value_data["value"])
-
-
 def parse_summary_value(value_data: dict) -> _SummaryValue:
     value_type = value_data["type"]
     value = value_data["value"]
@@ -68,19 +78,48 @@ def parse_summary_value(value_data: dict) -> _SummaryValue:
     raise ValueError(f"unsupported summary value type: {value_type}")
 
 
-def parse_summary_data(summary: dict) -> _SummaryData:
-    summary_values = {
+@dataclass(frozen=True)
+class BenchmarkResultSummary:
+    """Summary record parsed from one NVBench benchmark state."""
+
+    tag: str
+    name: str | None
+    hint: str | None
+    hide: str | None
+    description: str | None
+    data: dict[str, _SummaryValue]
+
+    @property
+    def value(self) -> _SummaryValue | None:
+        return self.data.get("value")
+
+    def __getitem__(self, key: str) -> _SummaryValue:
+        return self.data[key]
+
+    def get(
+        self, key: str, default: _SummaryValue | None = None
+    ) -> _SummaryValue | None:
+        return self.data.get(key, default)
+
+
+def parse_summary(summary: dict) -> BenchmarkResultSummary:
+    data = {
         value_data["name"]: parse_summary_value(value_data)
-        for value_data in summary["data"]
+        for value_data in summary.get("data", [])
     }
-    if len(summary_values) == 1 and "value" in summary_values:
-        return summary_values["value"]
-    return summary_values
+    return BenchmarkResultSummary(
+        tag=summary["tag"],
+        name=summary.get("name"),
+        hint=summary.get("hint"),
+        hide=summary.get("hide"),
+        description=summary.get("description"),
+        data=data,
+    )
 
 
-def parse_summaries(state: dict) -> dict[str, _SummaryData]:
+def parse_summaries(state: dict) -> dict[str, BenchmarkResultSummary]:
     return {
-        summary["tag"]: parse_summary_data(summary) for summary in state["summaries"]
+        summary["tag"]: parse_summary(summary) for summary in state["summaries"] or []
     }
 
 
@@ -169,17 +208,12 @@ def parse_frequencies(state: dict, json_dir: str) -> array.array | None:
     return parse_float32_binary(frequency_count, frequencies_filename, json_dir)
 
 
-def parse_bw(state: dict) -> float | None:
-    bwutil = next(
-        filter(
-            lambda s: s["tag"] == "nv/cold/bw/global/utilization", state["summaries"]
-        ),
-        None,
-    )
-    if not bwutil:
+def parse_bw(summaries: dict[str, BenchmarkResultSummary]) -> float | None:
+    bwutil = summaries.get("nv/cold/bw/global/utilization")
+    if bwutil is None or bwutil.value is None:
         return None
 
-    return extract_bw(bwutil)
+    return float(bwutil.value)
 
 
 def get_axis_name(axis: dict) -> str:
@@ -189,9 +223,16 @@ def get_axis_name(axis: dict) -> str:
     return name
 
 
-class SubBenchState:
+class SubBenchmarkState:
+    """Result data for one executed state of an NVBench benchmark."""
+
     def __init__(self, state: dict, axes_names: dict, axes_values: dict, json_dir: str):
         self.state_name = state["name"]
+        self.device = state.get("device")
+        self.type_config_index = state.get("type_config_index")
+        self.axis_values = state.get("axis_values") or []
+        self.is_skipped = state.get("is_skipped", False)
+        self.skip_reason = state.get("skip_reason")
         self.summaries = parse_summaries(state)
         self.samples = parse_samples(state, json_dir)
         self.frequencies = parse_frequencies(state, json_dir)
@@ -204,10 +245,10 @@ class SubBenchState:
                 f"sample count ({len(self.samples)}) does not match "
                 f"frequency count ({len(self.frequencies)})"
             )
-        self.bw = parse_bw(state)
+        self.bw = parse_bw(self.summaries)
 
         self.point = {}
-        for axis in state["axis_values"] or []:
+        for axis in self.axis_values:
             axis_name = axis["name"]
             name = axes_names[axis_name]
             value = axes_values[axis_name][axis["value"]]
@@ -234,11 +275,17 @@ class SubBenchState:
         return estimator(self.samples, self.frequencies)
 
 
-class SubBenchResult:
+class SubBenchmarkResult:
+    """Result data for one NVBench benchmark and its executed states."""
+
     def __init__(self, bench: dict, json_dir: str):
+        self.name = bench["name"]
+        self.devices = bench.get("devices") or []
+        self.axes = bench.get("axes") or []
+
         axes_names = {}
         axes_values = {}
-        for axis in bench["axes"] or []:
+        for axis in self.axes:
             short_name = axis["name"]
             full_name = get_axis_name(axis)
             this_axis_values = {}
@@ -252,9 +299,9 @@ class SubBenchResult:
 
         self.states = []
         for state in bench["states"]:
-            if not state["is_skipped"]:
+            if not state.get("is_skipped", False):
                 self.states.append(
-                    SubBenchState(state, axes_names, axes_values, json_dir)
+                    SubBenchmarkState(state, axes_names, axes_values, json_dir)
                 )
 
     def __repr__(self) -> str:
@@ -265,10 +312,10 @@ class SubBenchResult:
 
     def __getitem__(
         self, state_index: int | slice
-    ) -> SubBenchState | list[SubBenchState]:
+    ) -> SubBenchmarkState | list[SubBenchmarkState]:
         return self.states[state_index]
 
-    def __iter__(self) -> Iterator[SubBenchState]:
+    def __iter__(self) -> Iterator[SubBenchmarkState]:
         return iter(self.states)
 
     def centers(
@@ -289,23 +336,39 @@ class SubBenchResult:
 
 
 class BenchmarkResult:
-    """Parsed result data from an NVBench JSON output file."""
+    """Container for benchmark result data parsed from NVBench JSON output.
+
+    Instances are created with :meth:`from_json` or :meth:`empty`. Direct
+    construction is intentionally disabled to keep creation paths explicit.
+    """
+
+    _construction_token = object()
 
     def __init__(
         self,
-        *,
-        json_path: str | os.PathLike[str],
-        metadata: Any = None,
+        token=None,
     ):
-        self.metadata = metadata
-        self.subbenches: dict[str, SubBenchResult] = {}
-        self._parse_json(json_path)
+        """Initialize an instance created by a BenchmarkResult class method.
+
+        Users should call :meth:`from_json` or :meth:`empty` instead. The token
+        argument is an implementation detail used to prevent direct
+        construction.
+        """
+        if token is not self._construction_token:
+            raise TypeError(
+                "BenchmarkResult cannot be constructed directly; "
+                "use BenchmarkResult.from_json() or BenchmarkResult.empty()"
+            )
+
+        self.metadata: Any = None
+        self.devices: dict[int, BenchmarkResultDevice] = {}
+        self.subbenches: dict[str, SubBenchmarkResult] = {}
 
     @classmethod
     def empty(cls: type[BenchmarkResultT], *, metadata: Any = None) -> BenchmarkResultT:
-        result = cls.__new__(cls)
+        """Create an empty result container with optional user metadata."""
+        result = cls(cls._construction_token)
         result.metadata = metadata
-        result.subbenches = {}
         return result
 
     @classmethod
@@ -315,14 +378,27 @@ class BenchmarkResult:
         *,
         metadata: Any = None,
     ) -> BenchmarkResultT:
-        return cls(json_path=json_path, metadata=metadata)
+        """Read benchmark result data from an NVBench JSON output file."""
+        result = cls.empty(metadata=metadata)
+        result._parse_json(json_path)
+        return result
 
     def _parse_json(self, json_path: str | os.PathLike[str]) -> None:
+        """Populate this instance from an NVBench JSON output file."""
         json_path = os.fspath(json_path)
         json_dir = os.path.dirname(os.path.abspath(json_path))
-        for bench in read_json(json_path)["benchmarks"]:
+        result_json = read_json(json_path)
+        self.devices = {
+            int(device["id"]): BenchmarkResultDevice(
+                id=int(device["id"]),
+                name=device["name"],
+                data=device,
+            )
+            for device in result_json.get("devices", [])
+        }
+        for bench in result_json["benchmarks"]:
             bench_name: str = bench["name"]
-            self.subbenches[bench_name] = SubBenchResult(bench, json_dir)
+            self.subbenches[bench_name] = SubBenchmarkResult(bench, json_dir)
 
     def __repr__(self) -> str:
         return str(self.__dict__)
@@ -336,16 +412,16 @@ class BenchmarkResult:
     def __contains__(self, subbench_name: object) -> bool:
         return subbench_name in self.subbenches
 
-    def __getitem__(self, subbench_name: str) -> SubBenchResult:
+    def __getitem__(self, subbench_name: str) -> SubBenchmarkResult:
         return self.subbenches[subbench_name]
 
     def keys(self) -> KeysView[str]:
         return self.subbenches.keys()
 
-    def values(self) -> ValuesView[SubBenchResult]:
+    def values(self) -> ValuesView[SubBenchmarkResult]:
         return self.subbenches.values()
 
-    def items(self) -> ItemsView[str, SubBenchResult]:
+    def items(self) -> ItemsView[str, SubBenchmarkResult]:
         return self.subbenches.items()
 
     def centers(
diff --git a/python/pyproject.toml b/python/pyproject.toml
index f7ddf78..6a23ff9 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,3 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 [build-system]
 requires = ["scikit-build-core>=0.10", "setuptools_scm"]
 build-backend = "scikit_build_core.build"
@@ -52,6 +55,7 @@ tools = [
 [project.scripts]
 nvbench-compare = "scripts.nvbench_compare:main"
 nvbench-histogram = "scripts.nvbench_histogram:main"
+nvbench-json-summary = "scripts.nvbench_json_summary:main"
 nvbench-walltime = "scripts.nvbench_walltime:main"
 
 [project.urls]
@@ -85,4 +89,5 @@ fallback_version = "0.0.0"
 [tool.scikit-build.wheel.packages]
 "cuda" = "cuda"
 "cuda/bench" = "cuda/bench"
+"cuda/bench/results" = "cuda/bench/results"
 "scripts" = "scripts"
diff --git a/python/scripts/nvbench_json_summary.py b/python/scripts/nvbench_json_summary.py
new file mode 100644
index 0000000..813b57d
--- /dev/null
+++ b/python/scripts/nvbench_json_summary.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+#
+# Copyright 2026 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+
+from cuda.bench.results import (
+    BenchmarkResult,
+    BenchmarkResultSummary,
+    SubBenchmarkResult,
+    SubBenchmarkState,
+)
+
+
+class MarkdownTable:
+    def __init__(self):
+        self.columns = []
+
+    def add_cell(self, row: int, key: str, header: str, value: str) -> None:
+        column = next((col for col in self.columns if col["key"] == key), None)
+        if column is None:
+            column = {
+                "key": key,
+                "header": header,
+                "rows": [],
+                "max_width": len(header),
+            }
+            self.columns.append(column)
+
+        column["max_width"] = max(column["max_width"], len(value))
+        while len(column["rows"]) <= row:
+            column["rows"].append("")
+        column["rows"][row] = value
+
+    def to_string(self) -> str:
+        if not self.columns:
+            return ""
+
+        num_rows = max(len(column["rows"]) for column in self.columns)
+        for column in self.columns:
+            while len(column["rows"]) < num_rows:
+                column["rows"].append("")
+
+        header = "|"
+        divider = "|"
+        for column in self.columns:
+            width = column["max_width"]
+            header += f" {column['header']:^{width}} |"
+            divider += f"{'':-^{width + 2}}|"
+
+        rows = []
+        for row in range(num_rows):
+            row_text = "|"
+            for column in self.columns:
+                row_text += f" {column['rows'][row]:>{column['max_width']}} |"
+            rows.append(row_text)
+
+        return "\n".join([header, divider, *rows]) + "\n"
+
+
+def format_default(summary: BenchmarkResultSummary) -> str:
+    value = summary.value
+    if isinstance(value, float):
+        return f"{value:.5g}"
+    if value is None:
+        return ""
+    return str(value)
+
+
+def format_duration(summary: BenchmarkResultSummary) -> str:
+    seconds = float(summary["value"])
+    if seconds >= 1.0:
+        return f"{seconds:0.3f} s"
+    if seconds >= 1e-3:
+        return f"{seconds * 1e3:0.3f} ms"
+    if seconds >= 1e-6:
+        return f"{seconds * 1e6:0.3f} us"
+    return f"{seconds * 1e9:0.3f} ns"
+
+
+def format_item_rate(summary: BenchmarkResultSummary) -> str:
+    items_per_second = float(summary["value"])
+    if items_per_second >= 1e15:
+        return f"{items_per_second * 1e-15:0.3f}P"
+    if items_per_second >= 1e12:
+        return f"{items_per_second * 1e-12:0.3f}T"
+    if items_per_second >= 1e9:
+        return f"{items_per_second * 1e-9:0.3f}G"
+    if items_per_second >= 1e6:
+        return f"{items_per_second * 1e-6:0.3f}M"
+    if items_per_second >= 1e3:
+        return f"{items_per_second * 1e-3:0.3f}K"
+    return f"{items_per_second:0.3f}"
+
+
+def format_frequency(summary: BenchmarkResultSummary) -> str:
+    frequency_hz = float(summary["value"])
+    if frequency_hz >= 1e9:
+        return f"{frequency_hz * 1e-9:0.3f} GHz"
+    if frequency_hz >= 1e6:
+        return f"{frequency_hz * 1e-6:0.3f} MHz"
+    if frequency_hz >= 1e3:
+        return f"{frequency_hz * 1e-3:0.3f} KHz"
+    return f"{frequency_hz:0.3f} Hz"
+
+
+def format_bytes(summary: BenchmarkResultSummary) -> str:
+    nbytes = float(summary["value"])
+    if nbytes >= 1024.0 * 1024.0 * 1024.0:
+        return f"{nbytes / (1024.0 * 1024.0 * 1024.0):0.3f} GiB"
+    if nbytes >= 1024.0 * 1024.0:
+        return f"{nbytes / (1024.0 * 1024.0):0.3f} MiB"
+    if nbytes >= 1024.0:
+        return f"{nbytes / 1024.0:0.3f} KiB"
+    return f"{nbytes:0.3f} B"
+
+
+def format_byte_rate(summary: BenchmarkResultSummary) -> str:
+    bytes_per_second = float(summary["value"])
+    if bytes_per_second >= 1e15:
+        return f"{bytes_per_second * 1e-15:0.3f} PB/s"
+    if bytes_per_second >= 1e12:
+        return f"{bytes_per_second * 1e-12:0.3f} TB/s"
+    if bytes_per_second >= 1e9:
+        return f"{bytes_per_second * 1e-9:0.3f} GB/s"
+    if bytes_per_second >= 1e6:
+        return f"{bytes_per_second * 1e-6:0.3f} MB/s"
+    if bytes_per_second >= 1e3:
+        return f"{bytes_per_second * 1e-3:0.3f} KB/s"
+    return f"{bytes_per_second:0.3f} B/s"
+
+
+def format_sample_size(summary: BenchmarkResultSummary) -> str:
+    return f"{int(summary['value'])}x"
+
+
+def format_percentage(summary: BenchmarkResultSummary) -> str:
+    return f"{float(summary['value']) * 100.0:.2f}%"
+
+
+def format_summary(summary: BenchmarkResultSummary) -> str:
+    if summary.hint == "duration":
+        return format_duration(summary)
+    if summary.hint == "item_rate":
+        return format_item_rate(summary)
+    if summary.hint == "frequency":
+        return format_frequency(summary)
+    if summary.hint == "bytes":
+        return format_bytes(summary)
+    if summary.hint == "byte_rate":
+        return format_byte_rate(summary)
+    if summary.hint == "sample_size":
+        return format_sample_size(summary)
+    if summary.hint == "percentage":
+        return format_percentage(summary)
+    return format_default(summary)
+
+
+def format_axis_value(
+    axis_value: dict, axes_by_name: dict[str, dict]
+) -> tuple[str, str]:
+    name = axis_value["name"]
+    axis = axes_by_name.get(name, {})
+    value = axis_value["value"]
+    if axis.get("type") == "int64" and axis.get("flags") == "pow2":
+        int_value = int(value)
+        exponent = int_value.bit_length() - 1
+        return name, f"2^{exponent} = {int_value}"
+
+    value_type = axis_value.get("type", axis.get("type"))
+    if value_type == "int64":
+        return name, str(int(value))
+    if value_type == "float64":
+        return name, f"{float(value):.5g}"
+
+    return name, str(value)
+
+
+def add_state_row(
+    table: MarkdownTable,
+    row: int,
+    state: SubBenchmarkState,
+    bench: SubBenchmarkResult,
+) -> None:
+    axes_by_name = {axis["name"]: axis for axis in bench.axes}
+
+    for axis_value in state.axis_values:
+        header, value = format_axis_value(axis_value, axes_by_name)
+        table.add_cell(row, f"axis:{header}", header, value)
+
+    for summary in state.summaries.values():
+        if summary.hide is not None:
+            continue
+        header = summary.name if summary.name is not None else summary.tag
+        table.add_cell(row, summary.tag, header, format_summary(summary))
+
+
+def format_benchmark(result: BenchmarkResult, bench: SubBenchmarkResult) -> str:
+    parts = [f"## {bench.name}\n\n"]
+    device_ids: list[int | None] = list(bench.devices) if bench.devices else [None]
+
+    for device_id in device_ids:
+        if device_id is not None:
+            device = result.devices.get(device_id)
+            device_name = device.name if device is not None else f"Device {device_id}"
+            parts.append(f"### [{device_id}] {device_name}\n\n")
+
+        table = MarkdownTable()
+        row = 0
+        for state in bench.states:
+            if device_id is not None and state.device != device_id:
+                continue
+            add_state_row(table, row, state, bench)
+            row += 1
+
+        table_text = table.to_string()
+        parts.append(table_text if table_text else "No data -- check log.\n")
+
+    return "".join(parts)
+
+
+def format_result(result: BenchmarkResult) -> str:
+    parts = ["# Benchmark Results\n"]
+    for bench in result.values():
+        parts.append(f"\n{format_benchmark(result, bench)}")
+    return "".join(parts)
+
+
+def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        prog="nvbench-json-summary",
+        description="Print an NVBench-style markdown summary table from NVBench JSON output.",
+    )
+    parser.add_argument("json_path", help="Path to an NVBench JSON output file.")
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=Path,
+        help="Write markdown output to this file instead of stdout.",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = parse_args(argv)
+    result = BenchmarkResult.from_json(args.json_path)
+    report = format_result(result)
+
+    if args.output is not None:
+        args.output.write_text(report, encoding="utf-8")
+    else:
+        print(report)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/test/test_bench_result.py b/python/test/test_benchmark_result.py
similarity index 89%
rename from python/test/test_bench_result.py
rename to python/test/test_benchmark_result.py
index a01b413..944e3e2 100644
--- a/python/test/test_bench_result.py
+++ b/python/test/test_benchmark_result.py
@@ -1,8 +1,25 @@
+# Copyright 2026 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import json
 import struct
 from dataclasses import dataclass
 
-import cuda.bench as bench
+import cuda.bench
+import cuda.bench.results as results
 import pytest
 
 
@@ -63,6 +80,9 @@ def test_benchmark_result_reads_jsonbin_relative_to_json_path(tmp_path):
                                     },
                                     {
                                         "tag": "nv/cold/bw/global/utilization",
+                                        "name": "BWUtil",
+                                        "hint": "percentage",
+                                        "description": "Global memory utilization",
                                         "data": [
                                             {
                                                 "name": "value",
@@ -98,10 +118,12 @@ def test_benchmark_result_reads_jsonbin_relative_to_json_path(tmp_path):
     )
 
     metadata = {"returncode": 0, "elapsed_seconds": 0.25}
-    default_result = bench.BenchmarkResult.from_json(json_fn)
-    result = bench.BenchmarkResult(json_path=json_fn, metadata=metadata)
+    default_result = results.BenchmarkResult.from_json(json_fn)
+    result = results.BenchmarkResult.from_json(json_fn, metadata=metadata)
 
-    assert bench.BenchmarkResult.__module__ == bench.__name__
+    assert results.BenchmarkResult.__module__ == results.__name__
+    assert results.BenchmarkResultSummary.__module__ == results.__name__
+    assert not hasattr(cuda.bench, "BenchmarkResult")
     assert default_result.metadata is None
     assert result.metadata is metadata
     subbench = result["copy"]
@@ -121,12 +143,19 @@ def test_benchmark_result_reads_jsonbin_relative_to_json_path(tmp_path):
         subbench[1]
     assert state.name() == "BlockSize[pow2]=8"
     assert state.bw == 0.75
-    assert state.summaries["nv/cold/bw/global/utilization"] == pytest.approx(0.75)
-    assert state.summaries["nv/json/bin:nv/cold/sample_times"] == {
+    bw_summary = state.summaries["nv/cold/bw/global/utilization"]
+    assert bw_summary.tag == "nv/cold/bw/global/utilization"
+    assert bw_summary.name == "BWUtil"
+    assert bw_summary.hint == "percentage"
+    assert bw_summary.hide is None
+    assert bw_summary.description == "Global memory utilization"
+    assert bw_summary.value == pytest.approx(0.75)
+    assert bw_summary["value"] == pytest.approx(0.75)
+    assert state.summaries["nv/json/bin:nv/cold/sample_times"].data == {
         "filename": "result.json-bin/0.bin",
         "size": 3,
     }
-    assert state.summaries["nv/json/freqs-bin:nv/cold/sample_freqs"] == {
+    assert state.summaries["nv/json/freqs-bin:nv/cold/sample_freqs"].data == {
         "filename": "result.json-freqs-bin/0.bin",
         "size": 3,
     }
@@ -154,13 +183,15 @@ def test_benchmark_result_reads_jsonbin_relative_to_json_path(tmp_path):
         result["missing"]
 
 
-def test_benchmark_result_json_path_is_required_keyword():
+def test_benchmark_result_constructor_is_private():
+    with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
+        results.BenchmarkResult()
+    with pytest.raises(TypeError, match="from_json\\(\\).*empty\\(\\)"):
+        results.BenchmarkResult("result.json")
     with pytest.raises(TypeError):
-        bench.BenchmarkResult("result.json")
+        results.BenchmarkResult(metadata=None)
     with pytest.raises(TypeError):
-        bench.BenchmarkResult(metadata=None)
-    with pytest.raises(TypeError):
-        bench.BenchmarkResult(json_path="result.json", parse=False)
+        results.BenchmarkResult(json_path="result.json", parse=False)
 
 
 def test_benchmark_result_empty_does_not_read_json(tmp_path):
@@ -172,15 +203,15 @@ def test_benchmark_result_empty_does_not_read_json(tmp_path):
     metadata = RunMetadata(returncode=1, elapsed_seconds=0.25)
     missing_json = tmp_path / "missing.json"
 
-    result = bench.BenchmarkResult.empty(metadata=metadata)
+    result = results.BenchmarkResult.empty(metadata=metadata)
 
     assert result.metadata is metadata
     assert result.subbenches == {}
 
     with pytest.raises(FileNotFoundError):
-        bench.BenchmarkResult(json_path=missing_json, metadata=metadata)
+        results.BenchmarkResult.from_json(missing_json, metadata=metadata)
     with pytest.raises(FileNotFoundError):
-        bench.BenchmarkResult.from_json(json_path=missing_json, metadata=metadata)
+        results.BenchmarkResult.from_json(json_path=missing_json, metadata=metadata)
 
 
 def test_benchmark_result_accepts_no_axis_benchmark_with_recorded_binary_path(
@@ -251,7 +282,7 @@ def test_benchmark_result_accepts_no_axis_benchmark_with_recorded_binary_path(
 
     monkeypatch.chdir(tmp_path)
 
-    result = bench.BenchmarkResult(json_path="temp_data/axes_run1.json")
+    result = results.BenchmarkResult.from_json("temp_data/axes_run1.json")
 
     state = result.subbenches["simple"].states[0]
     assert state.name() == "Device=0"
@@ -263,7 +294,7 @@ def test_benchmark_result_accepts_no_axis_benchmark_with_recorded_binary_path(
 
 
 def test_benchmark_result_accepts_axis_value_input_string():
-    result = bench.SubBenchResult(
+    result = results.SubBenchmarkResult(
         {
             "name": "single_float64_axis",
             "axes": [
@@ -304,7 +335,7 @@ def test_benchmark_result_accepts_axis_value_input_string():
 
 
 def test_benchmark_result_ignores_skipped_state_with_no_summaries():
-    result = bench.SubBenchResult(
+    result = results.SubBenchmarkResult(
         {
             "name": "copy_sweep_grid_shape",
             "axes": [
@@ -451,7 +482,7 @@ def test_benchmark_result_uses_none_for_unavailable_samples(tmp_path):
         encoding="utf-8",
     )
 
-    result = bench.BenchmarkResult(json_path=json_fn)
+    result = results.BenchmarkResult.from_json(json_fn)
 
     states = result.subbenches["copy"].states
     assert states[0].samples is None
@@ -556,4 +587,4 @@ def test_benchmark_result_rejects_mismatched_sample_and_frequency_counts(tmp_pat
     )
 
     with pytest.raises(ValueError, match="sample count .* frequency count"):
-        bench.BenchmarkResult(json_path=json_fn)
+        results.BenchmarkResult.from_json(json_fn)
diff --git a/python/test/test_nvbench_json_summary.py b/python/test/test_nvbench_json_summary.py
new file mode 100644
index 0000000..f8b3340
--- /dev/null
+++ b/python/test/test_nvbench_json_summary.py
@@ -0,0 +1,221 @@
+# Copyright 2026 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import importlib.util
+import json
+from pathlib import Path
+
+
+def load_nvbench_json_summary():
+    module_path = (
+        Path(__file__).resolve().parents[1] / "scripts" / "nvbench_json_summary.py"
+    )
+    spec = importlib.util.spec_from_file_location("nvbench_json_summary", module_path)
+    assert spec is not None
+    assert spec.loader is not None
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+nvbench_json_summary = load_nvbench_json_summary()
+
+
+def write_result_json(path):
+    path.write_text(
+        json.dumps(
+            {
+                "devices": [
+                    {
+                        "id": 0,
+                        "name": "Test GPU",
+                    }
+                ],
+                "benchmarks": [
+                    {
+                        "name": "copy",
+                        "devices": [0],
+                        "axes": [
+                            {
+                                "name": "BlockSize",
+                                "type": "int64",
+                                "flags": "pow2",
+                                "values": [
+                                    {
+                                        "input_string": "8",
+                                        "description": "2^8 = 256",
+                                        "value": 256,
+                                    }
+                                ],
+                            }
+                        ],
+                        "states": [
+                            {
+                                "name": "Device=0 BlockSize=2^8",
+                                "device": 0,
+                                "type_config_index": 0,
+                                "axis_values": [
+                                    {
+                                        "name": "BlockSize",
+                                        "type": "int64",
+                                        "value": "256",
+                                    }
+                                ],
+                                "summaries": [
+                                    {
+                                        "tag": "nv/cold/time/gpu/sample_size",
+                                        "name": "Samples",
+                                        "hint": "sample_size",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "int64",
+                                                "value": "12",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/mean",
+                                        "name": "GPU Time",
+                                        "hint": "duration",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "1.25e-6",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/stdev/relative",
+                                        "name": "Noise",
+                                        "hint": "percentage",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "0.015",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/bw/global/bytes_per_second",
+                                        "name": "GlobalMem BW",
+                                        "hint": "byte_rate",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "2.5e9",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/bw/global/utilization",
+                                        "name": "BWUtil",
+                                        "hint": "percentage",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "0.625",
+                                            }
+                                        ],
+                                    },
+                                    {
+                                        "tag": "nv/cold/time/gpu/min",
+                                        "name": "Min GPU Time",
+                                        "hint": "duration",
+                                        "hide": "Hidden by default.",
+                                        "data": [
+                                            {
+                                                "name": "value",
+                                                "type": "float64",
+                                                "value": "1.0e-6",
+                                            }
+                                        ],
+                                    },
+                                ],
+                                "is_skipped": False,
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_json_summary_formats_nvbench_style_markdown(tmp_path):
+    json_path = tmp_path / "result.json"
+    write_result_json(json_path)
+
+    result = nvbench_json_summary.BenchmarkResult.from_json(json_path)
+    report = nvbench_json_summary.format_result(result)
+
+    assert "# Benchmark Results" in report
+    assert "## copy" in report
+    assert "### [0] Test GPU" in report
+    assert (
+        "| BlockSize | Samples | GPU Time | Noise | GlobalMem BW | BWUtil |" in report
+    )
+    assert (
+        "| 2^8 = 256 |     12x | 1.250 us | 1.50% |   2.500 GB/s | 62.50% |" in report
+    )
+    assert "Min GPU Time" not in report
+
+
+def test_json_summary_formats_axis_values_like_markdown_printer():
+    axes_by_name = {
+        "BlockSize": {
+            "name": "BlockSize",
+            "type": "int64",
+            "flags": "pow2",
+        },
+        "NumBlocks": {
+            "name": "NumBlocks",
+            "type": "int64",
+            "flags": "",
+        },
+        "Duration": {
+            "name": "Duration",
+            "type": "float64",
+            "flags": "",
+        },
+    }
+
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "BlockSize", "type": "int64", "value": "256"}, axes_by_name
+    ) == ("BlockSize", "2^8 = 256")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "NumBlocks", "type": "int64", "value": "64"}, axes_by_name
+    ) == ("NumBlocks", "64")
+    assert nvbench_json_summary.format_axis_value(
+        {"name": "Duration", "type": "float64", "value": "0.123456789"},
+        axes_by_name,
+    ) == ("Duration", "0.12346")
+
+
+def test_json_summary_cli_writes_output_file(tmp_path):
+    json_path = tmp_path / "result.json"
+    output_path = tmp_path / "summary.md"
+    write_result_json(json_path)
+
+    rc = nvbench_json_summary.main([str(json_path), "--output", str(output_path)])
+
+    assert rc == 0
+    assert "GlobalMem BW" in output_path.read_text(encoding="utf-8")