Provide BenchmarkResult class for parsing JSON output of NVBench-instrumented benchmarks (#356)

Implements `cuda.bench.results.BenchmarkResult` class to represent data from JSON output of benchmark execution. The contains implements two class methods `BenchmarkResult.from_json(filename : str | os.PathLike, *, metadata : Any = None)` which expects well-formed JSON filename and `BenchmarkResult.empty(*, metadata : Any = None)` intended to represent failed result with reasons that can be recorded in metadata at user's discretion. The `BenchmarkResult` implements mapping interface, supporting `.keys()`, `.values()`, `.items()` methods, `__len__`, `__contains__`, `__getitem__` and `__iter__` special methods. Values in `BenchmarkResult` has type `cuda.bench.results.SubBenchmarkResult` which implements a list-like interface, i.e. implements `__len__`, `__getitem__`, and `__iter__` special methods. Values in this list-like structure correspond to measurements of individual states of a particular benchmark (the key in `BenchmarkResult`). Elements of `SubBenchmarkResult` structure have type `SubBenchmarkState` that supports mapping protocol with axis_values as a key and represent data corresponding to measurements for a particular state (combination of settings for each axis). The state provides `.samples` and `.frequencies` attributes storing raw execution duration values and estimates for average GPU frequencies. Example usage: ``` import array, numpy as np, cuda.bench.results r = cuda.bench.results.BenchmarkResult("perf_data/axes_run1.json") r["copy_sweep_grid_shape"].centers_with_frequencies( lambda t, f: np.median(np.asarray(t)*np.asarray(f))) ``` ``` In [1]: import array, numpy as np, cuda.bench.results In [2]: r = cuda.bench.results.BenchmarkResult("temp_data/axes_run1.json") In [3]: list(r) Out[3]: ['simple', 'single_float64_axis', 'copy_sweep_grid_shape', 'copy_type_sweep', 'copy_type_conversion_sweep', 'copy_type_and_block_size_sweep'] In [4]: r["simple"].centers(lambda t: np.percentile(t, [25,75])) Out[4]: {'Device=0': array([0.00100966, 0.00101299])} In [5]: r.centers(lambda t: np.percentile(t, [25,75]))["simple"] Out[5]: {'Device=0': array([0.00100966, 0.00101299])} In [6]: len(r) Out[6]: 6 In [7]: "fake" in r Out[7]: False ``` Each `SubBenchmarkState` implements `.summaries` attribute - rich object that retains tag/name/hint/hide/description metadata. * Add nvbench-json-summary to render NVBench JSON output as an NVBench-style markdown summary table, including axis formatting, device sections, hidden summary filtering, and summary hint formatting. Update packaging, type stubs, and tests for the new namespace, renamed classes, Python 3.10-compatible annotations, and summary-table generation. * Split tests in test_benchmark_result into smaller tests * Fix break due to file name change * Add python/examples/benchmark_result_autotune.py This example demonstrates using cuda.bench and cuda.bench.results to implement simple auto-tuning, demonstrated on selecting of tile shape hyperparameter for naive stencil kernel implemented in numba-cuda. * Resolve ruff PLE0604 * Fix for format_axis_value in json format script to handle None value Add tests to cover such input. * Address code rabbit review feedback * Fix license header, add validation * Addressed both issues raised in review Malformed values are now represented in result as None. Skipped benchmarks are no longer dropped, i.e., they are present in BenchmarkResult data, but they are not reflected in summary table in line with what NVBench-instrumented benchmarks do.
2026-05-14 02:02:16 +00:00 · 2026-05-13 13:23:58 -05:00
parent 6df6dc8d89
commit 338936b6fe
12 changed files with 2480 additions and 45 deletions
--- a/python/examples/benchmark_result_autotune.py
+++ b/python/examples/benchmark_result_autotune.py
@@ -0,0 +1,361 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+import argparse
+import statistics
+import subprocess
+import sys
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+from cuda.bench.results import BenchmarkResult, BenchmarkResultSummary
+from tabulate import tabulate
+
+TILE_SHAPES = ("4x32", "8x16", "16x16", "32x8", "16x8", "8x8")
+BENCHMARK_NAME = "stencil_autotune"
+MEDIAN_TIE_RELATIVE_TOLERANCE = 0.01
+MIN_RECOMMENDED_INTERIOR_PIXELS = 1_000_000
+
+
+def parse_tile_shape(tile_shape: str) -> tuple[int, int]:
+    block_x, block_y = tile_shape.split("x", maxsplit=1)
+    return int(block_x), int(block_y)
+
+
+def format_duration(seconds: float) -> str:
+    if seconds >= 1.0:
+        return f"{seconds:.3f} s"
+    if seconds >= 1e-3:
+        return f"{seconds * 1e3:.3f} ms"
+    if seconds >= 1e-6:
+        return f"{seconds * 1e6:.3f} us"
+    return f"{seconds * 1e9:.3f} ns"
+
+
+def format_optional_duration(seconds: float | None) -> str:
+    if seconds is None:
+        return "-"
+    return format_duration(seconds)
+
+
+def format_byte_rate(summary: BenchmarkResultSummary | None) -> str:
+    if summary is None or summary.value is None:
+        return "-"
+
+    bytes_per_second = float(summary.value)
+    if bytes_per_second >= 1e12:
+        return f"{bytes_per_second * 1e-12:.3f} TB/s"
+    if bytes_per_second >= 1e9:
+        return f"{bytes_per_second * 1e-9:.3f} GB/s"
+    if bytes_per_second >= 1e6:
+        return f"{bytes_per_second * 1e-6:.3f} MB/s"
+    if bytes_per_second >= 1e3:
+        return f"{bytes_per_second * 1e-3:.3f} KB/s"
+    return f"{bytes_per_second:.3f} B/s"
+
+
+def state_tile_shape(state_name: str) -> str:
+    prefix = "TileShape="
+    for field in state_name.split():
+        if field.startswith(prefix):
+            return field.removeprefix(prefix)
+    return state_name
+
+
+def interior_pixel_count(width: int, height: int) -> int:
+    return max(width - 2, 0) * max(height - 2, 0)
+
+
+def median_ties_best(row: dict[str, Any], best_median_seconds: float) -> bool:
+    tolerance = abs(best_median_seconds) * MEDIAN_TIE_RELATIVE_TOLERANCE
+    return abs(row["median_seconds"] - best_median_seconds) <= tolerance
+
+
+def summarize_result(result: BenchmarkResult) -> list[dict[str, Any]]:
+    subbenchmark = result[BENCHMARK_NAME]
+    medians = subbenchmark.centers(statistics.median)
+    metadata = result.metadata if isinstance(result.metadata, dict) else {}
+    rows = []
+
+    for state in subbenchmark:
+        median_seconds = medians[state.name()]
+        if median_seconds is None:
+            continue
+
+        bandwidth = state.summaries.get("nv/cold/bw/global/bytes_per_second")
+        mean_summary = state.summaries.get("nv/cold/time/gpu/mean")
+        mean_seconds = (
+            None
+            if mean_summary is None or mean_summary.value is None
+            else float(mean_summary.value)
+        )
+        rows.append(
+            {
+                "tile_shape": state_tile_shape(state.name()),
+                "median_seconds": median_seconds,
+                "mean_seconds": mean_seconds,
+                "sample_count": len(state.samples) if state.samples is not None else 0,
+                "bandwidth": format_byte_rate(bandwidth),
+                "subprocess_seconds": metadata.get("elapsed_seconds", 0.0),
+            }
+        )
+
+    return sorted(rows, key=lambda row: row["median_seconds"])
+
+
+def print_summary(rows: list[dict[str, Any]]) -> None:
+    if not rows:
+        raise RuntimeError("No benchmark states with sample data were found.")
+
+    total_subprocess_seconds = sum(row["subprocess_seconds"] for row in rows)
+    print()
+    print(f"Total benchmark subprocess wall time: {total_subprocess_seconds:.3f} s")
+    print()
+
+    best_median_seconds = rows[0]["median_seconds"]
+    tied_rows = [row for row in rows if median_ties_best(row, best_median_seconds)]
+    table = [
+        [
+            "*" if row in tied_rows else "",
+            row["tile_shape"],
+            format_duration(row["median_seconds"]),
+            format_optional_duration(row["mean_seconds"]),
+            row["sample_count"],
+            row["bandwidth"],
+            f"{row['subprocess_seconds']:.3f} s",
+        ]
+        for row in rows
+    ]
+    print(
+        tabulate(
+            table,
+            headers=[
+                "Best",
+                "TileShape",
+                "Median GPU Time",
+                "Mean GPU Time",
+                "Samples",
+                "GlobalMem BW",
+                "Subprocess",
+            ],
+            tablefmt="simple",
+            disable_numparse=True,
+        )
+    )
+
+    print()
+    if len(tied_rows) == 1:
+        best = tied_rows[0]
+        print(
+            "Best tile shape by median isolated GPU time: "
+            f"{best['tile_shape']} ({format_duration(best['median_seconds'])})"
+        )
+    else:
+        tile_shapes = ", ".join(row["tile_shape"] for row in tied_rows)
+        print(
+            "No unique best tile shape by median isolated GPU time: "
+            f"{len(tied_rows)} states are within "
+            f"{MEDIAN_TIE_RELATIVE_TOLERANCE:.1%} of "
+            f"{format_duration(best_median_seconds)} ({tile_shapes})."
+        )
+
+
+def run_driver(args: argparse.Namespace, nvbench_args: list[str]) -> int:
+    with tempfile.TemporaryDirectory(prefix="nvbench-autotune-") as tmp_dir:
+        rows = []
+        total = len(TILE_SHAPES)
+        interior_pixels = interior_pixel_count(args.image_width, args.image_height)
+        print(
+            f"Image size: {args.image_width}x{args.image_height} "
+            f"({interior_pixels} interior stencil points)"
+        )
+        print(f"Sampling {total} tile shapes:")
+        if interior_pixels < MIN_RECOMMENDED_INTERIOR_PIXELS:
+            print(
+                "Warning: this problem has only "
+                f"{interior_pixels} interior stencil points. "
+                "Small problems are usually dominated by kernel launch overhead, "
+                "so median timings may tie across tile shapes."
+            )
+
+        for index, tile_shape in enumerate(TILE_SHAPES, start=1):
+            json_path = Path(tmp_dir) / f"stencil_autotune_{tile_shape}.json"
+            command = [
+                sys.executable,
+                str(Path(__file__).resolve()),
+                "--run-benchmark",
+                "--stopping-criterion",
+                "entropy",
+                "--tile-shape",
+                tile_shape,
+                "--image-width",
+                str(args.image_width),
+                "--image-height",
+                str(args.image_height),
+                "--jsonbin",
+                str(json_path),
+            ]
+            if nvbench_args:
+                command.extend(["--", *nvbench_args])
+
+            print(f"[{index}/{total}] TileShape={tile_shape} ... ", end="", flush=True)
+            start = time.perf_counter()
+            completed = subprocess.run(
+                command,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+                check=False,
+            )
+            elapsed_seconds = time.perf_counter() - start
+
+            if completed.returncode != 0:
+                print(f"failed after {elapsed_seconds:.3f} s")
+                print(completed.stdout, end="")
+                return completed.returncode
+
+            metadata = {
+                "command": command,
+                "returncode": completed.returncode,
+                "elapsed_seconds": elapsed_seconds,
+                "tile_shape": tile_shape,
+            }
+            result = BenchmarkResult.from_json(json_path, metadata=metadata)
+            tile_rows = summarize_result(result)
+            rows.extend(tile_rows)
+
+            if tile_rows:
+                row = tile_rows[0]
+                print(
+                    f"done in {elapsed_seconds:.3f} s, "
+                    f"median {format_duration(row['median_seconds'])}, "
+                    f"{row['bandwidth']}"
+                )
+            else:
+                print(f"done in {elapsed_seconds:.3f} s, no samples")
+
+        print_summary(sorted(rows, key=lambda row: row["median_seconds"]))
+        return 0
+
+
+def run_benchmark(args: argparse.Namespace, nvbench_args: list[str]) -> None:
+    import cuda.bench as bench
+    import numpy as np
+    from numba import cuda
+
+    def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
+        return cuda.external_stream(cs.addressof())
+
+    @cuda.jit
+    def stencil_kernel(inp, out, width, height):
+        x, y = cuda.grid(2)
+        if 0 < x < width - 1 and 0 < y < height - 1:
+            idx = y * width + x
+            out[idx] = 0.2 * (
+                inp[idx]
+                + inp[idx - 1]
+                + inp[idx + 1]
+                + inp[idx - width]
+                + inp[idx + width]
+            )
+
+    def stencil_autotune(state: bench.State) -> None:
+        tile_shape = state.get_string("TileShape")
+        block_x, block_y = parse_tile_shape(tile_shape)
+        width = args.image_width
+        height = args.image_height
+        interior_pixels = (width - 2) * (height - 2)
+
+        state.add_element_count(interior_pixels, column_name="Pixels")
+        state.add_global_memory_reads(
+            interior_pixels * 5 * np.dtype(np.float32).itemsize
+        )
+        state.add_global_memory_writes(interior_pixels * np.dtype(np.float32).itemsize)
+
+        host_input = np.ones(width * height, dtype=np.float32)
+        dev_input = cuda.to_device(host_input)
+        dev_output = cuda.device_array_like(dev_input)
+
+        block_shape = (block_x, block_y)
+        grid_shape = (
+            (width + block_x - 1) // block_x,
+            (height + block_y - 1) // block_y,
+        )
+
+        # Compile the Numba kernel outside NVBench measurement.
+        stencil_kernel[grid_shape, block_shape](dev_input, dev_output, width, height)
+        cuda.synchronize()
+
+        def launcher(launch: bench.Launch) -> None:
+            stream = as_cuda_stream(launch.get_stream())
+            stencil_kernel[grid_shape, block_shape, stream, 0](
+                dev_input,
+                dev_output,
+                width,
+                height,
+            )
+
+        state.exec(launcher)
+
+    benchmark = bench.register(stencil_autotune)
+    benchmark.set_name(BENCHMARK_NAME)
+    tile_shapes = [args.tile_shape] if args.tile_shape is not None else TILE_SHAPES
+    benchmark.add_string_axis("TileShape", tile_shapes)
+    bench.run_all_benchmarks([sys.argv[0], *nvbench_args])
+
+
+def parse_args(argv: list[str] | None = None) -> tuple[argparse.Namespace, list[str]]:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Autotune a simple stencil benchmark and select the best state "
+            "from NVBench JSON-bin output."
+        ),
+        epilog=(
+            "Additional NVBench options may be passed after '--'. "
+            "For example: benchmark_result_autotune.py -- --timeout 30"
+        ),
+    )
+    parser.add_argument(
+        "--run-benchmark",
+        action="store_true",
+        help=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--tile-shape",
+        choices=TILE_SHAPES,
+        default=None,
+        help=argparse.SUPPRESS,
+    )
+    parser.add_argument(
+        "--image-width",
+        type=int,
+        default=4096,
+        help="Stencil input width used by the subprocess benchmark.",
+    )
+    parser.add_argument(
+        "--image-height",
+        type=int,
+        default=4096,
+        help="Stencil input height used by the subprocess benchmark.",
+    )
+    args, nvbench_args = parser.parse_known_args(argv)
+    if args.image_width < 3 or args.image_height < 3:
+        parser.error("--image-width and --image-height must both be at least 3")
+    nvbench_args = [arg for arg in nvbench_args if arg != "--"]
+    return args, nvbench_args
+
+
+def main(argv: list[str] | None = None) -> int:
+    args, nvbench_args = parse_args(argv)
+    if args.run_benchmark:
+        run_benchmark(args, nvbench_args)
+        return 0
+    return run_driver(args, nvbench_args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/python/examples/requirements.txt
+++ b/python/examples/requirements.txt
@@ -6,4 +6,5 @@ numba-cuda
 cuda-cccl
 cupy
 nvidia-cute-dsl[cu13]
+tabulate
 torch[cu13]