From e07f87910a1a487dbc2d31860b61ce32fedce5b1 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 4 May 2026 08:21:41 -0500 Subject: [PATCH] Add decorators for registering benchmarks and adding axis cuda.bench.register(fn) continues returning Benchmark, and supports legacy use. New signature added: cuda.bench.register(): Returns a decorator ``` @bench.register() @bench.axis.float64("Duration (s)", [7e-5, 1e-4, 5e-4]) @bench.option.min_samples(120) def single_float64_axis(state: bench.State): ... ``` --- python/cuda/bench/__init__.py | 22 +- python/cuda/bench/__init__.pyi | 88 +++++- python/cuda/bench/_decorators.py | 285 ++++++++++++++++++ python/examples/auto_throughput.py | 9 +- python/examples/axes.py | 30 +- python/examples/cpu_activity.py | 15 +- .../examples/cuda_compute_segmented_reduce.py | 9 +- python/examples/cuda_coop_block_reduce.py | 9 +- python/examples/cupy_extract.py | 9 +- python/examples/cute_dsl_sgemm.py | 9 +- python/examples/exec_tag_sync.py | 4 +- python/examples/pytorch_bench.py | 22 +- python/examples/skip.py | 9 +- python/examples/throughput.py | 9 +- python/test/test_cuda_bench.py | 135 ++++++++- 15 files changed, 585 insertions(+), 79 deletions(-) create mode 100644 python/cuda/bench/_decorators.py diff --git a/python/cuda/bench/__init__.py b/python/cuda/bench/__init__.py index 4d2f496..f584104 100644 --- a/python/cuda/bench/__init__.py +++ b/python/cuda/bench/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -14,10 +14,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""CUDA Kernel Benchmarking Library Python API.""" + import importlib import importlib.metadata import warnings +from ._decorators import axis as axis +from ._decorators import make_register as _make_register +from ._decorators import option as option + try: __version__ = importlib.metadata.version("cuda-bench") except Exception as e: @@ -67,11 +73,18 @@ CudaStream = _nvbench_module.CudaStream Launch = _nvbench_module.Launch NVBenchRuntimeError = _nvbench_module.NVBenchRuntimeError State = _nvbench_module.State -register = _nvbench_module.register +_register = _nvbench_module.register run_all_benchmarks = _nvbench_module.run_all_benchmarks _test_cpp_exception = _nvbench_module._test_cpp_exception _test_py_exception = _nvbench_module._test_py_exception + +def _get_register(): + return _register + + +register = _make_register(_get_register) + # Expose the module as _nvbench for backward compatibility (e.g., for tests) _nvbench = _nvbench_module @@ -90,9 +103,6 @@ del ( _cuda_major, _extra_name, _module_fullname, + _make_register, _get_cuda_major_version, ) - -__doc__ = """ -CUDA Kernel Benchmarking Library Python API -""" diff --git a/python/cuda/bench/__init__.pyi b/python/cuda/bench/__init__.pyi index ae5d4bd..f23f051 100644 --- a/python/cuda/bench/__init__.pyi +++ b/python/cuda/bench/__init__.pyi @@ -26,7 +26,18 @@ # with definitions given here. from collections.abc import Callable, Sequence -from typing import Optional, Self, SupportsFloat, SupportsInt, Union +from typing import ( + Any, + Optional, + Self, + SupportsFloat, + SupportsInt, + TypeVar, + Union, + overload, +) + +_F = TypeVar("_F", bound=Callable[..., Any]) class CudaStream: def __cuda_stream__(self) -> tuple[int, int]: ... @@ -112,7 +123,80 @@ class State: def get_axis_values_as_string(self, color: bool = ...) -> str: ... def get_stopping_criterion(self) -> str: ... -def register(fn: Callable[[State], None]) -> Benchmark: ... +class _AxisDecorators: + def int64(self, name: str, values: Sequence[SupportsInt]) -> Callable[[_F], _F]: ... + def add_int64_axis( + self, name: str, values: Sequence[SupportsInt] + ) -> Callable[[_F], _F]: ... + def int64_power_of_two( + self, name: str, values: Sequence[SupportsInt] + ) -> Callable[[_F], _F]: ... + def power_of_two( + self, name: str, values: Sequence[SupportsInt] + ) -> Callable[[_F], _F]: ... + def add_int64_power_of_two_axis( + self, name: str, values: Sequence[SupportsInt] + ) -> Callable[[_F], _F]: ... + def float64( + self, name: str, values: Sequence[SupportsFloat] + ) -> Callable[[_F], _F]: ... + def add_float64_axis( + self, name: str, values: Sequence[SupportsFloat] + ) -> Callable[[_F], _F]: ... + def string(self, name: str, values: Sequence[str]) -> Callable[[_F], _F]: ... + def add_string_axis( + self, name: str, values: Sequence[str] + ) -> Callable[[_F], _F]: ... + +class _OptionDecorators: + def name(self, value: str) -> Callable[[_F], _F]: ... + def set_name(self, value: str) -> Callable[[_F], _F]: ... + def run_once(self, value: bool = True) -> Callable[[_F], _F]: ... + def set_run_once(self, value: bool) -> Callable[[_F], _F]: ... + def skip_time(self, duration_seconds: SupportsFloat) -> Callable[[_F], _F]: ... + def set_skip_time(self, duration_seconds: SupportsFloat) -> Callable[[_F], _F]: ... + def throttle_recovery_delay( + self, delay_seconds: SupportsFloat + ) -> Callable[[_F], _F]: ... + def set_throttle_recovery_delay( + self, delay_seconds: SupportsFloat + ) -> Callable[[_F], _F]: ... + def throttle_threshold(self, threshold: SupportsFloat) -> Callable[[_F], _F]: ... + def set_throttle_threshold( + self, threshold: SupportsFloat + ) -> Callable[[_F], _F]: ... + def timeout(self, duration_seconds: SupportsFloat) -> Callable[[_F], _F]: ... + def set_timeout(self, duration_seconds: SupportsFloat) -> Callable[[_F], _F]: ... + def stopping_criterion(self, criterion: str) -> Callable[[_F], _F]: ... + def set_stopping_criterion(self, criterion: str) -> Callable[[_F], _F]: ... + def criterion_param_float64( + self, name: str, value: SupportsFloat + ) -> Callable[[_F], _F]: ... + def set_criterion_param_float64( + self, name: str, value: SupportsFloat + ) -> Callable[[_F], _F]: ... + def criterion_param_int64( + self, name: str, value: SupportsInt + ) -> Callable[[_F], _F]: ... + def set_criterion_param_int64( + self, name: str, value: SupportsInt + ) -> Callable[[_F], _F]: ... + def criterion_param_string(self, name: str, value: str) -> Callable[[_F], _F]: ... + def set_criterion_param_string( + self, name: str, value: str + ) -> Callable[[_F], _F]: ... + def min_samples(self, count: SupportsInt) -> Callable[[_F], _F]: ... + def set_min_samples(self, count: SupportsInt) -> Callable[[_F], _F]: ... + def is_cpu_only(self, value: bool = True) -> Callable[[_F], _F]: ... + def set_is_cpu_only(self, value: bool) -> Callable[[_F], _F]: ... + +axis: _AxisDecorators +option: _OptionDecorators + +@overload +def register(fn: Callable[[State], None], /) -> Benchmark: ... +@overload +def register(fn: None = ..., /) -> Callable[[_F], _F]: ... def run_all_benchmarks(argv: Sequence[str]) -> None: ... class NVBenchRuntimeError(RuntimeError): ... diff --git a/python/cuda/bench/_decorators.py b/python/cuda/bench/_decorators.py new file mode 100644 index 0000000..a61b155 --- /dev/null +++ b/python/cuda/bench/_decorators.py @@ -0,0 +1,285 @@ +# Copyright 2026 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Function decorators for registering and configuring NVBench benchmarks.""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from typing import Any, TypeVar + +_F = TypeVar("_F", bound=Callable[..., Any]) +_Benchmark = Any +_RawRegister = Callable[[Callable[..., Any]], _Benchmark] +_RegisterGetter = Callable[[], _RawRegister] +_BenchmarkAction = Callable[[_Benchmark], None] +_BENCHMARK_ACTIONS_ATTR = "__cuda_bench_actions__" +_BENCHMARK_REGISTERED_ATTR = "__cuda_bench_registered__" + + +def _append_benchmark_action(action: _BenchmarkAction) -> Callable[[_F], _F]: + """Return a function-preserving decorator that records a benchmark action.""" + + def decorator(fn: _F) -> _F: + """Attach a delayed benchmark action to a benchmark function.""" + if getattr(fn, _BENCHMARK_REGISTERED_ATTR, False): + raise RuntimeError( + "NVBench axis and option decorators must be placed below " + "@bench.register()" + ) + + actions = getattr(fn, _BENCHMARK_ACTIONS_ATTR, None) + if actions is None: + actions = [] + try: + setattr(fn, _BENCHMARK_ACTIONS_ATTR, actions) + except AttributeError as e: + raise TypeError( + "NVBench benchmark decorators require a callable object" + ) from e + + actions.append(action) + return fn + + return decorator + + +def _apply_benchmark_actions( + benchmark: _Benchmark, fn: Callable[..., Any] +) -> _Benchmark: + """Apply delayed benchmark actions to a registered benchmark.""" + for action in reversed(getattr(fn, _BENCHMARK_ACTIONS_ATTR, ())): + action(benchmark) + + return benchmark + + +def _mark_registered(fn: Callable[..., Any]) -> None: + """Mark a callable as registered when it supports attribute assignment.""" + try: + setattr(fn, _BENCHMARK_REGISTERED_ATTR, True) + except AttributeError: + pass + + +def make_register(get_register: _RegisterGetter) -> Callable[..., Any]: + """Create the public ``register`` function around a raw register function.""" + + def register(fn=None, /): + """Register a Python benchmark function with NVBench. + + Called as ``bench.register(fn)``, this returns the registered + ``Benchmark``. Called as ``@bench.register()``, this returns a decorator + that registers the function and leaves the decorated symbol unchanged. + """ + if fn is None: + + def decorator(benchmark_fn): + benchmark = get_register()(benchmark_fn) + _apply_benchmark_actions(benchmark, benchmark_fn) + _mark_registered(benchmark_fn) + return benchmark_fn + + return decorator + + benchmark = get_register()(fn) + _apply_benchmark_actions(benchmark, fn) + _mark_registered(fn) + return benchmark + + register.__name__ = "register" + register.__qualname__ = "register" + return register + + +class _AxisDecorators: + """Namespace for decorators that add axes to a benchmark.""" + + def int64(self, name: str, values: Sequence[int]) -> Callable[[_F], _F]: + """Add an ``int64`` axis to the decorated benchmark.""" + return _append_benchmark_action( + lambda benchmark: benchmark.add_int64_axis(name, values) + ) + + def add_int64_axis(self, name: str, values: Sequence[int]) -> Callable[[_F], _F]: + """Alias for :meth:`int64`.""" + return self.int64(name, values) + + def int64_power_of_two( + self, name: str, values: Sequence[int] + ) -> Callable[[_F], _F]: + """Add a power-of-two ``int64`` axis to the decorated benchmark.""" + return _append_benchmark_action( + lambda benchmark: benchmark.add_int64_power_of_two_axis(name, values) + ) + + def power_of_two(self, name: str, values: Sequence[int]) -> Callable[[_F], _F]: + """Alias for :meth:`int64_power_of_two`.""" + return self.int64_power_of_two(name, values) + + def add_int64_power_of_two_axis( + self, name: str, values: Sequence[int] + ) -> Callable[[_F], _F]: + """Alias for :meth:`int64_power_of_two`.""" + return self.int64_power_of_two(name, values) + + def float64(self, name: str, values: Sequence[float]) -> Callable[[_F], _F]: + """Add a ``float64`` axis to the decorated benchmark.""" + return _append_benchmark_action( + lambda benchmark: benchmark.add_float64_axis(name, values) + ) + + def add_float64_axis( + self, name: str, values: Sequence[float] + ) -> Callable[[_F], _F]: + """Alias for :meth:`float64`.""" + return self.float64(name, values) + + def string(self, name: str, values: Sequence[str]) -> Callable[[_F], _F]: + """Add a string axis to the decorated benchmark.""" + return _append_benchmark_action( + lambda benchmark: benchmark.add_string_axis(name, values) + ) + + def add_string_axis(self, name: str, values: Sequence[str]) -> Callable[[_F], _F]: + """Alias for :meth:`string`.""" + return self.string(name, values) + + +class _OptionDecorators: + """Namespace for decorators that set benchmark options.""" + + def name(self, value: str) -> Callable[[_F], _F]: + """Set the benchmark name.""" + return self.set_name(value) + + def set_name(self, value: str) -> Callable[[_F], _F]: + """Set the benchmark name.""" + return _append_benchmark_action(lambda benchmark: benchmark.set_name(value)) + + def run_once(self, value: bool = True) -> Callable[[_F], _F]: + """Set whether each benchmark configuration runs only once.""" + return self.set_run_once(value) + + def set_run_once(self, value: bool) -> Callable[[_F], _F]: + """Set whether each benchmark configuration runs only once.""" + return _append_benchmark_action(lambda benchmark: benchmark.set_run_once(value)) + + def skip_time(self, duration_seconds: float) -> Callable[[_F], _F]: + """Set the threshold below which benchmark runs are skipped.""" + return self.set_skip_time(duration_seconds) + + def set_skip_time(self, duration_seconds: float) -> Callable[[_F], _F]: + """Set the threshold below which benchmark runs are skipped.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_skip_time(duration_seconds) + ) + + def throttle_recovery_delay(self, delay_seconds: float) -> Callable[[_F], _F]: + """Set the delay after GPU clock throttling is detected.""" + return self.set_throttle_recovery_delay(delay_seconds) + + def set_throttle_recovery_delay(self, delay_seconds: float) -> Callable[[_F], _F]: + """Set the delay after GPU clock throttling is detected.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_throttle_recovery_delay(delay_seconds) + ) + + def throttle_threshold(self, threshold: float) -> Callable[[_F], _F]: + """Set the GPU clock throttle threshold.""" + return self.set_throttle_threshold(threshold) + + def set_throttle_threshold(self, threshold: float) -> Callable[[_F], _F]: + """Set the GPU clock throttle threshold.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_throttle_threshold(threshold) + ) + + def timeout(self, duration_seconds: float) -> Callable[[_F], _F]: + """Set the benchmark timeout in seconds.""" + return self.set_timeout(duration_seconds) + + def set_timeout(self, duration_seconds: float) -> Callable[[_F], _F]: + """Set the benchmark timeout in seconds.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_timeout(duration_seconds) + ) + + def stopping_criterion(self, criterion: str) -> Callable[[_F], _F]: + """Set the benchmark stopping criterion.""" + return self.set_stopping_criterion(criterion) + + def set_stopping_criterion(self, criterion: str) -> Callable[[_F], _F]: + """Set the benchmark stopping criterion.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_stopping_criterion(criterion) + ) + + def criterion_param_float64(self, name: str, value: float) -> Callable[[_F], _F]: + """Set a floating-point parameter for the stopping criterion.""" + return self.set_criterion_param_float64(name, value) + + def set_criterion_param_float64( + self, name: str, value: float + ) -> Callable[[_F], _F]: + """Set a floating-point parameter for the stopping criterion.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_criterion_param_float64(name, value) + ) + + def criterion_param_int64(self, name: str, value: int) -> Callable[[_F], _F]: + """Set an integer parameter for the stopping criterion.""" + return self.set_criterion_param_int64(name, value) + + def set_criterion_param_int64(self, name: str, value: int) -> Callable[[_F], _F]: + """Set an integer parameter for the stopping criterion.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_criterion_param_int64(name, value) + ) + + def criterion_param_string(self, name: str, value: str) -> Callable[[_F], _F]: + """Set a string parameter for the stopping criterion.""" + return self.set_criterion_param_string(name, value) + + def set_criterion_param_string(self, name: str, value: str) -> Callable[[_F], _F]: + """Set a string parameter for the stopping criterion.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_criterion_param_string(name, value) + ) + + def min_samples(self, count: int) -> Callable[[_F], _F]: + """Set the minimum number of samples to collect.""" + return self.set_min_samples(count) + + def set_min_samples(self, count: int) -> Callable[[_F], _F]: + """Set the minimum number of samples to collect.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_min_samples(count) + ) + + def is_cpu_only(self, value: bool = True) -> Callable[[_F], _F]: + """Set whether the benchmark only performs CPU work.""" + return self.set_is_cpu_only(value) + + def set_is_cpu_only(self, value: bool) -> Callable[[_F], _F]: + """Set whether the benchmark only performs CPU work.""" + return _append_benchmark_action( + lambda benchmark: benchmark.set_is_cpu_only(value) + ) + + +axis = _AxisDecorators() +option = _OptionDecorators() diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py index 5d41b09..7066944 100644 --- a/python/examples/auto_throughput.py +++ b/python/examples/auto_throughput.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -39,6 +39,9 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc return kernel +@bench.register() +@bench.axis.int64("Stride", [1, 2, 4]) +@bench.axis.int64("ItemsPerThread", [1, 2, 3, 4]) def throughput_bench(state: bench.State) -> None: stride = state.get_int64("Stride") ipt = state.get_int64("ItemsPerThread") @@ -67,8 +70,4 @@ def throughput_bench(state: bench.State) -> None: if __name__ == "__main__": - b = bench.register(throughput_bench) - b.add_int64_axis("Stride", [1, 2, 4]) - b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/axes.py b/python/examples/axes.py index fa7c6c4..6afcedb 100644 --- a/python/examples/axes.py +++ b/python/examples/axes.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -58,8 +58,9 @@ __global__ void sleep_kernel(double seconds) { return mod.get_kernel("sleep_kernel") +@bench.register() +@bench.option.min_samples(1000) def simple(state: bench.State): - state.set_min_samples(1000) sleep_dur = 1e-3 krn = make_sleep_kernel() launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0) @@ -71,6 +72,8 @@ def simple(state: bench.State): state.exec(launcher) +@bench.register() +@bench.axis.float64("Duration (s)", [7e-5, 1e-4, 5e-4]) def single_float64_axis(state: bench.State): # get axis value, or default default_sleep_dur = 3.14e-4 @@ -85,6 +88,7 @@ def single_float64_axis(state: bench.State): state.exec(launcher) +@bench.register() def default_value(state: bench.State): single_float64_axis(state) @@ -120,6 +124,9 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n) return mod.get_kernel(instance_name) +@bench.register() +@bench.axis.int64("BlockSize", [2**x for x in range(6, 10, 2)]) +@bench.axis.int64("NumBlocks", [2**x for x in range(6, 10, 2)]) def copy_sweep_grid_shape(state: bench.State): block_size = state.get_int64("BlockSize") num_blocks = state.get_int64("NumBlocks") @@ -147,6 +154,8 @@ def copy_sweep_grid_shape(state: bench.State): state.exec(launcher) +@bench.register() +@bench.axis.int64("TypeID", range(0, 6)) def copy_type_sweep(state: bench.State): type_id = state.get_int64("TypeID") @@ -186,21 +195,4 @@ def copy_type_sweep(state: bench.State): if __name__ == "__main__": - # Benchmark without axes - bench.register(simple) - - # benchmark with no axes, that uses default value - bench.register(default_value) - # specify axis - bench.register(single_float64_axis).add_float64_axis( - "Duration (s)", [7e-5, 1e-4, 5e-4] - ) - - copy1_bench = bench.register(copy_sweep_grid_shape) - copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)]) - copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)]) - - copy2_bench = bench.register(copy_type_sweep) - copy2_bench.add_int64_axis("TypeID", range(0, 6)) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py index df1efae..c724501 100644 --- a/python/examples/cpu_activity.py +++ b/python/examples/cpu_activity.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -24,6 +24,8 @@ import cuda.core as core host_sleep_duration = 0.1 +@bench.register() +@bench.option.is_cpu_only() def cpu_only_sleep_bench(state: bench.State) -> None: def launcher(launch: bench.Launch): time.sleep(host_sleep_duration) @@ -66,6 +68,8 @@ __global__ void sleep_kernel(double seconds) { return mod.get_kernel("sleep_kernel") +@bench.register() +@bench.axis.string("Sync", ["Do not sync", "Do sync"]) def mixed_sleep_bench(state: bench.State) -> None: sync = state.get_string("Sync") sync_flag = sync == "Do sync" @@ -85,13 +89,4 @@ def mixed_sleep_bench(state: bench.State) -> None: if __name__ == "__main__": - # time function only doing work (sleeping) on the host - # using CPU timer only - b = bench.register(cpu_only_sleep_bench) - b.set_is_cpu_only(True) - - # time the function that does work on both GPU and CPU - b2 = bench.register(mixed_sleep_bench) - b2.add_string_axis("Sync", ["Do not sync", "Do sync"]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cuda_compute_segmented_reduce.py b/python/examples/cuda_compute_segmented_reduce.py index d1b53f2..17a01aa 100644 --- a/python/examples/cuda_compute_segmented_reduce.py +++ b/python/examples/cuda_compute_segmented_reduce.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -33,6 +33,9 @@ def as_cp_ExternalStream(cs: bench.CudaStream) -> cp.cuda.ExternalStream: return cp.cuda.Stream.from_external(cs) +@bench.register() +@bench.axis.int64("numElems", [2**20, 2**22, 2**24]) +@bench.axis.int64("numCols", [1024, 2048, 4096, 8192]) def segmented_reduce(state: bench.State): "Benchmark segmented_reduce example" n_elems = state.get_int64("numElems") @@ -111,8 +114,4 @@ def segmented_reduce(state: bench.State): if __name__ == "__main__": - b = bench.register(segmented_reduce) - b.add_int64_axis("numElems", [2**20, 2**22, 2**24]) - b.add_int64_axis("numCols", [1024, 2048, 4096, 8192]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cuda_coop_block_reduce.py b/python/examples/cuda_coop_block_reduce.py index 349db82..3789e28 100644 --- a/python/examples/cuda_coop_block_reduce.py +++ b/python/examples/cuda_coop_block_reduce.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -48,6 +48,9 @@ def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: return cuda.external_stream(cs.addressof()) +@bench.register() +@bench.axis.int64("ThreadsPerBlock", [64, 128, 192, 256]) +@bench.axis.power_of_two("NumBlocks", [10, 11, 12, 14, 16]) def multi_block_bench(state: bench.State): threads_per_block = state.get_int64("ThreadsPerBlock") num_blocks = state.get_int64("NumBlocks") @@ -91,8 +94,4 @@ def multi_block_bench(state: bench.State): if __name__ == "__main__": - b = bench.register(multi_block_bench) - b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256]) - b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py index d7f2a01..808fe05 100644 --- a/python/examples/cupy_extract.py +++ b/python/examples/cupy_extract.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -24,6 +24,9 @@ def as_cp_ExternalStream(cs: bench.CudaStream): return cp.cuda.Stream.from_external(cs) +@bench.register() +@bench.axis.int64("numCols", [1024, 2048, 4096, 2 * 4096]) +@bench.axis.int64("numRows", [1024, 2048, 4096, 2 * 4096]) def cupy_extract_by_mask(state: bench.State): n_cols = state.get_int64("numCols") n_rows = state.get_int64("numRows") @@ -51,8 +54,4 @@ def cupy_extract_by_mask(state: bench.State): if __name__ == "__main__": - b = bench.register(cupy_extract_by_mask) - b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096]) - b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/cute_dsl_sgemm.py b/python/examples/cute_dsl_sgemm.py index eae3758..073f10e 100644 --- a/python/examples/cute_dsl_sgemm.py +++ b/python/examples/cute_dsl_sgemm.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -602,6 +602,9 @@ class SGemm: return +@bench.register() +@bench.axis.int64("R", [16, 64, 256]) +@bench.axis.int64("N", [256, 512, 1024, 2048]) def cutlass_gemm(state: bench.State) -> None: n = state.get_int64("N") r = state.get_int64("R") @@ -660,8 +663,4 @@ if __name__ == "__main__": # see https://github.com/NVIDIA/cutlass/issues/3142 patch_cute_dsl() - gemm_b = bench.register(cutlass_gemm) - gemm_b.add_int64_axis("R", [16, 64, 256]) - gemm_b.add_int64_axis("N", [256, 512, 1024, 2048]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py index 8d57311..cf4ec29 100644 --- a/python/examples/exec_tag_sync.py +++ b/python/examples/exec_tag_sync.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -57,6 +57,7 @@ __global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n) return mod.get_kernel(instance_name) +@bench.register() def synchronizing_bench(state: bench.State): n_values = 64 * 1024 * 1024 n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0)) @@ -81,5 +82,4 @@ def synchronizing_bench(state: bench.State): if __name__ == "__main__": - bench.register(synchronizing_bench) bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/pytorch_bench.py b/python/examples/pytorch_bench.py index f49a543..6be92d7 100644 --- a/python/examples/pytorch_bench.py +++ b/python/examples/pytorch_bench.py @@ -1,3 +1,19 @@ +# Copyright 2026 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys import cuda.bench as bench @@ -12,9 +28,9 @@ def as_torch_cuda_Stream( ) +@bench.register() +@bench.option.throttle_threshold(0.25) def torch_bench(state: bench.State) -> None: - state.set_throttle_threshold(0.25) - dev_id = state.get_device() tc_s = as_torch_cuda_Stream(state.get_stream(), dev_id) @@ -53,6 +69,4 @@ def torch_bench(state: bench.State) -> None: if __name__ == "__main__": - bench.register(torch_bench) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/skip.py b/python/examples/skip.py index f720003..25d01d1 100644 --- a/python/examples/skip.py +++ b/python/examples/skip.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -57,6 +57,9 @@ __global__ void sleep_kernel(double seconds) { return mod.get_kernel("sleep_kernel") +@bench.register() +@bench.axis.float64("Duration", [1e-4 + k * 0.25e-3 for k in range(5)]) +@bench.axis.string("Kramble", ["Foo", "Bar", "Baz"]) def runtime_skip(state: bench.State): duration = state.get_float64("Duration") kramble = state.get_string("Kramble") @@ -82,8 +85,4 @@ def runtime_skip(state: bench.State): if __name__ == "__main__": - b = bench.register(runtime_skip) - b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)]) - b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/throughput.py b/python/examples/throughput.py index ff02bd3..06a6cf4 100644 --- a/python/examples/throughput.py +++ b/python/examples/throughput.py @@ -1,4 +1,4 @@ -# Copyright 2025 NVIDIA Corporation +# Copyright 2025-2026 NVIDIA Corporation # # Licensed under the Apache License, Version 2.0 with the LLVM exception # (the "License"); you may not use this file except in compliance with @@ -39,6 +39,9 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc return kernel +@bench.register() +@bench.axis.int64("Stride", [1, 2, 4]) +@bench.axis.int64("ItemsPerThread", [1, 2, 3, 4]) def throughput_bench(state: bench.State) -> None: stride = state.get_int64("Stride") ipt = state.get_int64("ItemsPerThread") @@ -69,8 +72,4 @@ def throughput_bench(state: bench.State) -> None: if __name__ == "__main__": - b = bench.register(throughput_bench) - b.add_int64_axis("Stride", [1, 2, 4]) - b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) - bench.run_all_benchmarks(sys.argv) diff --git a/python/test/test_cuda_bench.py b/python/test/test_cuda_bench.py index b63d24d..f626f85 100644 --- a/python/test/test_cuda_bench.py +++ b/python/test/test_cuda_bench.py @@ -1,4 +1,21 @@ +# Copyright 2026 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json +from typing import Union import cuda.bench as bench import pytest @@ -39,7 +56,7 @@ def test_cpu_only(): bench.run_all_benchmarks(["-q", "--profile"]) -def docstring_check(doc_str: str) -> None: +def docstring_check(doc_str: Union[str, None]) -> None: assert isinstance(doc_str, str) assert len(doc_str) > 0 @@ -56,6 +73,122 @@ def test_register_doc(): obj_has_docstring_check(bench.register) +def test_decorator_docstrings(): + obj_has_docstring_check(bench.axis) + obj_has_docstring_check(bench.axis.int64) + obj_has_docstring_check(bench.axis.add_int64_axis) + obj_has_docstring_check(bench.axis.int64_power_of_two) + obj_has_docstring_check(bench.axis.power_of_two) + obj_has_docstring_check(bench.axis.add_int64_power_of_two_axis) + obj_has_docstring_check(bench.axis.float64) + obj_has_docstring_check(bench.axis.add_float64_axis) + obj_has_docstring_check(bench.axis.string) + obj_has_docstring_check(bench.axis.add_string_axis) + + obj_has_docstring_check(bench.option) + obj_has_docstring_check(bench.option.name) + obj_has_docstring_check(bench.option.set_name) + obj_has_docstring_check(bench.option.run_once) + obj_has_docstring_check(bench.option.set_run_once) + obj_has_docstring_check(bench.option.skip_time) + obj_has_docstring_check(bench.option.set_skip_time) + obj_has_docstring_check(bench.option.throttle_recovery_delay) + obj_has_docstring_check(bench.option.set_throttle_recovery_delay) + obj_has_docstring_check(bench.option.throttle_threshold) + obj_has_docstring_check(bench.option.set_throttle_threshold) + obj_has_docstring_check(bench.option.timeout) + obj_has_docstring_check(bench.option.set_timeout) + obj_has_docstring_check(bench.option.stopping_criterion) + obj_has_docstring_check(bench.option.set_stopping_criterion) + obj_has_docstring_check(bench.option.criterion_param_float64) + obj_has_docstring_check(bench.option.set_criterion_param_float64) + obj_has_docstring_check(bench.option.criterion_param_int64) + obj_has_docstring_check(bench.option.set_criterion_param_int64) + obj_has_docstring_check(bench.option.criterion_param_string) + obj_has_docstring_check(bench.option.set_criterion_param_string) + obj_has_docstring_check(bench.option.min_samples) + obj_has_docstring_check(bench.option.set_min_samples) + obj_has_docstring_check(bench.option.is_cpu_only) + obj_has_docstring_check(bench.option.set_is_cpu_only) + + +def test_register_decorator_preserves_function_and_applies_options(monkeypatch): + class FakeBenchmark: + def __init__(self): + self.calls = [] + + def add_int64_axis(self, name, values): + self.calls.append(("int64", name, list(values))) + return self + + def set_min_samples(self, count): + self.calls.append(("min_samples", count)) + return self + + fake_benchmark = FakeBenchmark() + registered_functions = [] + + def fake_register(fn): + registered_functions.append(fn) + return fake_benchmark + + monkeypatch.setattr(bench, "_register", fake_register) + + @bench.register() + @bench.axis.int64("Elements", [1, 2, 3]) + @bench.option.min_samples(11) + def decorated(state: bench.State): + pass + + assert registered_functions == [decorated] + assert fake_benchmark.calls == [ + ("int64", "Elements", [1, 2, 3]), + ("min_samples", 11), + ] + assert callable(decorated) + + +def test_register_function_form_applies_decorated_options(monkeypatch): + class FakeBenchmark: + def __init__(self): + self.calls = [] + + def add_float64_axis(self, name, values): + self.calls.append(("float64", name, list(values))) + return self + + fake_benchmark = FakeBenchmark() + + def fake_register(fn): + return fake_benchmark + + monkeypatch.setattr(bench, "_register", fake_register) + + @bench.axis.float64("Duration", [0.1, 0.2]) + def decorated(state: bench.State): + pass + + assert bench.register(decorated) is fake_benchmark + assert fake_benchmark.calls == [("float64", "Duration", [0.1, 0.2])] + + +def test_option_decorators_reject_wrong_order(monkeypatch): + class FakeBenchmark: + pass + + def fake_register(fn): + return FakeBenchmark() + + monkeypatch.setattr(bench, "_register", fake_register) + + @bench.register() + def decorated(state: bench.State): + pass + + with pytest.raises(RuntimeError, match="must be placed below"): + bench.option.min_samples(3)(decorated) + + def test_run_all_benchmarks_doc(): obj_has_docstring_check(bench.run_all_benchmarks)