Merge pull request #237 from oleksandr-pavlyk/add-pynvbench

Python package pynvbench introduced that exposes `cuda.bench` namespace. Repository provides a set of examples.
This commit is contained in:
Oleksandr Pavlyk
2025-08-06 12:22:55 -05:00
committed by GitHub
24 changed files with 2780 additions and 0 deletions

4
python/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
build
nvbench_build
nvbench_install
__pycache__

41
python/CMakeLists.txt Normal file
View File

@@ -0,0 +1,41 @@
cmake_minimum_required(VERSION 3.30...4.0)
# CUDA is transitive dependency of nvbench
project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX CUDA)
find_package(Python REQUIRED COMPONENTS Development.Module)
find_package(CUDAToolkit REQUIRED)
# Get CMake package manager
set(_cpm_download_location ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
file(
DOWNLOAD
https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.42.0/CPM.cmake
${_cpm_download_location}
EXPECTED_HASH SHA256=2020b4fc42dba44817983e06342e682ecfc3d2f484a581f11cc5731fbe4dce8a
)
include(${_cpm_download_location})
CPMAddPackage(
NAME nvbench
SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..
OPTIONS "NVBench_INSTALL_RULES ON"
FIND_PACKAGE_ARGS CONFIG REQUIRED
)
CPMAddPackage("gh:pybind/pybind11@3.0.0")
pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS _nvbench DESTINATION cuda/bench)
# Determine target that nvbench::nvbench is an alias of,
# necessary because ALIAS targets cannot be installed
get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)

61
python/README.md Normal file
View File

@@ -0,0 +1,61 @@
# CUDA Kernel Benchmarking Package
This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
## Building
### Ensure recent version of CMake
Since `nvbench` requires a rather new version of CMake (>=3.30.4), either build CMake from sources, or create a conda environment with a recent version of CMake, using
```
conda create -n build_env --yes cmake ninja
conda activate build_env
```
### Ensure CUDA compiler
Since building `NVBench` library requires CUDA compiler, ensure that appropriate environment variables
are set. For example, assuming CUDA toolkit is installed system-wide, and assuming Ampere GPU architecture:
```bash
export CUDACXX=/usr/local/cuda/bin/nvcc
export CUDAARCHS=86
``
### Build Python project
Now switch to python folder, configure and install NVBench library, and install the package in editable mode:
```bash
cd nvbench/python
pip install -e .
```
### Verify that package works
```bash
python test/run_1.py
```
### Run examples
```bash
# Example benchmarking numba.cuda kernel
python examples/throughput.py
```
```bash
# Example benchmarking kernels authored using cuda.core
python examples/axes.py
```
```bash
# Example benchmarking algorithms from cuda.cccl.parallel
python examples/cccl_parallel_segmented_reduce.py
```
```bash
# Example benchmarking CuPy function
python examples/cupy_extract.py
```

View File

@@ -0,0 +1,59 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib.metadata
import warnings
from cuda.pathfinder import ( # type: ignore[import-not-found]
load_nvidia_dynamic_lib,
)
try:
__version__ = importlib.metadata.version("pynvbench")
except Exception as e:
__version__ = "0.0.0dev"
warnings.warn(
"Could not retrieve version of pynvbench package dynamically from its metadata. "
f"Exception {e} was raised. "
f"Version is set to fall-back value '{__version__}' instead."
)
for libname in ("cupti", "nvperf_target", "nvperf_host"):
load_nvidia_dynamic_lib(libname)
from cuda.bench._nvbench import ( # noqa: E402
Benchmark as Benchmark,
)
from cuda.bench._nvbench import ( # noqa: E402
CudaStream as CudaStream,
)
from cuda.bench._nvbench import ( # noqa: E402
Launch as Launch,
)
from cuda.bench._nvbench import ( # noqa: E402
NVBenchRuntimeError as NVBenchRuntimeError,
)
from cuda.bench._nvbench import ( # noqa: E402
State as State,
)
from cuda.bench._nvbench import ( # noqa: E402
register as register,
)
from cuda.bench._nvbench import ( # noqa: E402
run_all_benchmarks as run_all_benchmarks,
)
del load_nvidia_dynamic_lib

View File

@@ -0,0 +1,312 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================
# PLEASE KEEP IN SYNC WITH py_nvbench.cpp FILE
# ============================================
# Please be sure to keep these type hints and docstring in sync
# with the pybind11 bindings in ``../../src/py_nvbench.cpp``
# Use mypy's stubgen to auto-generate stubs using
# ``stubgen -m cuda.nvbench._nvbench`` and compare
# stubs in generated out/cuda/nvbench/_nvbench.pyi
# with definitions given here.
from collections.abc import Callable, Sequence
from typing import Optional, Self, SupportsFloat, SupportsInt, Union
class CudaStream:
"""Represents CUDA stream
Note
----
The class is not user-constructible.
"""
def __cuda_stream__(self) -> tuple[int, int]:
"""
Special method implement CUDA stream protocol
from `cuda.core`. Returns a pair of integers:
(protocol_version, integral_value_of_cudaStream_t pointer)
Example
-------
import cuda.core.experimental as core
import cuda.bench as bench
def bench(state: bench.State):
dev = core.Device(state.get_device())
dev.set_current()
# converts CudaString to core.Stream
# using __cuda_stream__ protocol
dev.create_stream(state.get_stream())
"""
...
def addressof(self) -> int:
"Integral value of address of driver's CUDA stream struct"
...
class Benchmark:
"""Represents NVBench benchmark.
Note
----
The class is not user-constructible.
Use `~register` function to create Benchmark and register
it with NVBench.
"""
def get_name(self) -> str:
"Get benchmark name"
...
def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self:
"Add integral type parameter axis with given name and values to sweep over"
...
def add_int64_power_of_two_axis(
self, name: str, values: Sequence[SupportsInt]
) -> Self:
"Add integral type parameter axis with given name and values to sweep over"
...
def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self:
"Add floating-point type parameter axis with given name and values to sweep over"
...
def add_string_axis(self, name: str, values: Sequence[str]) -> Self:
"Add string type parameter axis with given name and values to sweep over"
...
def set_name(self, name: str) -> Self:
"Set benchmark name"
...
def set_is_cpu_only(self, is_cpu_only: bool) -> Self:
"Set whether this benchmark only executes on CPU"
...
def set_run_once(self, v: bool) -> Self:
"Set whether all benchmark configurations are executed only once"
...
def set_skip_time(self, duration_seconds: SupportsFloat) -> Self:
"Set run durations, in seconds, that should be skipped"
...
def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self:
"Set throttle recovery delay, in seconds"
...
def set_throttle_threshold(self, threshold: SupportsFloat) -> Self:
"Set throttle threshold, as a fraction of maximal GPU frequency"
...
def set_timeout(self, duration_seconds: SupportsFloat) -> Self:
"Set benchmark run duration timeout value, in seconds"
...
def set_stopping_criterion(self, criterion: str) -> Self:
"Set stopping criterion to be used"
...
def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self:
"Set stopping criterion floating point parameter value"
...
def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self:
"Set stopping criterion integer parameter value"
...
def set_criterion_param_string(self, name: str, value: str) -> Self:
"Set stopping criterion string parameter value"
...
def set_min_samples(self, count: SupportsInt) -> Self:
"Set minimal samples count before stopping criterion applies"
...
class Launch:
"""Configuration object for function launch.
Note
----
The class is not user-constructible.
"""
def get_stream(self) -> CudaStream:
"Get CUDA stream of this configuration"
...
class State:
"""Represent benchmark configuration state.
Note
----
The class is not user-constructible.
"""
def has_device(self) -> bool:
"True if configuration has a device"
...
def has_printers(self) -> bool:
"True if configuration has a printer"
...
def get_device(self) -> Union[int, None]:
"Get device_id of the device from this configuration"
...
def get_stream(self) -> CudaStream:
"CudaStream object from this configuration"
...
def get_int64(self, name: str) -> int:
"Get value for given Int64 axis from this configuration"
...
def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int:
"Get value for given Int64 axis from this configuration"
...
def get_float64(self, name: str) -> float:
"Get value for given Float64 axis from this configuration"
...
def get_float64_or_default(self, name: str, default_value: SupportsFloat) -> float:
"Get value for given Float64 axis from this configuration"
...
def get_string(self, name: str) -> str:
"Get value for given String axis from this configuration"
...
def get_string_or_default(self, name: str, default_value: str) -> str:
"Get value for given String axis from this configuration"
...
def add_element_count(
self, count: SupportsInt, column_name: Optional[str] = None
) -> None:
"Add element count"
...
def set_element_count(self, count: SupportsInt) -> None:
"Set element count"
...
def get_element_count(self) -> int:
"Get element count"
...
def skip(self, reason: str) -> None:
"Skip this configuration"
...
def is_skipped(self) -> bool:
"Has this configuration been skipped"
...
def get_skip_reason(self) -> str:
"Get reason provided for skipping this configuration"
...
def add_global_memory_reads(
self, nbytes: SupportsInt, /, column_name: str = ""
) -> None:
"Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
...
def add_global_memory_writes(
self, nbytes: SupportsInt, /, column_name: str = ""
) -> None:
"Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
...
def get_benchmark(self) -> Benchmark:
"Get Benchmark this configuration is a part of"
...
def get_throttle_threshold(self) -> float:
"Get throttle threshold value, as fraction of maximal frequency"
...
def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None:
"Set throttle threshold fraction to specified value, expected to be between 0 and 1"
...
def get_min_samples(self) -> int:
"Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
...
def set_min_samples(self, min_samples_count: SupportsInt) -> None:
"Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
...
def get_disable_blocking_kernel(self) -> bool:
"True if use of blocking kernel by NVBench is disabled, False otherwise"
...
def set_disable_blocking_kernel(self, flag: bool) -> None:
"Use flag = True to disable use of blocking kernel by NVBench"
...
def get_run_once(self) -> bool:
"Boolean flag whether configuration should only run once"
...
def set_run_once(self, run_once_flag: bool) -> None:
"Set run-once flag for this configuration"
...
def get_timeout(self) -> float:
"Get time-out value for benchmark execution of this configuration, in seconds"
...
def set_timeout(self, duration: SupportsFloat) -> None:
"Set time-out value for benchmark execution of this configuration, in seconds"
...
def get_blocking_kernel_timeout(self) -> float:
"Get time-out value for execution of blocking kernel, in seconds"
...
def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None:
"Set time-out value for execution of blocking kernel, in seconds"
...
def collect_cupti_metrics(self) -> None:
"Request NVBench to record CUPTI metrics while running benchmark for this configuration"
...
def is_cupti_required(self) -> bool:
"True if (some) CUPTI metrics are being collected"
...
def exec(
self,
fn: Callable[[Launch], None],
/,
*,
batched: Optional[bool] = True,
sync: Optional[bool] = False,
):
"""Execute callable running the benchmark.
The callable may be executed multiple times.
Parameters
----------
fn: Callable
Python callable with signature fn(Launch) -> None that executes the benchmark.
batched: bool, optional
If `True`, no cache flushing is performed between callable invocations.
Default: `True`.
sync: bool, optional
True value indicates that callable performs device synchronization.
NVBench disables use of blocking kernel in this case.
Default: `False`.
"""
...
def get_short_description(self) -> str:
"Get short description for this configuration"
...
def add_summary(
self, column_name: str, value: Union[SupportsInt, SupportsFloat, str]
) -> None:
"Add summary column with a value"
...
def get_axis_values(self) -> dict[str, int | float | str]:
"Get dictionary with axis values for this configuration"
...
def get_axis_values_as_string(self) -> str:
"Get string of space-separated name=value pairs for this configuration"
...
def get_stopping_criterion(self) -> str:
"Get string name of stopping criterion used"
...
def register(fn: Callable[[State], None]) -> Benchmark:
"""
Register given benchmarking function with NVBench.
"""
...
def run_all_benchmarks(argv: Sequence[str]) -> None:
"""
Run all benchmarks registered with NVBench.
Parameters
----------
argv: List[str]
Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
"""
...
class NVBenchRuntimeError(RuntimeError):
"""An exception raised if running benchmarks encounters an error"""
...

View File

View File

@@ -0,0 +1,75 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import cuda.bench as bench
import numpy as np
from numba import cuda
def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
return cuda.external_stream(cs.addressof())
def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
@cuda.jit
def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
tid = cuda.grid(1)
step = cuda.gridDim.x * cuda.blockDim.x
for i in range(stride * tid, stride * elements, stride * step):
for j in range(items_per_thread):
read_id = (items_per_thread * i + j) % elements
write_id = tid + j * elements
out_arr[write_id] = in_arr[read_id]
return kernel
def throughput_bench(state: bench.State) -> None:
stride = state.get_int64("Stride")
ipt = state.get_int64("ItemsPerThread")
nbytes = 128 * 1024 * 1024
elements = nbytes // np.dtype(np.int32).itemsize
alloc_stream = as_cuda_stream(state.get_stream())
inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
state.add_element_count(elements, column_name="Elements")
state.collect_cupti_metrics()
threads_per_block = 256
blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
krn = make_throughput_kernel(ipt)
def launcher(launch: bench.Launch):
exec_stream = as_cuda_stream(launch.get_stream())
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
stride, elements, inp_arr, out_arr
)
state.exec(launcher)
if __name__ == "__main__":
b = bench.register(throughput_bench)
b.add_int64_axis("Stride", [1, 2, 4])
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
bench.run_all_benchmarks(sys.argv)

206
python/examples/axes.py Normal file
View File

@@ -0,0 +1,206 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ctypes
import sys
from typing import Dict, Optional, Tuple
import cuda.bench as bench
import cuda.cccl.headers as headers
import cuda.core.experimental as core
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof())
def make_sleep_kernel():
"""JITs sleep_kernel(seconds)"""
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/chrono>
// Each launched thread just sleeps for `seconds`.
__global__ void sleep_kernel(double seconds) {
namespace chrono = ::cuda::std::chrono;
using hr_clock = chrono::high_resolution_clock;
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
const auto ns = chrono::nanoseconds(duration);
const auto start = hr_clock::now();
const auto finish = start + ns;
auto now = hr_clock::now();
while (now < finish)
{
now = hr_clock::now();
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
return mod.get_kernel("sleep_kernel")
def simple(state: bench.State):
state.set_min_samples(1000)
sleep_dur = 1e-3
krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher)
def single_float64_axis(state: bench.State):
# get axis value, or default
default_sleep_dur = 3.14e-4
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher)
def default_value(state: bench.State):
single_float64_axis(state)
def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/cstddef>
/*!
* Naive copy of `n` values from `in` -> `out`.
*/
template <typename T, typename U>
__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
{
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = blockDim.x * gridDim.x;
for (auto i = init; i < n; i += step)
{
out[i] = static_cast<U>(in[i]);
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
if in_type is None:
in_type = "::cuda::std::int32_t"
if out_type is None:
out_type = "::cuda::std::int32_t"
instance_name = f"copy_kernel<{in_type}, {out_type}>"
mod = prog.compile("cubin", name_expressions=(instance_name,))
return mod.get_kernel(instance_name)
def copy_sweep_grid_shape(state: bench.State):
block_size = state.get_int64("BlockSize")
num_blocks = state.get_int64("NumBlocks")
# Number of int32 elements in 256MiB
nbytes = 256 * 1024 * 1024
num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
state.add_element_count(num_values)
state.add_global_memory_reads(nbytes)
state.add_global_memory_writes(nbytes)
dev_id = state.get_device()
alloc_s = as_core_Stream(state.get_stream())
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
krn = make_copy_kernel()
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
state.exec(launcher)
def copy_type_sweep(state: bench.State):
type_id = state.get_int64("TypeID")
types_map: Dict[int, Tuple[type, str]] = {
0: (ctypes.c_uint8, "cuda::std::uint8_t"),
1: (ctypes.c_uint16, "cuda::std::uint16_t"),
2: (ctypes.c_uint32, "cuda::std::uint32_t"),
3: (ctypes.c_uint64, "cuda::std::uint64_t"),
4: (ctypes.c_float, "float"),
5: (ctypes.c_double, "double"),
}
value_ctype, value_cuda_t = types_map[type_id]
state.add_summary("Type", value_cuda_t)
# Number of elements in 256MiB
nbytes = 256 * 1024 * 1024
num_values = nbytes // ctypes.sizeof(value_ctype)
state.add_element_count(num_values)
state.add_global_memory_reads(nbytes)
state.add_global_memory_writes(nbytes)
dev_id = state.get_device()
alloc_s = as_core_Stream(state.get_stream())
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
krn = make_copy_kernel(value_cuda_t, value_cuda_t)
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
state.exec(launcher)
if __name__ == "__main__":
# Benchmark without axes
bench.register(simple)
# benchmark with no axes, that uses default value
bench.register(default_value)
# specify axis
bench.register(single_float64_axis).add_float64_axis(
"Duration (s)", [7e-5, 1e-4, 5e-4]
)
copy1_bench = bench.register(copy_sweep_grid_shape)
copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
copy2_bench = bench.register(copy_type_sweep)
copy2_bench.add_int64_axis("TypeID", range(0, 6))
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,99 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import cuda.bench as bench
import cuda.cccl.cooperative.experimental as coop
import numba
import numpy as np
from numba import cuda
from pynvjitlink import patch
class BitsetRing:
"""
Addition operation over ring fixed width unsigned integers
with ring_plus = bitwise_or and ring_mul = bitwise_and,
ring_zero = 0, ring_one = -1
"""
def __init__(self):
self.dt = np.uint64
self.zero = self.dt(0)
self.one = np.bitwise_invert(self.zero)
@staticmethod
def add(op1, op2):
return op1 | op2
@staticmethod
def mul(op1, op2):
return op1 & op2
def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
return cuda.external_stream(cs.addressof())
def multi_block_bench(state: bench.State):
threads_per_block = state.get_int64("ThreadsPerBlock")
num_blocks = state.get_int64("NumBlocks")
total_elements = threads_per_block * num_blocks
if total_elements > 2**26:
state.skip(reason="Memory footprint over threshold")
return
ring = BitsetRing()
block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add)
@cuda.jit(link=block_reduce.files)
def kernel(inp_arr, out_arr):
# Each thread contributes one element
block_idx = cuda.blockIdx.x
thread_idx = cuda.threadIdx.x
global_idx = block_idx * threads_per_block + thread_idx
block_output = block_reduce(inp_arr[global_idx])
# Only thread 0 of each block writes the result
if thread_idx == 0:
out_arr[block_idx] = block_output
h_inp = np.arange(1, total_elements + 1, dtype=ring.dt)
d_inp = cuda.to_device(h_inp)
d_out = cuda.device_array(num_blocks, dtype=ring.dt)
state.add_element_count(total_elements)
state.add_global_memory_reads(total_elements * h_inp.itemsize)
state.add_global_memory_writes(num_blocks * h_inp.itemsize)
def launcher(launch: bench.Launch):
cuda_s = as_cuda_Stream(launch.get_stream())
kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
state.exec(launcher)
if __name__ == "__main__":
patch.patch_numba_linker(lto=True)
b = bench.register(multi_block_bench)
b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,124 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import cuda.bench as bench
import cuda.cccl.parallel.experimental.algorithms as algorithms
import cuda.cccl.parallel.experimental.iterators as iterators
import cuda.core.experimental as core
import cupy as cp
import numpy as np
class CCCLStream:
"Class to work around https://github.com/NVIDIA/cccl/issues/5144"
def __init__(self, ptr):
self._ptr = ptr
def __cuda_stream__(self):
return (0, self._ptr)
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof())
def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
return CCCLStream(cs.addressof())
def as_cp_ExternalStream(
cs: bench.CudaStream, dev_id: int | None = -1
) -> cp.cuda.ExternalStream:
h = cs.addressof()
return cp.cuda.ExternalStream(h, dev_id)
def segmented_reduce(state: bench.State):
"Benchmark segmented_reduce example"
n_elems = state.get_int64("numElems")
n_cols = state.get_int64("numCols")
n_rows = n_elems // n_cols
state.add_summary("numRows", n_rows)
state.collect_cupti_metrics()
dev_id = state.get_device()
cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
def add_op(a, b):
return a + b
def make_scaler(step):
def scale(row_id):
return row_id * step
return scale
zero = np.int32(0)
row_offset = make_scaler(np.int32(n_cols))
start_offsets = iterators.TransformIterator(
iterators.CountingIterator(zero), row_offset
)
end_offsets = start_offsets + 1
h_init = np.zeros(tuple(), dtype=np.int32)
with cp_stream:
rng = cp.random.default_rng()
mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
d_input = mat
d_output = cp.empty(n_rows, dtype=d_input.dtype)
alg = algorithms.segmented_reduce(
d_input, d_output, start_offsets, end_offsets, add_op, h_init
)
cccl_stream = as_cccl_Stream(state.get_stream())
# query size of temporary storage and allocate
temp_nbytes = alg(
None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
)
h_init = np.zeros(tuple(), dtype=np.int32)
with cp_stream:
temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
def launcher(launch: bench.Launch):
s = as_cccl_Stream(launch.get_stream())
alg(
temp_storage,
d_input,
d_output,
n_rows,
start_offsets,
end_offsets,
h_init,
s,
)
state.exec(launcher)
if __name__ == "__main__":
b = bench.register(segmented_reduce)
b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,97 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
import cuda.bench as bench
import cuda.cccl.headers as headers
import cuda.core.experimental as core
host_sleep_duration = 0.1
def cpu_only_sleep_bench(state: bench.State) -> None:
def launcher(launch: bench.Launch):
time.sleep(host_sleep_duration)
state.exec(launcher)
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof())
def make_sleep_kernel():
"""JITs sleep_kernel(seconds)"""
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/chrono>
// Each launched thread just sleeps for `seconds`.
__global__ void sleep_kernel(double seconds) {
namespace chrono = ::cuda::std::chrono;
using hr_clock = chrono::high_resolution_clock;
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
const auto ns = chrono::nanoseconds(duration);
const auto start = hr_clock::now();
const auto finish = start + ns;
auto now = hr_clock::now();
while (now < finish)
{
now = hr_clock::now();
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
return mod.get_kernel("sleep_kernel")
def mixed_sleep_bench(state: bench.State) -> None:
sync = state.get_string("Sync")
sync_flag = sync == "Do sync"
gpu_sleep_dur = 225e-3
krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: bench.Launch):
# host overhead
time.sleep(host_sleep_duration)
# GPU computation
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, gpu_sleep_dur)
state.exec(launcher, sync=sync_flag)
if __name__ == "__main__":
# time function only doing work (sleeping) on the host
# using CPU timer only
b = bench.register(cpu_only_sleep_bench)
b.set_is_cpu_only(True)
# time the function that does work on both GPU and CPU
b2 = bench.register(mixed_sleep_bench)
b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,63 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import cuda.bench as bench
import cupy as cp
def as_cp_ExternalStream(
cs: bench.CudaStream, dev_id: int | None = -1
) -> cp.cuda.ExternalStream:
h = cs.addressof()
return cp.cuda.ExternalStream(h, dev_id)
def cupy_extract_by_mask(state: bench.State):
n_cols = state.get_int64("numCols")
n_rows = state.get_int64("numRows")
dev_id = state.get_device()
cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
state.collect_cupti_metrics()
state.add_element_count(n_rows * n_cols, "# Elements")
int32_dt = cp.dtype(cp.int32)
bool_dt = cp.dtype(cp.bool_)
state.add_global_memory_reads(
n_rows * n_cols * (int32_dt.itemsize + bool_dt.itemsize)
)
state.add_global_memory_writes(n_rows * n_cols * (int32_dt.itemsize))
with cp_s:
X = cp.full((n_cols, n_rows), fill_value=3, dtype=int32_dt)
mask = cp.ones((n_cols, n_rows), dtype=bool_dt)
_ = X[mask]
def launcher(launch: bench.Launch):
with as_cp_ExternalStream(launch.get_stream(), dev_id):
_ = X[mask]
state.exec(launcher, sync=True)
if __name__ == "__main__":
b = bench.register(cupy_extract_by_mask)
b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,113 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import cuda.bench as bench
import cuda.bindings.driver as driver
import cuda.core.experimental as core
import cupy as cp
import cutlass
import numpy as np
def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
return driver.CUstream(cs.addressof())
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof())
def make_cp_array(
arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None
) -> cp.ndarray:
cp_memview = cp.cuda.UnownedMemory(
int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
)
zero_offset = 0
return cp.ndarray(
arr_h.shape,
dtype=arr_h.dtype,
memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
)
def cutlass_gemm(state: bench.State) -> None:
n = state.get_int64("N")
r = state.get_int64("R")
alpha = state.get_float64("alpha")
dt = np.float64
A_h = np.random.randn(n, r).astype(dt)
B_h = np.copy(A_h.mT)
C_h = np.eye(n, dtype=dt)
D_h = np.zeros_like(C_h)
if n >= 1024:
# allow more time for large inputs
state.set_timeout(360)
dev_id = state.get_device()
cs = state.get_stream()
s = as_bindings_Stream(cs)
core_s = as_core_Stream(cs)
A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)
D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s)
driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)
driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s)
A_cp = make_cp_array(A_h, A_d, dev_id)
B_cp = make_cp_array(B_h, B_d, dev_id)
C_cp = make_cp_array(C_h, C_d, dev_id)
D_cp = make_cp_array(D_h, D_d, dev_id)
plan = cutlass.op.Gemm(
A=A_cp,
B=B_cp,
C=C_cp,
D=D_cp,
element=dt,
alpha=alpha,
beta=1,
layout=cutlass.LayoutType.RowMajor,
)
# warm-up to ensure compilation is not timed
plan.run(stream=s)
def launcher(launch: bench.Launch) -> None:
s = as_bindings_Stream(launch.get_stream())
plan.run(stream=s, sync=False)
state.exec(launcher)
if __name__ == "__main__":
gemm_b = bench.register(cutlass_gemm)
gemm_b.add_int64_axis("R", [16, 64, 256])
gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
gemm_b.add_float64_axis("alpha", [1e-2])
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,85 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ctypes
import sys
from typing import Optional
import cuda.bench as bench
import cuda.cccl.headers as headers
import cuda.core.experimental as core
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
"Create view of native stream used by NVBench"
return core.Stream.from_handle(cs.addressof())
def make_fill_kernel(data_type: Optional[str] = None):
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/cstddef>
/*!
* Naive setting of values in buffer
*/
template <typename T>
__global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
{
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = blockDim.x * gridDim.x;
for (auto i = init; i < n; i += step)
{
buf[i] = v;
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
if data_type is None:
data_type = "::cuda::std::int32_t"
instance_name = f"fill_kernel<{data_type}>"
mod = prog.compile("cubin", name_expressions=(instance_name,))
return mod.get_kernel(instance_name)
def synchronizing_bench(state: bench.State):
n_values = 64 * 1024 * 1024
n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
alloc_s = as_core_Stream(state.get_stream())
buffer = core.DeviceMemoryResource(state.get_device()).allocate(n_bytes, alloc_s)
state.add_element_count(n_values, "Items")
state.add_global_memory_writes(n_bytes, "Size")
krn = make_fill_kernel()
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, buffer, 0, n_values)
s.sync()
# since launcher contains synchronization point,
# setting sync=True is required to avoid a deadlock
state.exec(launcher, sync=True)
if __name__ == "__main__":
bench.register(synchronizing_bench)
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,58 @@
import sys
import cuda.bench as bench
import torch
def as_torch_cuda_Stream(
cs: bench.CudaStream, dev: int | None
) -> torch.cuda.ExternalStream:
return torch.cuda.ExternalStream(
stream_ptr=cs.addressof(), device=torch.cuda.device(dev)
)
def torch_bench(state: bench.State) -> None:
state.set_throttle_threshold(0.25)
dev_id = state.get_device()
tc_s = as_torch_cuda_Stream(state.get_stream(), dev_id)
dt = torch.float32
scalar_shape: tuple = tuple()
n = 2**28
with torch.cuda.stream(tc_s):
a3 = torch.randn(scalar_shape, dtype=dt)
a2 = torch.randn(scalar_shape, dtype=dt)
a1 = torch.randn(scalar_shape, dtype=dt)
a0 = torch.randn(scalar_shape, dtype=dt)
x = torch.linspace(-3, 3, n, dtype=dt)
y = torch.sin(x)
learning_rate = 1e-4
def launcher(launch: bench.Launch) -> None:
tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id)
with torch.cuda.stream(tc_s):
x2 = torch.square(x)
y_pred = (a3 + x2 * a1) + x * (a2 + a0 * x2)
_ = torch.square(y_pred - y).sum()
grad_y_pred = 2 * (y_pred - y)
grad_a3 = grad_y_pred.sum()
grad_a2 = (grad_y_pred * x).sum()
grad_a1 = (grad_y_pred * x2).sum()
grad_a0 = (grad_y_pred * x2 * x).sum()
_ = a3 - grad_a3 * learning_rate
_ = a2 - grad_a2 * learning_rate
_ = a1 - grad_a1 * learning_rate
_ = a0 - grad_a0 * learning_rate
state.exec(launcher, sync=True)
if __name__ == "__main__":
bench.register(torch_bench)
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,7 @@
numpy
numba
cupy
nvidia-cutlass
cuda-cccl
cuda-core
cuda-bindings

89
python/examples/skip.py Normal file
View File

@@ -0,0 +1,89 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import cuda.bench as bench
import cuda.cccl.headers as headers
import cuda.core.experimental as core
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
"Create view into native stream provided by NVBench"
return core.Stream.from_handle(cs.addressof())
def make_sleep_kernel():
"""JITs sleep_kernel(seconds)"""
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/chrono>
// Each launched thread just sleeps for `seconds`.
__global__ void sleep_kernel(double seconds) {
namespace chrono = ::cuda::std::chrono;
using hr_clock = chrono::high_resolution_clock;
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
const auto ns = chrono::nanoseconds(duration);
const auto start = hr_clock::now();
const auto finish = start + ns;
auto now = hr_clock::now();
while (now < finish)
{
now = hr_clock::now();
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
return mod.get_kernel("sleep_kernel")
def runtime_skip(state: bench.State):
duration = state.get_float64("Duration")
kramble = state.get_string("Kramble")
# Skip Baz benchmarks with 0.8 ms duration
if kramble == "Baz" and duration < 0.8e-3:
state.skip("Short 'Baz' benchmarks are skipped")
return
# Skip Foo benchmark with > 0.3 ms duration
if kramble == "Foo" and duration > 0.3e-3:
state.skip("Long 'Foo' benchmarks are skipped")
return
krn = make_sleep_kernel()
launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_cfg, krn, duration)
state.exec(launcher)
if __name__ == "__main__":
b = bench.register(runtime_skip)
b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,76 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import cuda.bench as bench
import numpy as np
from numba import cuda
def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
return cuda.external_stream(cs.addressof())
def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
@cuda.jit
def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
tid = cuda.grid(1)
step = cuda.gridDim.x * cuda.blockDim.x
for i in range(stride * tid, stride * elements, stride * step):
for j in range(items_per_thread):
read_id = (items_per_thread * i + j) % elements
write_id = tid + j * elements
out_arr[write_id] = in_arr[read_id]
return kernel
def throughput_bench(state: bench.State) -> None:
stride = state.get_int64("Stride")
ipt = state.get_int64("ItemsPerThread")
nbytes = 128 * 1024 * 1024
elements = nbytes // np.dtype(np.int32).itemsize
alloc_stream = as_cuda_stream(state.get_stream())
inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
state.add_element_count(elements, column_name="Elements")
state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
state.add_global_memory_writes(inp_arr.nbytes)
threads_per_block = 256
blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
krn = make_throughput_kernel(ipt)
def launcher(launch: bench.Launch):
exec_stream = as_cuda_stream(launch.get_stream())
krn[blocks_in_grid, threads_per_block, exec_stream, 0](
stride, elements, inp_arr, out_arr
)
state.exec(launcher)
if __name__ == "__main__":
b = bench.register(throughput_bench)
b.add_int64_axis("Stride", [1, 2, 4])
b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
bench.run_all_benchmarks(sys.argv)

59
python/pyproject.toml Normal file
View File

@@ -0,0 +1,59 @@
[build-system]
requires = ["scikit-build-core>=0.10", "setuptools_scm"]
build-backend = "scikit_build_core.build"
[project]
name = "pynvbench"
description = "CUDA Kernel Benchmarking Package"
authors = [{ name = "NVIDIA Corporation" }]
classifiers = [
"Programming Language :: Python :: 3 :: Only",
"Environment :: GPU :: NVIDIA CUDA",
"License :: OSI Approved :: Apache Software License",
]
requires-python = ">=3.9"
dependencies = [
# pathfinder
"cuda-pathfinder",
# Library expects to find shared libraries
# libcupti, libnvperf_target, libnvperf_host
# pathfinder is used to find it in the Python layout
"nvidia-cuda-cupti-cu12",
# The shared library
# libnvidia-ml must be installed system-wide
# (Debian package provider: libnvidia-compute)
]
dynamic = ["version"]
readme = { file = "README.md", content-type = "text/markdown" }
[project.optional-dependencies]
test = ["pytest", "cupy-cuda12x", "numba"]
[project.urls]
Homepage = "https://developer.nvidia.com/"
[tool.scikit-build]
minimum-version = "build-system.requires"
build-dir = "build/{wheel_tag}"
[tool.scikit-build.cmake]
version = ">=3.30.4"
args = []
build-type = "Release"
source-dir = "."
[tool.scikit-build.ninja]
version = ">=1.11"
make-fallback = true
[tool.scikit-build.metadata.version]
provider = "scikit_build_core.metadata.setuptools_scm"
[tool.setuptools_scm]
root = ".."
[tool.scikit-build.wheel.packages]
"cuda" = "cuda"
"cuda/bench" = "cuda/bench"

View File

@@ -0,0 +1,24 @@
# Building extension module locally
This may be useful for debugging purposes.
Assuming NVBench is installed into `NVBENCH_PREFIX`,
and pybind11 repo is cloned to `PYBIND11_DIR`,
Python extension can be built locally, using host compiler,
as follows:
```bash
g++ py_nvbench.cpp \
-shared -fPIC \
-I ${PYBIND11_DIR}/include \
-I ${NVBENCH_PREFIX}/include \
-I /usr/local/cuda/include \
$(python3-config --includes) \
$(python3-config --libs) \
-L ${NVBENCH_PREFIX}/lib/ \
-lnvbench \
-Wl,-rpath,${NVBENCH_PREFIX}/lib \
-L /usr/local/cuda/lib64/ \
-lcudart \
-Wl,-rpath,/usr/local/cuda/lib64 \
-o _nvbench$(python3-config --extension-suffix)
```

734
python/src/py_nvbench.cpp Normal file
View File

@@ -0,0 +1,734 @@
/*
* Copyright 2025 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// clang-format off
// Include Pybind11 headers first thing
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
// clang-format on
#include <nvbench/nvbench.cuh>
#include <cstdio>
#include <cstdlib>
#include <functional>
#include <memory>
#include <sstream>
#include <string>
#include <utility>
#include <vector>
namespace py = pybind11;
namespace
{
struct PyObjectDeleter
{
void operator()(py::object *p)
{
const bool initialized = Py_IsInitialized();
#if PY_VERSION_HEX < 0x30d0000
const bool finalizing = _Py_IsFinalizing();
#else
const bool finalizing = Py_IsFinalizing();
#endif
const bool guard = initialized && !finalizing;
// deleter only call ~object if interpreter is active and
// not shutting down, let OS clean up resources after
// interpreter tear-down
if (guard)
{
delete p;
}
}
};
struct benchmark_wrapper_t
{
benchmark_wrapper_t() = default;
explicit benchmark_wrapper_t(py::object o)
: m_fn{std::shared_ptr<py::object>(new py::object(std::move(o)), PyObjectDeleter{})}
{
if (!PyCallable_Check(m_fn->ptr()))
{
throw py::value_error("Argument must be a callable");
}
}
// Only copy constructor is used, delete copy-assign, and moves
benchmark_wrapper_t(const benchmark_wrapper_t &other) = default;
benchmark_wrapper_t &operator=(const benchmark_wrapper_t &other) = delete;
benchmark_wrapper_t(benchmark_wrapper_t &&) noexcept = delete;
benchmark_wrapper_t &operator=(benchmark_wrapper_t &&) noexcept = delete;
void operator()(nvbench::state &state, nvbench::type_list<>)
{
if (!m_fn)
{
throw std::runtime_error("No function to execute");
}
// box as Python object, using reference semantics
auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
// Execute Python callable
(*m_fn)(arg);
}
private:
// Important to use shared pointer here rather than py::object directly,
// since copy constructor must be const (benchmark::do_clone is const member method)
std::shared_ptr<py::object> m_fn;
};
// Use struct to ensure public inheritance
struct nvbench_run_error : std::runtime_error
{
// ask compiler to generate all constructor signatures
// that are defined for the base class
using std::runtime_error::runtime_error;
};
py::handle benchmark_exc{};
class GlobalBenchmarkRegistry
{
bool m_finalized;
public:
GlobalBenchmarkRegistry()
: m_finalized(false) {};
GlobalBenchmarkRegistry(const GlobalBenchmarkRegistry &) = delete;
GlobalBenchmarkRegistry &operator=(const GlobalBenchmarkRegistry &) = delete;
GlobalBenchmarkRegistry(GlobalBenchmarkRegistry &&) = delete;
GlobalBenchmarkRegistry &operator=(GlobalBenchmarkRegistry &&) = delete;
bool is_finalized() const { return m_finalized; }
nvbench::benchmark_base &add_bench(py::object fn)
{
if (m_finalized)
{
throw std::runtime_error("Can not register more benchmarks after benchmark was run");
}
if (!PyCallable_Check(fn.ptr()))
{
throw py::value_error("Benchmark should be a callable object");
}
std::string name;
if (py::hasattr(fn, "__name__"))
{
py::str py_name = fn.attr("__name__");
name = py::cast<std::string>(py_name);
}
else
{
py::str py_name = py::repr(fn);
name = py::cast<std::string>(py_name);
}
benchmark_wrapper_t executor(fn);
return nvbench::benchmark_manager::get()
.add(std::make_unique<nvbench::benchmark<benchmark_wrapper_t>>(executor))
.set_name(std::move(name));
}
void run(const std::vector<std::string> &argv)
{
if (nvbench::benchmark_manager::get().get_benchmarks().empty())
{
throw std::runtime_error("No benchmarks had been registered yet");
}
if (m_finalized)
{
throw std::runtime_error("Benchmarks were already executed");
}
m_finalized = true;
try
{
// This line is mandatory for correctness to populate
// benchmark with devices requested by user via CLI
nvbench::benchmark_manager::get().initialize();
{
nvbench::option_parser parser{};
parser.parse(argv);
NVBENCH_MAIN_PRINT_PREAMBLE(parser);
NVBENCH_MAIN_RUN_BENCHMARKS(parser);
NVBENCH_MAIN_PRINT_EPILOGUE(parser);
NVBENCH_MAIN_PRINT_RESULTS(parser);
} /* Tear down parser before finalization */
}
catch (py::error_already_set &e)
{
py::raise_from(e, benchmark_exc.ptr(), "Python error raised ");
throw py::error_already_set();
}
catch (const std::exception &e)
{
const std::string &exc_message = e.what();
py::set_error(benchmark_exc, exc_message.c_str());
throw py::error_already_set();
}
catch (...)
{
py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
throw py::error_already_set();
}
}
};
py::dict py_get_axis_values(const nvbench::state &state)
{
auto named_values = state.get_axis_values();
auto names = named_values.get_names();
py::dict res;
for (const auto &name : names)
{
if (named_values.has_value(name))
{
auto v = named_values.get_value(name);
res[name.c_str()] = py::cast(v);
}
}
return res;
}
// essentially a global variable, but allocated on the heap during module initialization
std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
} // end of anonymous namespace
// ==========================================
// PLEASE KEEP IN SYNC WITH __init__.pyi FILE
// ==========================================
// If you modify these bindings, please be sure to update the
// corresponding type hints in ``../cuda/nvbench/__init__.pyi``
PYBIND11_MODULE(_nvbench, m)
{
// == STEP 1
// Set environment variable CUDA_MODULE_LOADING=EAGER
NVBENCH_DRIVER_API_CALL(cuInit(0));
// This line ensures that benchmark_manager has been created during module init
// It is reinitialized before running all benchmarks to set devices to use
nvbench::benchmark_manager::get().initialize();
// == STEP 2
// Define CudaStream class
// ATTN: nvbench::cuda_stream is move-only class
// Methods:
// Constructors, based on device, or on existing stream
// nvbench::cuda_stream::get_stream
auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream");
py_cuda_stream_cls.def("__cuda_stream__",
[](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
return std::make_pair(std::size_t{0},
reinterpret_cast<std::size_t>(s.get_stream()));
});
py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
return reinterpret_cast<std::size_t>(s.get_stream());
});
// == STEP 3
// Define Launch class
// ATTN: nvbench::launch is move-only class
// Methods:
// nvbench::launch::get_stream -> nvbench::cuda_stream
auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
py_launch_cls.def(
"get_stream",
[](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
py::return_value_policy::reference);
// == STEP 4
// Define Benchmark class
// ATTN: nvbench::benchmark_base is move-only class
// Methods:
// nvbench::benchmark_base::get_name
// nvbench::benchmark_base::add_int64_axis
// nvbench::benchmark_base::add_int64_power_of_two_axis
// nvbench::benchmark_base::add_float64_axis
// nvbench::benchmark_base::add_string_axis
// nvbench::benchmark_base::set_name
// nvbench::benchmark_base::set_is_cpu_only
// nvbench::benchmark_base::set_skip_time
// nvbench::benchmark_base::set_timeout
// nvbench::benchmark_base::set_throttle_threshold
// nvbench::benchmark_base::set_throttle_recovery_delay
// nvbench::benchmark_base::set_stopping_criterion
// nvbench::benchmark_base::set_criterion_param_int64
// nvbench::benchmark_base::set_criterion_param_float64
// nvbench::benchmark_base::set_criterion_param_string
// nvbench::benchmark_base::set_min_samples
auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
py_benchmark_cls.def(
"add_int64_axis",
[](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
self.add_int64_axis(std::move(name), std::move(data));
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"),
py::arg("values"));
py_benchmark_cls.def(
"add_int64_power_of_two_axis",
[](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
self.add_int64_axis(std::move(name),
std::move(data),
nvbench::int64_axis_flags::power_of_two);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"),
py::arg("values"));
py_benchmark_cls.def(
"add_float64_axis",
[](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::float64_t> data) {
self.add_float64_axis(std::move(name), std::move(data));
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"),
py::arg("values"));
py_benchmark_cls.def(
"add_string_axis",
[](nvbench::benchmark_base &self, std::string name, std::vector<std::string> data) {
self.add_string_axis(std::move(name), std::move(data));
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"),
py::arg("values"));
py_benchmark_cls.def(
"set_name",
[](nvbench::benchmark_base &self, std::string name) {
self.set_name(std::move(name));
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"));
py_benchmark_cls.def(
"set_is_cpu_only",
[](nvbench::benchmark_base &self, bool is_cpu_only) {
self.set_is_cpu_only(is_cpu_only);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("is_cpu_only"));
// TODO: should this be exposed?
py_benchmark_cls.def(
"set_run_once",
[](nvbench::benchmark_base &self, bool run_once) {
self.set_run_once(run_once);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("run_once"));
py_benchmark_cls.def(
"set_skip_time",
[](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) {
self.set_skip_time(skip_duration_seconds);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("duration_seconds"));
py_benchmark_cls.def(
"set_timeout",
[](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) {
self.set_timeout(duration_seconds);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("duration_seconds"));
py_benchmark_cls.def(
"set_throttle_threshold",
[](nvbench::benchmark_base &self, nvbench::float32_t threshold) {
self.set_throttle_threshold(threshold);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("threshold"));
py_benchmark_cls.def(
"set_throttle_recovery_delay",
[](nvbench::benchmark_base &self, nvbench::float32_t delay) {
self.set_throttle_recovery_delay(delay);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("delay_seconds"));
py_benchmark_cls.def(
"set_stopping_criterion",
[](nvbench::benchmark_base &self, std::string criterion) {
self.set_stopping_criterion(std::move(criterion));
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("criterion"));
py_benchmark_cls.def(
"set_criterion_param_int64",
[](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) {
self.set_criterion_param_int64(std::move(name), value);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"),
py::arg("value"));
py_benchmark_cls.def(
"set_criterion_param_float64",
[](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) {
self.set_criterion_param_float64(std::move(name), value);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"),
py::arg("value"));
py_benchmark_cls.def(
"set_criterion_param_string",
[](nvbench::benchmark_base &self, std::string name, std::string value) {
self.set_criterion_param_string(std::move(name), std::move(value));
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("name"),
py::arg("value"));
py_benchmark_cls.def(
"set_min_samples",
[](nvbench::benchmark_base &self, nvbench::int64_t count) {
self.set_min_samples(count);
return std::ref(self);
},
py::return_value_policy::reference,
py::arg("min_samples_count"));
// == STEP 5
// Define PyState class
// ATTN: nvbench::state is move-only class
// Methods:
// nvbench::state::get_cuda_stream
// nvbench::state::get_cuda_stream_optional
// nvbench::state::set_cuda_stream
// nvbench::state::get_device
// nvbench::state::get_is_cpu_only
// nvbench::state::get_type_config_index
// nvbench::state::get_int64
// nvbench::state::get_int64_or_default
// nvbench::state::get_float64
// nvbench::state::get_float64_or_default
// nvbench::state::get_string
// nvbench::state::get_string_or_default
// nvbench::state::add_element_count
// nvbench::state::set_element_count
// nvbench::state::get_element_count
// nvbench::state::add_global_memory_reads
// nvbench::state::add_global_memory_writes
// nvbench::state::add_buffer_size
// nvbench::state::set_global_memory_rw_bytes
// nvbench::state::get_global_memory_rw_bytes
// nvbench::state::skip
// nvbench::state::is_skipped
// nvbench::state::get_skip_reason
// nvbench::state::get_min_samples
// nvbench::state::set_min_samples
// nvbench::state::get_criterion_params
// nvbench::state::get_stopping_criterion
// nvbench::state::get_run_once
// nvbench::state::set_run_once
// nvbench::state::get_disable_blocking_kernel
// nvbench::state::set_disable_blocking_kernel
// nvbench::state::set_skip_time
// nvbench::state::get_skip_time
// nvbench::state::set_timeout
// nvbench::state::get_timeout
// nvbench::state::set_throttle_threshold
// nvbench::state::get_throttle_threshold
// nvbench::state::set_throttle_recovery_delay
// nvbench::state::get_throttle_recovery_delay
// nvbench::state::get_blocking_kernel_timeout
// nvbench::state::set_blocking_kernel_timeout
// nvbench::state::get_axis_values
// nvbench::state::get_axis_values_as_string
// nvbench::state::get_benchmark
// nvbench::state::collect_l1_hit_rates
// nvbench::state::collect_l2_hit_rates
// nvbench::state::collect_stores_efficiency
// nvbench::state::collect_loads_efficiency
// nvbench::state::collect_dram_throughput
// nvbench::state::collect_cupti_metrics
// nvbench::state::is_l1_hit_rate_collected
// nvbench::state::is_l2_hit_rate_collected
// nvbench::state::is_stores_efficiency_collected
// nvbench::state::is_loads_efficiency_collected
// nvbench::state::is_dram_throughput_collected
// nvbench::state::is_cupti_required
// nvbench::state::add_summary
// nvbench::state::get_summary
// nvbench::state::get_summaries
// nvbench::state::get_short_description
// nvbench::state::exec
// NOTE:
// State wraps std::reference_wrapper<nvbench::state>
using state_ref_t = std::reference_wrapper<nvbench::state>;
auto pystate_cls = py::class_<nvbench::state>(m, "State");
pystate_cls.def("has_device", [](const nvbench::state &state) -> bool {
return static_cast<bool>(state.get_device());
});
pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool {
return state.get_benchmark().get_printer().has_value();
});
pystate_cls.def("get_device", [](const nvbench::state &state) {
auto dev = state.get_device();
if (dev.has_value())
{
return py::cast(dev.value().get_id());
}
return py::object(py::none());
});
pystate_cls.def(
"get_stream",
[](nvbench::state &state) { return std::ref(state.get_cuda_stream()); },
py::return_value_policy::reference);
pystate_cls.def("get_int64", &nvbench::state::get_int64, py::arg("name"));
pystate_cls.def("get_int64_or_default",
&nvbench::state::get_int64_or_default,
py::arg("name"),
py::pos_only{},
py::arg("default_value"));
pystate_cls.def("get_float64", &nvbench::state::get_float64, py::arg("name"));
pystate_cls.def("get_float64_or_default",
&nvbench::state::get_float64_or_default,
py::arg("name"),
py::pos_only{},
py::arg("default_value"));
pystate_cls.def("get_string", &nvbench::state::get_string, py::arg("name"));
pystate_cls.def("get_string_or_default",
&nvbench::state::get_string_or_default,
py::arg("name"),
py::pos_only{},
py::arg("default_value"));
pystate_cls.def("add_element_count",
&nvbench::state::add_element_count,
py::arg("count"),
py::arg("column_name") = py::str(""));
pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count"));
pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
pystate_cls.def("skip", &nvbench::state::skip, py::arg("reason"));
pystate_cls.def("is_skipped", &nvbench::state::is_skipped);
pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason);
pystate_cls.def(
"add_global_memory_reads",
[](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
state.add_global_memory_reads(nbytes, column_name);
},
"Add size, in bytes, of global memory reads",
py::arg("nbytes"),
py::pos_only{},
py::arg("column_name") = py::str(""));
pystate_cls.def(
"add_global_memory_writes",
[](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
state.add_global_memory_writes(nbytes, column_name);
},
"Add size, in bytes, of global memory writes",
py::arg("nbytes"),
py::pos_only{},
py::arg("column_name") = py::str(""));
pystate_cls.def(
"get_benchmark",
[](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
py::return_value_policy::reference);
pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
pystate_cls.def("set_throttle_threshold",
&nvbench::state::set_throttle_threshold,
py::arg("throttle_fraction"));
pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples);
pystate_cls.def("set_min_samples",
&nvbench::state::set_min_samples,
py::arg("min_samples_count"));
pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel);
pystate_cls.def("set_disable_blocking_kernel",
&nvbench::state::set_disable_blocking_kernel,
py::arg("disable_blocking_kernel"));
pystate_cls.def("get_run_once", &nvbench::state::get_run_once);
pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once"));
pystate_cls.def("get_timeout", &nvbench::state::get_timeout);
pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration"));
pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout);
pystate_cls.def("set_blocking_kernel_timeout",
&nvbench::state::set_blocking_kernel_timeout,
py::arg("duration"));
pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics);
pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required);
pystate_cls.def(
"exec",
[](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) {
if (!PyCallable_Check(py_launcher_fn.ptr()))
{
throw py::type_error("Argument of exec method must be a callable object");
}
// wrapper to invoke Python callable
auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void {
// cast C++ object to python object
auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference);
// call Python callable
py_launcher_fn(launch_pyarg);
};
if (sync)
{
if (batched)
{
constexpr auto tag = nvbench::exec_tag::sync;
state.exec(tag, cpp_launcher_fn);
}
else
{
constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch;
state.exec(tag, cpp_launcher_fn);
}
}
else
{
if (batched)
{
constexpr auto tag = nvbench::exec_tag::none;
state.exec(tag, cpp_launcher_fn);
}
else
{
constexpr auto tag = nvbench::exec_tag::no_batch;
state.exec(tag, cpp_launcher_fn);
}
}
},
"Executor for given launcher callable fn(state : Launch)",
py::arg("launcher_fn"),
py::pos_only{},
py::arg("batched") = true,
py::arg("sync") = false);
pystate_cls.def("get_short_description",
[](const nvbench::state &state) { return state.get_short_description(); });
pystate_cls.def(
"add_summary",
[](nvbench::state &state, std::string column_name, std::string value) {
auto &summ = state.add_summary("nv/python/" + column_name);
summ.set_string("description", "User tag: " + column_name);
summ.set_string("name", std::move(column_name));
summ.set_string("value", std::move(value));
},
py::arg("name"),
py::arg("value"));
pystate_cls.def(
"add_summary",
[](nvbench::state &state, std::string column_name, std::int64_t value) {
auto &summ = state.add_summary("nv/python/" + column_name);
summ.set_string("description", "User tag: " + column_name);
summ.set_string("name", std::move(column_name));
summ.set_int64("value", value);
},
py::arg("name"),
py::arg("value"));
pystate_cls.def(
"add_summary",
[](nvbench::state &state, std::string column_name, double value) {
auto &summ = state.add_summary("nv/python/" + column_name);
summ.set_string("description", "User tag: " + column_name);
summ.set_string("name", std::move(column_name));
summ.set_float64("value", value);
},
py::arg("name"),
py::arg("value"));
pystate_cls.def("get_axis_values_as_string",
[](const nvbench::state &state) { return state.get_axis_values_as_string(); });
pystate_cls.def("get_axis_values", &py_get_axis_values);
pystate_cls.def("get_stopping_criterion", &nvbench::state::get_stopping_criterion);
// Use handle to take a memory leak here, since this object's destructor may be called after
// interpreter has shut down
benchmark_exc =
py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError).release();
// == STEP 6
// ATTN: nvbench::benchmark_manager is a singleton
global_registry =
std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete>(new GlobalBenchmarkRegistry(),
py::nodelete{});
m.def(
"register",
[&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
"Register benchmark function of type Callable[[nvbench.State], None]",
py::return_value_policy::reference,
py::arg("benchmark_fn"));
m.def(
"run_all_benchmarks",
[&](py::object argv) -> void {
if (!py::isinstance<py::list>(argv))
{
throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
}
std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
global_registry->run(args);
},
"Run all registered benchmarks",
py::arg("argv") = py::list());
m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
m.def("test_py_exception", []() {
py::set_error(benchmark_exc, "Test");
throw py::error_already_set();
});
}

119
python/test/run_1.py Normal file
View File

@@ -0,0 +1,119 @@
import sys
import cuda.bench as bench
import numpy as np
from numba import cuda
@cuda.jit()
def kernel(a, b, c):
tid = cuda.grid(1)
size = len(a)
if tid < size:
c[tid] = a[tid] + b[tid]
def get_numba_stream(launch: bench.Launch):
return cuda.external_stream(launch.get_stream().addressof())
def skipit(state: bench.State) -> None:
state.skip("Skipping this benchmark for no reason")
def add_two(state: bench.State):
N = state.get_int64("elements")
a = cuda.to_device(np.random.random(N))
c = cuda.device_array_like(a)
assert "elements" in state.get_axis_values()
assert "elements=" in state.get_axis_values_as_string()
state.add_global_memory_reads(a.nbytes)
state.add_global_memory_writes(c.nbytes)
nthreads = 256
nblocks = (len(a) + nthreads - 1) // nthreads
# First call locks, can't use async benchmarks until sync tag is supported
kernel[nblocks, nthreads](a, a, c)
cuda.synchronize()
def kernel_launcher(launch):
stream = get_numba_stream(launch)
kernel[nblocks, nthreads, stream](a, a, c)
state.exec(kernel_launcher, batched=True, sync=True)
def add_float(state: bench.State):
N = state.get_int64("elements")
v = state.get_float64("v")
name = state.get_string("name")
a = cuda.to_device(np.random.random(N).astype(np.float32))
b = cuda.to_device(np.random.random(N).astype(np.float32))
c = cuda.device_array_like(a)
state.add_global_memory_reads(a.nbytes + b.nbytes)
state.add_global_memory_writes(c.nbytes)
nthreads = 64
nblocks = (len(a) + nthreads - 1) // nthreads
axis_values = state.get_axis_values()
assert "elements" in axis_values
assert "v" in axis_values
assert "name" in axis_values
assert axis_values["elements"] == N
assert axis_values["v"] == v
assert axis_values["name"] == name
def kernel_launcher(launch):
_ = v
_ = name
stream = get_numba_stream(launch)
kernel[nblocks, nthreads, stream](a, b, c)
state.exec(kernel_launcher, batched=True, sync=True)
def add_three(state: bench.State):
N = state.get_int64("elements")
a = cuda.to_device(np.random.random(N).astype(np.float32))
b = cuda.to_device(np.random.random(N).astype(np.float32))
c = cuda.device_array_like(a)
state.add_global_memory_reads(a.nbytes + b.nbytes)
state.add_global_memory_writes(c.nbytes)
nthreads = 256
nblocks = (len(a) + nthreads - 1) // nthreads
def kernel_launcher(launch):
stream = get_numba_stream(launch)
kernel[nblocks, nthreads, stream](a, b, c)
state.exec(kernel_launcher, batched=True, sync=True)
cuda.synchronize()
def register_benchmarks():
(
bench.register(add_two).add_int64_axis(
"elements", [2**pow2 - 1 for pow2 in range(20, 23)]
)
)
(
bench.register(add_float)
.add_float64_axis("v", [0.1, 0.3])
.add_string_axis("name", ["Anne", "Lynda"])
.add_int64_power_of_two_axis("elements", range(20, 23))
)
bench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))
bench.register(skipit)
if __name__ == "__main__":
register_benchmarks()
bench.run_all_benchmarks(sys.argv)

236
python/test/stubs.py Normal file
View File

@@ -0,0 +1,236 @@
# Copyright 2025 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 with the LLVM exception
# (the "License"); you may not use this file except in compliance with
# the License.
#
# You may obtain a copy of the License at
#
# http://llvm.org/foundation/relicensing/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ctypes
import sys
from typing import Dict, Optional, Tuple
import cuda.bench as bench
import cuda.cccl.headers as headers
import cuda.core.experimental as core
def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
return core.Stream.from_handle(cs.addressof())
def make_sleep_kernel():
"""JITs sleep_kernel(seconds)"""
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/chrono>
// Each launched thread just sleeps for `seconds`.
__global__ void sleep_kernel(double seconds) {
namespace chrono = ::cuda::std::chrono;
using hr_clock = chrono::high_resolution_clock;
auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
const auto ns = chrono::nanoseconds(duration);
const auto start = hr_clock::now();
const auto finish = start + ns;
auto now = hr_clock::now();
while (now < finish)
{
now = hr_clock::now();
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
return mod.get_kernel("sleep_kernel")
def no_axes(state: bench.State):
state.set_min_samples(1000)
sleep_dur = 1e-3
krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
print(f"Stopping criterion used: {state.get_stopping_criterion()}")
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher)
def tags(state: bench.State):
state.set_min_samples(1000)
sleep_dur = 1e-3
krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
sync_flag = bool(state.get_int64("Sync"))
batched_flag = bool(state.get_int64("Batched"))
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher, sync=sync_flag, batched=batched_flag)
def single_float64_axis(state: bench.State):
# get axis value, or default
default_sleep_dur = 3.14e-4
sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
krn = make_sleep_kernel()
launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, sleep_dur)
state.exec(launcher)
def default_value(state: bench.State):
single_float64_axis(state)
def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
src = r"""
#include <cuda/std/cstdint>
#include <cuda/std/cstddef>
/*!
* Naive copy of `n` values from `in` -> `out`.
*/
template <typename T, typename U>
__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
{
const auto init = blockIdx.x * blockDim.x + threadIdx.x;
const auto step = blockDim.x * gridDim.x;
for (auto i = init; i < n; i += step)
{
out[i] = static_cast<U>(in[i]);
}
}
"""
incl = headers.get_include_paths()
opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
prog = core.Program(src, code_type="c++", options=opts)
if in_type is None:
in_type = "::cuda::std::int32_t"
if out_type is None:
out_type = "::cuda::std::int32_t"
instance_name = f"copy_kernel<{in_type}, {out_type}>"
mod = prog.compile("cubin", name_expressions=(instance_name,))
return mod.get_kernel(instance_name)
def copy_sweep_grid_shape(state: bench.State):
block_size = state.get_int64("BlockSize")
num_blocks = state.get_int64("NumBlocks")
# Number of int32 elements in 256MiB
nbytes = 256 * 1024 * 1024
num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
state.add_element_count(num_values)
state.add_global_memory_reads(nbytes)
state.add_global_memory_writes(nbytes)
dev_id = state.get_device()
alloc_s = as_core_Stream(state.get_stream())
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
krn = make_copy_kernel()
launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
state.exec(launcher)
def copy_type_sweep(state: bench.State):
type_id = state.get_int64("TypeID")
types_map: Dict[int, Tuple[type, str]] = {
0: (ctypes.c_uint8, "cuda::std::uint8_t"),
1: (ctypes.c_uint16, "cuda::std::uint16_t"),
2: (ctypes.c_uint32, "cuda::std::uint32_t"),
3: (ctypes.c_uint64, "cuda::std::uint64_t"),
4: (ctypes.c_float, "float"),
5: (ctypes.c_double, "double"),
}
value_ctype, value_cuda_t = types_map[type_id]
state.add_summary("Type", value_cuda_t)
# Number of elements in 256MiB
nbytes = 256 * 1024 * 1024
num_values = nbytes // ctypes.sizeof(value_ctype)
state.add_element_count(num_values)
state.add_global_memory_reads(nbytes)
state.add_global_memory_writes(nbytes)
dev_id = state.get_device()
alloc_s = as_core_Stream(state.get_stream())
input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
krn = make_copy_kernel(value_cuda_t, value_cuda_t)
launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
def launcher(launch: bench.Launch):
s = as_core_Stream(launch.get_stream())
core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
state.exec(launcher)
if __name__ == "__main__":
# Benchmark without axes
simple_b = bench.register(no_axes)
simple_b.set_stopping_criterion("entropy")
simple_b.set_criterion_param_int64("unused_int", 100)
tags_b = bench.register(tags)
tags_b.add_int64_axis("Sync", [0, 1])
tags_b.add_int64_axis("Batched", [0, 1])
# benchmark with no axes, that uses default value
default_b = bench.register(default_value)
default_b.set_min_samples(7)
# specify axis
axes_b = bench.register(single_float64_axis).add_float64_axis(
"Duration", [7e-5, 1e-4, 5e-4]
)
axes_b.set_timeout(20)
axes_b.set_skip_time(1e-5)
axes_b.set_throttle_threshold(0.2)
axes_b.set_throttle_recovery_delay(0.1)
copy1_bench = bench.register(copy_sweep_grid_shape)
copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2))
copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
copy2_bench = bench.register(copy_type_sweep)
copy2_bench.add_int64_axis("TypeID", range(0, 6))
bench.run_all_benchmarks(sys.argv)

View File

@@ -0,0 +1,39 @@
import json
import cuda.bench as bench
import pytest
def test_cpp_exception():
with pytest.raises(RuntimeError, match="Test"):
bench._nvbench.test_cpp_exception()
def test_py_exception():
with pytest.raises(bench.NVBenchRuntimeError, match="Test"):
bench._nvbench.test_py_exception()
@pytest.mark.parametrize(
"cls", [bench.CudaStream, bench.State, bench.Launch, bench.Benchmark]
)
def test_api_ctor(cls):
with pytest.raises(TypeError, match="No constructor defined!"):
cls()
def t_bench(state: bench.State):
s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}}
def launcher(launch: bench.Launch):
for _ in range(10000):
_ = json.dumps(s)
state.exec(launcher)
def test_cpu_only():
b = bench.register(t_bench)
b.set_is_cpu_only(True)
bench.run_all_benchmarks(["-q", "--profile"])