Merge pull request #237 from oleksandr-pavlyk/add-pynvbench

Python package pynvbench introduced that exposes `cuda.bench` namespace. Repository provides a set of examples.
2026-05-11 08:50:03 +00:00 · 2025-08-06 12:22:55 -05:00
parent fa8dd48186 b5e4b4ba31
commit 935bb0b633
24 changed files with 2780 additions and 0 deletions
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -0,0 +1,4 @@
+build
+nvbench_build
+nvbench_install
+__pycache__
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.30...4.0)
+
+# CUDA is transitive dependency of nvbench
+project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX CUDA)
+
+find_package(Python REQUIRED COMPONENTS Development.Module)
+find_package(CUDAToolkit REQUIRED)
+
+# Get CMake package manager
+set(_cpm_download_location ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.42.0/CPM.cmake
+  ${_cpm_download_location}
+  EXPECTED_HASH SHA256=2020b4fc42dba44817983e06342e682ecfc3d2f484a581f11cc5731fbe4dce8a
+)
+include(${_cpm_download_location})
+
+CPMAddPackage(
+   NAME nvbench
+   SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..
+   OPTIONS "NVBench_INSTALL_RULES ON"
+   FIND_PACKAGE_ARGS CONFIG REQUIRED
+)
+
+CPMAddPackage("gh:pybind/pybind11@3.0.0")
+
+pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
+target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
+target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
+
+set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
+set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
+set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+install(TARGETS _nvbench DESTINATION cuda/bench)
+
+# Determine target that nvbench::nvbench is an alias of,
+# necessary because ALIAS targets cannot be installed
+get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
+install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)
--- a/python/README.md
+++ b/python/README.md
@@ -0,0 +1,61 @@
+# CUDA Kernel Benchmarking Package
+
+This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
+
+## Building
+
+### Ensure recent version of CMake
+
+Since `nvbench` requires a rather new version of CMake (>=3.30.4), either build CMake from sources, or create a conda environment with a recent version of CMake, using
+
+```
+conda create -n build_env --yes  cmake ninja
+conda activate build_env
+```
+
+### Ensure CUDA compiler
+
+Since building `NVBench` library requires CUDA compiler, ensure that appropriate environment variables
+are set. For example, assuming CUDA toolkit is installed system-wide, and assuming Ampere GPU architecture:
+
+```bash
+export CUDACXX=/usr/local/cuda/bin/nvcc
+export CUDAARCHS=86
+``
+
+### Build Python project
+
+Now switch to python folder, configure and install NVBench library, and install the package in editable mode:
+
+```bash
+cd nvbench/python
+pip install -e .
+```
+
+### Verify that package works
+
+```bash
+python test/run_1.py
+```
+
+### Run examples
+
+```bash
+# Example benchmarking numba.cuda kernel
+python examples/throughput.py
+```
+
+```bash
+# Example benchmarking kernels authored using cuda.core
+python examples/axes.py
+```
+
+```bash
+# Example benchmarking algorithms from cuda.cccl.parallel
+python examples/cccl_parallel_segmented_reduce.py
+```
+
+```bash
+# Example benchmarking CuPy function
+python examples/cupy_extract.py
+```
--- a/python/cuda/bench/init.py
+++ b/python/cuda/bench/init.py
@@ -0,0 +1,59 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import importlib.metadata
+import warnings
+
+from cuda.pathfinder import (  # type: ignore[import-not-found]
+    load_nvidia_dynamic_lib,
+)
+
+try:
+    __version__ = importlib.metadata.version("pynvbench")
+except Exception as e:
+    __version__ = "0.0.0dev"
+    warnings.warn(
+        "Could not retrieve version of pynvbench package dynamically from its metadata. "
+        f"Exception {e} was raised. "
+        f"Version is set to fall-back value '{__version__}' instead."
+    )
+
+for libname in ("cupti", "nvperf_target", "nvperf_host"):
+    load_nvidia_dynamic_lib(libname)
+
+from cuda.bench._nvbench import (  # noqa: E402
+    Benchmark as Benchmark,
+)
+from cuda.bench._nvbench import (  # noqa: E402
+    CudaStream as CudaStream,
+)
+from cuda.bench._nvbench import (  # noqa: E402
+    Launch as Launch,
+)
+from cuda.bench._nvbench import (  # noqa: E402
+    NVBenchRuntimeError as NVBenchRuntimeError,
+)
+from cuda.bench._nvbench import (  # noqa: E402
+    State as State,
+)
+from cuda.bench._nvbench import (  # noqa: E402
+    register as register,
+)
+from cuda.bench._nvbench import (  # noqa: E402
+    run_all_benchmarks as run_all_benchmarks,
+)
+
+del load_nvidia_dynamic_lib
--- a/python/cuda/bench/init.pyi
+++ b/python/cuda/bench/init.pyi
@@ -0,0 +1,312 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# ============================================
+# PLEASE KEEP IN SYNC WITH py_nvbench.cpp FILE
+# ============================================
+# Please be sure to keep these type hints and docstring in sync
+# with the pybind11 bindings in ``../../src/py_nvbench.cpp``
+
+# Use mypy's stubgen to auto-generate stubs using
+# ``stubgen -m cuda.nvbench._nvbench`` and compare
+# stubs in generated out/cuda/nvbench/_nvbench.pyi
+# with definitions given here.
+
+from collections.abc import Callable, Sequence
+from typing import Optional, Self, SupportsFloat, SupportsInt, Union
+
+class CudaStream:
+    """Represents CUDA stream
+
+    Note
+    ----
+        The class is not user-constructible.
+    """
+    def __cuda_stream__(self) -> tuple[int, int]:
+        """
+        Special method implement CUDA stream protocol
+        from `cuda.core`. Returns a pair of integers:
+        (protocol_version, integral_value_of_cudaStream_t pointer)
+
+        Example
+        -------
+            import cuda.core.experimental as core
+            import cuda.bench as bench
+
+            def bench(state: bench.State):
+                dev = core.Device(state.get_device())
+                dev.set_current()
+                # converts CudaString to core.Stream
+                # using __cuda_stream__ protocol
+                dev.create_stream(state.get_stream())
+        """
+        ...
+
+    def addressof(self) -> int:
+        "Integral value of address of driver's CUDA stream struct"
+        ...
+
+class Benchmark:
+    """Represents NVBench benchmark.
+
+    Note
+    ----
+        The class is not user-constructible.
+
+        Use `~register` function to create Benchmark and register
+        it with NVBench.
+    """
+    def get_name(self) -> str:
+        "Get benchmark name"
+        ...
+    def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self:
+        "Add integral type parameter axis with given name and values to sweep over"
+        ...
+    def add_int64_power_of_two_axis(
+        self, name: str, values: Sequence[SupportsInt]
+    ) -> Self:
+        "Add integral type parameter axis with given name and values to sweep over"
+        ...
+    def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self:
+        "Add floating-point type parameter axis with given name and values to sweep over"
+        ...
+    def add_string_axis(self, name: str, values: Sequence[str]) -> Self:
+        "Add string type parameter axis with given name and values to sweep over"
+        ...
+    def set_name(self, name: str) -> Self:
+        "Set benchmark name"
+        ...
+    def set_is_cpu_only(self, is_cpu_only: bool) -> Self:
+        "Set whether this benchmark only executes on CPU"
+        ...
+    def set_run_once(self, v: bool) -> Self:
+        "Set whether all benchmark configurations are executed only once"
+        ...
+    def set_skip_time(self, duration_seconds: SupportsFloat) -> Self:
+        "Set run durations, in seconds, that should be skipped"
+        ...
+    def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self:
+        "Set throttle recovery delay, in seconds"
+        ...
+    def set_throttle_threshold(self, threshold: SupportsFloat) -> Self:
+        "Set throttle threshold, as a fraction of maximal GPU frequency"
+        ...
+    def set_timeout(self, duration_seconds: SupportsFloat) -> Self:
+        "Set benchmark run duration timeout value, in seconds"
+        ...
+    def set_stopping_criterion(self, criterion: str) -> Self:
+        "Set stopping criterion to be used"
+        ...
+    def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self:
+        "Set stopping criterion floating point parameter value"
+        ...
+    def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self:
+        "Set stopping criterion integer parameter value"
+        ...
+    def set_criterion_param_string(self, name: str, value: str) -> Self:
+        "Set stopping criterion string parameter value"
+        ...
+    def set_min_samples(self, count: SupportsInt) -> Self:
+        "Set minimal samples count before stopping criterion applies"
+        ...
+
+class Launch:
+    """Configuration object for function launch.
+
+    Note
+    ----
+        The class is not user-constructible.
+    """
+    def get_stream(self) -> CudaStream:
+        "Get CUDA stream of this configuration"
+        ...
+
+class State:
+    """Represent benchmark configuration state.
+
+    Note
+    ----
+        The class is not user-constructible.
+    """
+    def has_device(self) -> bool:
+        "True if configuration has a device"
+        ...
+    def has_printers(self) -> bool:
+        "True if configuration has a printer"
+        ...
+    def get_device(self) -> Union[int, None]:
+        "Get device_id of the device from this configuration"
+        ...
+    def get_stream(self) -> CudaStream:
+        "CudaStream object from this configuration"
+        ...
+    def get_int64(self, name: str) -> int:
+        "Get value for given Int64 axis from this configuration"
+        ...
+    def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int:
+        "Get value for given Int64 axis from this configuration"
+        ...
+    def get_float64(self, name: str) -> float:
+        "Get value for given Float64 axis from this configuration"
+        ...
+    def get_float64_or_default(self, name: str, default_value: SupportsFloat) -> float:
+        "Get value for given Float64 axis from this configuration"
+        ...
+    def get_string(self, name: str) -> str:
+        "Get value for given String axis from this configuration"
+        ...
+    def get_string_or_default(self, name: str, default_value: str) -> str:
+        "Get value for given String axis from this configuration"
+        ...
+    def add_element_count(
+        self, count: SupportsInt, column_name: Optional[str] = None
+    ) -> None:
+        "Add element count"
+        ...
+    def set_element_count(self, count: SupportsInt) -> None:
+        "Set element count"
+        ...
+    def get_element_count(self) -> int:
+        "Get element count"
+        ...
+    def skip(self, reason: str) -> None:
+        "Skip this configuration"
+        ...
+    def is_skipped(self) -> bool:
+        "Has this configuration been skipped"
+        ...
+    def get_skip_reason(self) -> str:
+        "Get reason provided for skipping this configuration"
+        ...
+    def add_global_memory_reads(
+        self, nbytes: SupportsInt, /, column_name: str = ""
+    ) -> None:
+        "Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
+        ...
+    def add_global_memory_writes(
+        self, nbytes: SupportsInt, /, column_name: str = ""
+    ) -> None:
+        "Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
+        ...
+    def get_benchmark(self) -> Benchmark:
+        "Get Benchmark this configuration is a part of"
+        ...
+    def get_throttle_threshold(self) -> float:
+        "Get throttle threshold value, as fraction of maximal frequency"
+        ...
+    def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None:
+        "Set throttle threshold fraction to specified value, expected to be between 0 and 1"
+        ...
+    def get_min_samples(self) -> int:
+        "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
+        ...
+    def set_min_samples(self, min_samples_count: SupportsInt) -> None:
+        "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
+        ...
+    def get_disable_blocking_kernel(self) -> bool:
+        "True if use of blocking kernel by NVBench is disabled, False otherwise"
+        ...
+    def set_disable_blocking_kernel(self, flag: bool) -> None:
+        "Use flag = True to disable use of blocking kernel by NVBench"
+        ...
+    def get_run_once(self) -> bool:
+        "Boolean flag whether configuration should only run once"
+        ...
+    def set_run_once(self, run_once_flag: bool) -> None:
+        "Set run-once flag for this configuration"
+        ...
+    def get_timeout(self) -> float:
+        "Get time-out value for benchmark execution of this configuration, in seconds"
+        ...
+    def set_timeout(self, duration: SupportsFloat) -> None:
+        "Set time-out value for benchmark execution of this configuration, in seconds"
+        ...
+    def get_blocking_kernel_timeout(self) -> float:
+        "Get time-out value for execution of blocking kernel, in seconds"
+        ...
+    def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None:
+        "Set time-out value for execution of blocking kernel, in seconds"
+        ...
+    def collect_cupti_metrics(self) -> None:
+        "Request NVBench to record CUPTI metrics while running benchmark for this configuration"
+        ...
+    def is_cupti_required(self) -> bool:
+        "True if (some) CUPTI metrics are being collected"
+        ...
+    def exec(
+        self,
+        fn: Callable[[Launch], None],
+        /,
+        *,
+        batched: Optional[bool] = True,
+        sync: Optional[bool] = False,
+    ):
+        """Execute callable running the benchmark.
+
+        The callable may be executed multiple times.
+
+        Parameters
+        ----------
+        fn: Callable
+            Python callable with signature fn(Launch) -> None that executes the benchmark.
+        batched: bool, optional
+            If `True`, no cache flushing is performed between callable invocations.
+            Default: `True`.
+        sync: bool, optional
+            True value indicates that callable performs device synchronization.
+            NVBench disables use of blocking kernel in this case.
+            Default: `False`.
+        """
+        ...
+    def get_short_description(self) -> str:
+        "Get short description for this configuration"
+        ...
+    def add_summary(
+        self, column_name: str, value: Union[SupportsInt, SupportsFloat, str]
+    ) -> None:
+        "Add summary column with a value"
+        ...
+    def get_axis_values(self) -> dict[str, int | float | str]:
+        "Get dictionary with axis values for this configuration"
+        ...
+    def get_axis_values_as_string(self) -> str:
+        "Get string of space-separated name=value pairs for this configuration"
+        ...
+    def get_stopping_criterion(self) -> str:
+        "Get string name of stopping criterion used"
+        ...
+
+def register(fn: Callable[[State], None]) -> Benchmark:
+    """
+    Register given benchmarking function with NVBench.
+    """
+    ...
+
+def run_all_benchmarks(argv: Sequence[str]) -> None:
+    """
+    Run all benchmarks registered with NVBench.
+
+    Parameters
+    ----------
+    argv: List[str]
+        Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
+    """
+    ...
+
+class NVBenchRuntimeError(RuntimeError):
+    """An exception raised if running benchmarks encounters an error"""
+
+    ...
--- a/python/cuda/bench/py.typed
+++ b/python/cuda/bench/py.typed
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -0,0 +1,75 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import numpy as np
+from numba import cuda
+
+
+def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
+    @cuda.jit
+    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
+        tid = cuda.grid(1)
+        step = cuda.gridDim.x * cuda.blockDim.x
+        for i in range(stride * tid, stride * elements, stride * step):
+            for j in range(items_per_thread):
+                read_id = (items_per_thread * i + j) % elements
+                write_id = tid + j * elements
+                out_arr[write_id] = in_arr[read_id]
+
+    return kernel
+
+
+def throughput_bench(state: bench.State) -> None:
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
+
+    nbytes = 128 * 1024 * 1024
+    elements = nbytes // np.dtype(np.int32).itemsize
+
+    alloc_stream = as_cuda_stream(state.get_stream())
+    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
+    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
+
+    state.add_element_count(elements, column_name="Elements")
+    state.collect_cupti_metrics()
+
+    threads_per_block = 256
+    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
+
+    krn = make_throughput_kernel(ipt)
+
+    def launcher(launch: bench.Launch):
+        exec_stream = as_cuda_stream(launch.get_stream())
+        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
+            stride, elements, inp_arr, out_arr
+        )
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = bench.register(throughput_bench)
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -0,0 +1,206 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+import sys
+from typing import Dict, Optional, Tuple
+
+import cuda.bench as bench
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+
+
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def simple(state: bench.State):
+    state.set_min_samples(1000)
+    sleep_dur = 1e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def single_float64_axis(state: bench.State):
+    # get axis value, or default
+    default_sleep_dur = 3.14e-4
+    sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def default_value(state: bench.State):
+    single_float64_axis(state)
+
+
+def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/cstddef>
+/*!
+ * Naive copy of `n` values from `in` -> `out`.
+ */
+template <typename T, typename U>
+__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    out[i] = static_cast<U>(in[i]);
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    if in_type is None:
+        in_type = "::cuda::std::int32_t"
+    if out_type is None:
+        out_type = "::cuda::std::int32_t"
+    instance_name = f"copy_kernel<{in_type}, {out_type}>"
+    mod = prog.compile("cubin", name_expressions=(instance_name,))
+    return mod.get_kernel(instance_name)
+
+
+def copy_sweep_grid_shape(state: bench.State):
+    block_size = state.get_int64("BlockSize")
+    num_blocks = state.get_int64("NumBlocks")
+
+    # Number of int32 elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
+
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
+
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+
+    krn = make_copy_kernel()
+    launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+def copy_type_sweep(state: bench.State):
+    type_id = state.get_int64("TypeID")
+
+    types_map: Dict[int, Tuple[type, str]] = {
+        0: (ctypes.c_uint8, "cuda::std::uint8_t"),
+        1: (ctypes.c_uint16, "cuda::std::uint16_t"),
+        2: (ctypes.c_uint32, "cuda::std::uint32_t"),
+        3: (ctypes.c_uint64, "cuda::std::uint64_t"),
+        4: (ctypes.c_float, "float"),
+        5: (ctypes.c_double, "double"),
+    }
+
+    value_ctype, value_cuda_t = types_map[type_id]
+    state.add_summary("Type", value_cuda_t)
+
+    # Number of elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(value_ctype)
+
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
+
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+
+    krn = make_copy_kernel(value_cuda_t, value_cuda_t)
+    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    # Benchmark without axes
+    bench.register(simple)
+
+    # benchmark with no axes, that uses default value
+    bench.register(default_value)
+    # specify axis
+    bench.register(single_float64_axis).add_float64_axis(
+        "Duration (s)", [7e-5, 1e-4, 5e-4]
+    )
+
+    copy1_bench = bench.register(copy_sweep_grid_shape)
+    copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
+    copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
+
+    copy2_bench = bench.register(copy_type_sweep)
+    copy2_bench.add_int64_axis("TypeID", range(0, 6))
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/cccl_cooperative_block_reduce.py
+++ b/python/examples/cccl_cooperative_block_reduce.py
@@ -0,0 +1,99 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import cuda.cccl.cooperative.experimental as coop
+import numba
+import numpy as np
+from numba import cuda
+from pynvjitlink import patch
+
+
+class BitsetRing:
+    """
+    Addition operation over ring fixed width unsigned integers
+    with ring_plus = bitwise_or and ring_mul = bitwise_and,
+         ring_zero = 0, ring_one = -1
+    """
+
+    def __init__(self):
+        self.dt = np.uint64
+        self.zero = self.dt(0)
+        self.one = np.bitwise_invert(self.zero)
+
+    @staticmethod
+    def add(op1, op2):
+        return op1 | op2
+
+    @staticmethod
+    def mul(op1, op2):
+        return op1 & op2
+
+
+def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def multi_block_bench(state: bench.State):
+    threads_per_block = state.get_int64("ThreadsPerBlock")
+    num_blocks = state.get_int64("NumBlocks")
+    total_elements = threads_per_block * num_blocks
+
+    if total_elements > 2**26:
+        state.skip(reason="Memory footprint over threshold")
+        return
+
+    ring = BitsetRing()
+    block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add)
+
+    @cuda.jit(link=block_reduce.files)
+    def kernel(inp_arr, out_arr):
+        # Each thread contributes one element
+        block_idx = cuda.blockIdx.x
+        thread_idx = cuda.threadIdx.x
+        global_idx = block_idx * threads_per_block + thread_idx
+
+        block_output = block_reduce(inp_arr[global_idx])
+
+        # Only thread 0 of each block writes the result
+        if thread_idx == 0:
+            out_arr[block_idx] = block_output
+
+    h_inp = np.arange(1, total_elements + 1, dtype=ring.dt)
+    d_inp = cuda.to_device(h_inp)
+    d_out = cuda.device_array(num_blocks, dtype=ring.dt)
+
+    state.add_element_count(total_elements)
+    state.add_global_memory_reads(total_elements * h_inp.itemsize)
+    state.add_global_memory_writes(num_blocks * h_inp.itemsize)
+
+    def launcher(launch: bench.Launch):
+        cuda_s = as_cuda_Stream(launch.get_stream())
+        kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    patch.patch_numba_linker(lto=True)
+
+    b = bench.register(multi_block_bench)
+    b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
+    b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -0,0 +1,124 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import cuda.cccl.parallel.experimental.algorithms as algorithms
+import cuda.cccl.parallel.experimental.iterators as iterators
+import cuda.core.experimental as core
+import cupy as cp
+import numpy as np
+
+
+class CCCLStream:
+    "Class to work around https://github.com/NVIDIA/cccl/issues/5144"
+
+    def __init__(self, ptr):
+        self._ptr = ptr
+
+    def __cuda_stream__(self):
+        return (0, self._ptr)
+
+
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
+    return CCCLStream(cs.addressof())
+
+
+def as_cp_ExternalStream(
+    cs: bench.CudaStream, dev_id: int | None = -1
+) -> cp.cuda.ExternalStream:
+    h = cs.addressof()
+    return cp.cuda.ExternalStream(h, dev_id)
+
+
+def segmented_reduce(state: bench.State):
+    "Benchmark segmented_reduce example"
+    n_elems = state.get_int64("numElems")
+    n_cols = state.get_int64("numCols")
+    n_rows = n_elems // n_cols
+
+    state.add_summary("numRows", n_rows)
+    state.collect_cupti_metrics()
+
+    dev_id = state.get_device()
+    cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
+
+    def add_op(a, b):
+        return a + b
+
+    def make_scaler(step):
+        def scale(row_id):
+            return row_id * step
+
+        return scale
+
+    zero = np.int32(0)
+    row_offset = make_scaler(np.int32(n_cols))
+    start_offsets = iterators.TransformIterator(
+        iterators.CountingIterator(zero), row_offset
+    )
+
+    end_offsets = start_offsets + 1
+
+    h_init = np.zeros(tuple(), dtype=np.int32)
+    with cp_stream:
+        rng = cp.random.default_rng()
+        mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
+        d_input = mat
+        d_output = cp.empty(n_rows, dtype=d_input.dtype)
+
+    alg = algorithms.segmented_reduce(
+        d_input, d_output, start_offsets, end_offsets, add_op, h_init
+    )
+
+    cccl_stream = as_cccl_Stream(state.get_stream())
+
+    # query size of temporary storage and allocate
+    temp_nbytes = alg(
+        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
+    )
+    h_init = np.zeros(tuple(), dtype=np.int32)
+
+    with cp_stream:
+        temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
+
+    def launcher(launch: bench.Launch):
+        s = as_cccl_Stream(launch.get_stream())
+        alg(
+            temp_storage,
+            d_input,
+            d_output,
+            n_rows,
+            start_offsets,
+            end_offsets,
+            h_init,
+            s,
+        )
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = bench.register(segmented_reduce)
+    b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/cpu_activity.py
+++ b/python/examples/cpu_activity.py
@@ -0,0 +1,97 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+import time
+
+import cuda.bench as bench
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+
+host_sleep_duration = 0.1
+
+
+def cpu_only_sleep_bench(state: bench.State) -> None:
+    def launcher(launch: bench.Launch):
+        time.sleep(host_sleep_duration)
+
+    state.exec(launcher)
+
+
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def mixed_sleep_bench(state: bench.State) -> None:
+    sync = state.get_string("Sync")
+    sync_flag = sync == "Do sync"
+
+    gpu_sleep_dur = 225e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        # host overhead
+        time.sleep(host_sleep_duration)
+        # GPU computation
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, gpu_sleep_dur)
+
+    state.exec(launcher, sync=sync_flag)
+
+
+if __name__ == "__main__":
+    # time function only doing work (sleeping) on the host
+    # using CPU timer only
+    b = bench.register(cpu_only_sleep_bench)
+    b.set_is_cpu_only(True)
+
+    # time the function that does work on both GPU and CPU
+    b2 = bench.register(mixed_sleep_bench)
+    b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -0,0 +1,63 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import cupy as cp
+
+
+def as_cp_ExternalStream(
+    cs: bench.CudaStream, dev_id: int | None = -1
+) -> cp.cuda.ExternalStream:
+    h = cs.addressof()
+    return cp.cuda.ExternalStream(h, dev_id)
+
+
+def cupy_extract_by_mask(state: bench.State):
+    n_cols = state.get_int64("numCols")
+    n_rows = state.get_int64("numRows")
+
+    dev_id = state.get_device()
+    cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
+
+    state.collect_cupti_metrics()
+    state.add_element_count(n_rows * n_cols, "# Elements")
+    int32_dt = cp.dtype(cp.int32)
+    bool_dt = cp.dtype(cp.bool_)
+    state.add_global_memory_reads(
+        n_rows * n_cols * (int32_dt.itemsize + bool_dt.itemsize)
+    )
+    state.add_global_memory_writes(n_rows * n_cols * (int32_dt.itemsize))
+
+    with cp_s:
+        X = cp.full((n_cols, n_rows), fill_value=3, dtype=int32_dt)
+        mask = cp.ones((n_cols, n_rows), dtype=bool_dt)
+        _ = X[mask]
+
+    def launcher(launch: bench.Launch):
+        with as_cp_ExternalStream(launch.get_stream(), dev_id):
+            _ = X[mask]
+
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    b = bench.register(cupy_extract_by_mask)
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
+    b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/cutlass_gemm.py
+++ b/python/examples/cutlass_gemm.py
@@ -0,0 +1,113 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import sys
+
+import cuda.bench as bench
+import cuda.bindings.driver as driver
+import cuda.core.experimental as core
+import cupy as cp
+import cutlass
+import numpy as np
+
+
+def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
+    return driver.CUstream(cs.addressof())
+
+
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_cp_array(
+    arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None
+) -> cp.ndarray:
+    cp_memview = cp.cuda.UnownedMemory(
+        int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
+    )
+    zero_offset = 0
+    return cp.ndarray(
+        arr_h.shape,
+        dtype=arr_h.dtype,
+        memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
+    )
+
+
+def cutlass_gemm(state: bench.State) -> None:
+    n = state.get_int64("N")
+    r = state.get_int64("R")
+
+    alpha = state.get_float64("alpha")
+
+    dt = np.float64
+    A_h = np.random.randn(n, r).astype(dt)
+    B_h = np.copy(A_h.mT)
+    C_h = np.eye(n, dtype=dt)
+    D_h = np.zeros_like(C_h)
+
+    if n >= 1024:
+        # allow more time for large inputs
+        state.set_timeout(360)
+
+    dev_id = state.get_device()
+    cs = state.get_stream()
+    s = as_bindings_Stream(cs)
+    core_s = as_core_Stream(cs)
+
+    A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
+    B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
+    C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)
+    D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s)
+
+    driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
+    driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
+    driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)
+    driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s)
+
+    A_cp = make_cp_array(A_h, A_d, dev_id)
+    B_cp = make_cp_array(B_h, B_d, dev_id)
+    C_cp = make_cp_array(C_h, C_d, dev_id)
+    D_cp = make_cp_array(D_h, D_d, dev_id)
+
+    plan = cutlass.op.Gemm(
+        A=A_cp,
+        B=B_cp,
+        C=C_cp,
+        D=D_cp,
+        element=dt,
+        alpha=alpha,
+        beta=1,
+        layout=cutlass.LayoutType.RowMajor,
+    )
+    # warm-up to ensure compilation is not timed
+    plan.run(stream=s)
+
+    def launcher(launch: bench.Launch) -> None:
+        s = as_bindings_Stream(launch.get_stream())
+        plan.run(stream=s, sync=False)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    gemm_b = bench.register(cutlass_gemm)
+    gemm_b.add_int64_axis("R", [16, 64, 256])
+    gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
+
+    gemm_b.add_float64_axis("alpha", [1e-2])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/exec_tag_sync.py
+++ b/python/examples/exec_tag_sync.py
@@ -0,0 +1,85 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+import sys
+from typing import Optional
+
+import cuda.bench as bench
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+
+
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
+    "Create view of native stream used by NVBench"
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_fill_kernel(data_type: Optional[str] = None):
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/cstddef>
+/*!
+ * Naive setting of values in buffer
+ */
+template <typename T>
+__global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    buf[i] = v;
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    if data_type is None:
+        data_type = "::cuda::std::int32_t"
+    instance_name = f"fill_kernel<{data_type}>"
+    mod = prog.compile("cubin", name_expressions=(instance_name,))
+    return mod.get_kernel(instance_name)
+
+
+def synchronizing_bench(state: bench.State):
+    n_values = 64 * 1024 * 1024
+    n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
+
+    alloc_s = as_core_Stream(state.get_stream())
+    buffer = core.DeviceMemoryResource(state.get_device()).allocate(n_bytes, alloc_s)
+
+    state.add_element_count(n_values, "Items")
+    state.add_global_memory_writes(n_bytes, "Size")
+
+    krn = make_fill_kernel()
+    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, buffer, 0, n_values)
+        s.sync()
+
+    # since launcher contains synchronization point,
+    # setting sync=True is required to avoid a deadlock
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    bench.register(synchronizing_bench)
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/pytorch_bench.py
+++ b/python/examples/pytorch_bench.py
@@ -0,0 +1,58 @@
+import sys
+
+import cuda.bench as bench
+import torch
+
+
+def as_torch_cuda_Stream(
+    cs: bench.CudaStream, dev: int | None
+) -> torch.cuda.ExternalStream:
+    return torch.cuda.ExternalStream(
+        stream_ptr=cs.addressof(), device=torch.cuda.device(dev)
+    )
+
+
+def torch_bench(state: bench.State) -> None:
+    state.set_throttle_threshold(0.25)
+
+    dev_id = state.get_device()
+    tc_s = as_torch_cuda_Stream(state.get_stream(), dev_id)
+
+    dt = torch.float32
+    scalar_shape: tuple = tuple()
+    n = 2**28
+    with torch.cuda.stream(tc_s):
+        a3 = torch.randn(scalar_shape, dtype=dt)
+        a2 = torch.randn(scalar_shape, dtype=dt)
+        a1 = torch.randn(scalar_shape, dtype=dt)
+        a0 = torch.randn(scalar_shape, dtype=dt)
+        x = torch.linspace(-3, 3, n, dtype=dt)
+        y = torch.sin(x)
+
+    learning_rate = 1e-4
+
+    def launcher(launch: bench.Launch) -> None:
+        tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id)
+        with torch.cuda.stream(tc_s):
+            x2 = torch.square(x)
+            y_pred = (a3 + x2 * a1) + x * (a2 + a0 * x2)
+
+            _ = torch.square(y_pred - y).sum()
+            grad_y_pred = 2 * (y_pred - y)
+            grad_a3 = grad_y_pred.sum()
+            grad_a2 = (grad_y_pred * x).sum()
+            grad_a1 = (grad_y_pred * x2).sum()
+            grad_a0 = (grad_y_pred * x2 * x).sum()
+
+            _ = a3 - grad_a3 * learning_rate
+            _ = a2 - grad_a2 * learning_rate
+            _ = a1 - grad_a1 * learning_rate
+            _ = a0 - grad_a0 * learning_rate
+
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    bench.register(torch_bench)
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/requirements.txt
+++ b/python/examples/requirements.txt
@@ -0,0 +1,7 @@
+numpy
+numba
+cupy
+nvidia-cutlass
+cuda-cccl
+cuda-core
+cuda-bindings
--- a/python/examples/skip.py
+++ b/python/examples/skip.py
@@ -0,0 +1,89 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+
+
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
+    "Create view into native stream provided by NVBench"
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def runtime_skip(state: bench.State):
+    duration = state.get_float64("Duration")
+    kramble = state.get_string("Kramble")
+
+    # Skip Baz benchmarks with 0.8 ms duration
+    if kramble == "Baz" and duration < 0.8e-3:
+        state.skip("Short 'Baz' benchmarks are skipped")
+        return
+
+    # Skip Foo benchmark with > 0.3 ms duration
+    if kramble == "Foo" and duration > 0.3e-3:
+        state.skip("Long 'Foo' benchmarks are skipped")
+        return
+
+    krn = make_sleep_kernel()
+    launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_cfg, krn, duration)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = bench.register(runtime_skip)
+    b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
+    b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -0,0 +1,76 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import numpy as np
+from numba import cuda
+
+
+def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
+    @cuda.jit
+    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
+        tid = cuda.grid(1)
+        step = cuda.gridDim.x * cuda.blockDim.x
+        for i in range(stride * tid, stride * elements, stride * step):
+            for j in range(items_per_thread):
+                read_id = (items_per_thread * i + j) % elements
+                write_id = tid + j * elements
+                out_arr[write_id] = in_arr[read_id]
+
+    return kernel
+
+
+def throughput_bench(state: bench.State) -> None:
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
+
+    nbytes = 128 * 1024 * 1024
+    elements = nbytes // np.dtype(np.int32).itemsize
+
+    alloc_stream = as_cuda_stream(state.get_stream())
+    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
+    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
+
+    state.add_element_count(elements, column_name="Elements")
+    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
+    state.add_global_memory_writes(inp_arr.nbytes)
+
+    threads_per_block = 256
+    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
+
+    krn = make_throughput_kernel(ipt)
+
+    def launcher(launch: bench.Launch):
+        exec_stream = as_cuda_stream(launch.get_stream())
+        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
+            stride, elements, inp_arr, out_arr
+        )
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = bench.register(throughput_bench)
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -0,0 +1,59 @@
+[build-system]
+requires = ["scikit-build-core>=0.10", "setuptools_scm"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "pynvbench"
+description = "CUDA Kernel Benchmarking Package"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dependencies = [
+  # pathfinder
+  "cuda-pathfinder",
+
+  # Library expects to find shared libraries
+  # libcupti, libnvperf_target, libnvperf_host
+  # pathfinder is used to find it in the Python layout
+  "nvidia-cuda-cupti-cu12",
+
+  # The shared library
+  # libnvidia-ml must be installed system-wide
+  # (Debian package provider: libnvidia-compute)
+]
+dynamic = ["version"]
+readme = { file = "README.md", content-type = "text/markdown" }
+
+[project.optional-dependencies]
+test = ["pytest", "cupy-cuda12x", "numba"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.scikit-build]
+minimum-version = "build-system.requires"
+build-dir = "build/{wheel_tag}"
+
+[tool.scikit-build.cmake]
+version = ">=3.30.4"
+args = []
+build-type = "Release"
+source-dir = "."
+
+[tool.scikit-build.ninja]
+version = ">=1.11"
+make-fallback = true
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.setuptools_scm"
+
+[tool.setuptools_scm]
+root = ".."
+
+[tool.scikit-build.wheel.packages]
+"cuda" = "cuda"
+"cuda/bench" = "cuda/bench"
--- a/python/src/.BUILD_LOCALLY.md
+++ b/python/src/.BUILD_LOCALLY.md
@@ -0,0 +1,24 @@
+# Building extension module locally
+
+This may be useful for debugging purposes.
+Assuming NVBench is installed into `NVBENCH_PREFIX`,
+and pybind11 repo is cloned to `PYBIND11_DIR`,
+Python extension can be built locally, using host compiler,
+as follows:
+
+```bash
+g++ py_nvbench.cpp                                  \
+   -shared -fPIC                                    \
+   -I ${PYBIND11_DIR}/include                       \
+   -I ${NVBENCH_PREFIX}/include                     \
+   -I /usr/local/cuda/include                       \
+   $(python3-config --includes)                     \
+   $(python3-config --libs)                         \
+   -L ${NVBENCH_PREFIX}/lib/                        \
+   -lnvbench                                        \
+   -Wl,-rpath,${NVBENCH_PREFIX}/lib                 \
+   -L /usr/local/cuda/lib64/                        \
+   -lcudart                                         \
+   -Wl,-rpath,/usr/local/cuda/lib64                 \
+   -o _nvbench$(python3-config --extension-suffix)
+```
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -0,0 +1,734 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// clang-format off
+// Include Pybind11 headers first thing
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+// clang-format on
+
+#include <nvbench/nvbench.cuh>
+
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace
+{
+
+struct PyObjectDeleter
+{
+  void operator()(py::object *p)
+  {
+    const bool initialized = Py_IsInitialized();
+
+#if PY_VERSION_HEX < 0x30d0000
+    const bool finalizing = _Py_IsFinalizing();
+#else
+    const bool finalizing = Py_IsFinalizing();
+#endif
+    const bool guard = initialized && !finalizing;
+
+    // deleter only call ~object if interpreter is active and
+    // not shutting down, let OS clean up resources after
+    // interpreter tear-down
+    if (guard)
+    {
+      delete p;
+    }
+  }
+};
+
+struct benchmark_wrapper_t
+{
+
+  benchmark_wrapper_t() = default;
+
+  explicit benchmark_wrapper_t(py::object o)
+      : m_fn{std::shared_ptr<py::object>(new py::object(std::move(o)), PyObjectDeleter{})}
+  {
+    if (!PyCallable_Check(m_fn->ptr()))
+    {
+      throw py::value_error("Argument must be a callable");
+    }
+  }
+
+  // Only copy constructor is used, delete copy-assign, and moves
+  benchmark_wrapper_t(const benchmark_wrapper_t &other)            = default;
+  benchmark_wrapper_t &operator=(const benchmark_wrapper_t &other) = delete;
+  benchmark_wrapper_t(benchmark_wrapper_t &&) noexcept             = delete;
+  benchmark_wrapper_t &operator=(benchmark_wrapper_t &&) noexcept  = delete;
+
+  void operator()(nvbench::state &state, nvbench::type_list<>)
+  {
+    if (!m_fn)
+    {
+      throw std::runtime_error("No function to execute");
+    }
+    // box as Python object, using reference semantics
+    auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
+
+    // Execute Python callable
+    (*m_fn)(arg);
+  }
+
+private:
+  // Important to use shared pointer here rather than py::object directly,
+  // since copy constructor must be const (benchmark::do_clone is const member method)
+  std::shared_ptr<py::object> m_fn;
+};
+
+// Use struct to ensure public inheritance
+struct nvbench_run_error : std::runtime_error
+{
+  // ask compiler to generate all constructor signatures
+  // that are defined for the base class
+  using std::runtime_error::runtime_error;
+};
+py::handle benchmark_exc{};
+
+class GlobalBenchmarkRegistry
+{
+  bool m_finalized;
+
+public:
+  GlobalBenchmarkRegistry()
+      : m_finalized(false) {};
+
+  GlobalBenchmarkRegistry(const GlobalBenchmarkRegistry &)            = delete;
+  GlobalBenchmarkRegistry &operator=(const GlobalBenchmarkRegistry &) = delete;
+
+  GlobalBenchmarkRegistry(GlobalBenchmarkRegistry &&)            = delete;
+  GlobalBenchmarkRegistry &operator=(GlobalBenchmarkRegistry &&) = delete;
+
+  bool is_finalized() const { return m_finalized; }
+
+  nvbench::benchmark_base &add_bench(py::object fn)
+  {
+    if (m_finalized)
+    {
+      throw std::runtime_error("Can not register more benchmarks after benchmark was run");
+    }
+    if (!PyCallable_Check(fn.ptr()))
+    {
+      throw py::value_error("Benchmark should be a callable object");
+    }
+    std::string name;
+    if (py::hasattr(fn, "__name__"))
+    {
+      py::str py_name = fn.attr("__name__");
+      name            = py::cast<std::string>(py_name);
+    }
+    else
+    {
+      py::str py_name = py::repr(fn);
+      name            = py::cast<std::string>(py_name);
+    }
+    benchmark_wrapper_t executor(fn);
+
+    return nvbench::benchmark_manager::get()
+      .add(std::make_unique<nvbench::benchmark<benchmark_wrapper_t>>(executor))
+      .set_name(std::move(name));
+  }
+
+  void run(const std::vector<std::string> &argv)
+  {
+    if (nvbench::benchmark_manager::get().get_benchmarks().empty())
+    {
+      throw std::runtime_error("No benchmarks had been registered yet");
+    }
+    if (m_finalized)
+    {
+      throw std::runtime_error("Benchmarks were already executed");
+    }
+    m_finalized = true;
+
+    try
+    {
+      // This line is mandatory for correctness to populate
+      // benchmark with devices requested by user via CLI
+      nvbench::benchmark_manager::get().initialize();
+      {
+        nvbench::option_parser parser{};
+        parser.parse(argv);
+
+        NVBENCH_MAIN_PRINT_PREAMBLE(parser);
+        NVBENCH_MAIN_RUN_BENCHMARKS(parser);
+        NVBENCH_MAIN_PRINT_EPILOGUE(parser);
+
+        NVBENCH_MAIN_PRINT_RESULTS(parser);
+      } /* Tear down parser before finalization */
+    }
+    catch (py::error_already_set &e)
+    {
+      py::raise_from(e, benchmark_exc.ptr(), "Python error raised ");
+      throw py::error_already_set();
+    }
+    catch (const std::exception &e)
+    {
+      const std::string &exc_message = e.what();
+      py::set_error(benchmark_exc, exc_message.c_str());
+      throw py::error_already_set();
+    }
+    catch (...)
+    {
+      py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
+      throw py::error_already_set();
+    }
+  }
+};
+
+py::dict py_get_axis_values(const nvbench::state &state)
+{
+  auto named_values = state.get_axis_values();
+
+  auto names = named_values.get_names();
+  py::dict res;
+
+  for (const auto &name : names)
+  {
+    if (named_values.has_value(name))
+    {
+      auto v            = named_values.get_value(name);
+      res[name.c_str()] = py::cast(v);
+    }
+  }
+
+  return res;
+}
+
+// essentially a global variable, but allocated on the heap during module initialization
+std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
+
+} // end of anonymous namespace
+
+// ==========================================
+// PLEASE KEEP IN SYNC WITH __init__.pyi FILE
+// ==========================================
+// If you modify these bindings, please be sure to update the
+// corresponding type hints in ``../cuda/nvbench/__init__.pyi``
+
+PYBIND11_MODULE(_nvbench, m)
+{
+  // == STEP 1
+  // Set environment variable CUDA_MODULE_LOADING=EAGER
+
+  NVBENCH_DRIVER_API_CALL(cuInit(0));
+
+  // This line ensures that benchmark_manager has been created during module init
+  // It is reinitialized before running all benchmarks to set devices to use
+  nvbench::benchmark_manager::get().initialize();
+
+  // == STEP 2
+  // Define CudaStream class
+  //    ATTN: nvbench::cuda_stream is move-only class
+  //    Methods:
+  //       Constructors, based on device, or on existing stream
+  //       nvbench::cuda_stream::get_stream
+
+  auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream");
+
+  py_cuda_stream_cls.def("__cuda_stream__",
+                         [](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
+                           return std::make_pair(std::size_t{0},
+                                                 reinterpret_cast<std::size_t>(s.get_stream()));
+                         });
+  py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
+    return reinterpret_cast<std::size_t>(s.get_stream());
+  });
+
+  // == STEP 3
+  // Define Launch class
+  //    ATTN: nvbench::launch is move-only class
+  //    Methods:
+  //        nvbench::launch::get_stream -> nvbench::cuda_stream
+
+  auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
+
+  py_launch_cls.def(
+    "get_stream",
+    [](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
+    py::return_value_policy::reference);
+
+  // == STEP 4
+  // Define Benchmark class
+  //    ATTN: nvbench::benchmark_base is move-only class
+  //    Methods:
+  //        nvbench::benchmark_base::get_name
+  //        nvbench::benchmark_base::add_int64_axis
+  //        nvbench::benchmark_base::add_int64_power_of_two_axis
+  //        nvbench::benchmark_base::add_float64_axis
+  //        nvbench::benchmark_base::add_string_axis
+  //        nvbench::benchmark_base::set_name
+  //        nvbench::benchmark_base::set_is_cpu_only
+  //        nvbench::benchmark_base::set_skip_time
+  //        nvbench::benchmark_base::set_timeout
+  //        nvbench::benchmark_base::set_throttle_threshold
+  //        nvbench::benchmark_base::set_throttle_recovery_delay
+  //        nvbench::benchmark_base::set_stopping_criterion
+  //        nvbench::benchmark_base::set_criterion_param_int64
+  //        nvbench::benchmark_base::set_criterion_param_float64
+  //        nvbench::benchmark_base::set_criterion_param_string
+  //        nvbench::benchmark_base::set_min_samples
+
+  auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
+  py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
+  py_benchmark_cls.def(
+    "add_int64_axis",
+    [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
+      self.add_int64_axis(std::move(name), std::move(data));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
+  py_benchmark_cls.def(
+    "add_int64_power_of_two_axis",
+    [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
+      self.add_int64_axis(std::move(name),
+                          std::move(data),
+                          nvbench::int64_axis_flags::power_of_two);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
+  py_benchmark_cls.def(
+    "add_float64_axis",
+    [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::float64_t> data) {
+      self.add_float64_axis(std::move(name), std::move(data));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
+  py_benchmark_cls.def(
+    "add_string_axis",
+    [](nvbench::benchmark_base &self, std::string name, std::vector<std::string> data) {
+      self.add_string_axis(std::move(name), std::move(data));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
+  py_benchmark_cls.def(
+    "set_name",
+    [](nvbench::benchmark_base &self, std::string name) {
+      self.set_name(std::move(name));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"));
+  py_benchmark_cls.def(
+    "set_is_cpu_only",
+    [](nvbench::benchmark_base &self, bool is_cpu_only) {
+      self.set_is_cpu_only(is_cpu_only);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("is_cpu_only"));
+  // TODO: should this be exposed?
+  py_benchmark_cls.def(
+    "set_run_once",
+    [](nvbench::benchmark_base &self, bool run_once) {
+      self.set_run_once(run_once);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("run_once"));
+  py_benchmark_cls.def(
+    "set_skip_time",
+    [](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) {
+      self.set_skip_time(skip_duration_seconds);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("duration_seconds"));
+  py_benchmark_cls.def(
+    "set_timeout",
+    [](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) {
+      self.set_timeout(duration_seconds);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("duration_seconds"));
+  py_benchmark_cls.def(
+    "set_throttle_threshold",
+    [](nvbench::benchmark_base &self, nvbench::float32_t threshold) {
+      self.set_throttle_threshold(threshold);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("threshold"));
+  py_benchmark_cls.def(
+    "set_throttle_recovery_delay",
+    [](nvbench::benchmark_base &self, nvbench::float32_t delay) {
+      self.set_throttle_recovery_delay(delay);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("delay_seconds"));
+  py_benchmark_cls.def(
+    "set_stopping_criterion",
+    [](nvbench::benchmark_base &self, std::string criterion) {
+      self.set_stopping_criterion(std::move(criterion));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("criterion"));
+  py_benchmark_cls.def(
+    "set_criterion_param_int64",
+    [](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) {
+      self.set_criterion_param_int64(std::move(name), value);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("value"));
+  py_benchmark_cls.def(
+    "set_criterion_param_float64",
+    [](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) {
+      self.set_criterion_param_float64(std::move(name), value);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("value"));
+  py_benchmark_cls.def(
+    "set_criterion_param_string",
+    [](nvbench::benchmark_base &self, std::string name, std::string value) {
+      self.set_criterion_param_string(std::move(name), std::move(value));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("value"));
+  py_benchmark_cls.def(
+    "set_min_samples",
+    [](nvbench::benchmark_base &self, nvbench::int64_t count) {
+      self.set_min_samples(count);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("min_samples_count"));
+
+  // == STEP 5
+  // Define PyState class
+  //    ATTN: nvbench::state is move-only class
+  //    Methods:
+  //        nvbench::state::get_cuda_stream
+  //        nvbench::state::get_cuda_stream_optional
+  //        nvbench::state::set_cuda_stream
+  //        nvbench::state::get_device
+  //        nvbench::state::get_is_cpu_only
+  //        nvbench::state::get_type_config_index
+  //        nvbench::state::get_int64
+  //        nvbench::state::get_int64_or_default
+  //        nvbench::state::get_float64
+  //        nvbench::state::get_float64_or_default
+  //        nvbench::state::get_string
+  //        nvbench::state::get_string_or_default
+  //        nvbench::state::add_element_count
+  //        nvbench::state::set_element_count
+  //        nvbench::state::get_element_count
+  //        nvbench::state::add_global_memory_reads
+  //        nvbench::state::add_global_memory_writes
+  //        nvbench::state::add_buffer_size
+  //        nvbench::state::set_global_memory_rw_bytes
+  //        nvbench::state::get_global_memory_rw_bytes
+  //        nvbench::state::skip
+  //        nvbench::state::is_skipped
+  //        nvbench::state::get_skip_reason
+  //        nvbench::state::get_min_samples
+  //        nvbench::state::set_min_samples
+  //        nvbench::state::get_criterion_params
+  //        nvbench::state::get_stopping_criterion
+  //        nvbench::state::get_run_once
+  //        nvbench::state::set_run_once
+  //        nvbench::state::get_disable_blocking_kernel
+  //        nvbench::state::set_disable_blocking_kernel
+  //        nvbench::state::set_skip_time
+  //        nvbench::state::get_skip_time
+  //        nvbench::state::set_timeout
+  //        nvbench::state::get_timeout
+  //        nvbench::state::set_throttle_threshold
+  //        nvbench::state::get_throttle_threshold
+  //        nvbench::state::set_throttle_recovery_delay
+  //        nvbench::state::get_throttle_recovery_delay
+  //        nvbench::state::get_blocking_kernel_timeout
+  //        nvbench::state::set_blocking_kernel_timeout
+  //        nvbench::state::get_axis_values
+  //        nvbench::state::get_axis_values_as_string
+  //        nvbench::state::get_benchmark
+  //        nvbench::state::collect_l1_hit_rates
+  //        nvbench::state::collect_l2_hit_rates
+  //        nvbench::state::collect_stores_efficiency
+  //        nvbench::state::collect_loads_efficiency
+  //        nvbench::state::collect_dram_throughput
+  //        nvbench::state::collect_cupti_metrics
+  //        nvbench::state::is_l1_hit_rate_collected
+  //        nvbench::state::is_l2_hit_rate_collected
+  //        nvbench::state::is_stores_efficiency_collected
+  //        nvbench::state::is_loads_efficiency_collected
+  //        nvbench::state::is_dram_throughput_collected
+  //        nvbench::state::is_cupti_required
+  //        nvbench::state::add_summary
+  //        nvbench::state::get_summary
+  //        nvbench::state::get_summaries
+  //        nvbench::state::get_short_description
+  //        nvbench::state::exec
+  // NOTE:
+  //    State wraps std::reference_wrapper<nvbench::state>
+
+  using state_ref_t = std::reference_wrapper<nvbench::state>;
+  auto pystate_cls  = py::class_<nvbench::state>(m, "State");
+
+  pystate_cls.def("has_device", [](const nvbench::state &state) -> bool {
+    return static_cast<bool>(state.get_device());
+  });
+  pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool {
+    return state.get_benchmark().get_printer().has_value();
+  });
+  pystate_cls.def("get_device", [](const nvbench::state &state) {
+    auto dev = state.get_device();
+    if (dev.has_value())
+    {
+      return py::cast(dev.value().get_id());
+    }
+    return py::object(py::none());
+  });
+
+  pystate_cls.def(
+    "get_stream",
+    [](nvbench::state &state) { return std::ref(state.get_cuda_stream()); },
+    py::return_value_policy::reference);
+
+  pystate_cls.def("get_int64", &nvbench::state::get_int64, py::arg("name"));
+  pystate_cls.def("get_int64_or_default",
+                  &nvbench::state::get_int64_or_default,
+                  py::arg("name"),
+                  py::pos_only{},
+                  py::arg("default_value"));
+
+  pystate_cls.def("get_float64", &nvbench::state::get_float64, py::arg("name"));
+  pystate_cls.def("get_float64_or_default",
+                  &nvbench::state::get_float64_or_default,
+                  py::arg("name"),
+                  py::pos_only{},
+                  py::arg("default_value"));
+
+  pystate_cls.def("get_string", &nvbench::state::get_string, py::arg("name"));
+  pystate_cls.def("get_string_or_default",
+                  &nvbench::state::get_string_or_default,
+                  py::arg("name"),
+                  py::pos_only{},
+                  py::arg("default_value"));
+
+  pystate_cls.def("add_element_count",
+                  &nvbench::state::add_element_count,
+                  py::arg("count"),
+                  py::arg("column_name") = py::str(""));
+  pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count"));
+  pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
+
+  pystate_cls.def("skip", &nvbench::state::skip, py::arg("reason"));
+  pystate_cls.def("is_skipped", &nvbench::state::is_skipped);
+  pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason);
+
+  pystate_cls.def(
+    "add_global_memory_reads",
+    [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
+      state.add_global_memory_reads(nbytes, column_name);
+    },
+    "Add size, in bytes, of global memory reads",
+    py::arg("nbytes"),
+    py::pos_only{},
+    py::arg("column_name") = py::str(""));
+  pystate_cls.def(
+    "add_global_memory_writes",
+    [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
+      state.add_global_memory_writes(nbytes, column_name);
+    },
+    "Add size, in bytes, of global memory writes",
+    py::arg("nbytes"),
+    py::pos_only{},
+    py::arg("column_name") = py::str(""));
+  pystate_cls.def(
+    "get_benchmark",
+    [](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
+    py::return_value_policy::reference);
+  pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
+  pystate_cls.def("set_throttle_threshold",
+                  &nvbench::state::set_throttle_threshold,
+                  py::arg("throttle_fraction"));
+
+  pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples);
+  pystate_cls.def("set_min_samples",
+                  &nvbench::state::set_min_samples,
+                  py::arg("min_samples_count"));
+
+  pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel);
+  pystate_cls.def("set_disable_blocking_kernel",
+                  &nvbench::state::set_disable_blocking_kernel,
+                  py::arg("disable_blocking_kernel"));
+
+  pystate_cls.def("get_run_once", &nvbench::state::get_run_once);
+  pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once"));
+
+  pystate_cls.def("get_timeout", &nvbench::state::get_timeout);
+  pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration"));
+
+  pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout);
+  pystate_cls.def("set_blocking_kernel_timeout",
+                  &nvbench::state::set_blocking_kernel_timeout,
+                  py::arg("duration"));
+
+  pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics);
+  pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required);
+
+  pystate_cls.def(
+    "exec",
+    [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) {
+      if (!PyCallable_Check(py_launcher_fn.ptr()))
+      {
+        throw py::type_error("Argument of exec method must be a callable object");
+      }
+
+      // wrapper to invoke Python callable
+      auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void {
+        // cast C++ object to python object
+        auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference);
+        // call Python callable
+        py_launcher_fn(launch_pyarg);
+      };
+
+      if (sync)
+      {
+        if (batched)
+        {
+          constexpr auto tag = nvbench::exec_tag::sync;
+          state.exec(tag, cpp_launcher_fn);
+        }
+        else
+        {
+          constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch;
+          state.exec(tag, cpp_launcher_fn);
+        }
+      }
+      else
+      {
+        if (batched)
+        {
+          constexpr auto tag = nvbench::exec_tag::none;
+          state.exec(tag, cpp_launcher_fn);
+        }
+        else
+        {
+          constexpr auto tag = nvbench::exec_tag::no_batch;
+          state.exec(tag, cpp_launcher_fn);
+        }
+      }
+    },
+    "Executor for given launcher callable fn(state : Launch)",
+    py::arg("launcher_fn"),
+    py::pos_only{},
+    py::arg("batched") = true,
+    py::arg("sync")    = false);
+
+  pystate_cls.def("get_short_description",
+                  [](const nvbench::state &state) { return state.get_short_description(); });
+
+  pystate_cls.def(
+    "add_summary",
+    [](nvbench::state &state, std::string column_name, std::string value) {
+      auto &summ = state.add_summary("nv/python/" + column_name);
+      summ.set_string("description", "User tag: " + column_name);
+      summ.set_string("name", std::move(column_name));
+      summ.set_string("value", std::move(value));
+    },
+    py::arg("name"),
+    py::arg("value"));
+  pystate_cls.def(
+    "add_summary",
+    [](nvbench::state &state, std::string column_name, std::int64_t value) {
+      auto &summ = state.add_summary("nv/python/" + column_name);
+      summ.set_string("description", "User tag: " + column_name);
+      summ.set_string("name", std::move(column_name));
+      summ.set_int64("value", value);
+    },
+    py::arg("name"),
+    py::arg("value"));
+  pystate_cls.def(
+    "add_summary",
+    [](nvbench::state &state, std::string column_name, double value) {
+      auto &summ = state.add_summary("nv/python/" + column_name);
+      summ.set_string("description", "User tag: " + column_name);
+      summ.set_string("name", std::move(column_name));
+      summ.set_float64("value", value);
+    },
+    py::arg("name"),
+    py::arg("value"));
+  pystate_cls.def("get_axis_values_as_string",
+                  [](const nvbench::state &state) { return state.get_axis_values_as_string(); });
+  pystate_cls.def("get_axis_values", &py_get_axis_values);
+  pystate_cls.def("get_stopping_criterion", &nvbench::state::get_stopping_criterion);
+
+  // Use handle to take a memory leak here, since this object's destructor may be called after
+  // interpreter has shut down
+  benchmark_exc =
+    py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError).release();
+  // == STEP 6
+  //    ATTN: nvbench::benchmark_manager is a singleton
+
+  global_registry =
+    std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete>(new GlobalBenchmarkRegistry(),
+                                                           py::nodelete{});
+
+  m.def(
+    "register",
+    [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
+    "Register benchmark function of type Callable[[nvbench.State], None]",
+    py::return_value_policy::reference,
+    py::arg("benchmark_fn"));
+
+  m.def(
+    "run_all_benchmarks",
+    [&](py::object argv) -> void {
+      if (!py::isinstance<py::list>(argv))
+      {
+        throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
+      }
+      std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
+      global_registry->run(args);
+    },
+    "Run all registered benchmarks",
+    py::arg("argv") = py::list());
+
+  m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
+  m.def("test_py_exception", []() {
+    py::set_error(benchmark_exc, "Test");
+    throw py::error_already_set();
+  });
+}
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -0,0 +1,119 @@
+import sys
+
+import cuda.bench as bench
+import numpy as np
+from numba import cuda
+
+
+@cuda.jit()
+def kernel(a, b, c):
+    tid = cuda.grid(1)
+    size = len(a)
+
+    if tid < size:
+        c[tid] = a[tid] + b[tid]
+
+
+def get_numba_stream(launch: bench.Launch):
+    return cuda.external_stream(launch.get_stream().addressof())
+
+
+def skipit(state: bench.State) -> None:
+    state.skip("Skipping this benchmark for no reason")
+
+
+def add_two(state: bench.State):
+    N = state.get_int64("elements")
+    a = cuda.to_device(np.random.random(N))
+    c = cuda.device_array_like(a)
+
+    assert "elements" in state.get_axis_values()
+    assert "elements=" in state.get_axis_values_as_string()
+
+    state.add_global_memory_reads(a.nbytes)
+    state.add_global_memory_writes(c.nbytes)
+
+    nthreads = 256
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    # First call locks, can't use async benchmarks until sync tag is supported
+    kernel[nblocks, nthreads](a, a, c)
+    cuda.synchronize()
+
+    def kernel_launcher(launch):
+        stream = get_numba_stream(launch)
+        kernel[nblocks, nthreads, stream](a, a, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+
+
+def add_float(state: bench.State):
+    N = state.get_int64("elements")
+    v = state.get_float64("v")
+    name = state.get_string("name")
+    a = cuda.to_device(np.random.random(N).astype(np.float32))
+    b = cuda.to_device(np.random.random(N).astype(np.float32))
+    c = cuda.device_array_like(a)
+
+    state.add_global_memory_reads(a.nbytes + b.nbytes)
+    state.add_global_memory_writes(c.nbytes)
+
+    nthreads = 64
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    axis_values = state.get_axis_values()
+    assert "elements" in axis_values
+    assert "v" in axis_values
+    assert "name" in axis_values
+    assert axis_values["elements"] == N
+    assert axis_values["v"] == v
+    assert axis_values["name"] == name
+
+    def kernel_launcher(launch):
+        _ = v
+        _ = name
+        stream = get_numba_stream(launch)
+        kernel[nblocks, nthreads, stream](a, b, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+
+
+def add_three(state: bench.State):
+    N = state.get_int64("elements")
+    a = cuda.to_device(np.random.random(N).astype(np.float32))
+    b = cuda.to_device(np.random.random(N).astype(np.float32))
+    c = cuda.device_array_like(a)
+
+    state.add_global_memory_reads(a.nbytes + b.nbytes)
+    state.add_global_memory_writes(c.nbytes)
+
+    nthreads = 256
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    def kernel_launcher(launch):
+        stream = get_numba_stream(launch)
+        kernel[nblocks, nthreads, stream](a, b, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+    cuda.synchronize()
+
+
+def register_benchmarks():
+    (
+        bench.register(add_two).add_int64_axis(
+            "elements", [2**pow2 - 1 for pow2 in range(20, 23)]
+        )
+    )
+    (
+        bench.register(add_float)
+        .add_float64_axis("v", [0.1, 0.3])
+        .add_string_axis("name", ["Anne", "Lynda"])
+        .add_int64_power_of_two_axis("elements", range(20, 23))
+    )
+    bench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))
+    bench.register(skipit)
+
+
+if __name__ == "__main__":
+    register_benchmarks()
+    bench.run_all_benchmarks(sys.argv)
--- a/python/test/stubs.py
+++ b/python/test/stubs.py
@@ -0,0 +1,236 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+import sys
+from typing import Dict, Optional, Tuple
+
+import cuda.bench as bench
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+
+
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def no_axes(state: bench.State):
+    state.set_min_samples(1000)
+    sleep_dur = 1e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    print(f"Stopping criterion used: {state.get_stopping_criterion()}")
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def tags(state: bench.State):
+    state.set_min_samples(1000)
+    sleep_dur = 1e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    sync_flag = bool(state.get_int64("Sync"))
+    batched_flag = bool(state.get_int64("Batched"))
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher, sync=sync_flag, batched=batched_flag)
+
+
+def single_float64_axis(state: bench.State):
+    # get axis value, or default
+    default_sleep_dur = 3.14e-4
+    sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def default_value(state: bench.State):
+    single_float64_axis(state)
+
+
+def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/cstddef>
+/*!
+ * Naive copy of `n` values from `in` -> `out`.
+ */
+template <typename T, typename U>
+__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    out[i] = static_cast<U>(in[i]);
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    if in_type is None:
+        in_type = "::cuda::std::int32_t"
+    if out_type is None:
+        out_type = "::cuda::std::int32_t"
+    instance_name = f"copy_kernel<{in_type}, {out_type}>"
+    mod = prog.compile("cubin", name_expressions=(instance_name,))
+    return mod.get_kernel(instance_name)
+
+
+def copy_sweep_grid_shape(state: bench.State):
+    block_size = state.get_int64("BlockSize")
+    num_blocks = state.get_int64("NumBlocks")
+
+    # Number of int32 elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
+
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
+
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+
+    krn = make_copy_kernel()
+    launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+def copy_type_sweep(state: bench.State):
+    type_id = state.get_int64("TypeID")
+
+    types_map: Dict[int, Tuple[type, str]] = {
+        0: (ctypes.c_uint8, "cuda::std::uint8_t"),
+        1: (ctypes.c_uint16, "cuda::std::uint16_t"),
+        2: (ctypes.c_uint32, "cuda::std::uint32_t"),
+        3: (ctypes.c_uint64, "cuda::std::uint64_t"),
+        4: (ctypes.c_float, "float"),
+        5: (ctypes.c_double, "double"),
+    }
+
+    value_ctype, value_cuda_t = types_map[type_id]
+    state.add_summary("Type", value_cuda_t)
+
+    # Number of elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(value_ctype)
+
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
+
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+
+    krn = make_copy_kernel(value_cuda_t, value_cuda_t)
+    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
+
+    def launcher(launch: bench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    # Benchmark without axes
+    simple_b = bench.register(no_axes)
+    simple_b.set_stopping_criterion("entropy")
+    simple_b.set_criterion_param_int64("unused_int", 100)
+
+    tags_b = bench.register(tags)
+    tags_b.add_int64_axis("Sync", [0, 1])
+    tags_b.add_int64_axis("Batched", [0, 1])
+
+    # benchmark with no axes, that uses default value
+    default_b = bench.register(default_value)
+    default_b.set_min_samples(7)
+
+    # specify axis
+    axes_b = bench.register(single_float64_axis).add_float64_axis(
+        "Duration", [7e-5, 1e-4, 5e-4]
+    )
+    axes_b.set_timeout(20)
+    axes_b.set_skip_time(1e-5)
+    axes_b.set_throttle_threshold(0.2)
+    axes_b.set_throttle_recovery_delay(0.1)
+
+    copy1_bench = bench.register(copy_sweep_grid_shape)
+    copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2))
+    copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
+
+    copy2_bench = bench.register(copy_type_sweep)
+    copy2_bench.add_int64_axis("TypeID", range(0, 6))
+
+    bench.run_all_benchmarks(sys.argv)
--- a/python/test/test_nvbench.py
+++ b/python/test/test_nvbench.py
@@ -0,0 +1,39 @@
+import json
+
+import cuda.bench as bench
+import pytest
+
+
+def test_cpp_exception():
+    with pytest.raises(RuntimeError, match="Test"):
+        bench._nvbench.test_cpp_exception()
+
+
+def test_py_exception():
+    with pytest.raises(bench.NVBenchRuntimeError, match="Test"):
+        bench._nvbench.test_py_exception()
+
+
+@pytest.mark.parametrize(
+    "cls", [bench.CudaStream, bench.State, bench.Launch, bench.Benchmark]
+)
+def test_api_ctor(cls):
+    with pytest.raises(TypeError, match="No constructor defined!"):
+        cls()
+
+
+def t_bench(state: bench.State):
+    s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}}
+
+    def launcher(launch: bench.Launch):
+        for _ in range(10000):
+            _ = json.dumps(s)
+
+    state.exec(launcher)
+
+
+def test_cpu_only():
+    b = bench.register(t_bench)
+    b.set_is_cpu_only(True)
+
+    bench.run_all_benchmarks(["-q", "--profile"])