From 6552ef503c63db1b2110fc69ad7bbc6874ea96d4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 30 Jun 2025 14:30:15 -0500
Subject: [PATCH 01/78] Draft of Python API for NVBench

The prototype is based on pybind11 to minimize boiler-plate
code needed to deal with move-only semantics of many nvbench
classes.
---
 python/.gitignore               |   4 +
 python/CMakeLists.txt           |  29 ++
 python/README.md                |  21 ++
 python/cuda/nvbench/__init__.py |  18 ++
 python/pyproject.toml           |  58 ++++
 python/src/README.md            |  17 ++
 python/src/py_nvbench.cpp       | 451 ++++++++++++++++++++++++++++++++
 python/test/run_1.py            | 108 ++++++++
 8 files changed, 706 insertions(+)
 create mode 100644 python/.gitignore
 create mode 100644 python/CMakeLists.txt
 create mode 100644 python/README.md
 create mode 100644 python/cuda/nvbench/__init__.py
 create mode 100644 python/pyproject.toml
 create mode 100644 python/src/README.md
 create mode 100644 python/src/py_nvbench.cpp
 create mode 100755 python/test/run_1.py

diff --git a/python/.gitignore b/python/.gitignore
new file mode 100644
index 0000000..56f16ab
--- /dev/null
+++ b/python/.gitignore
@@ -0,0 +1,4 @@
+build
+nvbench_build
+nvbench_install
+__pycache__
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
new file mode 100644
index 0000000..b61da52
--- /dev/null
+++ b/python/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.30...4.0)
+
+project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+find_package(Python REQUIRED COMPONENTS Development.Module)
+find_package(CUDAToolkit REQUIRED)
+
+include(FetchContent)
+
+FetchContent_Declare(
+    pybind11
+    URL https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.tar.gz
+    URL_HASH SHA256=e08cb87f4773da97fa7b5f035de8763abc656d87d5773e62f6da0587d1f0ec20
+    FIND_PACKAGE_ARGS NAMES pybind11
+)
+FetchContent_MakeAvailable(pybind11)
+
+find_package(nvbench CONFIG REQUIRED)
+
+pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
+target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
+target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
+set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
+
+install(TARGETS _nvbench DESTINATION cuda/nvbench)
+install(IMPORTED_RUNTIME_ARTIFACTS nvbench::nvbench DESTINATION cuda/nvbench)
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..dcbf72b
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,21 @@
+# CUDA Kernel Benchmarking Package
+
+This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
+
+## Building
+
+### Build `NVBench` project
+
+```
+cd nvbench/python
+cmake -B nvbench_build --preset nvbench-ci -S $(pwd)/.. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DNVBench_ENABLE_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$(pwd)/nvbench_install
+cmake --build nvbench_build/ --config Release --target install
+
+nvbench_DIR=$(pwd)/nvbench_install/lib/cmake CUDACXX=/usr/local/cuda/bin/nvcc pip install -e .
+```
+
+### Verify that package works
+
+```
+python test/run_1.py
+```
diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/nvbench/__init__.py
new file mode 100644
index 0000000..47ba48c
--- /dev/null
+++ b/python/cuda/nvbench/__init__.py
@@ -0,0 +1,18 @@
+import importlib.metadata
+
+from cuda.bindings.path_finder import (  # type: ignore[import-not-found]
+    _load_nvidia_dynamic_library,
+)
+
+try:
+    __version__ = importlib.metadata.version("pynvbench")
+except Exception:
+    __version__ = "0.0.0dev"
+
+for libname in ("cupti", "nvperf_target", "nvperf_host"):
+    _load_nvidia_dynamic_library(libname)
+
+from ._nvbench import *  # noqa: E402, F403
+from ._nvbench import register, run_all_benchmarks  # noqa: E402
+
+__all__ = ["register", "run_all_benchmarks"]
diff --git a/python/pyproject.toml b/python/pyproject.toml
new file mode 100644
index 0000000..b371019
--- /dev/null
+++ b/python/pyproject.toml
@@ -0,0 +1,58 @@
+[build-system]
+requires = ["scikit-build-core>=0.10", "setuptools_scm"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "pynvbench"
+description = "CUDA Kernel Benchmarking Package"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dependencies = [
+  # pathfinder
+  "cuda-bindings",
+
+  # Library expects to find shared libraries
+  # libcupti, libnvperf_target, libnvperf_host
+  # pathfinder is used to find it in the Python layout
+  "nvidia-cuda-cupti-cu12",
+
+  # The shared library
+  # libnvidia-ml must be installed system-wide
+  # (Debian package provider: libnvidia-compute)
+]
+dynamic = ["version"]
+readme = { file = "README.md", content-type = "text/markdown" }
+
+[project.optional-dependencies]
+test = ["pytest", "cupy-cuda12x", "numba"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.scikit-build]
+minimum-version = "build-system.requires"
+build-dir = "build/{wheel_tag}"
+
+[tool.scikit-build.cmake]
+version = ">=3.30.4"
+args = []
+build-type = "Release"
+source-dir = "."
+
+[tool.scikit-build.ninja]
+version = ">=1.11"
+make-fallback = true
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.setuptools_scm"
+
+[tool.setuptools_scm]
+root = ".."
+
+[tool.scikit-build.wheel.packages]
+"cuda/nvbench" = "cuda/nvbench"
diff --git a/python/src/README.md b/python/src/README.md
new file mode 100644
index 0000000..af4f613
--- /dev/null
+++ b/python/src/README.md
@@ -0,0 +1,17 @@
+
+```
+g++ py_nvbench.cpp                                    \
+   -shared -fPIC                                      \
+   -I ${HOME}/repos/pybind11/include                  \
+   -I ${HOME}/repos/pynvbench/nvbench_dir/include     \
+   -I /usr/local/cuda/include                         \
+   $(python3-config --includes)                       \
+   $(python3-config --libs)                           \
+   -L ${HOME}/repos/pynvbench/nvbench_dir/lib/        \
+   -lnvbench                                          \
+   -Wl,-rpath,${HOME}/repos/pynvbench/nvbench_dir/lib \
+   -L /usr/local/cuda/lib64/                          \
+   -lcudart                                           \
+   -Wl,-rpath,/usr/local/cuda/lib64                   \
+   -o _nvbench$(python3-config --extension-suffix)
+```
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
new file mode 100644
index 0000000..9ca7e64
--- /dev/null
+++ b/python/src/py_nvbench.cpp
@@ -0,0 +1,451 @@
+// TODO: Copyright header
+
+#include <nvbench/nvbench.cuh>
+
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+namespace
+{
+
+inline void set_env(const char *name, const char *value)
+{
+#ifdef _MSC_VER
+  _putenv_s(name, value);
+#else
+  setenv(name, value, 1);
+#endif
+}
+
+struct PyObjectDeleter
+{
+  void operator()(py::object *p)
+  {
+    const bool initialized = Py_IsInitialized();
+
+#if PY_VERSION_HEX < 0x30d0000
+    const bool finalizing = _Py_IsFinalizing();
+#else
+    const bool finalizing = Py_IsFinalizing();
+#endif
+    const bool guard = initialized && !finalizing;
+
+    // deleter only call ~object if interpreter is active and
+    // not shutting down, let OS clean up resources after
+    // interpreter tear-down
+    if (guard)
+    {
+      delete p;
+    }
+  }
+};
+
+struct benchmark_wrapper_t
+{
+
+  benchmark_wrapper_t()
+      : m_fn() {};
+  explicit benchmark_wrapper_t(py::object o)
+      : m_fn{std::shared_ptr<py::object>(new py::object(o), PyObjectDeleter{})}
+  {}
+
+  benchmark_wrapper_t(const benchmark_wrapper_t &other)
+      : m_fn{other.m_fn}
+  {}
+  benchmark_wrapper_t &operator=(const benchmark_wrapper_t &other) = delete;
+  benchmark_wrapper_t(benchmark_wrapper_t &&) noexcept             = delete;
+  benchmark_wrapper_t &operator=(benchmark_wrapper_t &&) noexcept  = delete;
+
+  void operator()(nvbench::state &state, nvbench::type_list<>)
+  {
+    // box as Python object, using reference semantics
+    auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
+
+    // Execute Python callable
+    (*m_fn)(arg);
+  }
+
+private:
+  // Important to use shared pointer here rather than py::object directly,
+  // since copy constructor must be const (benchmark::do_clone is const member method)
+  std::shared_ptr<py::object> m_fn;
+};
+
+class GlobalBenchmarkRegistry
+{
+  bool m_finalized;
+
+public:
+  GlobalBenchmarkRegistry()
+      : m_finalized(false) {};
+
+  GlobalBenchmarkRegistry(const GlobalBenchmarkRegistry &)            = delete;
+  GlobalBenchmarkRegistry &operator=(const GlobalBenchmarkRegistry &) = delete;
+
+  GlobalBenchmarkRegistry(GlobalBenchmarkRegistry &&)            = delete;
+  GlobalBenchmarkRegistry &operator=(GlobalBenchmarkRegistry &&) = delete;
+
+  bool is_finalized() const { return m_finalized; }
+
+  nvbench::benchmark_base &add_bench(py::object fn)
+  {
+    if (m_finalized)
+    {
+      throw std::runtime_error("Can not register more benchmarks after benchmark was run");
+    }
+    if (!PyCallable_Check(fn.ptr()))
+    {
+      throw py::value_error("Benchmark should be a callable object");
+    }
+    std::string name;
+    if (py::hasattr(fn, "__name__"))
+    {
+      py::str py_name = fn.attr("__name__");
+      name            = py::cast<std::string>(py_name);
+    }
+    else
+    {
+      py::str py_name = py::repr(fn);
+      name            = py::cast<std::string>(py_name);
+    }
+    benchmark_wrapper_t executor(fn);
+
+    return nvbench::benchmark_manager::get()
+      .add(std::make_unique<nvbench::benchmark<benchmark_wrapper_t>>(executor))
+      .set_name(std::move(name));
+  }
+
+  void run(const std::vector<std::string> &argv)
+  {
+    if (nvbench::benchmark_manager::get().get_benchmarks().empty())
+    {
+      throw std::runtime_error("No benchmarks had been registered yet");
+    }
+    if (m_finalized)
+    {
+      throw std::runtime_error("Benchmarks were already executed");
+    }
+    m_finalized = true;
+
+    try
+    {
+      // TODO: This line is mandatory for correctness
+      //    Q: Why is initializing at module init not enough?
+      nvbench::benchmark_manager::get().initialize();
+      {
+        nvbench::option_parser parser{};
+        parser.parse(argv);
+
+        NVBENCH_MAIN_PRINT_PREAMBLE(parser);
+        NVBENCH_MAIN_RUN_BENCHMARKS(parser);
+        NVBENCH_MAIN_PRINT_EPILOGUE(parser);
+
+        NVBENCH_MAIN_PRINT_RESULTS(parser);
+      } /* Tear down parser before finalization */
+    }
+    catch (const std::exception &e)
+    {
+      std::stringstream ss;
+      ss << "Caught exception while running benchmakrs: ";
+      ss << e.what();
+      ss << "\n";
+      py::print(py::cast(ss.str(), py::return_value_policy::move));
+    }
+    catch (...)
+    {
+      py::print("Caught exception in nvbench_main\n");
+    }
+  }
+};
+
+// essentially a global variable, but allocated on the heap during module initialization
+constinit std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
+
+} // end of anonymous namespace
+
+PYBIND11_MODULE(_nvbench, m)
+{
+  // == STEP 1
+  // Set environment variable CUDA_MODULE_LOADING=EAGER
+
+  // See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+  set_env("CUDA_MODULE_LOADING", "EAGER");
+
+  NVBENCH_DRIVER_API_CALL(cuInit(0));
+
+  nvbench::benchmark_manager::get().initialize();
+
+  // TODO: Use cuModuleGetLoadingMode(&mode) to confirm that (mode == CU_MODULE_EAGER_LOADING)
+  // and issue warning otherwise
+
+  // == STEP 2
+  // Define CudaStream class
+  //    ATTN: nvbench::cuda_stream is move-only class
+  //    Methods:
+  //       Constructors, based on device, or on existing stream
+  //       nvbench::cuda_stream::get_stream
+
+  auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream");
+
+  py_cuda_stream_cls.def("__cuda_stream__",
+                         [](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
+                           return std::make_pair(std::size_t{0},
+                                                 reinterpret_cast<std::size_t>(s.get_stream()));
+                         });
+  py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
+    return reinterpret_cast<std::size_t>(s.get_stream());
+  });
+
+  // == STEP 3
+  // Define Launch class
+  //    ATTN: nvbench::launch is move-only class
+  //    Methods:
+  //        nvbench::launch::get_stream -> nvbench::cuda_stream
+
+  auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
+
+  py_launch_cls.def(
+    "getStream",
+    [](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
+    py::return_value_policy::reference);
+
+  // == STEP 4
+  // Define Benchmark class
+
+  auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
+  py_benchmark_cls.def("getName", &nvbench::benchmark_base::get_name);
+  py_benchmark_cls.def(
+    "addInt64Axis",
+    [](nvbench::benchmark_base &self, std::string name, const std::vector<nvbench::int64_t> &data) {
+      self.add_int64_axis(name, data);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "addFloat64Axis",
+    [](nvbench::benchmark_base &self,
+       std::string name,
+       const std::vector<nvbench::float64_t> &data) {
+      self.add_float64_axis(name, data);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "addStringAxis",
+    [](nvbench::benchmark_base &self, std::string name, const std::vector<std::string> &data) {
+      self.add_string_axis(name, data);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+
+  // == STEP 5
+  // Define PyState class
+  //    ATTN: nvbench::state is move-only class
+  //    Methods:
+  //        nvbench::state::get_cuda_stream
+  //        nvbench::state::get_cuda_stream_optional
+  //        nvbench::state::set_cuda_stream
+  //        nvbench::state::get_device
+  //        nvbench::state::get_is_cpu_only
+  //        nvbench::state::get_type_config_index
+  //        nvbench::state::get_int64
+  //        nvbench::state::get_int64_or_default
+  //        nvbench::state::get_float64
+  //        nvbench::state::get_float64_or_default
+  //        nvbench::state::get_string
+  //        nvbench::state::get_string_or_default
+  //        nvbench::state::add_element_count
+  //        nvbench::state::set_element_count
+  //        nvbench::state::get_element_count
+  //        nvbench::state::add_global_memory_reads
+  //        nvbench::state::add_global_memory_writes
+  //        nvbench::state::add_buffer_size
+  //        nvbench::state::set_global_memory_rw_bytes
+  //        nvbench::state::get_global_memory_rw_bytes
+  //        nvbench::state::skip
+  //        nvbench::state::is_skipped
+  //        nvbench::state::get_skip_reason
+  //        nvbench::state::get_min_samples
+  //        nvbench::state::set_min_samples
+  //        nvbench::state::get_criterion_params
+  //        nvbench::state::get_stopping_criterion
+  //        nvbench::state::get_run_once
+  //        nvbench::state::set_run_once
+  //        nvbench::state::get_disable_blocking_kernel
+  //        nvbench::state::set_disable_blocking_kernel
+  //        nvbench::state::set_skip_time
+  //        nvbench::state::get_skip_time
+  //        nvbench::state::set_timeout
+  //        nvbench::state::get_timeout
+  //        nvbench::state::set_throttle_threshold
+  //        nvbench::state::get_throttle_threshold
+  //        nvbench::state::set_throttle_recovery_delay
+  //        nvbench::state::get_throttle_recovery_delay
+  //        nvbench::state::get_blocking_kernel_timeout
+  //        nvbench::state::set_blocking_kernel_timeout
+  //        nvbench::state::get_axis_values
+  //        nvbench::state::get_axis_values_as_string
+  //        nvbench::state::get_benchmark
+  //        nvbench::state::collect_l1_hit_rates
+  //        nvbench::state::collect_l2_hit_rates
+  //        nvbench::state::collect_stores_efficiency
+  //        nvbench::state::collect_loads_efficiency
+  //        nvbench::state::collect_dram_throughput
+  //        nvbench::state::collect_cupti_metrics
+  //        nvbench::state::is_l1_hit_rate_collected
+  //        nvbench::state::is_l2_hit_rate_collected
+  //        nvbench::state::is_stores_efficiency_collected
+  //        nvbench::state::is_loads_efficiency_collected
+  //        nvbench::state::is_dram_throughput_collected
+  //        nvbench::state::is_cupti_required
+  //        nvbench::state::add_summary
+  //        nvbench::state::get_summary
+  //        nvbench::state::get_summaries
+  //        nvbench::state::get_short_description
+  //        nvbench::state::exec
+  // NOTE:
+  //    PyState wraps std::reference_wrapper<nvbench::state>
+
+  using state_ref_t = std::reference_wrapper<nvbench::state>;
+  auto pystate_cls  = py::class_<nvbench::state>(m, "State");
+
+  pystate_cls.def("hasDevice", [](nvbench::state &state) -> bool {
+    return static_cast<bool>(state.get_device());
+  });
+  pystate_cls.def("hasPrinters", [](nvbench::state &state) -> bool {
+    return state.get_benchmark().get_printer().has_value();
+  });
+
+  pystate_cls.def("getStream", &nvbench::state::get_cuda_stream);
+
+  pystate_cls.def("getInt64", &nvbench::state::get_int64);
+  pystate_cls.def("getInt64", &nvbench::state::get_int64_or_default);
+
+  pystate_cls.def("getFloat64", &nvbench::state::get_float64);
+  pystate_cls.def("getFloat64", &nvbench::state::get_float64_or_default);
+
+  pystate_cls.def("getString", &nvbench::state::get_string);
+  pystate_cls.def("getString", &nvbench::state::get_string_or_default);
+
+  pystate_cls.def("addElementCount", &nvbench::state::add_element_count);
+  pystate_cls.def("setElementCount", &nvbench::state::set_element_count);
+  pystate_cls.def("getElementCount", &nvbench::state::get_element_count);
+
+  pystate_cls.def("skip", &nvbench::state::skip);
+  pystate_cls.def("isSkipped", &nvbench::state::is_skipped);
+  pystate_cls.def("getSkipReason", &nvbench::state::get_skip_reason);
+
+  pystate_cls.def(
+    "addGlobalMemoryReads",
+    [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
+      state.add_global_memory_reads(nbytes, column_name);
+    },
+    "Add size, in bytes, of global memory reads",
+    py::arg("nbytes"),
+    py::pos_only{},
+    py::arg("column_name") = py::str(""));
+  pystate_cls.def(
+    "addGlobalMemoryWrites",
+    [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
+      state.add_global_memory_writes(nbytes, column_name);
+    },
+    "Add size, in bytes, of global memory writes",
+    py::arg("nbytes"),
+    py::pos_only{},
+    py::arg("column_name") = py::str(""));
+  pystate_cls.def(
+    "getBenchmark",
+    [](nvbench::state &state) { return std::ref(state.get_benchmark()); },
+    py::return_value_policy::reference);
+  pystate_cls.def("getThrottleThreshold", &nvbench::state::get_throttle_threshold);
+
+  pystate_cls.def("getMinSamples", &nvbench::state::get_min_samples);
+  pystate_cls.def("setMinSamples", &nvbench::state::set_min_samples);
+
+  pystate_cls.def("getDisableBlockingKernel", &nvbench::state::get_disable_blocking_kernel);
+  pystate_cls.def("setDisableBlockingKernel", &nvbench::state::set_disable_blocking_kernel);
+
+  pystate_cls.def("getRunOnce", &nvbench::state::get_run_once);
+  pystate_cls.def("setRunOnce", &nvbench::state::set_run_once);
+
+  pystate_cls.def("getTimeout", &nvbench::state::get_timeout);
+  pystate_cls.def("setTimeout", &nvbench::state::set_timeout);
+
+  pystate_cls.def("getBlockingKernel", &nvbench::state::get_blocking_kernel_timeout);
+  pystate_cls.def("setBlockingKernel", &nvbench::state::set_blocking_kernel_timeout);
+
+  pystate_cls.def("collectCUPTIMetrics", &nvbench::state::collect_cupti_metrics);
+  pystate_cls.def("isCUPTIRequired", &nvbench::state::is_cupti_required);
+
+  pystate_cls.def(
+    "exec",
+    [](nvbench::state &state, py::object fn, bool batched, bool sync) {
+      auto launcher_fn = [fn](nvbench::launch &launch_descr) -> void {
+        fn(py::cast(std::ref(launch_descr), py::return_value_policy::reference));
+      };
+
+      if (sync)
+      {
+        if (batched)
+        {
+          state.exec(nvbench::exec_tag::sync, launcher_fn);
+        }
+        else
+        {
+          state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::no_batch, launcher_fn);
+        }
+      }
+      else
+      {
+        if (batched)
+        {
+          state.exec(nvbench::exec_tag::none, launcher_fn);
+        }
+        else
+        {
+          state.exec(nvbench::exec_tag::no_batch, launcher_fn);
+        }
+      }
+    },
+    "Executor for given callable fn(state : Launch)",
+    py::arg("fn"),
+    py::pos_only{},
+    py::arg("batched") = true,
+    py::arg("sync")    = false);
+
+  // == STEP 6
+  //    ATTN: nvbench::benchmark_manager is a singleton
+
+  global_registry =
+    std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete>(new GlobalBenchmarkRegistry(),
+                                                           py::nodelete{});
+
+  m.def(
+    "register",
+    [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
+    py::return_value_policy::reference);
+
+  m.def(
+    "run_all_benchmarks",
+    [&](py::object argv) -> void {
+      if (!py::isinstance<py::list>(argv))
+      {
+        throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
+      }
+      std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
+      global_registry->run(args);
+    },
+    "Run all benchmarks",
+    py::arg("argv") = py::list());
+}
diff --git a/python/test/run_1.py b/python/test/run_1.py
new file mode 100755
index 0000000..7c8b6b8
--- /dev/null
+++ b/python/test/run_1.py
@@ -0,0 +1,108 @@
+import sys
+
+import cuda.nvbench as nvbench
+import numpy as np
+from numba import cuda
+
+
+@cuda.jit()
+def kernel(a, b, c):
+    tid = cuda.grid(1)
+    size = len(a)
+
+    if tid < size:
+        c[tid] = a[tid] + b[tid]
+
+
+def getNumbaStream(launch):
+    return cuda.external_stream(launch.getStream().addressof())
+
+
+def add_two(state):
+    # state.skip("Skipping this benchmark for no reason")
+    N = state.getInt64("elements")
+    a = cuda.to_device(np.random.random(N))
+    c = cuda.device_array_like(a)
+
+    state.addGlobalMemoryReads(a.nbytes)
+    state.addGlobalMemoryWrites(c.nbytes)
+
+    nthreads = 256
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    # First call locks, can't use async benchmarks until sync tag is supported
+    kernel[nblocks, nthreads](a, a, c)
+    cuda.synchronize()
+
+    def kernel_launcher(launch):
+        stream = getNumbaStream(launch)
+        kernel[nblocks, nthreads, stream](a, a, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+
+
+def add_float(state):
+    N = state.getInt64("elements")
+    v = state.getFloat64("v")
+    name = state.getString("name")
+    a = cuda.to_device(np.random.random(N).astype(np.float32))
+    b = cuda.to_device(np.random.random(N).astype(np.float32))
+    c = cuda.device_array_like(a)
+
+    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
+    state.addGlobalMemoryWrites(c.nbytes)
+
+    nthreads = 64
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    def kernel_launcher(launch):
+        _ = v
+        _ = name
+        stream = getNumbaStream(launch)
+        kernel[nblocks, nthreads, stream](a, b, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+
+
+def add_three(state):
+    N = state.getInt64("elements")
+    a = cuda.to_device(np.random.random(N).astype(np.float32))
+    b = cuda.to_device(np.random.random(N).astype(np.float32))
+    c = cuda.device_array_like(a)
+
+    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
+    state.addGlobalMemoryWrites(c.nbytes)
+
+    nthreads = 256
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    def kernel_launcher(launch):
+        stream = getNumbaStream(launch)
+        kernel[nblocks, nthreads, stream](a, b, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+    cuda.synchronize()
+
+
+def register_benchmarks():
+    (
+        nvbench.register(add_two).addInt64Axis(
+            "elements", [2**pow2 for pow2 in range(20, 23)]
+        )
+    )
+    (
+        nvbench.register(add_float)
+        .addFloat64Axis("v", [0.1, 0.3])
+        .addStringAxis("name", ["Anne", "Lynda"])
+        .addInt64Axis("elements", [2**pow2 for pow2 in range(20, 23)])
+    )
+    (
+        nvbench.register(add_three).addInt64Axis(
+            "elements", [2**pow2 for pow2 in range(20, 22)]
+        )
+    )
+
+
+if __name__ == "__main__":
+    register_benchmarks()
+    nvbench.run_all_benchmarks(sys.argv)

From b88cc78aebdbd669845224091abca9fbbd4dfc33 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 09:19:18 -0500
Subject: [PATCH 02/78] Add license header to py_nvbench.cpp

Also updated comment as to why calling
`nvbench::benchmark_manager::get().initialize()` is necessary
for running all tests.
---
 python/src/py_nvbench.cpp | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 9ca7e64..c9f3824 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -1,4 +1,20 @@
-// TODO: Copyright header
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
 
 #include <nvbench/nvbench.cuh>
 
@@ -140,8 +156,8 @@ public:
 
     try
     {
-      // TODO: This line is mandatory for correctness
-      //    Q: Why is initializing at module init not enough?
+      // This line is mandatory for correctness to populate
+      // benchmark with devices requested by user via CLI
       nvbench::benchmark_manager::get().initialize();
       {
         nvbench::option_parser parser{};
@@ -184,6 +200,8 @@ PYBIND11_MODULE(_nvbench, m)
 
   NVBENCH_DRIVER_API_CALL(cuInit(0));
 
+  // This line ensures that benchmark_manager has been created during module init
+  // It is reinitialized before running all benchmarks to set devices to use
   nvbench::benchmark_manager::get().initialize();
 
   // TODO: Use cuModuleGetLoadingMode(&mode) to confirm that (mode == CU_MODULE_EAGER_LOADING)

From c184549cda8c3e0be409312bbadf3bbb8134c4ed Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 09:23:45 -0500
Subject: [PATCH 03/78] Import and reexport symbols from _nvbench one-by-one

---
 python/cuda/nvbench/__init__.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/nvbench/__init__.py
index 47ba48c..7f6f6ce 100644
--- a/python/cuda/nvbench/__init__.py
+++ b/python/cuda/nvbench/__init__.py
@@ -12,7 +12,20 @@ except Exception:
 for libname in ("cupti", "nvperf_target", "nvperf_host"):
     _load_nvidia_dynamic_library(libname)
 
-from ._nvbench import *  # noqa: E402, F403
-from ._nvbench import register, run_all_benchmarks  # noqa: E402
+from ._nvbench import (  # noqa: E402
+    Benchmark,
+    CudaStream,
+    Launch,
+    State,
+    register,
+    run_all_benchmarks,
+)
 
-__all__ = ["register", "run_all_benchmarks"]
+__all__ = [
+    "register",
+    "run_all_benchmarks",
+    "CudaStream",
+    "Launch",
+    "State",
+    "Benchmark",
+]

From c49d718f65719e75423f6fe322caabefe7e25cca Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 12:00:06 -0500
Subject: [PATCH 04/78] Corrected nvbench.State.getBlockingKernel ->
 getBlockingKernelTimeout

Similar change for setBlockingKernelTimeout.

Corrected statement in a comment.
---
 python/src/py_nvbench.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index c9f3824..ab15b5a 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -333,7 +333,7 @@ PYBIND11_MODULE(_nvbench, m)
   //        nvbench::state::get_short_description
   //        nvbench::state::exec
   // NOTE:
-  //    PyState wraps std::reference_wrapper<nvbench::state>
+  //    State wraps std::reference_wrapper<nvbench::state>
 
   using state_ref_t = std::reference_wrapper<nvbench::state>;
   auto pystate_cls  = py::class_<nvbench::state>(m, "State");
@@ -400,8 +400,8 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("getTimeout", &nvbench::state::get_timeout);
   pystate_cls.def("setTimeout", &nvbench::state::set_timeout);
 
-  pystate_cls.def("getBlockingKernel", &nvbench::state::get_blocking_kernel_timeout);
-  pystate_cls.def("setBlockingKernel", &nvbench::state::set_blocking_kernel_timeout);
+  pystate_cls.def("getBlockingKernelTimeout", &nvbench::state::get_blocking_kernel_timeout);
+  pystate_cls.def("setBlockingKernelTimeout", &nvbench::state::set_blocking_kernel_timeout);
 
   pystate_cls.def("collectCUPTIMetrics", &nvbench::state::collect_cupti_metrics);
   pystate_cls.def("isCUPTIRequired", &nvbench::state::is_cupti_required);

From e768ce28b637619ce35fa85a5cf834fefbf3c79c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 12:02:52 -0500
Subject: [PATCH 05/78] Add Python stub file for cuda.nvbench API

---
 python/cuda/nvbench/__init__.pyi | 183 +++++++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 python/cuda/nvbench/__init__.pyi

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
new file mode 100644
index 0000000..3bc9f45
--- /dev/null
+++ b/python/cuda/nvbench/__init__.pyi
@@ -0,0 +1,183 @@
+from typing import Callable, Sequence, Tuple
+
+class CudaStream:
+    """Represents CUDA stream
+
+    Note
+    ----
+        The class is not directly constructible.
+    """
+    def __cuda_stream__(self) -> Tuple[int]:
+        """
+        Special method implement CUDA stream protocol
+        from `cuda.core`. Returns a pair of integers:
+        (protocol_version, integral_value_of_cudaStream_t pointer)
+        """
+        ...
+
+    def addressof(self) -> int:
+        "Integral value of address of driver's CUDA stream struct"
+        ...
+
+class Benchmark:
+    """Represents NVBench benchmark.
+
+    Note
+    ----
+        The class is not user-constructible.
+        Use `~register` function to create Benchmark and register
+        it with NVBench.
+    """
+    def getName(self) -> str:
+        "Get benchmark name"
+        ...
+    def addInt64Axis(self, name: str, values: Sequence[int]) -> Benchmark:
+        "Add integral type parameter axis with given name and values to sweep over"
+        ...
+    def addFloat64Axis(self, name: str, values: Sequence[float]) -> Benchmark:
+        "Add floating-point type parameter axis with given name and values to sweep over"
+        ...
+    def addStringAxis(sef, name: str, values: Sequence[str]) -> Benchmark:
+        "Add string type parameter axis with given name and values to sweep over"
+        ...
+
+class Launch:
+    """Configuration object for function launch.
+
+    Note
+    ----
+        The class is not user-constructible.
+    """
+    def getStream(self) -> CudaStream:
+        "Get CUDA stream of this configuration"
+        ...
+
+class State:
+    """Represent benchmark configuration state.
+
+    Note
+    ----
+        The class is not user-constructible.
+    """
+    def hasDevice(self) -> bool:
+        "True if configuration has a device"
+        ...
+    def hasPrinters(self) -> bool:
+        "True if configuration has a printer"
+        ...
+    def getStream(self) -> CudaStream:
+        "CudaStream object from this configuration"
+        ...
+    def getInt64(self, name: str, default_value: int = None) -> int:
+        "Get value for given Int64 axis from this configuration"
+        ...
+    def getFloat64(self, name: str, default_value: float = None) -> float:
+        "Get value for given Float64 axis from this configuration"
+        ...
+    def getString(self, name: str, default_value: str = None) -> str:
+        "Get value for given String axis from this configuration"
+        ...
+    def addElementCount(self, count: int, column_name: str = None) -> None:
+        "Add element count"
+        ...
+    def setElementCount(self, count: int) -> None:
+        "Set element count"
+        ...
+    def getElementCount(self) -> int:
+        "Get element count"
+        ...
+    def skip(self, reason: str) -> None:
+        "Skip this configuration"
+        ...
+    def isSkipped(self) -> bool:
+        "Has this configuration been skipped"
+        ...
+    def getSkipReason(self) -> str:
+        "Get reason provided for skipping this configuration"
+        ...
+    def addGlobalMemoryReads(self, nbytes: int) -> None:
+        "Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
+        ...
+    def addGlobalMemoryWrites(self, nbytes: int) -> None:
+        "Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
+        ...
+    def getBenchmark(self) -> Benchmark:
+        "Get Benchmark this configuration is a part of"
+        ...
+    def getThrottleThreshold(self) -> float:
+        "Get throttle threshold value"
+        ...
+    def getMinSamples(self) -> int:
+        "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
+        ...
+    def setMinSamples(self, count: int) -> None:
+        "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
+        ...
+    def getDisableBlockingKernel(self) -> bool:
+        "True if use of blocking kernel by NVBench is disabled, False otherwise"
+        ...
+    def setDisableBlockingKernel(self, flag: bool) -> None:
+        "Use flag = True to disable use of blocking kernel by NVBench"
+        ...
+    def getRunOnce(self) -> bool:
+        "Boolean flag whether configuration should only run once"
+        ...
+
+    def setRunOnce(self, flag: bool) -> None:
+        "Set run-once flag for this configuration"
+        ...
+    def getTimeout(self) -> float:
+        "Get time-out value for benchmark execution of this configuration"
+        ...
+    def setTimeout(self, duration: float) -> None:
+        "Set time-out value for benchmark execution of this configuration"
+        ...
+    def getBlockingKernelTimeout(self) -> float:
+        "Get time-out value for execution of blocking kernel"
+        ...
+    def setBlockingKernelTimeout(self, duration: float) -> None:
+        "Set time-out value for execution of blocking kernel"
+        ...
+    def collectCUPTIMetrics(self) -> None:
+        "Request NVBench to record CUPTI metrics while running benchmark for this configuration"
+        ...
+    def isCUPTIRequired(self) -> bool:
+        "True if (some) CUPTI metrics are being collected"
+        ...
+    def exec(
+        self, fn: Callable[[Launch], None], batched: bool = True, sync: bool = False
+    ):
+        """Execute callable running the benchmark.
+
+        The callable may be executed multiple times.
+
+        Parameters
+        ----------
+        fn: Callable
+            Python callable with signature fn(Launch) -> None that executes the benchmark.
+        batched: bool, optional
+            If `True`, no cache flushing is performed between callable invocations.
+            Default: `True`.
+        sync: bool, optional
+            True value indicates that callable performs device synchronization.
+            NVBench disables use of blocking kernel in this case.
+            Default: `False`.
+        """
+        ...
+
+def register(fn: Callable[[State], None]) -> Benchmark:
+    """
+    Register bencharking function with NVBench.
+    """
+    ...
+
+def run_all_benchmarks(argv: Sequence[str]) -> None:
+    """
+    Run all benchmarks registered with NVBench.
+
+    Parameters
+    ----------
+    argv: List[str]
+        Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
+    """
+    ...

From 6f8bcdc774e1cb1460bed78752f2b2adf8ff2b50 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 14:53:54 -0500
Subject: [PATCH 06/78] Fixed correctness of nvbench.State.getStream() method

Fix run-time exception:

```
Fail: Unexpected error: RuntimeError: return_value_policy = copy, but type is non-copyable! (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)
```

caused by attempt to returning move-only `nvbench::cuda_stream` class
instance using default `pybind11::return_value_policy::copy`.
---
 python/src/py_nvbench.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index ab15b5a..c940587 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -345,7 +345,10 @@ PYBIND11_MODULE(_nvbench, m)
     return state.get_benchmark().get_printer().has_value();
   });
 
-  pystate_cls.def("getStream", &nvbench::state::get_cuda_stream);
+  pystate_cls.def(
+    "getStream",
+    [](nvbench::state &state) { return std::ref(state.get_cuda_stream()); },
+    py::return_value_policy::reference);
 
   pystate_cls.def("getInt64", &nvbench::state::get_int64);
   pystate_cls.def("getInt64", &nvbench::state::get_int64_or_default);

From c9f0785aedd16b41ff32a6d683808cbbd603737c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 14:57:27 -0500
Subject: [PATCH 07/78] Replace uses of deprecated typing.Tuple,
 typing.Callable, etc.

Also use typing.Self to encode that `Benchmark.addInt64Axis` returns
self.
---
 python/cuda/nvbench/__init__.pyi | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 3bc9f45..c63cb4b 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -1,4 +1,7 @@
-from typing import Callable, Sequence, Tuple
+# from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from typing import Optional, Self
 
 class CudaStream:
     """Represents CUDA stream
@@ -7,7 +10,7 @@ class CudaStream:
     ----
         The class is not directly constructible.
     """
-    def __cuda_stream__(self) -> Tuple[int]:
+    def __cuda_stream__(self) -> tuple[int]:
         """
         Special method implement CUDA stream protocol
         from `cuda.core`. Returns a pair of integers:
@@ -31,13 +34,13 @@ class Benchmark:
     def getName(self) -> str:
         "Get benchmark name"
         ...
-    def addInt64Axis(self, name: str, values: Sequence[int]) -> Benchmark:
+    def addInt64Axis(self, name: str, values: Sequence[int]) -> Self:
         "Add integral type parameter axis with given name and values to sweep over"
         ...
-    def addFloat64Axis(self, name: str, values: Sequence[float]) -> Benchmark:
+    def addFloat64Axis(self, name: str, values: Sequence[float]) -> Self:
         "Add floating-point type parameter axis with given name and values to sweep over"
         ...
-    def addStringAxis(sef, name: str, values: Sequence[str]) -> Benchmark:
+    def addStringAxis(sef, name: str, values: Sequence[str]) -> Self:
         "Add string type parameter axis with given name and values to sweep over"
         ...
 
@@ -68,16 +71,16 @@ class State:
     def getStream(self) -> CudaStream:
         "CudaStream object from this configuration"
         ...
-    def getInt64(self, name: str, default_value: int = None) -> int:
+    def getInt64(self, name: str, default_value: Optional[int] = None) -> int:
         "Get value for given Int64 axis from this configuration"
         ...
-    def getFloat64(self, name: str, default_value: float = None) -> float:
+    def getFloat64(self, name: str, default_value: Optional[float] = None) -> float:
         "Get value for given Float64 axis from this configuration"
         ...
-    def getString(self, name: str, default_value: str = None) -> str:
+    def getString(self, name: str, default_value: Optional[str] = None) -> str:
         "Get value for given String axis from this configuration"
         ...
-    def addElementCount(self, count: int, column_name: str = None) -> None:
+    def addElementCount(self, count: int, column_name: Optional[str] = None) -> None:
         "Add element count"
         ...
     def setElementCount(self, count: int) -> None:
@@ -145,7 +148,10 @@ class State:
         "True if (some) CUPTI metrics are being collected"
         ...
     def exec(
-        self, fn: Callable[[Launch], None], batched: bool = True, sync: bool = False
+        self,
+        fn: Callable[[Launch], None],
+        batched: Optional[bool] = True,
+        sync: Optional[bool] = False,
     ):
         """Execute callable running the benchmark.
 

From 4950a50961e709cdc0a8c0e9ad6afaebc1f84615 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 14:59:07 -0500
Subject: [PATCH 08/78] Add empty py.typed to signal mypy that package has type
 annotations

---
 python/cuda/nvbench/py.typed | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 python/cuda/nvbench/py.typed

diff --git a/python/cuda/nvbench/py.typed b/python/cuda/nvbench/py.typed
new file mode 100644
index 0000000..e69de29

From 2507bc226352cd2e628ed705eed8f47074fa357b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 1 Jul 2025 14:59:33 -0500
Subject: [PATCH 09/78] Add Python example based on C++
 example/auto_throughput.cpp

---
 python/examples/auto_throughput.py | 75 ++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 python/examples/auto_throughput.py

diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
new file mode 100644
index 0000000..4f79217
--- /dev/null
+++ b/python/examples/auto_throughput.py
@@ -0,0 +1,75 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.nvbench as nvbench
+import numpy as np
+from numba import cuda
+
+
+def make_kernel(items_per_thread: int):
+    @cuda.jit
+    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
+        tid = cuda.grid(1)
+        step = cuda.gridDim.x * cuda.blockDim.x
+        for i in range(stride * tid, stride * elements, stride * step):
+            for j in range(items_per_thread):
+                read_id = (items_per_thread * i + j) % elements
+                write_id = tid + j * elements
+                out_arr[write_id] = in_arr[read_id]
+
+    return kernel
+
+
+def throughput_bench(state: nvbench.State):
+    stride = state.getInt64("Stride")
+    ipt = state.getInt64("ItemsPerThread")
+
+    nbytes = 128 * 1024 * 1024
+    elements = nbytes // np.dtype(np.int32).itemsize
+
+    alloc_stream = cuda.external_stream(state.getStream().addressof())
+    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
+    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
+
+    state.addElementCount(elements, "Elements")
+    state.collectCUPTIMetrics()
+
+    threads_per_block = 256
+    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
+
+    krn = make_kernel(ipt)
+
+    def launcher(launch: nvbench.Launch):
+        exec_stream = cuda.external_stream(launch.getStream().addressof())
+        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
+            stride, elements, inp_arr, out_arr
+        )
+
+    state.exec(launcher)
+
+
+(
+    nvbench.register(throughput_bench)
+    .addInt64Axis("Stride", [1, 4])
+    .addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+)
+
+
+if __name__ == "__main__":
+    print(nvbench.__version__)
+    nvbench.run_all_benchmarks(sys.argv)

From 576c4734816f20cd141af17a2aead089bd80eb57 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 10:32:10 -0500
Subject: [PATCH 10/78] Add implementation of and signature for State.getDevice

make batch/sync arguments of State.exec keyword-only

Provide default column_name value for State.addElementCount method,
so that it can be called state.addElementCount(count), or as
state.addElementCount(count, column_name="Descriptive Name")
---
 python/cuda/nvbench/__init__.pyi | 19 ++++++++++++++++++-
 python/src/py_nvbench.cpp        | 13 ++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index c63cb4b..1e7376d 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -1,7 +1,7 @@
 # from __future__ import annotations
 
 from collections.abc import Callable, Sequence
-from typing import Optional, Self
+from typing import Optional, Self, Union
 
 class CudaStream:
     """Represents CUDA stream
@@ -15,6 +15,18 @@ class CudaStream:
         Special method implement CUDA stream protocol
         from `cuda.core`. Returns a pair of integers:
         (protocol_version, integral_value_of_cudaStream_t pointer)
+
+        Example
+        -------
+            import cuda.core.experimental as core
+            import cuda.nvbench as nvbench
+
+            def bench(state: nvbench.State):
+                dev = core.Device(state.getDevice())
+                dev.set_current()
+                # converts CudaString to core.Stream
+                # using __cuda_stream__ protocol
+                dev.create_stream(state.getStream())
         """
         ...
 
@@ -68,6 +80,9 @@ class State:
     def hasPrinters(self) -> bool:
         "True if configuration has a printer"
         ...
+    def getDevice(self) -> Union[int, None]:
+        "Get device_id of the device from this configuration"
+        ...
     def getStream(self) -> CudaStream:
         "CudaStream object from this configuration"
         ...
@@ -150,6 +165,8 @@ class State:
     def exec(
         self,
         fn: Callable[[Launch], None],
+        /,
+        *,
         batched: Optional[bool] = True,
         sync: Optional[bool] = False,
     ):
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index c940587..55a79f5 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -344,6 +344,14 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("hasPrinters", [](nvbench::state &state) -> bool {
     return state.get_benchmark().get_printer().has_value();
   });
+  pystate_cls.def("getDevice", [](nvbench::state &state) {
+    auto dev = state.get_device();
+    if (dev.has_value())
+    {
+      return py::cast(dev.value().get_id());
+    }
+    return py::object(py::none());
+  });
 
   pystate_cls.def(
     "getStream",
@@ -359,7 +367,10 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("getString", &nvbench::state::get_string);
   pystate_cls.def("getString", &nvbench::state::get_string_or_default);
 
-  pystate_cls.def("addElementCount", &nvbench::state::add_element_count);
+  pystate_cls.def("addElementCount",
+                  &nvbench::state::add_element_count,
+                  py::arg("count"),
+                  py::arg("column_name") = py::str(""));
   pystate_cls.def("setElementCount", &nvbench::state::set_element_count);
   pystate_cls.def("getElementCount", &nvbench::state::get_element_count);
 

From df426a0bad573cca912a0a9719c2eef14bd1567b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 10:43:30 -0500
Subject: [PATCH 11/78] Add examples/axes.py

---
 python/examples/axes.py | 198 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 198 insertions(+)
 create mode 100644 python/examples/axes.py

diff --git a/python/examples/axes.py b/python/examples/axes.py
new file mode 100644
index 0000000..8e97850
--- /dev/null
+++ b/python/examples/axes.py
@@ -0,0 +1,198 @@
+import ctypes
+import sys
+from typing import Optional
+
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def simple(state: nvbench.State):
+    state.setMinSamples(1000)
+    sleep_dur = 1e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        dev = core.Device()
+        dev.set_current()
+        s = dev.create_stream(launch.getStream())
+
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def single_float64_axis(state: nvbench.State):
+    # get axis value, or default
+    sleep_dur = state.getFloat64("Duration", 3.14e-4)
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        dev = core.Device()
+        dev.set_current()
+        s = dev.create_stream(launch.getStream())
+
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def default_value(state: nvbench.State):
+    single_float64_axis(state)
+
+
+def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/cstddef>
+/*!
+ * Naive copy of `n` values from `in` -> `out`.
+ */
+template <typename T, typename U>
+__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    out[i] = static_cast<U>(in[i]);
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    if in_type is None:
+        in_type = "::cuda::std::int32_t"
+    if out_type is None:
+        out_type = "::cuda::std::int32_t"
+    instance_name = f"copy_kernel<{in_type}, {out_type}>"
+    mod = prog.compile("cubin", name_expressions=(instance_name,))
+    return mod.get_kernel(instance_name)
+
+
+def copy_sweep_grid_shape(state: nvbench.State):
+    block_size = state.getInt64("BlockSize")
+    num_blocks = state.getInt64("NumBlocks")
+
+    # Number of int32 elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
+
+    state.addElementCount(num_values)
+    state.addGlobalMemoryReads(nbytes)
+    state.addGlobalMemoryWrites(nbytes)
+
+    dev = core.Device(state.getDevice())
+    dev.set_current()
+
+    alloc_stream = dev.create_stream(state.getStream())
+    input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+    output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+
+    krn = make_copy_kernel()
+    launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        dev = core.Device()
+        dev.set_current()
+        s = dev.create_stream(launch.getStream())
+
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+def copy_type_sweep(state: nvbench.State):
+    type_id = state.getInt64("TypeID")
+
+    types_map = {
+        0: (ctypes.c_uint8, "::cuda::std::uint8_t"),
+        1: (ctypes.c_uint16, "::cuda::std::uint16_t"),
+        2: (ctypes.c_uint32, "::cuda::std::uint32_t"),
+        3: (ctypes.c_uint64, "::cuda::std::uint64_t"),
+        4: (ctypes.c_float, "float"),
+        5: (ctypes.c_double, "double"),
+    }
+
+    value_ctype, value_cuda_t = types_map[type_id]
+
+    # Number of elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(value_ctype(0))
+
+    state.addElementCount(num_values)
+    state.addGlobalMemoryReads(nbytes)
+    state.addGlobalMemoryWrites(nbytes)
+
+    dev = core.Device(state.getDevice())
+    dev.set_current()
+
+    alloc_stream = dev.create_stream(state.getStream())
+    input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+    output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+
+    krn = make_copy_kernel(value_cuda_t, value_cuda_t)
+    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        dev = core.Device()
+        dev.set_current()
+        s = dev.create_stream(launch.getStream())
+
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    # Benchmark without axes
+    nvbench.register(simple)
+
+    # benchmark with no axes, that uses default value
+    nvbench.register(default_value)
+    # specify axis
+    nvbench.register(single_float64_axis).addFloat64Axis("Duration", [7e-5, 1e-4, 5e-4])
+
+    copy1_bench = nvbench.register(copy_sweep_grid_shape)
+    copy1_bench.addInt64Axis("BlockSize", [2**x for x in range(6, 10, 2)])
+    copy1_bench.addInt64Axis("NumBlocks", [2**x for x in range(6, 10, 2)])
+
+    copy2_bench = nvbench.register(copy_type_sweep)
+    copy2_bench.addInt64Axis("TypeID", range(0, 6))
+
+    nvbench.run_all_benchmarks(sys.argv)

From 9dba8664260aa450a3e62681e7f560a48f117080 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 11:14:41 -0500
Subject: [PATCH 12/78] Add State.add_summary method

state.add_summary(column_name: str, value: Union[int, float, str])

This is used in examples/axes.py to map integral value from Int64Axis
to string description.
---
 python/cuda/nvbench/__init__.pyi |  3 +++
 python/src/py_nvbench.cpp        | 32 ++++++++++++++++++++++++++++----
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 1e7376d..9c060af 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -187,6 +187,9 @@ class State:
             Default: `False`.
         """
         ...
+    def add_summary(self, column_name: str, value: Union[int, float, str]) -> None:
+        "Add summary column with a value"
+        ...
 
 def register(fn: Callable[[State], None]) -> Benchmark:
     """
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 55a79f5..24d5748 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -338,13 +338,13 @@ PYBIND11_MODULE(_nvbench, m)
   using state_ref_t = std::reference_wrapper<nvbench::state>;
   auto pystate_cls  = py::class_<nvbench::state>(m, "State");
 
-  pystate_cls.def("hasDevice", [](nvbench::state &state) -> bool {
+  pystate_cls.def("hasDevice", [](const nvbench::state &state) -> bool {
     return static_cast<bool>(state.get_device());
   });
-  pystate_cls.def("hasPrinters", [](nvbench::state &state) -> bool {
+  pystate_cls.def("hasPrinters", [](const nvbench::state &state) -> bool {
     return state.get_benchmark().get_printer().has_value();
   });
-  pystate_cls.def("getDevice", [](nvbench::state &state) {
+  pystate_cls.def("getDevice", [](const nvbench::state &state) {
     auto dev = state.get_device();
     if (dev.has_value())
     {
@@ -398,7 +398,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::arg("column_name") = py::str(""));
   pystate_cls.def(
     "getBenchmark",
-    [](nvbench::state &state) { return std::ref(state.get_benchmark()); },
+    [](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
     py::return_value_policy::reference);
   pystate_cls.def("getThrottleThreshold", &nvbench::state::get_throttle_threshold);
 
@@ -456,6 +456,30 @@ PYBIND11_MODULE(_nvbench, m)
     py::arg("batched") = true,
     py::arg("sync")    = false);
 
+  pystate_cls.def("get_short_description",
+                  [](const nvbench::state &state) { return state.get_short_description(); });
+
+  pystate_cls.def("add_summary",
+                  [](nvbench::state &state, std::string column_name, std::string value) {
+                    auto &summ = state.add_summary("nv/python/" + column_name);
+                    summ.set_string("description", "User tag: " + column_name);
+                    summ.set_string("name", std::move(column_name));
+                    summ.set_string("value", std::move(value));
+                  });
+  pystate_cls.def("add_summary",
+                  [](nvbench::state &state, std::string column_name, std::int64_t value) {
+                    auto &summ = state.add_summary("nv/python/" + column_name);
+                    summ.set_string("description", "User tag: " + column_name);
+                    summ.set_string("name", std::move(column_name));
+                    summ.set_int64("value", value);
+                  });
+  pystate_cls.def("add_summary", [](nvbench::state &state, std::string column_name, double value) {
+    auto &summ = state.add_summary("nv/python/" + column_name);
+    summ.set_string("description", "User tag: " + column_name);
+    summ.set_string("name", std::move(column_name));
+    summ.set_float64("value", value);
+  });
+
   // == STEP 6
   //    ATTN: nvbench::benchmark_manager is a singleton
 

From 4f15840832d9b1e65d091450d5e48e465fe4c6b4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 11:16:11 -0500
Subject: [PATCH 13/78] Use state.add_summary to supplement integral TypeID
 with meaningful type name

---
 python/examples/axes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/examples/axes.py b/python/examples/axes.py
index 8e97850..c26f4b0 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -150,6 +150,7 @@ def copy_type_sweep(state: nvbench.State):
     }
 
     value_ctype, value_cuda_t = types_map[type_id]
+    state.add_summary("Type", value_cuda_t)
 
     # Number of elements in 256MiB
     nbytes = 256 * 1024 * 1024

From 964ec2e1bc60ef08feb8ccd840544c165cc61957 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 11:49:32 -0500
Subject: [PATCH 14/78] Add examples/exec_tag_sync.py

---
 python/examples/exec_tag_sync.py | 70 ++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 python/examples/exec_tag_sync.py

diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
new file mode 100644
index 0000000..4541017
--- /dev/null
+++ b/python/examples/exec_tag_sync.py
@@ -0,0 +1,70 @@
+import ctypes
+import sys
+from typing import Optional
+
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+
+
+def make_fill_kernel(data_type: Optional[str] = None):
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/cstddef>
+/*!
+ * Naive setting of values in buffer
+ */
+template <typename T>
+__global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    buf[i] = v;
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    if data_type is None:
+        data_type = "::cuda::std::int32_t"
+    instance_name = f"fill_kernel<{data_type}>"
+    mod = prog.compile("cubin", name_expressions=(instance_name,))
+    return mod.get_kernel(instance_name)
+
+
+def synchronizing_bench(state: nvbench.State):
+    n_values = 64 * 1024 * 1024
+    n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
+
+    dev = core.Device(state.getDevice())
+    dev.set_current()
+
+    alloc_stream = dev.create_stream(state.getStream())
+    buffer = core.DeviceMemoryResource(dev).allocate(n_bytes, alloc_stream)
+
+    state.addElementCount(n_values, "Items")
+    state.addGlobalMemoryWrites(n_bytes, "Size")
+
+    krn = make_fill_kernel()
+    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        dev = core.Device()
+        dev.set_current()
+
+        s = dev.create_stream(launch.getStream())
+        core.launch(s, launch_config, krn, buffer, 0, n_values)
+        s.sync()
+
+    # since launcher contains synchronization point,
+    # setting sync=True is required to avoid a deadlock
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    nvbench.register(synchronizing_bench)
+    nvbench.run_all_benchmarks(sys.argv)

From b357af00920942c7284830fd4252093225d35949 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 12:44:22 -0500
Subject: [PATCH 15/78] Add examples/skip.py

---
 python/examples/skip.py | 71 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 python/examples/skip.py

diff --git a/python/examples/skip.py b/python/examples/skip.py
new file mode 100644
index 0000000..b02947e
--- /dev/null
+++ b/python/examples/skip.py
@@ -0,0 +1,71 @@
+import sys
+
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def runtime_skip(state: nvbench.State):
+    duration = state.getFloat64("Duration")
+    kramble = state.getString("Kramble")
+
+    # Skip Baz benchmarks with 0.8 ms duration
+    if kramble == "Baz" and duration < 0.8e-3:
+        state.skip("Short 'Baz' benchmarks are skipped")
+        return
+
+    # Skip Foo benchmark with > 0.3 ms duration
+    if kramble == "Foo" and duration > 0.3e-3:
+        state.skip("Long 'Foo' benchmarks are skipped")
+        return
+
+    krn = make_sleep_kernel()
+    launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        dev = core.Device()
+        dev.set_current()
+
+        s = dev.create_stream(launch.getStream())
+        core.launch(s, launch_cfg, krn, duration)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = nvbench.register(runtime_skip)
+    b.addFloat64Axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
+    b.addStringAxis("Kramble", ["Foo", "Bar", "Baz"])
+
+    nvbench.run_all_benchmarks(sys.argv)

From 883e5819b6f678ee173f728f83b20d821b3eaaba Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 15:09:12 -0500
Subject: [PATCH 16/78] Use cuda.Stream.from_handle to create core.Stream from
 nvbench.CudaStream

---
 python/examples/auto_throughput.py | 12 +++-----
 python/examples/axes.py            | 44 +++++++++++-------------------
 python/examples/exec_tag_sync.py   | 17 ++++++------
 python/examples/skip.py            | 10 ++++---
 4 files changed, 34 insertions(+), 49 deletions(-)

diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index 4f79217..c77d36d 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -63,13 +63,9 @@ def throughput_bench(state: nvbench.State):
     state.exec(launcher)
 
 
-(
-    nvbench.register(throughput_bench)
-    .addInt64Axis("Stride", [1, 4])
-    .addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
-)
-
-
 if __name__ == "__main__":
-    print(nvbench.__version__)
+    b = nvbench.register(throughput_bench)
+    b.addInt64Axis("Stride", [1, 2, 4])
+    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/axes.py b/python/examples/axes.py
index c26f4b0..fc710b7 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -7,6 +7,10 @@ import cuda.core.experimental as core
 import cuda.nvbench as nvbench
 
 
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
 def make_sleep_kernel():
     """JITs sleep_kernel(seconds)"""
     src = r"""
@@ -45,10 +49,7 @@ def simple(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
@@ -61,10 +62,7 @@ def single_float64_axis(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
@@ -117,21 +115,16 @@ def copy_sweep_grid_shape(state: nvbench.State):
     state.addGlobalMemoryReads(nbytes)
     state.addGlobalMemoryWrites(nbytes)
 
-    dev = core.Device(state.getDevice())
-    dev.set_current()
-
-    alloc_stream = dev.create_stream(state.getStream())
-    input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
-    output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+    dev_id = state.getDevice()
+    alloc_s = as_core_Stream(state.getStream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
     krn = make_copy_kernel()
     launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
@@ -160,21 +153,16 @@ def copy_type_sweep(state: nvbench.State):
     state.addGlobalMemoryReads(nbytes)
     state.addGlobalMemoryWrites(nbytes)
 
-    dev = core.Device(state.getDevice())
-    dev.set_current()
-
-    alloc_stream = dev.create_stream(state.getStream())
-    input_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
-    output_buf = core.DeviceMemoryResource(dev.device_id).allocate(nbytes, alloc_stream)
+    dev_id = state.getDevice()
+    alloc_s = as_core_Stream(state.getStream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
     krn = make_copy_kernel(value_cuda_t, value_cuda_t)
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-        s = dev.create_stream(launch.getStream())
-
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
index 4541017..8148c30 100644
--- a/python/examples/exec_tag_sync.py
+++ b/python/examples/exec_tag_sync.py
@@ -7,6 +7,11 @@ import cuda.core.experimental as core
 import cuda.nvbench as nvbench
 
 
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    "Create view of native stream used by NVBench"
+    return core.Stream.from_handle(cs.addressof())
+
+
 def make_fill_kernel(data_type: Optional[str] = None):
     src = r"""
 #include <cuda/std/cstdint>
@@ -40,11 +45,8 @@ def synchronizing_bench(state: nvbench.State):
     n_values = 64 * 1024 * 1024
     n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
 
-    dev = core.Device(state.getDevice())
-    dev.set_current()
-
-    alloc_stream = dev.create_stream(state.getStream())
-    buffer = core.DeviceMemoryResource(dev).allocate(n_bytes, alloc_stream)
+    alloc_s = as_core_Stream(state.getStream())
+    buffer = core.DeviceMemoryResource(state.getDevice()).allocate(n_bytes, alloc_s)
 
     state.addElementCount(n_values, "Items")
     state.addGlobalMemoryWrites(n_bytes, "Size")
@@ -53,10 +55,7 @@ def synchronizing_bench(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-
-        s = dev.create_stream(launch.getStream())
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_config, krn, buffer, 0, n_values)
         s.sync()
 
diff --git a/python/examples/skip.py b/python/examples/skip.py
index b02947e..fba91f5 100644
--- a/python/examples/skip.py
+++ b/python/examples/skip.py
@@ -5,6 +5,11 @@ import cuda.core.experimental as core
 import cuda.nvbench as nvbench
 
 
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    "Create view into native stream provided by NVBench"
+    return core.Stream.from_handle(cs.addressof())
+
+
 def make_sleep_kernel():
     """JITs sleep_kernel(seconds)"""
     src = r"""
@@ -54,10 +59,7 @@ def runtime_skip(state: nvbench.State):
     launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        dev = core.Device()
-        dev.set_current()
-
-        s = dev.create_stream(launch.getStream())
+        s = as_core_Stream(launch.getStream())
         core.launch(s, launch_cfg, krn, duration)
 
     state.exec(launcher)

From 707b24ffb5a8e010b53dca6d978faaf9553279cb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 15:34:25 -0500
Subject: [PATCH 17/78] Add examples/cccl_parallel_segmented_reduce.py

---
 .../cccl_parallel_segmented_reduce.py         | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 python/examples/cccl_parallel_segmented_reduce.py

diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
new file mode 100644
index 0000000..b9c1b66
--- /dev/null
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -0,0 +1,79 @@
+import sys
+
+import cuda.cccl.parallel.experimental.algorithms as algorithms
+import cuda.cccl.parallel.experimental.iterators as iterators
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+import cupy as cp
+import numpy as np
+
+
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def segmented_reduce(state: nvbench.State):
+    "Benchmark segmented_reduce example"
+    n_elems = state.getInt64("numElems")
+    n_cols = state.getInt64("numCols")
+    n_rows = n_elems // n_cols
+
+    state.add_summary("numRows", n_rows)
+    state.collectCUPTIMetrics()
+
+    rng = cp.random.default_rng()
+    mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
+
+    def add_op(a, b):
+        return a + b
+
+    def make_scaler(step):
+        def scale(row_id):
+            return row_id * step
+
+        return scale
+
+    zero = np.int32(0)
+    row_offset = make_scaler(np.int32(n_cols))
+    start_offsets = iterators.TransformIterator(
+        iterators.CountingIterator(zero), row_offset
+    )
+
+    end_offsets = start_offsets + 1
+
+    d_input = mat
+    h_init = np.zeros(tuple(), dtype=np.int32)
+    d_output = cp.empty(n_rows, dtype=d_input.dtype)
+
+    alg = algorithms.segmented_reduce(
+        d_input, d_output, start_offsets, end_offsets, add_op, h_init
+    )
+
+    # query size of temporary storage and allocate
+    temp_nbytes = alg(
+        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init
+    )
+    temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
+
+    def launcher(launch: nvbench.Launch):
+        s = as_core_Stream(launch.getStream())
+        alg(
+            temp_storage,
+            d_input,
+            d_output,
+            n_rows,
+            start_offsets,
+            end_offsets,
+            h_init,
+            s,
+        )
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = nvbench.register(segmented_reduce)
+    b.addInt64Axis("numElems", [2**20, 2**22, 2**24])
+    b.addInt64Axis("numCols", [1024, 2048, 4096, 8192])
+
+    nvbench.run_all_benchmarks(sys.argv)

From 394324023feb4413fa4e5638525d4b9125743baf Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 2 Jul 2025 22:14:10 -0500
Subject: [PATCH 18/78] Add example for benchmarking CuPy function

---
 python/examples/cupy_extract.py | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 python/examples/cupy_extract.py

diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
new file mode 100644
index 0000000..ffdb325
--- /dev/null
+++ b/python/examples/cupy_extract.py
@@ -0,0 +1,45 @@
+import sys
+
+import cuda.nvbench as nvbench
+import cupy as cp
+
+
+def as_cp_ExternalStream(
+    cs: nvbench.CudaStream, dev_id: int = -1
+) -> cp.cuda.ExternalStream:
+    h = cs.addressof()
+    return cp.cuda.ExternalStream(h, dev_id)
+
+
+def cupy_extract_by_mask(state: nvbench.State):
+    n_cols = state.getInt64("numCols")
+    n_rows = state.getInt64("numRows")
+
+    dev_id = state.getDevice()
+    cp_s = as_cp_ExternalStream(state.getStream(), dev_id)
+
+    state.collectCUPTIMetrics()
+    state.addElementCount(n_rows * n_cols, "# Elements")
+    state.addGlobalMemoryReads(
+        n_rows * n_cols * (cp.dtype(cp.int32).itemsize + cp.dtype("?").itemsize)
+    )
+    state.addGlobalMemoryWrites(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
+
+    with cp_s:
+        X = cp.full((n_cols, n_rows), fill_value=3, dtype=cp.int32)
+        mask = cp.ones((n_cols, n_rows), dtype="?")
+        _ = X[mask]
+
+    def launcher(launch: nvbench.Launch):
+        with as_cp_ExternalStream(launch.getStream(), dev_id):
+            _ = X[mask]
+
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    b = nvbench.register(cupy_extract_by_mask)
+    b.addInt64Axis("numCols", [1024, 2048, 4096, 2 * 4096])
+    b.addInt64Axis("numRows", [1024, 2048, 4096, 2 * 4096])
+
+    nvbench.run_all_benchmarks(sys.argv)

From 8589511f6163e01516d451c85947f05dbf3d716f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 3 Jul 2025 11:42:16 -0500
Subject: [PATCH 19/78] Corrected broken cccl_parallel_segmented_reduce.py

---
 .../cccl_parallel_segmented_reduce.py         | 46 ++++++++++++++++---
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
index b9c1b66..9a6a26d 100644
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -8,10 +8,31 @@ import cupy as cp
 import numpy as np
 
 
+class CCCLStream:
+    "Class to work around https://github.com/NVIDIA/cccl/issues/5144"
+
+    def __init__(self, ptr):
+        self._ptr = ptr
+
+    def __cuda_stream__(self):
+        return (0, self._ptr)
+
+
 def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
+def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:
+    return CCCLStream(cs.addressof())
+
+
+def as_cp_ExternalStream(
+    cs: nvbench.CudaStream, dev_id: int = -1
+) -> cp.cuda.ExternalStream:
+    h = cs.addressof()
+    return cp.cuda.ExternalStream(h, dev_id)
+
+
 def segmented_reduce(state: nvbench.State):
     "Benchmark segmented_reduce example"
     n_elems = state.getInt64("numElems")
@@ -21,8 +42,12 @@ def segmented_reduce(state: nvbench.State):
     state.add_summary("numRows", n_rows)
     state.collectCUPTIMetrics()
 
-    rng = cp.random.default_rng()
-    mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
+    dev_id = state.getDevice()
+    cp_stream = as_cp_ExternalStream(state.getStream(), dev_id)
+
+    with cp_stream:
+        rng = cp.random.default_rng()
+        mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
 
     def add_op(a, b):
         return a + b
@@ -41,22 +66,29 @@ def segmented_reduce(state: nvbench.State):
 
     end_offsets = start_offsets + 1
 
-    d_input = mat
     h_init = np.zeros(tuple(), dtype=np.int32)
-    d_output = cp.empty(n_rows, dtype=d_input.dtype)
+    with cp_stream:
+        d_input = mat
+        d_output = cp.empty(n_rows, dtype=d_input.dtype)
 
     alg = algorithms.segmented_reduce(
         d_input, d_output, start_offsets, end_offsets, add_op, h_init
     )
 
+    # print(1)
+    cccl_stream = as_cccl_Stream(state.getStream())
+    # print(2, core_stream, core_stream.__cuda_stream__())
     # query size of temporary storage and allocate
     temp_nbytes = alg(
-        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init
+        None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
     )
-    temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
+    h_init = np.zeros(tuple(), dtype=np.int32)
+    # print(3)
+    with cp_stream:
+        temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_cccl_Stream(launch.getStream())
         alg(
             temp_storage,
             d_input,

From 02ad6e5490a2e5f4dc7aa6ae0f25fb84bf27a8cc Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 3 Jul 2025 11:53:17 -0500
Subject: [PATCH 20/78] Implement Benchmark.setName

---
 python/src/py_nvbench.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 24d5748..3551ff3 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -246,7 +246,7 @@ PYBIND11_MODULE(_nvbench, m)
   py_benchmark_cls.def(
     "addInt64Axis",
     [](nvbench::benchmark_base &self, std::string name, const std::vector<nvbench::int64_t> &data) {
-      self.add_int64_axis(name, data);
+      self.add_int64_axis(std::move(name), data);
       return std::ref(self);
     },
     py::return_value_policy::reference);
@@ -255,14 +255,21 @@ PYBIND11_MODULE(_nvbench, m)
     [](nvbench::benchmark_base &self,
        std::string name,
        const std::vector<nvbench::float64_t> &data) {
-      self.add_float64_axis(name, data);
+      self.add_float64_axis(std::move(name), data);
       return std::ref(self);
     },
     py::return_value_policy::reference);
   py_benchmark_cls.def(
     "addStringAxis",
     [](nvbench::benchmark_base &self, std::string name, const std::vector<std::string> &data) {
-      self.add_string_axis(name, data);
+      self.add_string_axis(std::move(name), data);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "setName",
+    [](nvbench::benchmark_base &self, std::string name) {
+      self.set_name(std::move(name));
       return std::ref(self);
     },
     py::return_value_policy::reference);

From 203ef2046e35a5386c3590d91eca7ba78ce8b419 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 3 Jul 2025 13:56:09 -0500
Subject: [PATCH 21/78] Add warm-up call to auto_throughput.py

Add throughput.py example, which is based on the same kernel as
auto_throughput.py but records global memory reads/writes amounts
to output BWUtil metric measuring %SOL in bandwidth utilization.
---
 python/examples/auto_throughput.py | 21 ++++++--
 python/examples/throughput.py      | 83 ++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 5 deletions(-)
 create mode 100644 python/examples/throughput.py

diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index c77d36d..83c7312 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -15,13 +15,18 @@
 #  limitations under the License.
 
 import sys
+from collections.abc import Callable
 
 import cuda.nvbench as nvbench
 import numpy as np
 from numba import cuda
 
 
-def make_kernel(items_per_thread: int):
+def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def make_kernel(items_per_thread: int) -> Callable:
     @cuda.jit
     def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
         tid = cuda.grid(1)
@@ -35,18 +40,18 @@ def make_kernel(items_per_thread: int):
     return kernel
 
 
-def throughput_bench(state: nvbench.State):
+def throughput_bench(state: nvbench.State) -> None:
     stride = state.getInt64("Stride")
     ipt = state.getInt64("ItemsPerThread")
 
     nbytes = 128 * 1024 * 1024
     elements = nbytes // np.dtype(np.int32).itemsize
 
-    alloc_stream = cuda.external_stream(state.getStream().addressof())
+    alloc_stream = as_cuda_Stream(state.getStream())
     inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
     out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
 
-    state.addElementCount(elements, "Elements")
+    state.addElementCount(elements, column_name="Elements")
     state.collectCUPTIMetrics()
 
     threads_per_block = 256
@@ -54,8 +59,14 @@ def throughput_bench(state: nvbench.State):
 
     krn = make_kernel(ipt)
 
+    # warm-up call ensures that kernel is loaded into context
+    # before blocking kernel is launched
+    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
+        stride, elements, inp_arr, out_arr
+    )
+
     def launcher(launch: nvbench.Launch):
-        exec_stream = cuda.external_stream(launch.getStream().addressof())
+        exec_stream = as_cuda_Stream(launch.getStream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
         )
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
new file mode 100644
index 0000000..eaae2ef
--- /dev/null
+++ b/python/examples/throughput.py
@@ -0,0 +1,83 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+from collections.abc import Callable
+
+import cuda.nvbench as nvbench
+import numpy as np
+from numba import cuda
+
+
+def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def make_kernel(items_per_thread: int) -> Callable:
+    @cuda.jit
+    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
+        tid = cuda.grid(1)
+        step = cuda.gridDim.x * cuda.blockDim.x
+        for i in range(stride * tid, stride * elements, stride * step):
+            for j in range(items_per_thread):
+                read_id = (items_per_thread * i + j) % elements
+                write_id = tid + j * elements
+                out_arr[write_id] = in_arr[read_id]
+
+    return kernel
+
+
+def throughput_bench(state: nvbench.State) -> None:
+    stride = state.getInt64("Stride")
+    ipt = state.getInt64("ItemsPerThread")
+
+    nbytes = 128 * 1024 * 1024
+    elements = nbytes // np.dtype(np.int32).itemsize
+
+    alloc_stream = as_cuda_Stream(state.getStream())
+    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
+    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
+
+    state.addElementCount(elements, column_name="Elements")
+    state.addGlobalMemoryReads(inp_arr.nbytes, column_name="Datasize")
+    state.addGlobalMemoryWrites(inp_arr.nbytes)
+
+    threads_per_block = 256
+    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
+
+    krn = make_kernel(ipt)
+
+    # warm-up call ensures that kernel is loaded into context
+    # before blocking kernel is launched
+    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
+        stride, elements, inp_arr, out_arr
+    )
+
+    def launcher(launch: nvbench.Launch):
+        exec_stream = as_cuda_Stream(launch.getStream())
+        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
+            stride, elements, inp_arr, out_arr
+        )
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = nvbench.register(throughput_bench)
+    b.addInt64Axis("Stride", [1, 2, 4])
+    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+
+    nvbench.run_all_benchmarks(sys.argv)

From 6b1b2f3c306265fbad49202beabf7bfb9420fff2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 3 Jul 2025 17:01:15 -0500
Subject: [PATCH 22/78] Updated readme

---
 python/README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/python/README.md b/python/README.md
index dcbf72b..0ad2665 100644
--- a/python/README.md
+++ b/python/README.md
@@ -17,5 +17,28 @@ nvbench_DIR=$(pwd)/nvbench_install/lib/cmake CUDACXX=/usr/local/cuda/bin/nvcc pi
 ### Verify that package works
 
 ```
+export PYTHONPATH=$(pwd):${PYTHONPATH}
 python test/run_1.py
 ```
+
+### Run examples
+
+```
+# Example benchmarking numba.cuda kernel
+python examples/throughput.py
+```
+
+```
+# Example benchmarking kernels authored using cuda.core
+python examples/axes.py
+```
+
+```
+# Example benchmarking algorithms from cuda.cccl.parallel
+python examples/cccl_parallel_segmented_reduce.py
+```
+
+```
+# Example benchmarking CuPy function
+python examples/cupy_extract.py
+```

From 8c112d529f92d5ae1ab8f73150ed216c19025ee9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 09:40:35 -0500
Subject: [PATCH 23/78] Include Pybind11 headers before anything else

See https://github.com/NVIDIA/nvbench/pull/237#discussion_r2183703828
for the rationale
---
 python/src/py_nvbench.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 3551ff3..7fba033 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -16,6 +16,12 @@
  *  limitations under the License.
  */
 
+// clang-format off
+// Include Pybind11 headers first thing
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+// clang-format on
+
 #include <nvbench/nvbench.cuh>
 
 #include <cstdio>
@@ -27,9 +33,6 @@
 #include <utility>
 #include <vector>
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
 namespace py = pybind11;
 
 namespace

From 7f9d672cecb1619a41c1da25e137e626d91bd405 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 09:42:08 -0500
Subject: [PATCH 24/78] Raise Python exception if error is encountered while
 executing benchmarks

Introduce new exception type to raise on errors that occurred while
NVBench runs benchmarks.
---
 python/src/py_nvbench.cpp | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 7fba033..bb714dd 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -101,6 +101,10 @@ private:
   std::shared_ptr<py::object> m_fn;
 };
 
+class nvbench_run_error : std::runtime_error
+{};
+constinit py::handle benchmark_exc{};
+
 class GlobalBenchmarkRegistry
 {
   bool m_finalized;
@@ -173,17 +177,23 @@ public:
         NVBENCH_MAIN_PRINT_RESULTS(parser);
       } /* Tear down parser before finalization */
     }
+    catch (py::error_already_set &e)
+    {
+      py::raise_from(e, benchmark_exc.ptr(), "Python error raised ");
+      throw py::error_already_set();
+    }
     catch (const std::exception &e)
     {
       std::stringstream ss;
-      ss << "Caught exception while running benchmakrs: ";
+      ss << "Caught exception while running benchmarks: ";
       ss << e.what();
-      ss << "\n";
-      py::print(py::cast(ss.str(), py::return_value_policy::move));
+
+      const std::string &exc_message = ss.str();
+      py::set_error(benchmark_exc, exc_message.c_str());
     }
     catch (...)
     {
-      py::print("Caught exception in nvbench_main\n");
+      py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
     }
   }
 };
@@ -490,6 +500,10 @@ PYBIND11_MODULE(_nvbench, m)
     summ.set_float64("value", value);
   });
 
+  // Use handle to take a memory leak here, since this object's destructor may be called after
+  // interpreter has shut down
+  benchmark_exc =
+    py::exception<nvbench_run_error>(m, "NVBenchRuntimeException", PyExc_RuntimeError).release();
   // == STEP 6
   //    ATTN: nvbench::benchmark_manager is a singleton
 

From aa2b4d99607ec563ff0b81c3cc1b0c2e2e2289de Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 10:56:31 -0500
Subject: [PATCH 25/78] Add Benchmark.setIsCPUOnly API

---
 python/src/py_nvbench.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index bb714dd..80e608b 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -286,6 +286,13 @@ PYBIND11_MODULE(_nvbench, m)
       return std::ref(self);
     },
     py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "setIsCPUOnly",
+    [](nvbench::benchmark_base &self, bool is_cpu_only) {
+      self.set_is_cpu_only(is_cpu_only);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
 
   // == STEP 5
   // Define PyState class

From 6b4da8c5cbbfb4962f0a0b79f30079391cff07d0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 10:57:28 -0500
Subject: [PATCH 26/78] add comments to body of launcher_fn lambda in
 State.exec method

---
 python/src/py_nvbench.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 80e608b..962bdbd 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -449,9 +449,13 @@ PYBIND11_MODULE(_nvbench, m)
 
   pystate_cls.def(
     "exec",
-    [](nvbench::state &state, py::object fn, bool batched, bool sync) {
-      auto launcher_fn = [fn](nvbench::launch &launch_descr) -> void {
-        fn(py::cast(std::ref(launch_descr), py::return_value_policy::reference));
+    [](nvbench::state &state, py::object callable_fn, bool batched, bool sync) {
+      // wrapper to invoke Python callable
+      auto launcher_fn = [callable_fn](nvbench::launch &launch_descr) -> void {
+        // cast C++ object to python object
+        auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference);
+        // call Python callable
+        callable_fn(launch_pyarg);
       };
 
       if (sync)

From c960ef75ccf1d4ce957c850989b2ab7b04b72f2f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 10:59:37 -0500
Subject: [PATCH 27/78] Add examples/cpu_only.py based on code from PR feedback

https://github.com/NVIDIA/nvbench/pull/237#issuecomment-3058594793
---
 python/examples/cpu_only.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 python/examples/cpu_only.py

diff --git a/python/examples/cpu_only.py b/python/examples/cpu_only.py
new file mode 100644
index 0000000..4831c6d
--- /dev/null
+++ b/python/examples/cpu_only.py
@@ -0,0 +1,18 @@
+import sys
+import time
+
+import cuda.nvbench as nvbench
+
+
+def throughput_bench(state: nvbench.State) -> None:
+    def launcher(launch: nvbench.Launch):
+        time.sleep(1)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = nvbench.register(throughput_bench)
+    b.setIsCPUOnly(True)
+
+    nvbench.run_all_benchmarks(sys.argv)

From d3071fb0389f565b50b839f3c5f996b45d6ab6c0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 11:32:48 -0500
Subject: [PATCH 28/78] Addressed PR feedback re: definition of
 benchmark_wrapper_t

See https://github.com/NVIDIA/nvbench/pull/237#discussion_r2183749750
---
 python/src/py_nvbench.cpp | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 962bdbd..d25e6c7 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -73,21 +73,29 @@ struct PyObjectDeleter
 struct benchmark_wrapper_t
 {
 
-  benchmark_wrapper_t()
-      : m_fn() {};
+  benchmark_wrapper_t() = default;
+
   explicit benchmark_wrapper_t(py::object o)
       : m_fn{std::shared_ptr<py::object>(new py::object(o), PyObjectDeleter{})}
-  {}
+  {
+    if (!PyCallable_Check(m_fn->ptr()))
+    {
+      throw py::value_error("Argument must be a callable");
+    }
+  }
 
-  benchmark_wrapper_t(const benchmark_wrapper_t &other)
-      : m_fn{other.m_fn}
-  {}
+  // Only copy constructor is used, delete copy-assign, and moves
+  benchmark_wrapper_t(const benchmark_wrapper_t &other)            = default;
   benchmark_wrapper_t &operator=(const benchmark_wrapper_t &other) = delete;
   benchmark_wrapper_t(benchmark_wrapper_t &&) noexcept             = delete;
   benchmark_wrapper_t &operator=(benchmark_wrapper_t &&) noexcept  = delete;
 
   void operator()(nvbench::state &state, nvbench::type_list<>)
   {
+    if (!m_fn)
+    {
+      throw std::runtime_error("No function to execute");
+    }
     // box as Python object, using reference semantics
     auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
 

From 11ae98389d763b9b36dd1c5dc7b4cdd335e2ee72 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 11:44:25 -0500
Subject: [PATCH 29/78] Replace use of py::object copy constructor with use of
 move constructor

Change explicit constructor of benchmark_wrapper_t to use move-constructor
of py::object instead of copy constructor by replacing `py::object(o)` with
`py::object(std::move(o))`.
---
 python/src/py_nvbench.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index d25e6c7..18ca05d 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -76,7 +76,7 @@ struct benchmark_wrapper_t
   benchmark_wrapper_t() = default;
 
   explicit benchmark_wrapper_t(py::object o)
-      : m_fn{std::shared_ptr<py::object>(new py::object(o), PyObjectDeleter{})}
+      : m_fn{std::shared_ptr<py::object>(new py::object(std::move(o)), PyObjectDeleter{})}
   {
     if (!PyCallable_Check(m_fn->ptr()))
     {

From 81fff085b9f60f5e1ef6207b5f7060ea8479ddd4 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 12:48:24 -0500
Subject: [PATCH 30/78] Change method nameing from camelCase to snake_case

This ensures names of Python API methods are consistent with those of C++
counterparts.
---
 python/src/py_nvbench.cpp | 79 ++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 39 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 18ca05d..b5531a2 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -255,7 +255,7 @@ PYBIND11_MODULE(_nvbench, m)
   auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
 
   py_launch_cls.def(
-    "getStream",
+    "get_stream",
     [](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
     py::return_value_policy::reference);
 
@@ -263,16 +263,16 @@ PYBIND11_MODULE(_nvbench, m)
   // Define Benchmark class
 
   auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
-  py_benchmark_cls.def("getName", &nvbench::benchmark_base::get_name);
+  py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
   py_benchmark_cls.def(
-    "addInt64Axis",
+    "add_int64_axis",
     [](nvbench::benchmark_base &self, std::string name, const std::vector<nvbench::int64_t> &data) {
       self.add_int64_axis(std::move(name), data);
       return std::ref(self);
     },
     py::return_value_policy::reference);
   py_benchmark_cls.def(
-    "addFloat64Axis",
+    "add_float64_axis",
     [](nvbench::benchmark_base &self,
        std::string name,
        const std::vector<nvbench::float64_t> &data) {
@@ -281,21 +281,21 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference);
   py_benchmark_cls.def(
-    "addStringAxis",
+    "add_string_axis",
     [](nvbench::benchmark_base &self, std::string name, const std::vector<std::string> &data) {
       self.add_string_axis(std::move(name), data);
       return std::ref(self);
     },
     py::return_value_policy::reference);
   py_benchmark_cls.def(
-    "setName",
+    "set_name",
     [](nvbench::benchmark_base &self, std::string name) {
       self.set_name(std::move(name));
       return std::ref(self);
     },
     py::return_value_policy::reference);
   py_benchmark_cls.def(
-    "setIsCPUOnly",
+    "set_is_cpu_only",
     [](nvbench::benchmark_base &self, bool is_cpu_only) {
       self.set_is_cpu_only(is_cpu_only);
       return std::ref(self);
@@ -373,13 +373,13 @@ PYBIND11_MODULE(_nvbench, m)
   using state_ref_t = std::reference_wrapper<nvbench::state>;
   auto pystate_cls  = py::class_<nvbench::state>(m, "State");
 
-  pystate_cls.def("hasDevice", [](const nvbench::state &state) -> bool {
+  pystate_cls.def("has_device", [](const nvbench::state &state) -> bool {
     return static_cast<bool>(state.get_device());
   });
-  pystate_cls.def("hasPrinters", [](const nvbench::state &state) -> bool {
+  pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool {
     return state.get_benchmark().get_printer().has_value();
   });
-  pystate_cls.def("getDevice", [](const nvbench::state &state) {
+  pystate_cls.def("get_device", [](const nvbench::state &state) {
     auto dev = state.get_device();
     if (dev.has_value())
     {
@@ -389,32 +389,32 @@ PYBIND11_MODULE(_nvbench, m)
   });
 
   pystate_cls.def(
-    "getStream",
+    "get_stream",
     [](nvbench::state &state) { return std::ref(state.get_cuda_stream()); },
     py::return_value_policy::reference);
 
-  pystate_cls.def("getInt64", &nvbench::state::get_int64);
-  pystate_cls.def("getInt64", &nvbench::state::get_int64_or_default);
+  pystate_cls.def("get_int64", &nvbench::state::get_int64);
+  pystate_cls.def("get_int64", &nvbench::state::get_int64_or_default);
 
-  pystate_cls.def("getFloat64", &nvbench::state::get_float64);
-  pystate_cls.def("getFloat64", &nvbench::state::get_float64_or_default);
+  pystate_cls.def("get_float64", &nvbench::state::get_float64);
+  pystate_cls.def("get_float64", &nvbench::state::get_float64_or_default);
 
-  pystate_cls.def("getString", &nvbench::state::get_string);
-  pystate_cls.def("getString", &nvbench::state::get_string_or_default);
+  pystate_cls.def("get_string", &nvbench::state::get_string);
+  pystate_cls.def("get_string", &nvbench::state::get_string_or_default);
 
-  pystate_cls.def("addElementCount",
+  pystate_cls.def("add_element_count",
                   &nvbench::state::add_element_count,
                   py::arg("count"),
                   py::arg("column_name") = py::str(""));
-  pystate_cls.def("setElementCount", &nvbench::state::set_element_count);
-  pystate_cls.def("getElementCount", &nvbench::state::get_element_count);
+  pystate_cls.def("set_element_count", &nvbench::state::set_element_count);
+  pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
 
   pystate_cls.def("skip", &nvbench::state::skip);
-  pystate_cls.def("isSkipped", &nvbench::state::is_skipped);
-  pystate_cls.def("getSkipReason", &nvbench::state::get_skip_reason);
+  pystate_cls.def("is_skipped", &nvbench::state::is_skipped);
+  pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason);
 
   pystate_cls.def(
-    "addGlobalMemoryReads",
+    "add_global_memory_reads",
     [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
       state.add_global_memory_reads(nbytes, column_name);
     },
@@ -423,7 +423,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::pos_only{},
     py::arg("column_name") = py::str(""));
   pystate_cls.def(
-    "addGlobalMemoryWrites",
+    "add_global_memory_writes",
     [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
       state.add_global_memory_writes(nbytes, column_name);
     },
@@ -432,28 +432,28 @@ PYBIND11_MODULE(_nvbench, m)
     py::pos_only{},
     py::arg("column_name") = py::str(""));
   pystate_cls.def(
-    "getBenchmark",
+    "get_benchmark",
     [](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
     py::return_value_policy::reference);
-  pystate_cls.def("getThrottleThreshold", &nvbench::state::get_throttle_threshold);
+  pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
 
-  pystate_cls.def("getMinSamples", &nvbench::state::get_min_samples);
-  pystate_cls.def("setMinSamples", &nvbench::state::set_min_samples);
+  pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples);
+  pystate_cls.def("set_min_samples", &nvbench::state::set_min_samples);
 
-  pystate_cls.def("getDisableBlockingKernel", &nvbench::state::get_disable_blocking_kernel);
-  pystate_cls.def("setDisableBlockingKernel", &nvbench::state::set_disable_blocking_kernel);
+  pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel);
+  pystate_cls.def("set_disable_blocking_kernel", &nvbench::state::set_disable_blocking_kernel);
 
-  pystate_cls.def("getRunOnce", &nvbench::state::get_run_once);
-  pystate_cls.def("setRunOnce", &nvbench::state::set_run_once);
+  pystate_cls.def("get_run_once", &nvbench::state::get_run_once);
+  pystate_cls.def("set_run_once", &nvbench::state::set_run_once);
 
-  pystate_cls.def("getTimeout", &nvbench::state::get_timeout);
-  pystate_cls.def("setTimeout", &nvbench::state::set_timeout);
+  pystate_cls.def("get_timeout", &nvbench::state::get_timeout);
+  pystate_cls.def("set_timeout", &nvbench::state::set_timeout);
 
-  pystate_cls.def("getBlockingKernelTimeout", &nvbench::state::get_blocking_kernel_timeout);
-  pystate_cls.def("setBlockingKernelTimeout", &nvbench::state::set_blocking_kernel_timeout);
+  pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout);
+  pystate_cls.def("set_blocking_kernel_timeout", &nvbench::state::set_blocking_kernel_timeout);
 
-  pystate_cls.def("collectCUPTIMetrics", &nvbench::state::collect_cupti_metrics);
-  pystate_cls.def("isCUPTIRequired", &nvbench::state::is_cupti_required);
+  pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics);
+  pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required);
 
   pystate_cls.def(
     "exec",
@@ -533,6 +533,7 @@ PYBIND11_MODULE(_nvbench, m)
   m.def(
     "register",
     [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
+    "Register benchmark function of type Callable[[nvbench.State], None]",
     py::return_value_policy::reference);
 
   m.def(
@@ -545,6 +546,6 @@ PYBIND11_MODULE(_nvbench, m)
       std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
       global_registry->run(args);
     },
-    "Run all benchmarks",
+    "Run all registered benchmarks",
     py::arg("argv") = py::list());
 }

From e58951837614b6f2a186609590ba4d57a2cd6c08 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 12:50:28 -0500
Subject: [PATCH 31/78] Change test and examples from using camelCase to using
 snake_case as implementation changed

---
 python/examples/auto_throughput.py            | 16 +++---
 python/examples/axes.py                       | 57 ++++++++++---------
 .../cccl_parallel_segmented_reduce.py         | 23 ++++----
 python/examples/cpu_only.py                   |  2 +-
 python/examples/cupy_extract.py               | 22 +++----
 python/examples/exec_tag_sync.py              | 10 ++--
 python/examples/skip.py                       | 10 ++--
 python/examples/throughput.py                 | 24 +++-----
 python/test/run_1.py                          | 42 +++++++-------
 9 files changed, 101 insertions(+), 105 deletions(-)

diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index 83c7312..0d05aa0 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -41,18 +41,18 @@ def make_kernel(items_per_thread: int) -> Callable:
 
 
 def throughput_bench(state: nvbench.State) -> None:
-    stride = state.getInt64("Stride")
-    ipt = state.getInt64("ItemsPerThread")
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
 
     nbytes = 128 * 1024 * 1024
     elements = nbytes // np.dtype(np.int32).itemsize
 
-    alloc_stream = as_cuda_Stream(state.getStream())
+    alloc_stream = as_cuda_Stream(state.get_stream())
     inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
     out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
 
-    state.addElementCount(elements, column_name="Elements")
-    state.collectCUPTIMetrics()
+    state.add_element_count(elements, column_name="Elements")
+    state.collect_cupti_metrics()
 
     threads_per_block = 256
     blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
@@ -66,7 +66,7 @@ def throughput_bench(state: nvbench.State) -> None:
     )
 
     def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.getStream())
+        exec_stream = as_cuda_Stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
         )
@@ -76,7 +76,7 @@ def throughput_bench(state: nvbench.State) -> None:
 
 if __name__ == "__main__":
     b = nvbench.register(throughput_bench)
-    b.addInt64Axis("Stride", [1, 2, 4])
-    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/axes.py b/python/examples/axes.py
index fc710b7..26fc820 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -43,13 +43,13 @@ __global__ void sleep_kernel(double seconds) {
 
 
 def simple(state: nvbench.State):
-    state.setMinSamples(1000)
+    state.set_min_samples(1000)
     sleep_dur = 1e-3
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
@@ -57,12 +57,13 @@ def simple(state: nvbench.State):
 
 def single_float64_axis(state: nvbench.State):
     # get axis value, or default
-    sleep_dur = state.getFloat64("Duration", 3.14e-4)
+    default_sleep_dur = 3.14e-4
+    sleep_dur = state.get_float64("Duration", default_sleep_dur)
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
@@ -104,19 +105,19 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
 
 
 def copy_sweep_grid_shape(state: nvbench.State):
-    block_size = state.getInt64("BlockSize")
-    num_blocks = state.getInt64("NumBlocks")
+    block_size = state.get_int64("BlockSize")
+    num_blocks = state.get_int64("NumBlocks")
 
     # Number of int32 elements in 256MiB
     nbytes = 256 * 1024 * 1024
     num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
 
-    state.addElementCount(num_values)
-    state.addGlobalMemoryReads(nbytes)
-    state.addGlobalMemoryWrites(nbytes)
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
 
-    dev_id = state.getDevice()
-    alloc_s = as_core_Stream(state.getStream())
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
     input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
     output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
@@ -124,20 +125,20 @@ def copy_sweep_grid_shape(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
 
 
 def copy_type_sweep(state: nvbench.State):
-    type_id = state.getInt64("TypeID")
+    type_id = state.get_int64("TypeID")
 
     types_map = {
-        0: (ctypes.c_uint8, "::cuda::std::uint8_t"),
-        1: (ctypes.c_uint16, "::cuda::std::uint16_t"),
-        2: (ctypes.c_uint32, "::cuda::std::uint32_t"),
-        3: (ctypes.c_uint64, "::cuda::std::uint64_t"),
+        0: (ctypes.c_uint8, "cuda::std::uint8_t"),
+        1: (ctypes.c_uint16, "cuda::std::uint16_t"),
+        2: (ctypes.c_uint32, "cuda::std::uint32_t"),
+        3: (ctypes.c_uint64, "cuda::std::uint64_t"),
         4: (ctypes.c_float, "float"),
         5: (ctypes.c_double, "double"),
     }
@@ -149,12 +150,12 @@ def copy_type_sweep(state: nvbench.State):
     nbytes = 256 * 1024 * 1024
     num_values = nbytes // ctypes.sizeof(value_ctype(0))
 
-    state.addElementCount(num_values)
-    state.addGlobalMemoryReads(nbytes)
-    state.addGlobalMemoryWrites(nbytes)
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
 
-    dev_id = state.getDevice()
-    alloc_s = as_core_Stream(state.getStream())
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
     input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
     output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
 
@@ -162,7 +163,7 @@ def copy_type_sweep(state: nvbench.State):
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
@@ -175,13 +176,15 @@ if __name__ == "__main__":
     # benchmark with no axes, that uses default value
     nvbench.register(default_value)
     # specify axis
-    nvbench.register(single_float64_axis).addFloat64Axis("Duration", [7e-5, 1e-4, 5e-4])
+    nvbench.register(single_float64_axis).add_float64_axis(
+        "Duration", [7e-5, 1e-4, 5e-4]
+    )
 
     copy1_bench = nvbench.register(copy_sweep_grid_shape)
-    copy1_bench.addInt64Axis("BlockSize", [2**x for x in range(6, 10, 2)])
-    copy1_bench.addInt64Axis("NumBlocks", [2**x for x in range(6, 10, 2)])
+    copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
+    copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
 
     copy2_bench = nvbench.register(copy_type_sweep)
-    copy2_bench.addInt64Axis("TypeID", range(0, 6))
+    copy2_bench.add_int64_axis("TypeID", range(0, 6))
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
index 9a6a26d..bbc2649 100644
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -35,15 +35,15 @@ def as_cp_ExternalStream(
 
 def segmented_reduce(state: nvbench.State):
     "Benchmark segmented_reduce example"
-    n_elems = state.getInt64("numElems")
-    n_cols = state.getInt64("numCols")
+    n_elems = state.get_int64("numElems")
+    n_cols = state.get_int64("numCols")
     n_rows = n_elems // n_cols
 
     state.add_summary("numRows", n_rows)
-    state.collectCUPTIMetrics()
+    state.collect_cupti_metrics()
 
-    dev_id = state.getDevice()
-    cp_stream = as_cp_ExternalStream(state.getStream(), dev_id)
+    dev_id = state.get_device()
+    cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
 
     with cp_stream:
         rng = cp.random.default_rng()
@@ -75,20 +75,19 @@ def segmented_reduce(state: nvbench.State):
         d_input, d_output, start_offsets, end_offsets, add_op, h_init
     )
 
-    # print(1)
-    cccl_stream = as_cccl_Stream(state.getStream())
-    # print(2, core_stream, core_stream.__cuda_stream__())
+    cccl_stream = as_cccl_Stream(state.get_stream())
+
     # query size of temporary storage and allocate
     temp_nbytes = alg(
         None, d_input, d_output, n_rows, start_offsets, end_offsets, h_init, cccl_stream
     )
     h_init = np.zeros(tuple(), dtype=np.int32)
-    # print(3)
+
     with cp_stream:
         temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
 
     def launcher(launch: nvbench.Launch):
-        s = as_cccl_Stream(launch.getStream())
+        s = as_cccl_Stream(launch.get_stream())
         alg(
             temp_storage,
             d_input,
@@ -105,7 +104,7 @@ def segmented_reduce(state: nvbench.State):
 
 if __name__ == "__main__":
     b = nvbench.register(segmented_reduce)
-    b.addInt64Axis("numElems", [2**20, 2**22, 2**24])
-    b.addInt64Axis("numCols", [1024, 2048, 4096, 8192])
+    b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cpu_only.py b/python/examples/cpu_only.py
index 4831c6d..16fd306 100644
--- a/python/examples/cpu_only.py
+++ b/python/examples/cpu_only.py
@@ -13,6 +13,6 @@ def throughput_bench(state: nvbench.State) -> None:
 
 if __name__ == "__main__":
     b = nvbench.register(throughput_bench)
-    b.setIsCPUOnly(True)
+    b.set_is_cpu_only(True)
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
index ffdb325..97b06e1 100644
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -12,18 +12,18 @@ def as_cp_ExternalStream(
 
 
 def cupy_extract_by_mask(state: nvbench.State):
-    n_cols = state.getInt64("numCols")
-    n_rows = state.getInt64("numRows")
+    n_cols = state.get_int64("numCols")
+    n_rows = state.get_int64("numRows")
 
-    dev_id = state.getDevice()
-    cp_s = as_cp_ExternalStream(state.getStream(), dev_id)
+    dev_id = state.get_device()
+    cp_s = as_cp_ExternalStream(state.get_stream(), dev_id)
 
-    state.collectCUPTIMetrics()
-    state.addElementCount(n_rows * n_cols, "# Elements")
-    state.addGlobalMemoryReads(
+    state.collect_cupti_metrics()
+    state.add_element_count(n_rows * n_cols, "# Elements")
+    state.add_global_memory_reads(
         n_rows * n_cols * (cp.dtype(cp.int32).itemsize + cp.dtype("?").itemsize)
     )
-    state.addGlobalMemoryWrites(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
+    state.add_global_memory_writes(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
 
     with cp_s:
         X = cp.full((n_cols, n_rows), fill_value=3, dtype=cp.int32)
@@ -31,7 +31,7 @@ def cupy_extract_by_mask(state: nvbench.State):
         _ = X[mask]
 
     def launcher(launch: nvbench.Launch):
-        with as_cp_ExternalStream(launch.getStream(), dev_id):
+        with as_cp_ExternalStream(launch.get_stream(), dev_id):
             _ = X[mask]
 
     state.exec(launcher, sync=True)
@@ -39,7 +39,7 @@ def cupy_extract_by_mask(state: nvbench.State):
 
 if __name__ == "__main__":
     b = nvbench.register(cupy_extract_by_mask)
-    b.addInt64Axis("numCols", [1024, 2048, 4096, 2 * 4096])
-    b.addInt64Axis("numRows", [1024, 2048, 4096, 2 * 4096])
+    b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
+    b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
index 8148c30..9315983 100644
--- a/python/examples/exec_tag_sync.py
+++ b/python/examples/exec_tag_sync.py
@@ -45,17 +45,17 @@ def synchronizing_bench(state: nvbench.State):
     n_values = 64 * 1024 * 1024
     n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
 
-    alloc_s = as_core_Stream(state.getStream())
-    buffer = core.DeviceMemoryResource(state.getDevice()).allocate(n_bytes, alloc_s)
+    alloc_s = as_core_Stream(state.get_stream())
+    buffer = core.DeviceMemoryResource(state.get_device()).allocate(n_bytes, alloc_s)
 
-    state.addElementCount(n_values, "Items")
-    state.addGlobalMemoryWrites(n_bytes, "Size")
+    state.add_element_count(n_values, "Items")
+    state.add_global_memory_writes(n_bytes, "Size")
 
     krn = make_fill_kernel()
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, buffer, 0, n_values)
         s.sync()
 
diff --git a/python/examples/skip.py b/python/examples/skip.py
index fba91f5..bb75b57 100644
--- a/python/examples/skip.py
+++ b/python/examples/skip.py
@@ -42,8 +42,8 @@ __global__ void sleep_kernel(double seconds) {
 
 
 def runtime_skip(state: nvbench.State):
-    duration = state.getFloat64("Duration")
-    kramble = state.getString("Kramble")
+    duration = state.get_float64("Duration")
+    kramble = state.get_string("Kramble")
 
     # Skip Baz benchmarks with 0.8 ms duration
     if kramble == "Baz" and duration < 0.8e-3:
@@ -59,7 +59,7 @@ def runtime_skip(state: nvbench.State):
     launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
     def launcher(launch: nvbench.Launch):
-        s = as_core_Stream(launch.getStream())
+        s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_cfg, krn, duration)
 
     state.exec(launcher)
@@ -67,7 +67,7 @@ def runtime_skip(state: nvbench.State):
 
 if __name__ == "__main__":
     b = nvbench.register(runtime_skip)
-    b.addFloat64Axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
-    b.addStringAxis("Kramble", ["Foo", "Bar", "Baz"])
+    b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
+    b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
index eaae2ef..13aba4f 100644
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -41,33 +41,27 @@ def make_kernel(items_per_thread: int) -> Callable:
 
 
 def throughput_bench(state: nvbench.State) -> None:
-    stride = state.getInt64("Stride")
-    ipt = state.getInt64("ItemsPerThread")
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
 
     nbytes = 128 * 1024 * 1024
     elements = nbytes // np.dtype(np.int32).itemsize
 
-    alloc_stream = as_cuda_Stream(state.getStream())
+    alloc_stream = as_cuda_Stream(state.get_stream())
     inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
     out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
 
-    state.addElementCount(elements, column_name="Elements")
-    state.addGlobalMemoryReads(inp_arr.nbytes, column_name="Datasize")
-    state.addGlobalMemoryWrites(inp_arr.nbytes)
+    state.add_element_count(elements, column_name="Elements")
+    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
+    state.add_global_memory_writes(inp_arr.nbytes)
 
     threads_per_block = 256
     blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
 
     krn = make_kernel(ipt)
 
-    # warm-up call ensures that kernel is loaded into context
-    # before blocking kernel is launched
-    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
-        stride, elements, inp_arr, out_arr
-    )
-
     def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.getStream())
+        exec_stream = as_cuda_Stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
         )
@@ -77,7 +71,7 @@ def throughput_bench(state: nvbench.State) -> None:
 
 if __name__ == "__main__":
     b = nvbench.register(throughput_bench)
-    b.addInt64Axis("Stride", [1, 2, 4])
-    b.addInt64Axis("ItemsPerThread", [1, 2, 3, 4])
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
 
     nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/test/run_1.py b/python/test/run_1.py
index 7c8b6b8..38031cf 100755
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -14,18 +14,18 @@ def kernel(a, b, c):
         c[tid] = a[tid] + b[tid]
 
 
-def getNumbaStream(launch):
-    return cuda.external_stream(launch.getStream().addressof())
+def get_numba_stream(launch):
+    return cuda.external_stream(launch.get_stream().addressof())
 
 
 def add_two(state):
     # state.skip("Skipping this benchmark for no reason")
-    N = state.getInt64("elements")
+    N = state.get_int64("elements")
     a = cuda.to_device(np.random.random(N))
     c = cuda.device_array_like(a)
 
-    state.addGlobalMemoryReads(a.nbytes)
-    state.addGlobalMemoryWrites(c.nbytes)
+    state.add_global_memory_reads(a.nbytes)
+    state.add_global_memory_writes(c.nbytes)
 
     nthreads = 256
     nblocks = (len(a) + nthreads - 1) // nthreads
@@ -35,22 +35,22 @@ def add_two(state):
     cuda.synchronize()
 
     def kernel_launcher(launch):
-        stream = getNumbaStream(launch)
+        stream = get_numba_stream(launch)
         kernel[nblocks, nthreads, stream](a, a, c)
 
     state.exec(kernel_launcher, batched=True, sync=True)
 
 
 def add_float(state):
-    N = state.getInt64("elements")
-    v = state.getFloat64("v")
-    name = state.getString("name")
+    N = state.get_int64("elements")
+    v = state.get_gloat64("v")
+    name = state.get_string("name")
     a = cuda.to_device(np.random.random(N).astype(np.float32))
     b = cuda.to_device(np.random.random(N).astype(np.float32))
     c = cuda.device_array_like(a)
 
-    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
-    state.addGlobalMemoryWrites(c.nbytes)
+    state.add_global_memory_reads(a.nbytes + b.nbytes)
+    state.add_global_memory_writes(c.nbytes)
 
     nthreads = 64
     nblocks = (len(a) + nthreads - 1) // nthreads
@@ -58,26 +58,26 @@ def add_float(state):
     def kernel_launcher(launch):
         _ = v
         _ = name
-        stream = getNumbaStream(launch)
+        stream = get_numba_stream(launch)
         kernel[nblocks, nthreads, stream](a, b, c)
 
     state.exec(kernel_launcher, batched=True, sync=True)
 
 
 def add_three(state):
-    N = state.getInt64("elements")
+    N = state.get_int64("elements")
     a = cuda.to_device(np.random.random(N).astype(np.float32))
     b = cuda.to_device(np.random.random(N).astype(np.float32))
     c = cuda.device_array_like(a)
 
-    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
-    state.addGlobalMemoryWrites(c.nbytes)
+    state.add_global_memory_reads(a.nbytes + b.nbytes)
+    state.add_global_memory_writes(c.nbytes)
 
     nthreads = 256
     nblocks = (len(a) + nthreads - 1) // nthreads
 
     def kernel_launcher(launch):
-        stream = getNumbaStream(launch)
+        stream = get_numba_stream(launch)
         kernel[nblocks, nthreads, stream](a, b, c)
 
     state.exec(kernel_launcher, batched=True, sync=True)
@@ -86,18 +86,18 @@ def add_three(state):
 
 def register_benchmarks():
     (
-        nvbench.register(add_two).addInt64Axis(
+        nvbench.register(add_two).add_int64_axis(
             "elements", [2**pow2 for pow2 in range(20, 23)]
         )
     )
     (
         nvbench.register(add_float)
-        .addFloat64Axis("v", [0.1, 0.3])
-        .addStringAxis("name", ["Anne", "Lynda"])
-        .addInt64Axis("elements", [2**pow2 for pow2 in range(20, 23)])
+        .add_float64_axis("v", [0.1, 0.3])
+        .add_string_axis("name", ["Anne", "Lynda"])
+        .add_int64_axis("elements", [2**pow2 for pow2 in range(20, 23)])
     )
     (
-        nvbench.register(add_three).addInt64Axis(
+        nvbench.register(add_three).add_int64_axis(
             "elements", [2**pow2 for pow2 in range(20, 22)]
         )
     )

From 9ab642cf6992ef04db3de4ea0c4c390f3b3b0a1d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 12:55:58 -0500
Subject: [PATCH 32/78] Add suggestion to create conda environment with recent
 CMake to build nvbench

---
 python/README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/README.md b/python/README.md
index 0ad2665..22150b0 100644
--- a/python/README.md
+++ b/python/README.md
@@ -6,6 +6,15 @@ This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
 
 ### Build `NVBench` project
 
+Since `nvbench` requires a rather new version of CMake (>=3.30.4), either build CMake from sources, or create a conda environment with a recent version of CMake, using
+
+```
+conda create -n build_env --yes  cmake ninja
+conda activate build_env
+```
+
+Now switch to python folder, configure and install NVBench library, and install the package in editable mode:
+
 ```
 cd nvbench/python
 cmake -B nvbench_build --preset nvbench-ci -S $(pwd)/.. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DNVBench_ENABLE_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$(pwd)/nvbench_install

From e426368485bdb7cf9b88ed43e1698fc7cbae6fe5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 13:31:34 -0500
Subject: [PATCH 33/78] Correct propagating nvbench_main exceptions to Python

python examples/cpu_only.py --run-once -d 0 --output foo.md

used to trip SystemError, returned a result with an exception set.

It now returns a clean NVBenchmarkError exception.
---
 python/src/py_nvbench.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index b5531a2..822f9e1 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -192,16 +192,14 @@ public:
     }
     catch (const std::exception &e)
     {
-      std::stringstream ss;
-      ss << "Caught exception while running benchmarks: ";
-      ss << e.what();
-
-      const std::string &exc_message = ss.str();
+      const std::string &exc_message = e.what();
       py::set_error(benchmark_exc, exc_message.c_str());
+      throw py::error_already_set();
     }
     catch (...)
     {
       py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
+      throw py::error_already_set();
     }
   }
 };

From d09df0f754cf40cdb9f471e80f7c18ae8fa0ea2a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 21 Jul 2025 13:37:23 -0500
Subject: [PATCH 34/78] Expand examples/cpu_only.py

Benchmark function that sleeps for 1 seconda on the host using CPU-only
timer, as well as CPU/GPU timer that does/doesn't use blocking kernel.

All three methods must report consistent values close to 1 second.
---
 python/examples/cpu_only.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/examples/cpu_only.py b/python/examples/cpu_only.py
index 16fd306..bfd9b7e 100644
--- a/python/examples/cpu_only.py
+++ b/python/examples/cpu_only.py
@@ -4,15 +4,31 @@ import time
 import cuda.nvbench as nvbench
 
 
-def throughput_bench(state: nvbench.State) -> None:
+def sleep_bench(state: nvbench.State) -> None:
     def launcher(launch: nvbench.Launch):
         time.sleep(1)
 
     state.exec(launcher)
 
 
+def sleep_bench_sync(state: nvbench.State) -> None:
+    sync = state.get_string("Sync")
+    sync_flag = sync == "Do sync"
+
+    def launcher(launch: nvbench.Launch):
+        time.sleep(1)
+
+    state.exec(launcher, sync=sync_flag)
+
+
 if __name__ == "__main__":
-    b = nvbench.register(throughput_bench)
+    # time function sleeping on the host
+    # using CPU timer only
+    b = nvbench.register(sleep_bench)
     b.set_is_cpu_only(True)
 
+    # time the same function using both CPU/GPU timers
+    b2 = nvbench.register(sleep_bench_sync)
+    b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
+
     nvbench.run_all_benchmarks(sys.argv)

From bd2b536ab40823553ab027b68ffa46ed22370eeb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 12:04:37 -0500
Subject: [PATCH 35/78] cpu_only -> cpu_activity

Change example to illustrate timing CPU work.

First example does only CPU work (sleeps), use CPU-only timer.
Second examples does both CPU and GPU work (sleeps in either case).
Use cold-run timer with/without sync tag to measure both CPU and GPU times.
---
 python/examples/cpu_activity.py | 81 +++++++++++++++++++++++++++++++++
 python/examples/cpu_only.py     | 34 --------------
 2 files changed, 81 insertions(+), 34 deletions(-)
 create mode 100644 python/examples/cpu_activity.py
 delete mode 100644 python/examples/cpu_only.py

diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py
new file mode 100644
index 0000000..d51f6ad
--- /dev/null
+++ b/python/examples/cpu_activity.py
@@ -0,0 +1,81 @@
+import sys
+import time
+
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+
+host_sleep_duration = 0.1
+
+
+def cpu_only_sleep_bench(state: nvbench.State) -> None:
+    def launcher(launch: nvbench.Launch):
+        time.sleep(host_sleep_duration)
+
+    state.exec(launcher)
+
+
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def mixed_sleep_bench(state: nvbench.State) -> None:
+    sync = state.get_string("Sync")
+    sync_flag = sync == "Do sync"
+
+    gpu_sleep_dur = 225e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        # host overhead
+        time.sleep(host_sleep_duration)
+        # GPU computation
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, gpu_sleep_dur)
+
+    state.exec(launcher, sync=sync_flag)
+
+
+if __name__ == "__main__":
+    # time function only doing work (sleeping) on the host
+    # using CPU timer only
+    b = nvbench.register(cpu_only_sleep_bench)
+    b.set_is_cpu_only(True)
+
+    # time the function that does work on both GPU and CPU
+    b2 = nvbench.register(mixed_sleep_bench)
+    b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
+
+    nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cpu_only.py b/python/examples/cpu_only.py
deleted file mode 100644
index bfd9b7e..0000000
--- a/python/examples/cpu_only.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import sys
-import time
-
-import cuda.nvbench as nvbench
-
-
-def sleep_bench(state: nvbench.State) -> None:
-    def launcher(launch: nvbench.Launch):
-        time.sleep(1)
-
-    state.exec(launcher)
-
-
-def sleep_bench_sync(state: nvbench.State) -> None:
-    sync = state.get_string("Sync")
-    sync_flag = sync == "Do sync"
-
-    def launcher(launch: nvbench.Launch):
-        time.sleep(1)
-
-    state.exec(launcher, sync=sync_flag)
-
-
-if __name__ == "__main__":
-    # time function sleeping on the host
-    # using CPU timer only
-    b = nvbench.register(sleep_bench)
-    b.set_is_cpu_only(True)
-
-    # time the same function using both CPU/GPU timers
-    b2 = nvbench.register(sleep_bench_sync)
-    b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
-
-    nvbench.run_all_benchmarks(sys.argv)

From 13ad115ca3063f3f890dd56bb046954cc1ad6bba Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 12:07:07 -0500
Subject: [PATCH 36/78] Add nvbench.Benchmark.set_run_once method

---
 python/src/py_nvbench.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 822f9e1..7cdb24f 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -299,6 +299,13 @@ PYBIND11_MODULE(_nvbench, m)
       return std::ref(self);
     },
     py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "set_run_once",
+    [](nvbench::benchmark_base &self, bool v) {
+      self.set_run_once(v);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
 
   // == STEP 5
   // Define PyState class

From a535a1d173aafd651f2bb2b7949bc2473185c4bb Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 13:02:22 -0500
Subject: [PATCH 37/78] Fix type annotations in cuda.nvbench, and in examples

---
 python/cuda/nvbench/__init__.pyi              | 79 +++++++++++--------
 python/examples/auto_throughput.py            |  3 +-
 python/examples/axes.py                       |  6 +-
 .../cccl_parallel_segmented_reduce.py         |  2 +-
 python/examples/cupy_extract.py               |  2 +-
 python/examples/throughput.py                 |  3 +-
 6 files changed, 52 insertions(+), 43 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 9c060af..fd7c90a 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -43,18 +43,27 @@ class Benchmark:
         Use `~register` function to create Benchmark and register
         it with NVBench.
     """
-    def getName(self) -> str:
+    def get_name(self) -> str:
         "Get benchmark name"
         ...
-    def addInt64Axis(self, name: str, values: Sequence[int]) -> Self:
+    def add_int64_axis(self, name: str, values: Sequence[int]) -> Self:
         "Add integral type parameter axis with given name and values to sweep over"
         ...
-    def addFloat64Axis(self, name: str, values: Sequence[float]) -> Self:
+    def add_float64_axis(self, name: str, values: Sequence[float]) -> Self:
         "Add floating-point type parameter axis with given name and values to sweep over"
         ...
-    def addStringAxis(sef, name: str, values: Sequence[str]) -> Self:
+    def add_string_axis(sef, name: str, values: Sequence[str]) -> Self:
         "Add string type parameter axis with given name and values to sweep over"
         ...
+    def set_name(self, name: str) -> Self:
+        ""
+        ...
+    def set_is_cpu_only(self, is_cpu_only: bool) -> Self:
+        "Set whether this benchmark only executes on CPU"
+        ...
+    def set_run_once(self, v: bool) -> Self:
+        "Set whether all benchmark configurations are executed only once"
+        ...
 
 class Launch:
     """Configuration object for function launch.
@@ -63,7 +72,7 @@ class Launch:
     ----
         The class is not user-constructible.
     """
-    def getStream(self) -> CudaStream:
+    def get_stream(self) -> CudaStream:
         "Get CUDA stream of this configuration"
         ...
 
@@ -74,92 +83,91 @@ class State:
     ----
         The class is not user-constructible.
     """
-    def hasDevice(self) -> bool:
+    def has_device(self) -> bool:
         "True if configuration has a device"
         ...
-    def hasPrinters(self) -> bool:
+    def has_printers(self) -> bool:
         "True if configuration has a printer"
         ...
-    def getDevice(self) -> Union[int, None]:
+    def get_device(self) -> Union[int, None]:
         "Get device_id of the device from this configuration"
         ...
-    def getStream(self) -> CudaStream:
+    def get_stream(self) -> CudaStream:
         "CudaStream object from this configuration"
         ...
-    def getInt64(self, name: str, default_value: Optional[int] = None) -> int:
+    def get_int64(self, name: str, default_value: Optional[int] = None) -> int:
         "Get value for given Int64 axis from this configuration"
         ...
-    def getFloat64(self, name: str, default_value: Optional[float] = None) -> float:
+    def get_float64(self, name: str, default_value: Optional[float] = None) -> float:
         "Get value for given Float64 axis from this configuration"
         ...
-    def getString(self, name: str, default_value: Optional[str] = None) -> str:
+    def get_string(self, name: str, default_value: Optional[str] = None) -> str:
         "Get value for given String axis from this configuration"
         ...
-    def addElementCount(self, count: int, column_name: Optional[str] = None) -> None:
+    def add_element_count(self, count: int, column_name: Optional[str] = None) -> None:
         "Add element count"
         ...
-    def setElementCount(self, count: int) -> None:
+    def set_element_count(self, count: int) -> None:
         "Set element count"
         ...
-    def getElementCount(self) -> int:
+    def get_element_count(self) -> int:
         "Get element count"
         ...
     def skip(self, reason: str) -> None:
         "Skip this configuration"
         ...
-    def isSkipped(self) -> bool:
+    def is_skipped(self) -> bool:
         "Has this configuration been skipped"
         ...
-    def getSkipReason(self) -> str:
+    def get_skip_reason(self) -> str:
         "Get reason provided for skipping this configuration"
         ...
-    def addGlobalMemoryReads(self, nbytes: int) -> None:
+    def add_global_memory_reads(self, nbytes: int, /, column_name: str = "") -> None:
         "Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
         ...
-    def addGlobalMemoryWrites(self, nbytes: int) -> None:
+    def add_global_memory_writes(self, nbytes: int, /, column_name: str = "") -> None:
         "Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
         ...
-    def getBenchmark(self) -> Benchmark:
+    def get_benchmark(self) -> Benchmark:
         "Get Benchmark this configuration is a part of"
         ...
-    def getThrottleThreshold(self) -> float:
+    def get_throttle_threshold(self) -> float:
         "Get throttle threshold value"
         ...
-    def getMinSamples(self) -> int:
+    def get_min_samples(self) -> int:
         "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
         ...
-    def setMinSamples(self, count: int) -> None:
+    def set_min_samples(self, count: int) -> None:
         "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
         ...
-    def getDisableBlockingKernel(self) -> bool:
+    def get_disable_blocking_kernel(self) -> bool:
         "True if use of blocking kernel by NVBench is disabled, False otherwise"
         ...
-    def setDisableBlockingKernel(self, flag: bool) -> None:
+    def set_disable_blocking_kernel(self, flag: bool) -> None:
         "Use flag = True to disable use of blocking kernel by NVBench"
         ...
-    def getRunOnce(self) -> bool:
+    def get_run_once(self) -> bool:
         "Boolean flag whether configuration should only run once"
         ...
-
-    def setRunOnce(self, flag: bool) -> None:
+    def set_run_once(self, flag: bool) -> None:
         "Set run-once flag for this configuration"
         ...
-    def getTimeout(self) -> float:
+    def get_timeout(self) -> float:
         "Get time-out value for benchmark execution of this configuration"
         ...
-    def setTimeout(self, duration: float) -> None:
+    def set_timeout(self, duration: float) -> None:
         "Set time-out value for benchmark execution of this configuration"
         ...
-    def getBlockingKernelTimeout(self) -> float:
+    def get_blocking_kernel_timeout(self) -> float:
         "Get time-out value for execution of blocking kernel"
         ...
-    def setBlockingKernelTimeout(self, duration: float) -> None:
+    def set_blocking_kernel_timeout(self, duration: float) -> None:
         "Set time-out value for execution of blocking kernel"
         ...
-    def collectCUPTIMetrics(self) -> None:
+    def collect_cupti_metrics(self) -> None:
         "Request NVBench to record CUPTI metrics while running benchmark for this configuration"
         ...
-    def isCUPTIRequired(self) -> bool:
+    def is_cupti_required(self) -> bool:
         "True if (some) CUPTI metrics are being collected"
         ...
     def exec(
@@ -187,6 +195,9 @@ class State:
             Default: `False`.
         """
         ...
+    def get_short_description(self) -> str:
+        "Get short description for this configuration"
+        ...
     def add_summary(self, column_name: str, value: Union[int, float, str]) -> None:
         "Add summary column with a value"
         ...
diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index 0d05aa0..70569b7 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -15,7 +15,6 @@
 #  limitations under the License.
 
 import sys
-from collections.abc import Callable
 
 import cuda.nvbench as nvbench
 import numpy as np
@@ -26,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def make_kernel(items_per_thread: int) -> Callable:
+def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
     @cuda.jit
     def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
         tid = cuda.grid(1)
diff --git a/python/examples/axes.py b/python/examples/axes.py
index 26fc820..1e79eb6 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -1,6 +1,6 @@
 import ctypes
 import sys
-from typing import Optional
+from typing import Dict, Optional, Tuple
 
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
@@ -134,7 +134,7 @@ def copy_sweep_grid_shape(state: nvbench.State):
 def copy_type_sweep(state: nvbench.State):
     type_id = state.get_int64("TypeID")
 
-    types_map = {
+    types_map: Dict[int, Tuple[type, str]] = {
         0: (ctypes.c_uint8, "cuda::std::uint8_t"),
         1: (ctypes.c_uint16, "cuda::std::uint16_t"),
         2: (ctypes.c_uint32, "cuda::std::uint32_t"),
@@ -148,7 +148,7 @@ def copy_type_sweep(state: nvbench.State):
 
     # Number of elements in 256MiB
     nbytes = 256 * 1024 * 1024
-    num_values = nbytes // ctypes.sizeof(value_ctype(0))
+    num_values = nbytes // ctypes.sizeof(value_ctype)
 
     state.add_element_count(num_values)
     state.add_global_memory_reads(nbytes)
diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
index bbc2649..dd77f46 100644
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -27,7 +27,7 @@ def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:
 
 
 def as_cp_ExternalStream(
-    cs: nvbench.CudaStream, dev_id: int = -1
+    cs: nvbench.CudaStream, dev_id: int | None = -1
 ) -> cp.cuda.ExternalStream:
     h = cs.addressof()
     return cp.cuda.ExternalStream(h, dev_id)
diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
index 97b06e1..d1e86ef 100644
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -5,7 +5,7 @@ import cupy as cp
 
 
 def as_cp_ExternalStream(
-    cs: nvbench.CudaStream, dev_id: int = -1
+    cs: nvbench.CudaStream, dev_id: int | None = -1
 ) -> cp.cuda.ExternalStream:
     h = cs.addressof()
     return cp.cuda.ExternalStream(h, dev_id)
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
index 13aba4f..64b557b 100644
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -15,7 +15,6 @@
 #  limitations under the License.
 
 import sys
-from collections.abc import Callable
 
 import cuda.nvbench as nvbench
 import numpy as np
@@ -26,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def make_kernel(items_per_thread: int) -> Callable:
+def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
     @cuda.jit
     def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
         tid = cuda.grid(1)

From 893cefb4003d40d30ba0dab3fccbbe6d82ef1e67 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 14:12:08 -0500
Subject: [PATCH 38/78] Fix the need to set PYTHONPATH, edited README

Edit wheel.packages metadata to include namespace package "cuda".
Updated README to remove the work-around of setting PYTHONPATH,
as it is no longer necessary.
---
 python/README.md      | 1 -
 python/pyproject.toml | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/README.md b/python/README.md
index 22150b0..d08a3e6 100644
--- a/python/README.md
+++ b/python/README.md
@@ -26,7 +26,6 @@ nvbench_DIR=$(pwd)/nvbench_install/lib/cmake CUDACXX=/usr/local/cuda/bin/nvcc pi
 ### Verify that package works
 
 ```
-export PYTHONPATH=$(pwd):${PYTHONPATH}
 python test/run_1.py
 ```
 
diff --git a/python/pyproject.toml b/python/pyproject.toml
index b371019..eee7a33 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -55,4 +55,5 @@ provider = "scikit_build_core.metadata.setuptools_scm"
 root = ".."
 
 [tool.scikit-build.wheel.packages]
+"cuda" = "cuda"
 "cuda/nvbench" = "cuda/nvbench"

From 526856db4ebbce769a2e3ddc7715fafb6bc90071 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 14:13:41 -0500
Subject: [PATCH 39/78] Fix typo in the method spelling

---
 python/test/run_1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/test/run_1.py b/python/test/run_1.py
index 38031cf..ac33233 100755
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -43,7 +43,7 @@ def add_two(state):
 
 def add_float(state):
     N = state.get_int64("elements")
-    v = state.get_gloat64("v")
+    v = state.get_float64("v")
     name = state.get_string("name")
     a = cuda.to_device(np.random.random(N).astype(np.float32))
     b = cuda.to_device(np.random.random(N).astype(np.float32))

From dc7f9edfd47c9d3bd34169f6873a2e616c52b9cd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 15:58:36 -0500
Subject: [PATCH 40/78] Support nvbench.Benchmark.add_int64_power_of_two_axis

---
 python/cuda/nvbench/__init__.pyi |  3 +++
 python/src/py_nvbench.cpp        | 23 +++++++++++++++--------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index fd7c90a..e370a8f 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -49,6 +49,9 @@ class Benchmark:
     def add_int64_axis(self, name: str, values: Sequence[int]) -> Self:
         "Add integral type parameter axis with given name and values to sweep over"
         ...
+    def add_int64_power_of_two_axis(self, name: str, values: Sequence[int]) -> Self:
+        "Add integral type parameter axis with given name and values to sweep over"
+        ...
     def add_float64_axis(self, name: str, values: Sequence[float]) -> Self:
         "Add floating-point type parameter axis with given name and values to sweep over"
         ...
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 7cdb24f..461010a 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -264,24 +264,31 @@ PYBIND11_MODULE(_nvbench, m)
   py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
   py_benchmark_cls.def(
     "add_int64_axis",
-    [](nvbench::benchmark_base &self, std::string name, const std::vector<nvbench::int64_t> &data) {
-      self.add_int64_axis(std::move(name), data);
+    [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
+      self.add_int64_axis(std::move(name), std::move(data));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "add_int64_power_of_two_axis",
+    [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
+      self.add_int64_axis(std::move(name),
+                          std::move(data),
+                          nvbench::int64_axis_flags::power_of_two);
       return std::ref(self);
     },
     py::return_value_policy::reference);
   py_benchmark_cls.def(
     "add_float64_axis",
-    [](nvbench::benchmark_base &self,
-       std::string name,
-       const std::vector<nvbench::float64_t> &data) {
-      self.add_float64_axis(std::move(name), data);
+    [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::float64_t> data) {
+      self.add_float64_axis(std::move(name), std::move(data));
       return std::ref(self);
     },
     py::return_value_policy::reference);
   py_benchmark_cls.def(
     "add_string_axis",
-    [](nvbench::benchmark_base &self, std::string name, const std::vector<std::string> &data) {
-      self.add_string_axis(std::move(name), data);
+    [](nvbench::benchmark_base &self, std::string name, std::vector<std::string> data) {
+      self.add_string_axis(std::move(name), std::move(data));
       return std::ref(self);
     },
     py::return_value_policy::reference);

From 51fa07fab883a2cc7da87f58394fb80aba0b9f6e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 16:13:44 -0500
Subject: [PATCH 41/78] Avoid overloading get_int64_or_default as get_int64

Introduce get_int64_or_default method, and counterparts for
float64 and string.

Provided names for Python arguments.

Tried generating Python stubs automatically with

```
stubgen -m cuda.nvbench._nvbench
```

Gave up on this, since it does not include doc-strings.
It would be nice to compare auto-generated _nvbench.pyi with
__init__.pyi for discrepancies though.
---
 python/cuda/nvbench/__init__.pyi | 23 +++++---
 python/src/py_nvbench.cpp        | 95 +++++++++++++++++++++-----------
 2 files changed, 78 insertions(+), 40 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index e370a8f..3ea1d89 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -98,13 +98,22 @@ class State:
     def get_stream(self) -> CudaStream:
         "CudaStream object from this configuration"
         ...
-    def get_int64(self, name: str, default_value: Optional[int] = None) -> int:
+    def get_int64(self, name: str) -> int:
         "Get value for given Int64 axis from this configuration"
         ...
-    def get_float64(self, name: str, default_value: Optional[float] = None) -> float:
+    def get_int64_or_default_value(self, name: str, default_value: int) -> int:
+        "Get value for given Int64 axis from this configuration"
+        ...
+    def get_float64(self, name: str) -> float:
         "Get value for given Float64 axis from this configuration"
         ...
-    def get_string(self, name: str, default_value: Optional[str] = None) -> str:
+    def get_float64_or_default_value(self, name: str, default_value: float) -> float:
+        "Get value for given Float64 axis from this configuration"
+        ...
+    def get_string(self, name: str) -> str:
+        "Get value for given String axis from this configuration"
+        ...
+    def get_string_or_default_value(self, name: str, default_value: str) -> str:
         "Get value for given String axis from this configuration"
         ...
     def add_element_count(self, count: int, column_name: Optional[str] = None) -> None:
@@ -140,7 +149,7 @@ class State:
     def get_min_samples(self) -> int:
         "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
         ...
-    def set_min_samples(self, count: int) -> None:
+    def set_min_samples(self, min_samples_count: int) -> None:
         "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
         ...
     def get_disable_blocking_kernel(self) -> bool:
@@ -152,20 +161,20 @@ class State:
     def get_run_once(self) -> bool:
         "Boolean flag whether configuration should only run once"
         ...
-    def set_run_once(self, flag: bool) -> None:
+    def set_run_once(self, run_once_flag: bool) -> None:
         "Set run-once flag for this configuration"
         ...
     def get_timeout(self) -> float:
         "Get time-out value for benchmark execution of this configuration"
         ...
     def set_timeout(self, duration: float) -> None:
-        "Set time-out value for benchmark execution of this configuration"
+        "Set time-out value for benchmark execution of this configuration, in seconds"
         ...
     def get_blocking_kernel_timeout(self) -> float:
         "Get time-out value for execution of blocking kernel"
         ...
     def set_blocking_kernel_timeout(self, duration: float) -> None:
-        "Set time-out value for execution of blocking kernel"
+        "Set time-out value for execution of blocking kernel, in seconds"
         ...
     def collect_cupti_metrics(self) -> None:
         "Request NVBench to record CUPTI metrics while running benchmark for this configuration"
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 461010a..6efd0eb 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -405,14 +405,26 @@ PYBIND11_MODULE(_nvbench, m)
     [](nvbench::state &state) { return std::ref(state.get_cuda_stream()); },
     py::return_value_policy::reference);
 
-  pystate_cls.def("get_int64", &nvbench::state::get_int64);
-  pystate_cls.def("get_int64", &nvbench::state::get_int64_or_default);
+  pystate_cls.def("get_int64", &nvbench::state::get_int64, py::arg("name"));
+  pystate_cls.def("get_int64_or_default",
+                  &nvbench::state::get_int64_or_default,
+                  py::arg("name"),
+                  py::pos_only{},
+                  py::arg("default_value"));
 
-  pystate_cls.def("get_float64", &nvbench::state::get_float64);
-  pystate_cls.def("get_float64", &nvbench::state::get_float64_or_default);
+  pystate_cls.def("get_float64", &nvbench::state::get_float64, py::arg("name"));
+  pystate_cls.def("get_float64_or_default",
+                  &nvbench::state::get_float64_or_default,
+                  py::arg("name"),
+                  py::pos_only{},
+                  py::arg("default_value"));
 
-  pystate_cls.def("get_string", &nvbench::state::get_string);
-  pystate_cls.def("get_string", &nvbench::state::get_string_or_default);
+  pystate_cls.def("get_string", &nvbench::state::get_string, py::arg("name"));
+  pystate_cls.def("get_string_or_default",
+                  &nvbench::state::get_string_or_default,
+                  py::arg("name"),
+                  py::pos_only{},
+                  py::arg("default_value"));
 
   pystate_cls.def("add_element_count",
                   &nvbench::state::add_element_count,
@@ -421,7 +433,7 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("set_element_count", &nvbench::state::set_element_count);
   pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
 
-  pystate_cls.def("skip", &nvbench::state::skip);
+  pystate_cls.def("skip", &nvbench::state::skip, py::arg("reason"));
   pystate_cls.def("is_skipped", &nvbench::state::is_skipped);
   pystate_cls.def("get_skip_reason", &nvbench::state::get_skip_reason);
 
@@ -450,19 +462,25 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
 
   pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples);
-  pystate_cls.def("set_min_samples", &nvbench::state::set_min_samples);
+  pystate_cls.def("set_min_samples",
+                  &nvbench::state::set_min_samples,
+                  py::arg("min_samples_count"));
 
   pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel);
-  pystate_cls.def("set_disable_blocking_kernel", &nvbench::state::set_disable_blocking_kernel);
+  pystate_cls.def("set_disable_blocking_kernel",
+                  &nvbench::state::set_disable_blocking_kernel,
+                  py::arg("disable_blocking_kernel"));
 
   pystate_cls.def("get_run_once", &nvbench::state::get_run_once);
-  pystate_cls.def("set_run_once", &nvbench::state::set_run_once);
+  pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once"));
 
   pystate_cls.def("get_timeout", &nvbench::state::get_timeout);
-  pystate_cls.def("set_timeout", &nvbench::state::set_timeout);
+  pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration"));
 
   pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout);
-  pystate_cls.def("set_blocking_kernel_timeout", &nvbench::state::set_blocking_kernel_timeout);
+  pystate_cls.def("set_blocking_kernel_timeout",
+                  &nvbench::state::set_blocking_kernel_timeout,
+                  py::arg("duration"));
 
   pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics);
   pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required);
@@ -510,26 +528,36 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("get_short_description",
                   [](const nvbench::state &state) { return state.get_short_description(); });
 
-  pystate_cls.def("add_summary",
-                  [](nvbench::state &state, std::string column_name, std::string value) {
-                    auto &summ = state.add_summary("nv/python/" + column_name);
-                    summ.set_string("description", "User tag: " + column_name);
-                    summ.set_string("name", std::move(column_name));
-                    summ.set_string("value", std::move(value));
-                  });
-  pystate_cls.def("add_summary",
-                  [](nvbench::state &state, std::string column_name, std::int64_t value) {
-                    auto &summ = state.add_summary("nv/python/" + column_name);
-                    summ.set_string("description", "User tag: " + column_name);
-                    summ.set_string("name", std::move(column_name));
-                    summ.set_int64("value", value);
-                  });
-  pystate_cls.def("add_summary", [](nvbench::state &state, std::string column_name, double value) {
-    auto &summ = state.add_summary("nv/python/" + column_name);
-    summ.set_string("description", "User tag: " + column_name);
-    summ.set_string("name", std::move(column_name));
-    summ.set_float64("value", value);
-  });
+  pystate_cls.def(
+    "add_summary",
+    [](nvbench::state &state, std::string column_name, std::string value) {
+      auto &summ = state.add_summary("nv/python/" + column_name);
+      summ.set_string("description", "User tag: " + column_name);
+      summ.set_string("name", std::move(column_name));
+      summ.set_string("value", std::move(value));
+    },
+    py::arg("column_name"),
+    py::arg("value"));
+  pystate_cls.def(
+    "add_summary",
+    [](nvbench::state &state, std::string column_name, std::int64_t value) {
+      auto &summ = state.add_summary("nv/python/" + column_name);
+      summ.set_string("description", "User tag: " + column_name);
+      summ.set_string("name", std::move(column_name));
+      summ.set_int64("value", value);
+    },
+    py::arg("name"),
+    py::arg("value"));
+  pystate_cls.def(
+    "add_summary",
+    [](nvbench::state &state, std::string column_name, double value) {
+      auto &summ = state.add_summary("nv/python/" + column_name);
+      summ.set_string("description", "User tag: " + column_name);
+      summ.set_string("name", std::move(column_name));
+      summ.set_float64("value", value);
+    },
+    py::arg("name"),
+    py::arg("value"));
 
   // Use handle to take a memory leak here, since this object's destructor may be called after
   // interpreter has shut down
@@ -546,7 +574,8 @@ PYBIND11_MODULE(_nvbench, m)
     "register",
     [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
     "Register benchmark function of type Callable[[nvbench.State], None]",
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("benchmark_fn"));
 
   m.def(
     "run_all_benchmarks",

From 361c0337bee6b238ff55ed01cbc9df9bc5b73cc0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 16:33:48 -0500
Subject: [PATCH 42/78] Use cuda-pathfinder instead of cuda-bindings for
 Pathfinder

Removed use of __all__ per PR feedback. Emit warnings.warn if
version information could not be retrieved from the package metadata,
e.g., package has been renamed by source code was not updated.
---
 python/cuda/nvbench/__init__.py | 45 ++++++++++++++++++++-------------
 python/pyproject.toml           |  2 +-
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/nvbench/__init__.py
index 7f6f6ce..3c320af 100644
--- a/python/cuda/nvbench/__init__.py
+++ b/python/cuda/nvbench/__init__.py
@@ -1,31 +1,40 @@
 import importlib.metadata
+import warnings
 
-from cuda.bindings.path_finder import (  # type: ignore[import-not-found]
-    _load_nvidia_dynamic_library,
+from cuda.pathfinder import (  # type: ignore[import-not-found]
+    load_nvidia_dynamic_lib,
 )
 
 try:
     __version__ = importlib.metadata.version("pynvbench")
-except Exception:
+except Exception as e:
     __version__ = "0.0.0dev"
+    warnings.warn(
+        "Could not retrieve version of pynvbench package dynamically from its metadata. "
+        f"Exception {e} was raised. "
+        f"Version is set to fall-back value '{__version__}' instead."
+    )
 
 for libname in ("cupti", "nvperf_target", "nvperf_host"):
-    _load_nvidia_dynamic_library(libname)
+    load_nvidia_dynamic_lib(libname)
 
 from ._nvbench import (  # noqa: E402
-    Benchmark,
-    CudaStream,
-    Launch,
-    State,
-    register,
-    run_all_benchmarks,
+    Benchmark as Benchmark,
+)
+from ._nvbench import (  # noqa: E402
+    CudaStream as CudaStream,
+)
+from ._nvbench import (  # noqa: E402
+    Launch as Launch,
+)
+from ._nvbench import (  # noqa: E402
+    State as State,
+)
+from ._nvbench import (  # noqa: E402
+    register as register,
+)
+from ._nvbench import (  # noqa: E402
+    run_all_benchmarks as run_all_benchmarks,
 )
 
-__all__ = [
-    "register",
-    "run_all_benchmarks",
-    "CudaStream",
-    "Launch",
-    "State",
-    "Benchmark",
-]
+del load_nvidia_dynamic_lib
diff --git a/python/pyproject.toml b/python/pyproject.toml
index eee7a33..22adc77 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -14,7 +14,7 @@ classifiers = [
 requires-python = ">=3.9"
 dependencies = [
   # pathfinder
-  "cuda-bindings",
+  "cuda-pathfinder",
 
   # Library expects to find shared libraries
   # libcupti, libnvperf_target, libnvperf_host

From fc0249d188e43e6f44c2e05e9a9f8cb69656a9bd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 22 Jul 2025 16:47:54 -0500
Subject: [PATCH 43/78] Updated examples/axes.py to use get_float64_or_default

---
 python/examples/axes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/examples/axes.py b/python/examples/axes.py
index 1e79eb6..5adc03a 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -58,7 +58,7 @@ def simple(state: nvbench.State):
 def single_float64_axis(state: nvbench.State):
     # get axis value, or default
     default_sleep_dur = 3.14e-4
-    sleep_dur = state.get_float64("Duration", default_sleep_dur)
+    sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 

From c136efab658ea3ed7dfb7404d1a62c21cd72ec0a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 23 Jul 2025 15:32:27 -0500
Subject: [PATCH 44/78] Use absolute imports in cuda/nvbench/__init__.py

---
 python/cuda/nvbench/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/nvbench/__init__.py
index 3c320af..8091be2 100644
--- a/python/cuda/nvbench/__init__.py
+++ b/python/cuda/nvbench/__init__.py
@@ -18,22 +18,22 @@ except Exception as e:
 for libname in ("cupti", "nvperf_target", "nvperf_host"):
     load_nvidia_dynamic_lib(libname)
 
-from ._nvbench import (  # noqa: E402
+from cuda.nvbench._nvbench import (  # noqa: E402
     Benchmark as Benchmark,
 )
-from ._nvbench import (  # noqa: E402
+from cuda.nvbench._nvbench import (  # noqa: E402
     CudaStream as CudaStream,
 )
-from ._nvbench import (  # noqa: E402
+from cuda.nvbench._nvbench import (  # noqa: E402
     Launch as Launch,
 )
-from ._nvbench import (  # noqa: E402
+from cuda.nvbench._nvbench import (  # noqa: E402
     State as State,
 )
-from ._nvbench import (  # noqa: E402
+from cuda.nvbench._nvbench import (  # noqa: E402
     register as register,
 )
-from ._nvbench import (  # noqa: E402
+from cuda.nvbench._nvbench import (  # noqa: E402
     run_all_benchmarks as run_all_benchmarks,
 )
 

From a69a3647b2ce4ccd78894879d1e1c6fa1f50f8fe Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 09:33:13 -0500
Subject: [PATCH 45/78] CUTLASS example added, license headers added, fixes

- Add license header to each example file.
- Fixed broken runs caused by type declarations.
- Fixed hang in throughput.py when --run-once by doing a
  manual warm-up step, like in auto_throughput.py
---
 python/examples/auto_throughput.py            |   5 +-
 python/examples/axes.py                       |  16 +++
 .../cccl_parallel_segmented_reduce.py         |  16 +++
 python/examples/cpu_activity.py               |  16 +++
 python/examples/cupy_extract.py               |  16 +++
 python/examples/cutlass_gemm.py               | 112 ++++++++++++++++++
 python/examples/exec_tag_sync.py              |  16 +++
 python/examples/requirements.txt              |   7 ++
 python/examples/skip.py                       |  16 +++
 python/examples/throughput.py                 |   9 +-
 10 files changed, 226 insertions(+), 3 deletions(-)
 create mode 100644 python/examples/cutlass_gemm.py
 create mode 100644 python/examples/requirements.txt

diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index 70569b7..80a94e9 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -25,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
+def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
     @cuda.jit
     def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
         tid = cuda.grid(1)
@@ -59,7 +59,8 @@ def throughput_bench(state: nvbench.State) -> None:
     krn = make_kernel(ipt)
 
     # warm-up call ensures that kernel is loaded into context
-    # before blocking kernel is launched
+    # before blocking kernel is launched. Kernel loading may cause
+    # a synchronization to occur.
     krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
         stride, elements, inp_arr, out_arr
     )
diff --git a/python/examples/axes.py b/python/examples/axes.py
index 5adc03a..f01607b 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -1,3 +1,19 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import ctypes
 import sys
 from typing import Dict, Optional, Tuple
diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
index dd77f46..58586be 100644
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -1,3 +1,19 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import sys
 
 import cuda.cccl.parallel.experimental.algorithms as algorithms
diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py
index d51f6ad..16f70cc 100644
--- a/python/examples/cpu_activity.py
+++ b/python/examples/cpu_activity.py
@@ -1,3 +1,19 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import sys
 import time
 
diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
index d1e86ef..59177bc 100644
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -1,3 +1,19 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import sys
 
 import cuda.nvbench as nvbench
diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py
new file mode 100644
index 0000000..bba8633
--- /dev/null
+++ b/python/examples/cutlass_gemm.py
@@ -0,0 +1,112 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import sys
+
+import cuda.bindings.driver as driver
+import cuda.core.experimental as core
+import cupy as cp
+import cutlass
+import numpy as np
+
+import nvbench
+
+
+def as_bindings_Stream(cs: nvbench.CudaStream) -> driver.CUstream:
+    return driver.CUstream(cs.addressof())
+
+
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_cp_array(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int) -> cp.ndarray:
+    cp_memview = cp.cuda.UnownedMemory(
+        int(dev_buf.handle), dev_buf.size, dev_buf, dev_id
+    )
+    zero_offset = 0
+    return cp.ndarray(
+        arr_h.shape,
+        dtype=arr_h.dtype,
+        memptr=cp.cuda.MemoryPointer(cp_memview, zero_offset),
+    )
+
+
+def cutlass_gemm(state: nvbench.State) -> None:
+    n = state.get_int64("N")
+    r = state.get_int64("R")
+
+    alpha = state.get_float64("alpha")
+
+    dt = np.float64
+    A_h = np.random.randn(n, r).astype(dt)
+    B_h = np.copy(A_h.mT)
+    C_h = np.eye(n, dtype=dt)
+    D_h = np.zeros_like(C_h)
+
+    if n >= 1024:
+        # allow more time for large inputs
+        state.set_timeout(360)
+
+    dev_id = state.get_device()
+    cs = state.get_stream()
+    s = as_bindings_Stream(cs)
+    core_s = as_core_Stream(cs)
+
+    A_d = core.DeviceMemoryResource(dev_id).allocate(A_h.nbytes, core_s)
+    B_d = core.DeviceMemoryResource(dev_id).allocate(B_h.nbytes, core_s)
+    C_d = core.DeviceMemoryResource(dev_id).allocate(C_h.nbytes, core_s)
+    D_d = core.DeviceMemoryResource(dev_id).allocate(D_h.nbytes, core_s)
+
+    driver.cuMemcpyAsync(A_d.handle, A_h.ctypes.data, A_h.nbytes, s)
+    driver.cuMemcpyAsync(B_d.handle, B_h.ctypes.data, B_h.nbytes, s)
+    driver.cuMemcpyAsync(C_d.handle, C_h.ctypes.data, C_h.nbytes, s)
+    driver.cuMemcpyAsync(D_d.handle, D_h.ctypes.data, D_h.nbytes, s)
+
+    A_cp = make_cp_array(A_h, A_d, dev_id)
+    B_cp = make_cp_array(B_h, B_d, dev_id)
+    C_cp = make_cp_array(C_h, C_d, dev_id)
+    D_cp = make_cp_array(D_h, D_d, dev_id)
+
+    plan = cutlass.op.Gemm(
+        A=A_cp,
+        B=B_cp,
+        C=C_cp,
+        D=D_cp,
+        element=dt,
+        alpha=alpha,
+        beta=1,
+        layout=cutlass.LayoutType.RowMajor,
+    )
+    # warm-up to ensure compilation is not timed
+    plan.run(stream=s)
+
+    def launcher(launch: nvbench.Launch) -> None:
+        s = as_bindings_Stream(launch.get_stream())
+        plan.run(stream=s, sync=False)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    gemm_b = nvbench.register(cutlass_gemm)
+    gemm_b.add_int64_axis("R", [16, 64, 256])
+    gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
+
+    gemm_b.add_float64_axis("alpha", [1e-2])
+
+    nvbench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
index 9315983..8d0789a 100644
--- a/python/examples/exec_tag_sync.py
+++ b/python/examples/exec_tag_sync.py
@@ -1,3 +1,19 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import ctypes
 import sys
 from typing import Optional
diff --git a/python/examples/requirements.txt b/python/examples/requirements.txt
new file mode 100644
index 0000000..35a9c48
--- /dev/null
+++ b/python/examples/requirements.txt
@@ -0,0 +1,7 @@
+numpy
+numba
+cupy
+nvidia-cutlass
+cuda-cccl
+cuda-core
+cuda-bindings
diff --git a/python/examples/skip.py b/python/examples/skip.py
index bb75b57..a5555d0 100644
--- a/python/examples/skip.py
+++ b/python/examples/skip.py
@@ -1,3 +1,19 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import sys
 
 import cuda.cccl.headers as headers
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
index 64b557b..3ae5c1d 100644
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -25,7 +25,7 @@ def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def make_kernel(items_per_thread: int) -> cuda.compiler.AutoJitCUDAKernel:
+def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
     @cuda.jit
     def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
         tid = cuda.grid(1)
@@ -59,6 +59,13 @@ def throughput_bench(state: nvbench.State) -> None:
 
     krn = make_kernel(ipt)
 
+    # warm-up call ensures that kernel is loaded into context
+    # before blocking kernel is launched. Kernel loading may
+    # cause synchronization to occur.
+    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
+        stride, elements, inp_arr, out_arr
+    )
+
     def launcher(launch: nvbench.Launch):
         exec_stream = as_cuda_Stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](

From 480614e847e99cdd30495e3425a9b02d8cee1785 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 09:52:53 -0500
Subject: [PATCH 46/78] Add license to stub fuile, add comment about syncing
 impl and stubs

Add comments stating the need to keep implementation and Python stub
file in sync to both files. In the stub file to comment documents
use of mypy's stubgen to generate stubs and calls to compare that against
current stubs. It also calls out the need to keep docstrings and
doctring examples in sync with implementation.
---
 python/cuda/nvbench/__init__.pyi | 31 ++++++++++++++++++++++++++++---
 python/src/py_nvbench.cpp        |  9 ++++++---
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 3ea1d89..5540d50 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -1,4 +1,29 @@
-# from __future__ import annotations
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# ============================================
+# PLEASE KEEP IN SYNC WITH py_nvbench.cpp FILE
+# ============================================
+# Please be sure to keep these type hints and docstring in sync
+# with the pybind11 bindings in ``../../src/py_nvbench.cpp``
+
+# Use mypy's stubgen to auto-generate stubs using
+# ``stubgen -m cuda.nvbench._nvbench`` and compare
+# stubs in generated out/cuda/nvbench/_nvbench.pyi
+# with definitions given here.
 
 from collections.abc import Callable, Sequence
 from typing import Optional, Self, Union
@@ -22,11 +47,11 @@ class CudaStream:
             import cuda.nvbench as nvbench
 
             def bench(state: nvbench.State):
-                dev = core.Device(state.getDevice())
+                dev = core.Device(state.get_device())
                 dev.set_current()
                 # converts CudaString to core.Stream
                 # using __cuda_stream__ protocol
-                dev.create_stream(state.getStream())
+                dev.create_stream(state.get_stream())
         """
         ...
 
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 6efd0eb..c93581f 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -209,6 +209,12 @@ constinit std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry
 
 } // end of anonymous namespace
 
+// ==========================================
+// PLEASE KEEP IN SYNC WITH __init__.pyi FILE
+// ==========================================
+// If you modify these bindings, please be sure to update the
+// corresponding type hints in ``../cuda/nvbench/__init__.pyi``
+
 PYBIND11_MODULE(_nvbench, m)
 {
   // == STEP 1
@@ -223,9 +229,6 @@ PYBIND11_MODULE(_nvbench, m)
   // It is reinitialized before running all benchmarks to set devices to use
   nvbench::benchmark_manager::get().initialize();
 
-  // TODO: Use cuModuleGetLoadingMode(&mode) to confirm that (mode == CU_MODULE_EAGER_LOADING)
-  // and issue warning otherwise
-
   // == STEP 2
   // Define CudaStream class
   //    ATTN: nvbench::cuda_stream is move-only class

From 5428534124a408a67de47ba7baefcb7fa3994410 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 09:56:58 -0500
Subject: [PATCH 47/78] Add license header to __init__.py

---
 python/cuda/nvbench/__init__.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/nvbench/__init__.py
index 8091be2..993ff05 100644
--- a/python/cuda/nvbench/__init__.py
+++ b/python/cuda/nvbench/__init__.py
@@ -1,3 +1,19 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 import importlib.metadata
 import warnings
 

From 5c01c34793265b820f929b42cd20dd9e4ed8bc94 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:30:31 -0500
Subject: [PATCH 48/78] Fix mypy error in cutlass_gemm example

---
 python/examples/cutlass_gemm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py
index bba8633..1675d0c 100644
--- a/python/examples/cutlass_gemm.py
+++ b/python/examples/cutlass_gemm.py
@@ -34,9 +34,11 @@ def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
-def make_cp_array(arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int) -> cp.ndarray:
+def make_cp_array(
+    arr_h: np.ndarray, dev_buf: core.Buffer, dev_id: int | None
+) -> cp.ndarray:
     cp_memview = cp.cuda.UnownedMemory(
-        int(dev_buf.handle), dev_buf.size, dev_buf, dev_id
+        int(dev_buf.handle), dev_buf.size, dev_buf, -1 if dev_id is None else dev_id
     )
     zero_offset = 0
     return cp.ndarray(

From b9554d7980138d677f9e88e33ce345747278b1f1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:31:17 -0500
Subject: [PATCH 49/78] Fix method name typo in stub file discovered by mypy

---
 python/cuda/nvbench/__init__.pyi | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 5540d50..5935698 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -126,19 +126,19 @@ class State:
     def get_int64(self, name: str) -> int:
         "Get value for given Int64 axis from this configuration"
         ...
-    def get_int64_or_default_value(self, name: str, default_value: int) -> int:
+    def get_int64_or_default(self, name: str, default_value: int) -> int:
         "Get value for given Int64 axis from this configuration"
         ...
     def get_float64(self, name: str) -> float:
         "Get value for given Float64 axis from this configuration"
         ...
-    def get_float64_or_default_value(self, name: str, default_value: float) -> float:
+    def get_float64_or_default(self, name: str, default_value: float) -> float:
         "Get value for given Float64 axis from this configuration"
         ...
     def get_string(self, name: str) -> str:
         "Get value for given String axis from this configuration"
         ...
-    def get_string_or_default_value(self, name: str, default_value: str) -> str:
+    def get_string_or_default(self, name: str, default_value: str) -> str:
         "Get value for given String axis from this configuration"
         ...
     def add_element_count(self, count: int, column_name: Optional[str] = None) -> None:

From 5e8c17c740d4ec205dfb5d23a9299c8f58f8d9e5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:33:08 -0500
Subject: [PATCH 50/78] Fix mypy error in import statement used in cutlass_gemm
 example

---
 python/examples/cutlass_gemm.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py
index 1675d0c..154bc16 100644
--- a/python/examples/cutlass_gemm.py
+++ b/python/examples/cutlass_gemm.py
@@ -19,12 +19,11 @@ import sys
 
 import cuda.bindings.driver as driver
 import cuda.core.experimental as core
+import cuda.nvbench as nvbench
 import cupy as cp
 import cutlass
 import numpy as np
 
-import nvbench
-
 
 def as_bindings_Stream(cs: nvbench.CudaStream) -> driver.CUstream:
     return driver.CUstream(cs.addressof())

From 445d881eda847994ca54112802026b2b75d5aeb7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:39:15 -0500
Subject: [PATCH 51/78] Expand README

Make it explicit in README that we build and locally install NVBench first,
and then build Python package use the library as a dependency.

The nvbench library is installed into Python layout alongside the native
extension.
---
 python/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/README.md b/python/README.md
index d08a3e6..dffd70e 100644
--- a/python/README.md
+++ b/python/README.md
@@ -19,10 +19,18 @@ Now switch to python folder, configure and install NVBench library, and install
 cd nvbench/python
 cmake -B nvbench_build --preset nvbench-ci -S $(pwd)/.. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DNVBench_ENABLE_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$(pwd)/nvbench_install
 cmake --build nvbench_build/ --config Release --target install
+```
 
+### Build Python extension
+
+Specify location local installation of `NVBench` library and perform editable `pip install`:
+
+```
 nvbench_DIR=$(pwd)/nvbench_install/lib/cmake CUDACXX=/usr/local/cuda/bin/nvcc pip install -e .
 ```
 
+Note that `CUDACXX` must be set for NVBench cmake script to work, but Python extension itself only uses host compiler.
+
 ### Verify that package works
 
 ```

From 985db4f1440ed557b40896c174f3acda8747a783 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 24 Jul 2025 12:01:59 -0500
Subject: [PATCH 52/78] Add examples/cccl_cooperative_block_reduce.py

---
 .../examples/cccl_cooperative_block_reduce.py | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 python/examples/cccl_cooperative_block_reduce.py

diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cccl_cooperative_block_reduce.py
new file mode 100644
index 0000000..dc9a6eb
--- /dev/null
+++ b/python/examples/cccl_cooperative_block_reduce.py
@@ -0,0 +1,103 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.cccl.cooperative.experimental as coop
+import cuda.nvbench as nvbench
+import numba
+import numpy as np
+from numba import cuda
+from pynvjitlink import patch
+
+
+class BitsetRing:
+    """
+    Addition operation over ring fixed width unsigned integers
+    with ring_plus = bitwise_or and ring_mul = bitwise_and,
+         ring_zero = 0, ring_one = -1
+    """
+
+    def __init__(self):
+        self.dt = np.uint64
+        self.zero = self.dt(0)
+        self.one = np.bitwise_invert(self.zero)
+
+    @staticmethod
+    def add(op1, op2):
+        return op1 | op2
+
+    @staticmethod
+    def mul(op1, op2):
+        return op1 & op2
+
+
+def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def multi_block_bench(state: nvbench.State):
+    threads_per_block = state.get_int64("ThreadsPerBlock")
+    num_blocks = state.get_int64("NumBlocks")
+    total_elements = threads_per_block * num_blocks
+
+    if total_elements > 2**26:
+        state.skip(reason="Memory footprint over threshold")
+        return
+
+    ring = BitsetRing()
+    block_reduce = coop.block.reduce(numba.uint64, threads_per_block, BitsetRing.add)
+
+    @cuda.jit(link=block_reduce.files)
+    def kernel(inp_arr, out_arr):
+        # Each thread contributes one element
+        block_idx = cuda.blockIdx.x
+        thread_idx = cuda.threadIdx.x
+        global_idx = block_idx * threads_per_block + thread_idx
+
+        block_output = block_reduce(inp_arr[global_idx])
+
+        # Only thread 0 of each block writes the result
+        if thread_idx == 0:
+            out_arr[block_idx] = block_output
+
+    h_inp = np.arange(1, total_elements + 1, dtype=ring.dt)
+    d_inp = cuda.to_device(h_inp)
+    d_out = cuda.device_array(num_blocks, dtype=ring.dt)
+
+    cuda_s = as_cuda_Stream(state.get_stream())
+    # warmup
+    kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
+
+    state.add_element_count(total_elements)
+    state.add_global_memory_reads(total_elements * h_inp.itemsize)
+    state.add_global_memory_writes(num_blocks * h_inp.itemsize)
+
+    def launcher(launch: nvbench.Launch):
+        cuda_s = as_cuda_Stream(launch.get_stream())
+        kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    patch.patch_numba_linker(lto=True)
+
+    b = nvbench.register(multi_block_bench)
+    b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
+    b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
+
+    nvbench.run_all_benchmarks(sys.argv)

From c747a19b98a3d24382b213177f07ddcc23bffe2c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 25 Jul 2025 12:42:58 -0500
Subject: [PATCH 53/78] Remove code setting up CUDA_MODULE_LOADING=EAGER in
 Python extension

---
 python/src/py_nvbench.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index c93581f..34c812d 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -38,15 +38,6 @@ namespace py = pybind11;
 namespace
 {
 
-inline void set_env(const char *name, const char *value)
-{
-#ifdef _MSC_VER
-  _putenv_s(name, value);
-#else
-  setenv(name, value, 1);
-#endif
-}
-
 struct PyObjectDeleter
 {
   void operator()(py::object *p)
@@ -220,9 +211,6 @@ PYBIND11_MODULE(_nvbench, m)
   // == STEP 1
   // Set environment variable CUDA_MODULE_LOADING=EAGER
 
-  // See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
-  set_env("CUDA_MODULE_LOADING", "EAGER");
-
   NVBENCH_DRIVER_API_CALL(cuInit(0));
 
   // This line ensures that benchmark_manager has been created during module init

From 5613281c2e75b496058e38a2dc651fc7e97ea989 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 25 Jul 2025 13:28:25 -0500
Subject: [PATCH 54/78] nvbench.State.exec validates arg to be a callable

Add names to method arguments to make it more self-descriptive.
---
 python/src/py_nvbench.cpp | 60 ++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 34c812d..c8fdc30 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -259,7 +259,9 @@ PYBIND11_MODULE(_nvbench, m)
       self.add_int64_axis(std::move(name), std::move(data));
       return std::ref(self);
     },
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
   py_benchmark_cls.def(
     "add_int64_power_of_two_axis",
     [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
@@ -268,42 +270,51 @@ PYBIND11_MODULE(_nvbench, m)
                           nvbench::int64_axis_flags::power_of_two);
       return std::ref(self);
     },
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
   py_benchmark_cls.def(
     "add_float64_axis",
     [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::float64_t> data) {
       self.add_float64_axis(std::move(name), std::move(data));
       return std::ref(self);
     },
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
   py_benchmark_cls.def(
     "add_string_axis",
     [](nvbench::benchmark_base &self, std::string name, std::vector<std::string> data) {
       self.add_string_axis(std::move(name), std::move(data));
       return std::ref(self);
     },
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("values"));
   py_benchmark_cls.def(
     "set_name",
     [](nvbench::benchmark_base &self, std::string name) {
       self.set_name(std::move(name));
       return std::ref(self);
     },
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("name"));
   py_benchmark_cls.def(
     "set_is_cpu_only",
     [](nvbench::benchmark_base &self, bool is_cpu_only) {
       self.set_is_cpu_only(is_cpu_only);
       return std::ref(self);
     },
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("is_cpu_only"));
   py_benchmark_cls.def(
     "set_run_once",
-    [](nvbench::benchmark_base &self, bool v) {
-      self.set_run_once(v);
+    [](nvbench::benchmark_base &self, bool run_once) {
+      self.set_run_once(run_once);
       return std::ref(self);
     },
-    py::return_value_policy::reference);
+    py::return_value_policy::reference,
+    py::arg("run_once"));
 
   // == STEP 5
   // Define PyState class
@@ -421,7 +432,7 @@ PYBIND11_MODULE(_nvbench, m)
                   &nvbench::state::add_element_count,
                   py::arg("count"),
                   py::arg("column_name") = py::str(""));
-  pystate_cls.def("set_element_count", &nvbench::state::set_element_count);
+  pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count"));
   pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
 
   pystate_cls.def("skip", &nvbench::state::skip, py::arg("reason"));
@@ -478,40 +489,49 @@ PYBIND11_MODULE(_nvbench, m)
 
   pystate_cls.def(
     "exec",
-    [](nvbench::state &state, py::object callable_fn, bool batched, bool sync) {
+    [](nvbench::state &state, py::object py_launcher_fn, bool batched, bool sync) {
+      if (!PyCallable_Check(py_launcher_fn.ptr()))
+      {
+        throw py::type_error("Argument of exec method must be a callable object");
+      }
+
       // wrapper to invoke Python callable
-      auto launcher_fn = [callable_fn](nvbench::launch &launch_descr) -> void {
+      auto cpp_launcher_fn = [py_launcher_fn](nvbench::launch &launch_descr) -> void {
         // cast C++ object to python object
         auto launch_pyarg = py::cast(std::ref(launch_descr), py::return_value_policy::reference);
         // call Python callable
-        callable_fn(launch_pyarg);
+        py_launcher_fn(launch_pyarg);
       };
 
       if (sync)
       {
         if (batched)
         {
-          state.exec(nvbench::exec_tag::sync, launcher_fn);
+          constexpr auto tag = nvbench::exec_tag::sync;
+          state.exec(tag, cpp_launcher_fn);
         }
         else
         {
-          state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::no_batch, launcher_fn);
+          constexpr auto tag = nvbench::exec_tag::sync | nvbench::exec_tag::no_batch;
+          state.exec(tag, cpp_launcher_fn);
         }
       }
       else
       {
         if (batched)
         {
-          state.exec(nvbench::exec_tag::none, launcher_fn);
+          constexpr auto tag = nvbench::exec_tag::none;
+          state.exec(tag, cpp_launcher_fn);
         }
         else
         {
-          state.exec(nvbench::exec_tag::no_batch, launcher_fn);
+          constexpr auto tag = nvbench::exec_tag::no_batch;
+          state.exec(tag, cpp_launcher_fn);
         }
       }
     },
-    "Executor for given callable fn(state : Launch)",
-    py::arg("fn"),
+    "Executor for given launcher callable fn(state : Launch)",
+    py::arg("launcher_fn"),
     py::pos_only{},
     py::arg("batched") = true,
     py::arg("sync")    = false);
@@ -527,7 +547,7 @@ PYBIND11_MODULE(_nvbench, m)
       summ.set_string("name", std::move(column_name));
       summ.set_string("value", std::move(value));
     },
-    py::arg("column_name"),
+    py::arg("name"),
     py::arg("value"));
   pystate_cls.def(
     "add_summary",

From eb614ac52fc1c76e7951be05c0064c92872c7643 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 25 Jul 2025 16:39:01 -0500
Subject: [PATCH 55/78] Add State.get_axis_values and
 State.get_axis_values_as_string

Add nvbench.State methods to get Python dictionary representing
axis values of benchmark configuration state represents.

get_axis_values_as_string gives a string of space-separated
name=values pairs.
---
 python/cuda/nvbench/__init__.pyi |  6 ++++++
 python/src/py_nvbench.cpp        | 22 ++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 5935698..607918f 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -238,6 +238,12 @@ class State:
     def add_summary(self, column_name: str, value: Union[int, float, str]) -> None:
         "Add summary column with a value"
         ...
+    def get_axis_values(self) -> dict[str, int | float | str]:
+        "Get dictionary with axis values for this configuration"
+        ...
+    def get_axis_values_as_string(self) -> str:
+        "Get string of space-separated name=value pairs for this configuration"
+        ...
 
 def register(fn: Callable[[State], None]) -> Benchmark:
     """
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index c8fdc30..8577fe9 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -195,6 +195,25 @@ public:
   }
 };
 
+py::dict py_get_axis_values(const nvbench::state &state)
+{
+  auto named_values = state.get_axis_values();
+
+  auto names = named_values.get_names();
+  py::dict res;
+
+  for (const auto &name : names)
+  {
+    if (named_values.has_value(name))
+    {
+      auto v            = named_values.get_value(name);
+      res[name.c_str()] = py::cast(v);
+    }
+  }
+
+  return res;
+}
+
 // essentially a global variable, but allocated on the heap during module initialization
 constinit std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
 
@@ -569,6 +588,9 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::arg("name"),
     py::arg("value"));
+  pystate_cls.def("get_axis_values_as_string",
+                  [](const nvbench::state &state) { return state.get_axis_values_as_string(); });
+  pystate_cls.def("get_axis_values", &py_get_axis_values);
 
   // Use handle to take a memory leak here, since this object's destructor may be called after
   // interpreter has shut down

From b97e27cbf2d176eae242402a98ac263509086ac6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 25 Jul 2025 16:45:52 -0500
Subject: [PATCH 56/78] Add use of add_axis_values and
 add_axis_values_as_string to test/run_1.py

---
 python/test/run_1.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/python/test/run_1.py b/python/test/run_1.py
index ac33233..0099dc9 100755
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -24,6 +24,9 @@ def add_two(state):
     a = cuda.to_device(np.random.random(N))
     c = cuda.device_array_like(a)
 
+    assert "elements" in state.get_axis_values()
+    assert "elements=" in state.get_axis_values_as_string()
+
     state.add_global_memory_reads(a.nbytes)
     state.add_global_memory_writes(c.nbytes)
 
@@ -55,6 +58,14 @@ def add_float(state):
     nthreads = 64
     nblocks = (len(a) + nthreads - 1) // nthreads
 
+    axis_values = state.get_axis_values()
+    assert "elements" in axis_values
+    assert "v" in axis_values
+    assert "name" in axis_values
+    assert axis_values["elements"] == N
+    assert axis_values["v"] == v
+    assert axis_values["name"] == name
+
     def kernel_launcher(launch):
         _ = v
         _ = name

From b6821b7624dde5bf04097d0334c9560ee73e3337 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 25 Jul 2025 16:58:02 -0500
Subject: [PATCH 57/78] Rename NVBenchRuntimeException to NVBenchRuntimeError

Added exception to __init__.pyi
---
 python/cuda/nvbench/__init__.pyi | 5 +++++
 python/src/py_nvbench.cpp        | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 607918f..5e7b390 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -261,3 +261,8 @@ def run_all_benchmarks(argv: Sequence[str]) -> None:
         Sequence of CLI arguments controlling NVBench. Usually, it is `sys.argv`.
     """
     ...
+
+class NVBenchRuntimeError(RuntimeError):
+    """An exception raised if running benchmarks encounters an error"""
+
+    ...
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 8577fe9..a3cba65 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -595,7 +595,7 @@ PYBIND11_MODULE(_nvbench, m)
   // Use handle to take a memory leak here, since this object's destructor may be called after
   // interpreter has shut down
   benchmark_exc =
-    py::exception<nvbench_run_error>(m, "NVBenchRuntimeException", PyExc_RuntimeError).release();
+    py::exception<nvbench_run_error>(m, "NVBenchRuntimeError", PyExc_RuntimeError).release();
   // == STEP 6
   //    ATTN: nvbench::benchmark_manager is a singleton
 

From afb9951ed8e2550667bd2dfd0ebacb94e62857be Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:41:55 -0500
Subject: [PATCH 58/78] Enable building of NVBench as part of buildign
 extension

1. Download and include CPM.cmake, version 0.42.0
2. Use CPM.make to get Pybind11
3. Update to use pybind11=3.0.0
4. Also use CPM to configure/build nvbench
---
 python/CMakeLists.txt | 41 +++++++++++++++++++++++++++--------------
 python/README.md      | 39 ++++++++++++++++++++-------------------
 2 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b61da52..abfc59a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,29 +1,42 @@
 cmake_minimum_required(VERSION 3.30...4.0)
 
-project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+# CUDA is transitive dependency of nvbench
+project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX CUDA)
 
 find_package(Python REQUIRED COMPONENTS Development.Module)
 find_package(CUDAToolkit REQUIRED)
 
-include(FetchContent)
-
-FetchContent_Declare(
-    pybind11
-    URL https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.tar.gz
-    URL_HASH SHA256=e08cb87f4773da97fa7b5f035de8763abc656d87d5773e62f6da0587d1f0ec20
-    FIND_PACKAGE_ARGS NAMES pybind11
+# Get CMake package manager
+set(_cpm_download_location ${CMAKE_CURRENT_BINARY_DIR}/cmake/CPM.cmake)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/v0.42.0/CPM.cmake
+  ${_cpm_download_location}
+  EXPECTED_HASH SHA256=2020b4fc42dba44817983e06342e682ecfc3d2f484a581f11cc5731fbe4dce8a
 )
-FetchContent_MakeAvailable(pybind11)
+include(${_cpm_download_location})
 
-find_package(nvbench CONFIG REQUIRED)
+CPMAddPackage(
+   NAME nvbench
+   SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/..
+   OPTIONS "NVBench_INSTALL_RULES ON"
+   FIND_PACKAGE_ARGS CONFIG REQUIRED
+)
+
+CPMAddPackage("gh:pybind/pybind11@3.0.0")
 
 pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
 target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
 target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
+
 set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
+set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
+set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(_nvbench PROPERTIES CXX_STANDARD 20)
 
 install(TARGETS _nvbench DESTINATION cuda/nvbench)
-install(IMPORTED_RUNTIME_ARTIFACTS nvbench::nvbench DESTINATION cuda/nvbench)
+
+# Determine target that nvbench::nvbench is an alias of,
+# necessary because ALIAS targets cannot be installed
+get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
+install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/nvbench)
diff --git a/python/README.md b/python/README.md
index dffd70e..bd812f5 100644
--- a/python/README.md
+++ b/python/README.md
@@ -4,7 +4,7 @@ This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
 
 ## Building
 
-### Build `NVBench` project
+### Ensure recent version of CMake
 
 Since `nvbench` requires a rather new version of CMake (>=3.30.4), either build CMake from sources, or create a conda environment with a recent version of CMake, using
 
@@ -13,48 +13,49 @@ conda create -n build_env --yes  cmake ninja
 conda activate build_env
 ```
 
+### Ensure CUDA compiler
+
+Since building `NVBench` library requires CUDA compiler, ensure that appropriate environment variables
+are set. For example, assuming CUDA toolkit is installedsystem-wide, and assuming Ampere GPU architecture:
+
+```bash
+export CUDACXX=/usr/local/cuda/bin/nvcc
+export CUDAARCHS=86
+``
+
+### Build Python project
+
 Now switch to python folder, configure and install NVBench library, and install the package in editable mode:
 
-```
+```bash
 cd nvbench/python
-cmake -B nvbench_build --preset nvbench-ci -S $(pwd)/.. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DNVBench_ENABLE_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$(pwd)/nvbench_install
-cmake --build nvbench_build/ --config Release --target install
+pip install -e .
 ```
 
-### Build Python extension
-
-Specify location local installation of `NVBench` library and perform editable `pip install`:
-
-```
-nvbench_DIR=$(pwd)/nvbench_install/lib/cmake CUDACXX=/usr/local/cuda/bin/nvcc pip install -e .
-```
-
-Note that `CUDACXX` must be set for NVBench cmake script to work, but Python extension itself only uses host compiler.
-
 ### Verify that package works
 
-```
+```bash
 python test/run_1.py
 ```
 
 ### Run examples
 
-```
+```bash
 # Example benchmarking numba.cuda kernel
 python examples/throughput.py
 ```
 
-```
+```bash
 # Example benchmarking kernels authored using cuda.core
 python examples/axes.py
 ```
 
-```
+```bash
 # Example benchmarking algorithms from cuda.cccl.parallel
 python examples/cccl_parallel_segmented_reduce.py
 ```
 
-```
+```bash
 # Example benchmarking CuPy function
 python examples/cupy_extract.py
 ```

From 413c4a114be0b0d70867e64e14f165b5bdbd0b39 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 28 Jul 2025 14:27:50 -0500
Subject: [PATCH 59/78] Support nvbench.State.set_throttle_threshold

---
 python/cuda/nvbench/__init__.pyi | 5 ++++-
 python/src/py_nvbench.cpp        | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 5e7b390..0e2690e 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -169,7 +169,10 @@ class State:
         "Get Benchmark this configuration is a part of"
         ...
     def get_throttle_threshold(self) -> float:
-        "Get throttle threshold value"
+        "Get throttle threshold value, as fraction of maximal frequency"
+        ...
+    def set_throttle_threshold(self, threshold_fraction: float) -> None:
+        "Set throttle threshold fraction to specified value, expected to be between 0 and 1"
         ...
     def get_min_samples(self) -> int:
         "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index a3cba65..604746e 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -481,6 +481,9 @@ PYBIND11_MODULE(_nvbench, m)
     [](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
     py::return_value_policy::reference);
   pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
+  pystate_cls.def("set_throttle_threshold",
+                  &nvbench::state::set_throttle_threshold,
+                  py::arg("throttle_fraction"));
 
   pystate_cls.def("get_min_samples", &nvbench::state::get_min_samples);
   pystate_cls.def("set_min_samples",

From 6b9050e4044087df1acb5e15bfad0b2f680a31ae Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 28 Jul 2025 15:57:11 -0500
Subject: [PATCH 60/78] Add example of benchmarking pytorch code

---
 python/examples/pytorch_bench.py | 58 ++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 python/examples/pytorch_bench.py

diff --git a/python/examples/pytorch_bench.py b/python/examples/pytorch_bench.py
new file mode 100644
index 0000000..f62a7a5
--- /dev/null
+++ b/python/examples/pytorch_bench.py
@@ -0,0 +1,58 @@
+import sys
+
+import cuda.nvbench as nvbench
+import torch
+
+
+def as_torch_cuda_Stream(
+    cs: nvbench.CudaStream, dev: int | None
+) -> torch.cuda.ExternalStream:
+    return torch.cuda.ExternalStream(
+        stream_ptr=cs.addressof(), device=torch.cuda.device(dev)
+    )
+
+
+def torch_bench(state: nvbench.State) -> None:
+    state.set_throttle_threshold(0.25)
+
+    dev_id = state.get_device()
+    tc_s = as_torch_cuda_Stream(state.get_stream(), dev_id)
+
+    dt = torch.float32
+    scalar_shape: tuple = tuple()
+    n = 2**28
+    with torch.cuda.stream(tc_s):
+        a3 = torch.randn(scalar_shape, dtype=dt)
+        a2 = torch.randn(scalar_shape, dtype=dt)
+        a1 = torch.randn(scalar_shape, dtype=dt)
+        a0 = torch.randn(scalar_shape, dtype=dt)
+        x = torch.linspace(-3, 3, n, dtype=dt)
+        y = torch.sin(x)
+
+    learning_rate = 1e-4
+
+    def launcher(launch: nvbench.Launch) -> None:
+        tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id)
+        with torch.cuda.stream(tc_s):
+            x2 = torch.square(x)
+            y_pred = (a3 + x2 * a1) + x * (a2 + a0 * x2)
+
+            _ = torch.square(y_pred - y).sum()
+            grad_y_pred = 2 * (y_pred - y)
+            grad_a3 = grad_y_pred.sum()
+            grad_a2 = (grad_y_pred * x).sum()
+            grad_a1 = (grad_y_pred * x2).sum()
+            grad_a0 = (grad_y_pred * x2 * x).sum()
+
+            _ = a3 - grad_a3 * learning_rate
+            _ = a2 - grad_a2 * learning_rate
+            _ = a1 - grad_a1 * learning_rate
+            _ = a0 - grad_a0 * learning_rate
+
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    nvbench.register(torch_bench)
+
+    nvbench.run_all_benchmarks(sys.argv)

From 9c01f229a6b18c490e0f9d9f835e90f11520be05 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 28 Jul 2025 15:35:01 -0500
Subject: [PATCH 61/78] Add Benchmark set methods, such as
 set_stopping_criterion, set_timeout, etc

Add
   - State.get_stopping_criterion() -> str
   - Benchmark.set_stopping_criterion(criterion: str) -> Self
   - Benchmark.set_criterion_param_int64(name: str, value: int) -> Self
   - Benchmark.set_criterion_param_float64(name: str, value: float) -> Self
   - Benchmark.set_criterion_param_string(name: str, value: str) -> Self
   - Benchmark.set_timeout(duration: float) -> Self
   - Benchmark.set_skip_time(skip_time: float) -> Self
   - Benchmark.set_throttle_threshold(frac: float) -> Self
   - Benchmark.set_throttle_recovery_delay(duration: float) -> Self
   - Benchmark.set_min_samples(count: int) -> Self
---
 python/cuda/nvbench/__init__.pyi | 30 ++++++++++
 python/src/py_nvbench.cpp        | 95 ++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 0e2690e..00b786d 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -92,6 +92,33 @@ class Benchmark:
     def set_run_once(self, v: bool) -> Self:
         "Set whether all benchmark configurations are executed only once"
         ...
+    def set_skip_time(self, duration_seconds: float) -> Self:
+        "Set run durations, in seconds, that should be skipped"
+        ...
+    def set_throttle_recovery_delay(self, delay_seconds: float) -> Self:
+        "Set throttle recovery delay, in seconds"
+        ...
+    def set_throttle_threshold(self, threshold: float) -> Self:
+        "Set throttle threshold, as a fraction of maximal GPU frequency"
+        ...
+    def set_timeout(self, duration_seconds: float) -> Self:
+        "Set benchmark run duration timeout value, in seconds"
+        ...
+    def set_stopping_criterion(self, criterion: str) -> Self:
+        "Set stopping criterion to be used"
+        ...
+    def set_criterion_param_float64(self, name: str, value: float) -> Self:
+        "Set stopping criterion floating point parameter value"
+        ...
+    def set_criterion_param_int64(self, name: str, value: int) -> Self:
+        "Set stopping criterion integer parameter value"
+        ...
+    def set_criterion_param_string(self, name: str, value: str) -> Self:
+        "Set stopping criterion string parameter value"
+        ...
+    def set_min_samples(self, count: int) -> Self:
+        "Set minimal samples count before stopping criterion applies"
+        ...
 
 class Launch:
     """Configuration object for function launch.
@@ -247,6 +274,9 @@ class State:
     def get_axis_values_as_string(self) -> str:
         "Get string of space-separated name=value pairs for this configuration"
         ...
+    def get_stopping_criterion(self) -> str:
+        "Get string name of stopping criterion used"
+        ...
 
 def register(fn: Callable[[State], None]) -> Benchmark:
     """
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 604746e..be2a384 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -269,6 +269,24 @@ PYBIND11_MODULE(_nvbench, m)
 
   // == STEP 4
   // Define Benchmark class
+  //    ATTN: nvbench::benchmark_base is move-only class
+  //    Methods:
+  //        nvbench::benchmark_base::get_name
+  //        nvbench::benchmark_base::add_int64_axis
+  //        nvbench::benchmark_base::add_int64_power_of_two_axis
+  //        nvbench::benchmark_base::add_float64_axis
+  //        nvbench::benchmark_base::add_string_axis
+  //        nvbench::benchmark_base::set_name
+  //        nvbench::benchmark_base::set_is_cpu_only
+  //        nvbench::benchmark_base::set_skip_time
+  //        nvbench::benchmark_base::set_timeout
+  //        nvbench::benchmark_base::set_throttle_threshold
+  //        nvbench::benchmark_base::set_throttle_recovery_delay
+  //        nvbench::benchmark_base::set_stopping_criterion
+  //        nvbench::benchmark_base::set_criterion_param_int64
+  //        nvbench::benchmark_base::set_criterion_param_float64
+  //        nvbench::benchmark_base::set_criterion_param_string
+  //        nvbench::benchmark_base::set_min_samples
 
   auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
   py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
@@ -326,6 +344,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("is_cpu_only"));
+  // TODO: should this be exposed?
   py_benchmark_cls.def(
     "set_run_once",
     [](nvbench::benchmark_base &self, bool run_once) {
@@ -334,6 +353,81 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("run_once"));
+  py_benchmark_cls.def(
+    "set_skip_time",
+    [](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) {
+      self.set_skip_time(skip_duration_seconds);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("duration_seconds"));
+  py_benchmark_cls.def(
+    "set_timeout",
+    [](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) {
+      self.set_timeout(duration_seconds);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("duration_seconds"));
+  py_benchmark_cls.def(
+    "set_throttle_threshold",
+    [](nvbench::benchmark_base &self, nvbench::float64_t threshold) {
+      self.set_throttle_threshold(threshold);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("threshold"));
+  py_benchmark_cls.def(
+    "set_throttle_recovery_delay",
+    [](nvbench::benchmark_base &self, nvbench::float64_t delay) {
+      self.set_throttle_recovery_delay(delay);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("delay_seconds"));
+  py_benchmark_cls.def(
+    "set_stopping_criterion",
+    [](nvbench::benchmark_base &self, std::string criterion) {
+      self.set_stopping_criterion(std::move(criterion));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("criterion"));
+  py_benchmark_cls.def(
+    "set_criterion_param_int64",
+    [](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) {
+      self.set_criterion_param_int64(std::move(name), value);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("value"));
+  py_benchmark_cls.def(
+    "set_criterion_param_float64",
+    [](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) {
+      self.set_criterion_param_float64(std::move(name), value);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("value"));
+  py_benchmark_cls.def(
+    "set_criterion_param_string",
+    [](nvbench::benchmark_base &self, std::string name, std::string value) {
+      self.set_criterion_param_string(std::move(name), std::move(value));
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("name"),
+    py::arg("value"));
+  py_benchmark_cls.def(
+    "set_min_samples",
+    [](nvbench::benchmark_base &self, nvbench::int64_t count) {
+      self.set_min_samples(count);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference,
+    py::arg("min_samples_count"));
 
   // == STEP 5
   // Define PyState class
@@ -594,6 +688,7 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("get_axis_values_as_string",
                   [](const nvbench::state &state) { return state.get_axis_values_as_string(); });
   pystate_cls.def("get_axis_values", &py_get_axis_values);
+  pystate_cls.def("get_stopping_criterion", &nvbench::state::get_stopping_criterion);
 
   // Use handle to take a memory leak here, since this object's destructor may be called after
   // interpreter has shut down

From 88a3ad0138940158c2442f8cd9bd186bcb582257 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 30 Jul 2025 13:54:37 -0500
Subject: [PATCH 62/78] Add test/stub.py

The following static analysis run should run green

```
mypy --ignore-missing-imports test/stub.py
```
---
 python/test/stubs.py | 236 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 236 insertions(+)
 create mode 100644 python/test/stubs.py

diff --git a/python/test/stubs.py b/python/test/stubs.py
new file mode 100644
index 0000000..f3f4ee2
--- /dev/null
+++ b/python/test/stubs.py
@@ -0,0 +1,236 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+import sys
+from typing import Dict, Optional, Tuple
+
+import cuda.cccl.headers as headers
+import cuda.core.experimental as core
+import cuda.nvbench as nvbench
+
+
+def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+    return core.Stream.from_handle(cs.addressof())
+
+
+def make_sleep_kernel():
+    """JITs sleep_kernel(seconds)"""
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/chrono>
+
+// Each launched thread just sleeps for `seconds`.
+__global__ void sleep_kernel(double seconds) {
+  namespace chrono = ::cuda::std::chrono;
+  using hr_clock = chrono::high_resolution_clock;
+
+  auto duration = static_cast<cuda::std::int64_t>(seconds * 1e9);
+  const auto ns = chrono::nanoseconds(duration);
+
+  const auto start = hr_clock::now();
+  const auto finish = start + ns;
+
+  auto now = hr_clock::now();
+  while (now < finish)
+  {
+    now = hr_clock::now();
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    mod = prog.compile("cubin", name_expressions=("sleep_kernel",))
+    return mod.get_kernel("sleep_kernel")
+
+
+def no_axes(state: nvbench.State):
+    state.set_min_samples(1000)
+    sleep_dur = 1e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    print(f"Stopping criterion used: {state.get_stopping_criterion()}")
+
+    def launcher(launch: nvbench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def tags(state: nvbench.State):
+    state.set_min_samples(1000)
+    sleep_dur = 1e-3
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    sync_flag = bool(state.get_int64("Sync"))
+    batched_flag = bool(state.get_int64("Batched"))
+
+    def launcher(launch: nvbench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher, sync=sync_flag, batched=batched_flag)
+
+
+def single_float64_axis(state: nvbench.State):
+    # get axis value, or default
+    default_sleep_dur = 3.14e-4
+    sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
+    krn = make_sleep_kernel()
+    launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, sleep_dur)
+
+    state.exec(launcher)
+
+
+def default_value(state: nvbench.State):
+    single_float64_axis(state)
+
+
+def make_copy_kernel(in_type: Optional[str] = None, out_type: Optional[str] = None):
+    src = r"""
+#include <cuda/std/cstdint>
+#include <cuda/std/cstddef>
+/*!
+ * Naive copy of `n` values from `in` -> `out`.
+ */
+template <typename T, typename U>
+__global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
+{
+  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto step = blockDim.x * gridDim.x;
+
+  for (auto i = init; i < n; i += step)
+  {
+    out[i] = static_cast<U>(in[i]);
+  }
+}
+"""
+    incl = headers.get_include_paths()
+    opts = core.ProgramOptions(include_path=str(incl.libcudacxx))
+    prog = core.Program(src, code_type="c++", options=opts)
+    if in_type is None:
+        in_type = "::cuda::std::int32_t"
+    if out_type is None:
+        out_type = "::cuda::std::int32_t"
+    instance_name = f"copy_kernel<{in_type}, {out_type}>"
+    mod = prog.compile("cubin", name_expressions=(instance_name,))
+    return mod.get_kernel(instance_name)
+
+
+def copy_sweep_grid_shape(state: nvbench.State):
+    block_size = state.get_int64("BlockSize")
+    num_blocks = state.get_int64("NumBlocks")
+
+    # Number of int32 elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(ctypes.c_int32(0))
+
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
+
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+
+    krn = make_copy_kernel()
+    launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+def copy_type_sweep(state: nvbench.State):
+    type_id = state.get_int64("TypeID")
+
+    types_map: Dict[int, Tuple[type, str]] = {
+        0: (ctypes.c_uint8, "cuda::std::uint8_t"),
+        1: (ctypes.c_uint16, "cuda::std::uint16_t"),
+        2: (ctypes.c_uint32, "cuda::std::uint32_t"),
+        3: (ctypes.c_uint64, "cuda::std::uint64_t"),
+        4: (ctypes.c_float, "float"),
+        5: (ctypes.c_double, "double"),
+    }
+
+    value_ctype, value_cuda_t = types_map[type_id]
+    state.add_summary("Type", value_cuda_t)
+
+    # Number of elements in 256MiB
+    nbytes = 256 * 1024 * 1024
+    num_values = nbytes // ctypes.sizeof(value_ctype)
+
+    state.add_element_count(num_values)
+    state.add_global_memory_reads(nbytes)
+    state.add_global_memory_writes(nbytes)
+
+    dev_id = state.get_device()
+    alloc_s = as_core_Stream(state.get_stream())
+    input_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+    output_buf = core.DeviceMemoryResource(dev_id).allocate(nbytes, alloc_s)
+
+    krn = make_copy_kernel(value_cuda_t, value_cuda_t)
+    launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
+
+    def launcher(launch: nvbench.Launch):
+        s = as_core_Stream(launch.get_stream())
+        core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    # Benchmark without axes
+    simple_b = nvbench.register(no_axes)
+    simple_b.set_stopping_criterion("entropy")
+    simple_b.set_criterion_param_int64("unused_int", 100)
+
+    tags_b = nvbench.register(tags)
+    tags_b.add_int64_axis("Sync", [0, 1])
+    tags_b.add_int64_axis("Batched", [0, 1])
+
+    # benchmark with no axes, that uses default value
+    default_b = nvbench.register(default_value)
+    default_b.set_min_samples(7)
+
+    # specify axis
+    axes_b = nvbench.register(single_float64_axis).add_float64_axis(
+        "Duration", [7e-5, 1e-4, 5e-4]
+    )
+    axes_b.set_timeout(20)
+    axes_b.set_skip_time(1e-5)
+    axes_b.set_throttle_threshold(0.2)
+    axes_b.set_throttle_recovery_delay(0.1)
+
+    copy1_bench = nvbench.register(copy_sweep_grid_shape)
+    copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2))
+    copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
+
+    copy2_bench = nvbench.register(copy_type_sweep)
+    copy2_bench.add_int64_axis("TypeID", range(0, 6))
+
+    nvbench.run_all_benchmarks(sys.argv)

From add539a0c1f99dc2f0f79d1bdad1d0d179542090 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 30 Jul 2025 16:54:56 -0500
Subject: [PATCH 63/78] Replaced argument type annotation: int ->
 typing.SupportsInt

Same for float->typing.SupportsFloat. Result types remain int/float
---
 python/cuda/nvbench/__init__.pyi | 54 +++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 00b786d..bff9f83 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -26,7 +26,7 @@
 # with definitions given here.
 
 from collections.abc import Callable, Sequence
-from typing import Optional, Self, Union
+from typing import Optional, Self, SupportsFloat, SupportsInt, Union
 
 class CudaStream:
     """Represents CUDA stream
@@ -71,13 +71,15 @@ class Benchmark:
     def get_name(self) -> str:
         "Get benchmark name"
         ...
-    def add_int64_axis(self, name: str, values: Sequence[int]) -> Self:
+    def add_int64_axis(self, name: str, values: Sequence[SupportsInt]) -> Self:
         "Add integral type parameter axis with given name and values to sweep over"
         ...
-    def add_int64_power_of_two_axis(self, name: str, values: Sequence[int]) -> Self:
+    def add_int64_power_of_two_axis(
+        self, name: str, values: Sequence[SupportsInt]
+    ) -> Self:
         "Add integral type parameter axis with given name and values to sweep over"
         ...
-    def add_float64_axis(self, name: str, values: Sequence[float]) -> Self:
+    def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self:
         "Add floating-point type parameter axis with given name and values to sweep over"
         ...
     def add_string_axis(sef, name: str, values: Sequence[str]) -> Self:
@@ -92,31 +94,31 @@ class Benchmark:
     def set_run_once(self, v: bool) -> Self:
         "Set whether all benchmark configurations are executed only once"
         ...
-    def set_skip_time(self, duration_seconds: float) -> Self:
+    def set_skip_time(self, duration_seconds: SupportsFloat) -> Self:
         "Set run durations, in seconds, that should be skipped"
         ...
-    def set_throttle_recovery_delay(self, delay_seconds: float) -> Self:
+    def set_throttle_recovery_delay(self, delay_seconds: SupportsFloat) -> Self:
         "Set throttle recovery delay, in seconds"
         ...
-    def set_throttle_threshold(self, threshold: float) -> Self:
+    def set_throttle_threshold(self, threshold: SupportsFloat) -> Self:
         "Set throttle threshold, as a fraction of maximal GPU frequency"
         ...
-    def set_timeout(self, duration_seconds: float) -> Self:
+    def set_timeout(self, duration_seconds: SupportsFloat) -> Self:
         "Set benchmark run duration timeout value, in seconds"
         ...
     def set_stopping_criterion(self, criterion: str) -> Self:
         "Set stopping criterion to be used"
         ...
-    def set_criterion_param_float64(self, name: str, value: float) -> Self:
+    def set_criterion_param_float64(self, name: str, value: SupportsFloat) -> Self:
         "Set stopping criterion floating point parameter value"
         ...
-    def set_criterion_param_int64(self, name: str, value: int) -> Self:
+    def set_criterion_param_int64(self, name: str, value: SupportsInt) -> Self:
         "Set stopping criterion integer parameter value"
         ...
     def set_criterion_param_string(self, name: str, value: str) -> Self:
         "Set stopping criterion string parameter value"
         ...
-    def set_min_samples(self, count: int) -> Self:
+    def set_min_samples(self, count: SupportsInt) -> Self:
         "Set minimal samples count before stopping criterion applies"
         ...
 
@@ -153,13 +155,13 @@ class State:
     def get_int64(self, name: str) -> int:
         "Get value for given Int64 axis from this configuration"
         ...
-    def get_int64_or_default(self, name: str, default_value: int) -> int:
+    def get_int64_or_default(self, name: str, default_value: SupportsInt) -> int:
         "Get value for given Int64 axis from this configuration"
         ...
     def get_float64(self, name: str) -> float:
         "Get value for given Float64 axis from this configuration"
         ...
-    def get_float64_or_default(self, name: str, default_value: float) -> float:
+    def get_float64_or_default(self, name: str, default_value: SupportsFloat) -> float:
         "Get value for given Float64 axis from this configuration"
         ...
     def get_string(self, name: str) -> str:
@@ -168,10 +170,12 @@ class State:
     def get_string_or_default(self, name: str, default_value: str) -> str:
         "Get value for given String axis from this configuration"
         ...
-    def add_element_count(self, count: int, column_name: Optional[str] = None) -> None:
+    def add_element_count(
+        self, count: SupportsInt, column_name: Optional[str] = None
+    ) -> None:
         "Add element count"
         ...
-    def set_element_count(self, count: int) -> None:
+    def set_element_count(self, count: SupportsInt) -> None:
         "Set element count"
         ...
     def get_element_count(self) -> int:
@@ -186,10 +190,14 @@ class State:
     def get_skip_reason(self) -> str:
         "Get reason provided for skipping this configuration"
         ...
-    def add_global_memory_reads(self, nbytes: int, /, column_name: str = "") -> None:
+    def add_global_memory_reads(
+        self, nbytes: SupportsInt, /, column_name: str = ""
+    ) -> None:
         "Inform NVBench that given amount of bytes is being read by the benchmark from global memory"
         ...
-    def add_global_memory_writes(self, nbytes: int, /, column_name: str = "") -> None:
+    def add_global_memory_writes(
+        self, nbytes: SupportsInt, /, column_name: str = ""
+    ) -> None:
         "Inform NVBench that given amount of bytes is being written by the benchmark into global memory"
         ...
     def get_benchmark(self) -> Benchmark:
@@ -198,13 +206,13 @@ class State:
     def get_throttle_threshold(self) -> float:
         "Get throttle threshold value, as fraction of maximal frequency"
         ...
-    def set_throttle_threshold(self, threshold_fraction: float) -> None:
+    def set_throttle_threshold(self, threshold_fraction: SupportsFloat) -> None:
         "Set throttle threshold fraction to specified value, expected to be between 0 and 1"
         ...
     def get_min_samples(self) -> int:
         "Get the number of benchmark timings NVBench performs before stopping criterion begins being used"
         ...
-    def set_min_samples(self, min_samples_count: int) -> None:
+    def set_min_samples(self, min_samples_count: SupportsInt) -> None:
         "Set the number of benchmark timings for NVBench to perform before stopping criterion begins being used"
         ...
     def get_disable_blocking_kernel(self) -> bool:
@@ -222,13 +230,13 @@ class State:
     def get_timeout(self) -> float:
         "Get time-out value for benchmark execution of this configuration"
         ...
-    def set_timeout(self, duration: float) -> None:
+    def set_timeout(self, duration: SupportsFloat) -> None:
         "Set time-out value for benchmark execution of this configuration, in seconds"
         ...
     def get_blocking_kernel_timeout(self) -> float:
         "Get time-out value for execution of blocking kernel"
         ...
-    def set_blocking_kernel_timeout(self, duration: float) -> None:
+    def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None:
         "Set time-out value for execution of blocking kernel, in seconds"
         ...
     def collect_cupti_metrics(self) -> None:
@@ -265,7 +273,9 @@ class State:
     def get_short_description(self) -> str:
         "Get short description for this configuration"
         ...
-    def add_summary(self, column_name: str, value: Union[int, float, str]) -> None:
+    def add_summary(
+        self, column_name: str, value: Union[SupportsInt, SupportsFloat, str]
+    ) -> None:
         "Add summary column with a value"
         ...
     def get_axis_values(self) -> dict[str, int | float | str]:

From fb23591aeff1fe308ce35bfce7754605c0859802 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 31 Jul 2025 15:42:30 -0500
Subject: [PATCH 64/78] Fixed missing space in README

---
 python/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/README.md b/python/README.md
index bd812f5..4bcdf92 100644
--- a/python/README.md
+++ b/python/README.md
@@ -16,7 +16,7 @@ conda activate build_env
 ### Ensure CUDA compiler
 
 Since building `NVBench` library requires CUDA compiler, ensure that appropriate environment variables
-are set. For example, assuming CUDA toolkit is installedsystem-wide, and assuming Ampere GPU architecture:
+are set. For example, assuming CUDA toolkit is installed system-wide, and assuming Ampere GPU architecture:
 
 ```bash
 export CUDACXX=/usr/local/cuda/bin/nvcc

From c91204f2593f9e30c4757751194f61b4e5472736 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 31 Jul 2025 15:48:49 -0500
Subject: [PATCH 65/78] Improved docstrings per PR review suggestions

---
 python/cuda/nvbench/__init__.pyi | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index bff9f83..42169ec 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -86,7 +86,7 @@ class Benchmark:
         "Add string type parameter axis with given name and values to sweep over"
         ...
     def set_name(self, name: str) -> Self:
-        ""
+        "Set benchmark name"
         ...
     def set_is_cpu_only(self, is_cpu_only: bool) -> Self:
         "Set whether this benchmark only executes on CPU"
@@ -228,13 +228,13 @@ class State:
         "Set run-once flag for this configuration"
         ...
     def get_timeout(self) -> float:
-        "Get time-out value for benchmark execution of this configuration"
+        "Get time-out value for benchmark execution of this configuration, in seconds"
         ...
     def set_timeout(self, duration: SupportsFloat) -> None:
         "Set time-out value for benchmark execution of this configuration, in seconds"
         ...
     def get_blocking_kernel_timeout(self) -> float:
-        "Get time-out value for execution of blocking kernel"
+        "Get time-out value for execution of blocking kernel, in seconds"
         ...
     def set_blocking_kernel_timeout(self, duration: SupportsFloat) -> None:
         "Set time-out value for execution of blocking kernel, in seconds"
@@ -290,7 +290,7 @@ class State:
 
 def register(fn: Callable[[State], None]) -> Benchmark:
     """
-    Register bencharking function with NVBench.
+    Register given benchmarking function with NVBench.
     """
     ...
 

From 453a1648aa768137b9026183419cdcfcbb2acef2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 31 Jul 2025 16:20:52 -0500
Subject: [PATCH 66/78] Improvements to readability of examples per PR review

---
 python/examples/auto_throughput.py            | 10 ++++-----
 python/examples/axes.py                       |  2 +-
 .../cccl_parallel_segmented_reduce.py         |  6 ++---
 python/examples/cupy_extract.py               | 10 +++++----
 python/examples/throughput.py                 | 10 ++++-----
 python/test/run_1.py                          | 22 +++++++++----------
 6 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index 80a94e9..1b6e663 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -21,11 +21,11 @@ import numpy as np
 from numba import cuda
 
 
-def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
+def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
     @cuda.jit
     def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
         tid = cuda.grid(1)
@@ -46,7 +46,7 @@ def throughput_bench(state: nvbench.State) -> None:
     nbytes = 128 * 1024 * 1024
     elements = nbytes // np.dtype(np.int32).itemsize
 
-    alloc_stream = as_cuda_Stream(state.get_stream())
+    alloc_stream = as_cuda_stream(state.get_stream())
     inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
     out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
 
@@ -56,7 +56,7 @@ def throughput_bench(state: nvbench.State) -> None:
     threads_per_block = 256
     blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
 
-    krn = make_kernel(ipt)
+    krn = make_throughput_kernel(ipt)
 
     # warm-up call ensures that kernel is loaded into context
     # before blocking kernel is launched. Kernel loading may cause
@@ -66,7 +66,7 @@ def throughput_bench(state: nvbench.State) -> None:
     )
 
     def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.get_stream())
+        exec_stream = as_cuda_stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
         )
diff --git a/python/examples/axes.py b/python/examples/axes.py
index f01607b..e07606f 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -193,7 +193,7 @@ if __name__ == "__main__":
     nvbench.register(default_value)
     # specify axis
     nvbench.register(single_float64_axis).add_float64_axis(
-        "Duration", [7e-5, 1e-4, 5e-4]
+        "Duration (s)", [7e-5, 1e-4, 5e-4]
     )
 
     copy1_bench = nvbench.register(copy_sweep_grid_shape)
diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
index 58586be..0f440e3 100644
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -61,10 +61,6 @@ def segmented_reduce(state: nvbench.State):
     dev_id = state.get_device()
     cp_stream = as_cp_ExternalStream(state.get_stream(), dev_id)
 
-    with cp_stream:
-        rng = cp.random.default_rng()
-        mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
-
     def add_op(a, b):
         return a + b
 
@@ -84,6 +80,8 @@ def segmented_reduce(state: nvbench.State):
 
     h_init = np.zeros(tuple(), dtype=np.int32)
     with cp_stream:
+        rng = cp.random.default_rng()
+        mat = rng.integers(low=-31, high=32, dtype=np.int32, size=(n_rows, n_cols))
         d_input = mat
         d_output = cp.empty(n_rows, dtype=d_input.dtype)
 
diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
index 59177bc..16e5d9f 100644
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -36,14 +36,16 @@ def cupy_extract_by_mask(state: nvbench.State):
 
     state.collect_cupti_metrics()
     state.add_element_count(n_rows * n_cols, "# Elements")
+    int32_dt = cp.dtype(cp.int32)
+    bool_dt = cp.dtype(cp.bool_)
     state.add_global_memory_reads(
-        n_rows * n_cols * (cp.dtype(cp.int32).itemsize + cp.dtype("?").itemsize)
+        n_rows * n_cols * (int32_dt.itemsize + bool_dt.itemsize)
     )
-    state.add_global_memory_writes(n_rows * n_cols * (cp.dtype(cp.int32).itemsize))
+    state.add_global_memory_writes(n_rows * n_cols * (int32_dt.itemsize))
 
     with cp_s:
-        X = cp.full((n_cols, n_rows), fill_value=3, dtype=cp.int32)
-        mask = cp.ones((n_cols, n_rows), dtype="?")
+        X = cp.full((n_cols, n_rows), fill_value=3, dtype=int32_dt)
+        mask = cp.ones((n_cols, n_rows), dtype=bool_dt)
         _ = X[mask]
 
     def launcher(launch: nvbench.Launch):
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
index 3ae5c1d..5984126 100644
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -21,11 +21,11 @@ import numpy as np
 from numba import cuda
 
 
-def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def make_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
+def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
     @cuda.jit
     def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
         tid = cuda.grid(1)
@@ -46,7 +46,7 @@ def throughput_bench(state: nvbench.State) -> None:
     nbytes = 128 * 1024 * 1024
     elements = nbytes // np.dtype(np.int32).itemsize
 
-    alloc_stream = as_cuda_Stream(state.get_stream())
+    alloc_stream = as_cuda_stream(state.get_stream())
     inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
     out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
 
@@ -57,7 +57,7 @@ def throughput_bench(state: nvbench.State) -> None:
     threads_per_block = 256
     blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
 
-    krn = make_kernel(ipt)
+    krn = make_throughput_kernel(ipt)
 
     # warm-up call ensures that kernel is loaded into context
     # before blocking kernel is launched. Kernel loading may
@@ -67,7 +67,7 @@ def throughput_bench(state: nvbench.State) -> None:
     )
 
     def launcher(launch: nvbench.Launch):
-        exec_stream = as_cuda_Stream(launch.get_stream())
+        exec_stream = as_cuda_stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
         )
diff --git a/python/test/run_1.py b/python/test/run_1.py
index 0099dc9..dfa38f4 100755
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -14,12 +14,15 @@ def kernel(a, b, c):
         c[tid] = a[tid] + b[tid]
 
 
-def get_numba_stream(launch):
+def get_numba_stream(launch: nvbench.Launch):
     return cuda.external_stream(launch.get_stream().addressof())
 
 
-def add_two(state):
-    # state.skip("Skipping this benchmark for no reason")
+def skipit(state: nvbench.State) -> None:
+    state.skip("Skipping this benchmark for no reason")
+
+
+def add_two(state: nvbench.State):
     N = state.get_int64("elements")
     a = cuda.to_device(np.random.random(N))
     c = cuda.device_array_like(a)
@@ -44,7 +47,7 @@ def add_two(state):
     state.exec(kernel_launcher, batched=True, sync=True)
 
 
-def add_float(state):
+def add_float(state: nvbench.State):
     N = state.get_int64("elements")
     v = state.get_float64("v")
     name = state.get_string("name")
@@ -75,7 +78,7 @@ def add_float(state):
     state.exec(kernel_launcher, batched=True, sync=True)
 
 
-def add_three(state):
+def add_three(state: nvbench.State):
     N = state.get_int64("elements")
     a = cuda.to_device(np.random.random(N).astype(np.float32))
     b = cuda.to_device(np.random.random(N).astype(np.float32))
@@ -105,13 +108,10 @@ def register_benchmarks():
         nvbench.register(add_float)
         .add_float64_axis("v", [0.1, 0.3])
         .add_string_axis("name", ["Anne", "Lynda"])
-        .add_int64_axis("elements", [2**pow2 for pow2 in range(20, 23)])
-    )
-    (
-        nvbench.register(add_three).add_int64_axis(
-            "elements", [2**pow2 for pow2 in range(20, 22)]
-        )
+        .add_int64_power_of_two_axis("elements", range(20, 23))
     )
+    (nvbench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22)))
+    nvbench.register(skipit)
 
 
 if __name__ == "__main__":

From f1fbfd85b41c4b252627ae316f5b7d0e432f3b70 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 31 Jul 2025 16:27:54 -0500
Subject: [PATCH 67/78] Renamed src/README.md to src/.BUILD_LOCALLY.md

Provided more context to the command stated in the readme, and
changed so as to not hard-code installation paths of NVBench,
and checkout path of pybind11.
---
 python/src/.BUILD_LOCALLY.md | 24 ++++++++++++++++++++++++
 python/src/README.md         | 17 -----------------
 2 files changed, 24 insertions(+), 17 deletions(-)
 create mode 100644 python/src/.BUILD_LOCALLY.md
 delete mode 100644 python/src/README.md

diff --git a/python/src/.BUILD_LOCALLY.md b/python/src/.BUILD_LOCALLY.md
new file mode 100644
index 0000000..70ad23e
--- /dev/null
+++ b/python/src/.BUILD_LOCALLY.md
@@ -0,0 +1,24 @@
+# Building extension module locally
+
+This may be useful for debugging purposes.
+Assuming NVBench is installed into `NVBENCH_PREFIX`,
+and pybind11 repo is cloned to `PYBIND11_DIR`,
+Python extension can be built locally, using host compiler,
+as follows:
+
+```bash
+g++ py_nvbench.cpp                                  \
+   -shared -fPIC                                    \
+   -I ${PYBIND11_DIR}/include                       \
+   -I ${NVBENCH_PREFIX}/include                     \
+   -I /usr/local/cuda/include                       \
+   $(python3-config --includes)                     \
+   $(python3-config --libs)                         \
+   -L ${NVBENCH_PREFIX}/lib/                        \
+   -lnvbench                                        \
+   -Wl,-rpath,${NVBENCH_PREFIX}/lib                 \
+   -L /usr/local/cuda/lib64/                        \
+   -lcudart                                         \
+   -Wl,-rpath,/usr/local/cuda/lib64                 \
+   -o _nvbench$(python3-config --extension-suffix)
+```
diff --git a/python/src/README.md b/python/src/README.md
deleted file mode 100644
index af4f613..0000000
--- a/python/src/README.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-```
-g++ py_nvbench.cpp                                    \
-   -shared -fPIC                                      \
-   -I ${HOME}/repos/pybind11/include                  \
-   -I ${HOME}/repos/pynvbench/nvbench_dir/include     \
-   -I /usr/local/cuda/include                         \
-   $(python3-config --includes)                       \
-   $(python3-config --libs)                           \
-   -L ${HOME}/repos/pynvbench/nvbench_dir/lib/        \
-   -lnvbench                                          \
-   -Wl,-rpath,${HOME}/repos/pynvbench/nvbench_dir/lib \
-   -L /usr/local/cuda/lib64/                          \
-   -lcudart                                           \
-   -Wl,-rpath,/usr/local/cuda/lib64                   \
-   -o _nvbench$(python3-config --extension-suffix)
-```

From 3fea652d16020de148c7537c6c75b50f86030210 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 1 Aug 2025 15:03:06 -0500
Subject: [PATCH 68/78] Fix type in stub declaration for
 Benchmark.add_string_axis

---
 python/cuda/nvbench/__init__.pyi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 42169ec..80a6dee 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -82,7 +82,7 @@ class Benchmark:
     def add_float64_axis(self, name: str, values: Sequence[SupportsFloat]) -> Self:
         "Add floating-point type parameter axis with given name and values to sweep over"
         ...
-    def add_string_axis(sef, name: str, values: Sequence[str]) -> Self:
+    def add_string_axis(self, name: str, values: Sequence[str]) -> Self:
         "Add string type parameter axis with given name and values to sweep over"
         ...
     def set_name(self, name: str) -> Self:

From 4fc628c4d7b6148ebd5cb9c800ea3fb337d602c0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 1 Aug 2025 15:33:39 -0500
Subject: [PATCH 69/78] Python native extension to use CXX/CUDA standard of
 NVBench library

This fixes cryptic build failure with GNU compiler 14
---
 python/CMakeLists.txt     | 1 -
 python/src/py_nvbench.cpp | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index abfc59a..139b80b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -32,7 +32,6 @@ target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
 set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
 set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
 set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set_target_properties(_nvbench PROPERTIES CXX_STANDARD 20)
 
 install(TARGETS _nvbench DESTINATION cuda/nvbench)
 
diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index be2a384..89fb718 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -102,7 +102,7 @@ private:
 
 class nvbench_run_error : std::runtime_error
 {};
-constinit py::handle benchmark_exc{};
+py::handle benchmark_exc{};
 
 class GlobalBenchmarkRegistry
 {
@@ -215,7 +215,7 @@ py::dict py_get_axis_values(const nvbench::state &state)
 }
 
 // essentially a global variable, but allocated on the heap during module initialization
-constinit std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
+std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
 
 } // end of anonymous namespace
 

From 40a2337a6ba6ff2c5fd46386ec84432d74347948 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 09:40:17 -0500
Subject: [PATCH 70/78] Review fix: make nvbenhch_run_error constructable

Allow `throw nvbench_run_error("Msg");` to compile.

Add comment around definition of nvbench_run_error
---
 python/src/py_nvbench.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 89fb718..30b5dd0 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -100,8 +100,13 @@ private:
   std::shared_ptr<py::object> m_fn;
 };
 
-class nvbench_run_error : std::runtime_error
-{};
+// Use struct to ensure public inheritance
+struct nvbench_run_error : std::runtime_error
+{
+  // ask compiler to generate all constructor signatures
+  // that are defined for the base class
+  using std::runtime_error::runtime_error;
+};
 py::handle benchmark_exc{};
 
 class GlobalBenchmarkRegistry

From a5e0a48f80a3019314bd3edf9e0d5a5057ad9686 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 10:04:20 -0500
Subject: [PATCH 71/78] Add test test functions for cpp/python exceptions

---
 python/src/py_nvbench.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 30b5dd0..22cfbc6 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -725,4 +725,10 @@ PYBIND11_MODULE(_nvbench, m)
     },
     "Run all registered benchmarks",
     py::arg("argv") = py::list());
+
+  m.def("test_cpp_exception", []() { throw nvbench_run_error("Test"); });
+  m.def("test_py_exception", []() {
+    py::set_error(benchmark_exc, "Test");
+    throw py::error_already_set();
+  });
 }

From 73e18419b2f6f458c2867ee3607b4931288f08e6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 10:11:33 -0500
Subject: [PATCH 72/78] Stub of __cuda_stream__ special method declare
 tuple[int, int] as return type

This is to indicate that special method always returns a pair of integers
---
 python/cuda/nvbench/__init__.pyi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 80a6dee..6fea984 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -35,7 +35,7 @@ class CudaStream:
     ----
         The class is not directly constructible.
     """
-    def __cuda_stream__(self) -> tuple[int]:
+    def __cuda_stream__(self) -> tuple[int, int]:
         """
         Special method implement CUDA stream protocol
         from `cuda.core`. Returns a pair of integers:

From 6aff4712f8f62ccf8925e9feef3bb1a0277e8c29 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 10:13:08 -0500
Subject: [PATCH 73/78] Change permissions of test/run_1.py

---
 python/test/run_1.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 python/test/run_1.py

diff --git a/python/test/run_1.py b/python/test/run_1.py
old mode 100755
new mode 100644

From 9dfdd8af8968945946dbff4bc2cffcbda6b3ef07 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 11:59:17 -0500
Subject: [PATCH 74/78] Minimal test file

---
 python/test/test_nvbench.py | 39 +++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 python/test/test_nvbench.py

diff --git a/python/test/test_nvbench.py b/python/test/test_nvbench.py
new file mode 100644
index 0000000..d03e263
--- /dev/null
+++ b/python/test/test_nvbench.py
@@ -0,0 +1,39 @@
+import json
+
+import cuda.nvbench as nvbench
+import pytest
+
+
+def test_cpp_exception():
+    with pytest.raises(RuntimeError, match="Test"):
+        nvbench._nvbench.test_cpp_exception()
+
+
+def test_py_exception():
+    with pytest.raises(nvbench.NVBenchRuntimeError, match="Test"):
+        nvbench._nvbench.test_py_exception()
+
+
+@pytest.mark.parametrize(
+    "cls", [nvbench.CudaStream, nvbench.State, nvbench.Launch, nvbench.Benchmark]
+)
+def test_api_ctor(cls):
+    with pytest.raises(TypeError, match="No constructor defined!"):
+        cls()
+
+
+def t_bench(state: nvbench.State):
+    s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}}
+
+    def launcher(launch: nvbench.Launch):
+        for _ in range(10000):
+            _ = json.dumps(s)
+
+    state.exec(launcher)
+
+
+def test_cpu_only():
+    b = nvbench.register(t_bench)
+    b.set_is_cpu_only(True)
+
+    nvbench.run_all_benchmarks(["-q", "--profile"])

From d8b0acc8d4fd3f893fee4b3800b5a4e561f97867 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 12:00:42 -0500
Subject: [PATCH 75/78] Export exception to nvbench namespace

---
 python/cuda/nvbench/__init__.py  | 3 +++
 python/cuda/nvbench/__init__.pyi | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/nvbench/__init__.py
index 993ff05..4617e52 100644
--- a/python/cuda/nvbench/__init__.py
+++ b/python/cuda/nvbench/__init__.py
@@ -43,6 +43,9 @@ from cuda.nvbench._nvbench import (  # noqa: E402
 from cuda.nvbench._nvbench import (  # noqa: E402
     Launch as Launch,
 )
+from cuda.nvbench._nvbench import (  # noqa: E402
+    NVBenchRuntimeError as NVBenchRuntimeError,
+)
 from cuda.nvbench._nvbench import (  # noqa: E402
     State as State,
 )
diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/nvbench/__init__.pyi
index 6fea984..a0bca3d 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/nvbench/__init__.pyi
@@ -33,7 +33,7 @@ class CudaStream:
 
     Note
     ----
-        The class is not directly constructible.
+        The class is not user-constructible.
     """
     def __cuda_stream__(self) -> tuple[int, int]:
         """
@@ -65,6 +65,7 @@ class Benchmark:
     Note
     ----
         The class is not user-constructible.
+
         Use `~register` function to create Benchmark and register
         it with NVBench.
     """

From 584f48ac977475a66a2b0d285a4b58f4038c355e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 12:14:44 -0500
Subject: [PATCH 76/78] Remove warm-up invocations outside of launcher in
 examples/throughout and auto_throughput

---
 python/examples/auto_throughput.py | 7 -------
 python/examples/throughput.py      | 7 -------
 2 files changed, 14 deletions(-)

diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index 1b6e663..88691ec 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -58,13 +58,6 @@ def throughput_bench(state: nvbench.State) -> None:
 
     krn = make_throughput_kernel(ipt)
 
-    # warm-up call ensures that kernel is loaded into context
-    # before blocking kernel is launched. Kernel loading may cause
-    # a synchronization to occur.
-    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
-        stride, elements, inp_arr, out_arr
-    )
-
     def launcher(launch: nvbench.Launch):
         exec_stream = as_cuda_stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
index 5984126..890c372 100644
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -59,13 +59,6 @@ def throughput_bench(state: nvbench.State) -> None:
 
     krn = make_throughput_kernel(ipt)
 
-    # warm-up call ensures that kernel is loaded into context
-    # before blocking kernel is launched. Kernel loading may
-    # cause synchronization to occur.
-    krn[blocks_in_grid, threads_per_block, alloc_stream, 0](
-        stride, elements, inp_arr, out_arr
-    )
-
     def launcher(launch: nvbench.Launch):
         exec_stream = as_cuda_stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](

From c2a2acc9b666a6cf69678461090574fb160a82fd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 12:14:52 -0500
Subject: [PATCH 77/78] Change float64_t arg-type for set_throttle_threshold to
 float32_t

The C++ method signature of set_throttle_threshold/set_trottle_recovery_delay,
which uses nvbench::float32_t
---
 python/src/py_nvbench.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 22cfbc6..8856e8e 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -376,7 +376,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::arg("duration_seconds"));
   py_benchmark_cls.def(
     "set_throttle_threshold",
-    [](nvbench::benchmark_base &self, nvbench::float64_t threshold) {
+    [](nvbench::benchmark_base &self, nvbench::float32_t threshold) {
       self.set_throttle_threshold(threshold);
       return std::ref(self);
     },
@@ -384,7 +384,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::arg("threshold"));
   py_benchmark_cls.def(
     "set_throttle_recovery_delay",
-    [](nvbench::benchmark_base &self, nvbench::float64_t delay) {
+    [](nvbench::benchmark_base &self, nvbench::float32_t delay) {
       self.set_throttle_recovery_delay(delay);
       return std::ref(self);
     },

From b5e4b4ba31bae000e0bdaa4fc09013992ab8078f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 4 Aug 2025 13:42:43 -0500
Subject: [PATCH 78/78] cuda.nvbench -> cuda.bench

Per PR review suggestion:
   - `cuda.parallel`    - device-wide algorithms/Thrust
   - `cuda.cooperative` - Cooperative algorithsm/CUB
   - `cuda.bench`       - Benchmarking/NVBench
---
 python/CMakeLists.txt                         |  4 +-
 python/cuda/{nvbench => bench}/__init__.py    | 14 +++----
 python/cuda/{nvbench => bench}/__init__.pyi   |  4 +-
 python/cuda/{nvbench => bench}/py.typed       |  0
 python/examples/auto_throughput.py            | 12 +++---
 python/examples/axes.py                       | 34 ++++++++--------
 .../examples/cccl_cooperative_block_reduce.py | 16 +++-----
 .../cccl_parallel_segmented_reduce.py         | 16 ++++----
 python/examples/cpu_activity.py               | 18 ++++-----
 python/examples/cupy_extract.py               | 12 +++---
 python/examples/cutlass_gemm.py               | 14 +++----
 python/examples/exec_tag_sync.py              | 12 +++---
 python/examples/pytorch_bench.py              | 12 +++---
 python/examples/skip.py                       | 12 +++---
 python/examples/throughput.py                 | 12 +++---
 python/pyproject.toml                         |  2 +-
 python/test/run_1.py                          | 24 +++++------
 python/test/stubs.py                          | 40 +++++++++----------
 python/test/test_nvbench.py                   | 18 ++++-----
 19 files changed, 136 insertions(+), 140 deletions(-)
 rename python/cuda/{nvbench => bench}/__init__.py (80%)
 rename python/cuda/{nvbench => bench}/__init__.pyi (99%)
 rename python/cuda/{nvbench => bench}/py.typed (100%)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 139b80b..b18f7ef 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -33,9 +33,9 @@ set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
 set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
 set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-install(TARGETS _nvbench DESTINATION cuda/nvbench)
+install(TARGETS _nvbench DESTINATION cuda/bench)
 
 # Determine target that nvbench::nvbench is an alias of,
 # necessary because ALIAS targets cannot be installed
 get_target_property(_aliased_target_name nvbench::nvbench ALIASED_TARGET)
-install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/nvbench)
+install(IMPORTED_RUNTIME_ARTIFACTS ${_aliased_target_name} DESTINATION cuda/bench)
diff --git a/python/cuda/nvbench/__init__.py b/python/cuda/bench/__init__.py
similarity index 80%
rename from python/cuda/nvbench/__init__.py
rename to python/cuda/bench/__init__.py
index 4617e52..e1d2282 100644
--- a/python/cuda/nvbench/__init__.py
+++ b/python/cuda/bench/__init__.py
@@ -34,25 +34,25 @@ except Exception as e:
 for libname in ("cupti", "nvperf_target", "nvperf_host"):
     load_nvidia_dynamic_lib(libname)
 
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     Benchmark as Benchmark,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     CudaStream as CudaStream,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     Launch as Launch,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     NVBenchRuntimeError as NVBenchRuntimeError,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     State as State,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     register as register,
 )
-from cuda.nvbench._nvbench import (  # noqa: E402
+from cuda.bench._nvbench import (  # noqa: E402
     run_all_benchmarks as run_all_benchmarks,
 )
 
diff --git a/python/cuda/nvbench/__init__.pyi b/python/cuda/bench/__init__.pyi
similarity index 99%
rename from python/cuda/nvbench/__init__.pyi
rename to python/cuda/bench/__init__.pyi
index a0bca3d..86681fc 100644
--- a/python/cuda/nvbench/__init__.pyi
+++ b/python/cuda/bench/__init__.pyi
@@ -44,9 +44,9 @@ class CudaStream:
         Example
         -------
             import cuda.core.experimental as core
-            import cuda.nvbench as nvbench
+            import cuda.bench as bench
 
-            def bench(state: nvbench.State):
+            def bench(state: bench.State):
                 dev = core.Device(state.get_device())
                 dev.set_current()
                 # converts CudaString to core.Stream
diff --git a/python/cuda/nvbench/py.typed b/python/cuda/bench/py.typed
similarity index 100%
rename from python/cuda/nvbench/py.typed
rename to python/cuda/bench/py.typed
diff --git a/python/examples/auto_throughput.py b/python/examples/auto_throughput.py
index 88691ec..db4fa19 100644
--- a/python/examples/auto_throughput.py
+++ b/python/examples/auto_throughput.py
@@ -16,12 +16,12 @@
 
 import sys
 
-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import numpy as np
 from numba import cuda
 
 
-def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
@@ -39,7 +39,7 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc
     return kernel
 
 
-def throughput_bench(state: nvbench.State) -> None:
+def throughput_bench(state: bench.State) -> None:
     stride = state.get_int64("Stride")
     ipt = state.get_int64("ItemsPerThread")
 
@@ -58,7 +58,7 @@ def throughput_bench(state: nvbench.State) -> None:
 
     krn = make_throughput_kernel(ipt)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         exec_stream = as_cuda_stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
@@ -68,8 +68,8 @@ def throughput_bench(state: nvbench.State) -> None:
 
 
 if __name__ == "__main__":
-    b = nvbench.register(throughput_bench)
+    b = bench.register(throughput_bench)
     b.add_int64_axis("Stride", [1, 2, 4])
     b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/axes.py b/python/examples/axes.py
index e07606f..ce67238 100644
--- a/python/examples/axes.py
+++ b/python/examples/axes.py
@@ -18,12 +18,12 @@ import ctypes
 import sys
 from typing import Dict, Optional, Tuple
 
+import cuda.bench as bench
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
@@ -58,34 +58,34 @@ __global__ void sleep_kernel(double seconds) {
     return mod.get_kernel("sleep_kernel")
 
 
-def simple(state: nvbench.State):
+def simple(state: bench.State):
     state.set_min_samples(1000)
     sleep_dur = 1e-3
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
 
 
-def single_float64_axis(state: nvbench.State):
+def single_float64_axis(state: bench.State):
     # get axis value, or default
     default_sleep_dur = 3.14e-4
     sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
 
 
-def default_value(state: nvbench.State):
+def default_value(state: bench.State):
     single_float64_axis(state)
 
 
@@ -120,7 +120,7 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
     return mod.get_kernel(instance_name)
 
 
-def copy_sweep_grid_shape(state: nvbench.State):
+def copy_sweep_grid_shape(state: bench.State):
     block_size = state.get_int64("BlockSize")
     num_blocks = state.get_int64("NumBlocks")
 
@@ -140,14 +140,14 @@ def copy_sweep_grid_shape(state: nvbench.State):
     krn = make_copy_kernel()
     launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
 
 
-def copy_type_sweep(state: nvbench.State):
+def copy_type_sweep(state: bench.State):
     type_id = state.get_int64("TypeID")
 
     types_map: Dict[int, Tuple[type, str]] = {
@@ -178,7 +178,7 @@ def copy_type_sweep(state: nvbench.State):
     krn = make_copy_kernel(value_cuda_t, value_cuda_t)
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
@@ -187,20 +187,20 @@ def copy_type_sweep(state: nvbench.State):
 
 if __name__ == "__main__":
     # Benchmark without axes
-    nvbench.register(simple)
+    bench.register(simple)
 
     # benchmark with no axes, that uses default value
-    nvbench.register(default_value)
+    bench.register(default_value)
     # specify axis
-    nvbench.register(single_float64_axis).add_float64_axis(
+    bench.register(single_float64_axis).add_float64_axis(
         "Duration (s)", [7e-5, 1e-4, 5e-4]
     )
 
-    copy1_bench = nvbench.register(copy_sweep_grid_shape)
+    copy1_bench = bench.register(copy_sweep_grid_shape)
     copy1_bench.add_int64_axis("BlockSize", [2**x for x in range(6, 10, 2)])
     copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
 
-    copy2_bench = nvbench.register(copy_type_sweep)
+    copy2_bench = bench.register(copy_type_sweep)
     copy2_bench.add_int64_axis("TypeID", range(0, 6))
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cccl_cooperative_block_reduce.py b/python/examples/cccl_cooperative_block_reduce.py
index dc9a6eb..ea5bcf0 100644
--- a/python/examples/cccl_cooperative_block_reduce.py
+++ b/python/examples/cccl_cooperative_block_reduce.py
@@ -16,8 +16,8 @@
 
 import sys
 
+import cuda.bench as bench
 import cuda.cccl.cooperative.experimental as coop
-import cuda.nvbench as nvbench
 import numba
 import numpy as np
 from numba import cuda
@@ -45,11 +45,11 @@ class BitsetRing:
         return op1 & op2
 
 
-def as_cuda_Stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+def as_cuda_Stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
-def multi_block_bench(state: nvbench.State):
+def multi_block_bench(state: bench.State):
     threads_per_block = state.get_int64("ThreadsPerBlock")
     num_blocks = state.get_int64("NumBlocks")
     total_elements = threads_per_block * num_blocks
@@ -78,15 +78,11 @@ def multi_block_bench(state: nvbench.State):
     d_inp = cuda.to_device(h_inp)
     d_out = cuda.device_array(num_blocks, dtype=ring.dt)
 
-    cuda_s = as_cuda_Stream(state.get_stream())
-    # warmup
-    kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
-
     state.add_element_count(total_elements)
     state.add_global_memory_reads(total_elements * h_inp.itemsize)
     state.add_global_memory_writes(num_blocks * h_inp.itemsize)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         cuda_s = as_cuda_Stream(launch.get_stream())
         kernel[num_blocks, threads_per_block, cuda_s, 0](d_inp, d_out)
 
@@ -96,8 +92,8 @@ def multi_block_bench(state: nvbench.State):
 if __name__ == "__main__":
     patch.patch_numba_linker(lto=True)
 
-    b = nvbench.register(multi_block_bench)
+    b = bench.register(multi_block_bench)
     b.add_int64_axis("ThreadsPerBlock", [64, 128, 192, 256])
     b.add_int64_power_of_two_axis("NumBlocks", [10, 11, 12, 14, 16])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cccl_parallel_segmented_reduce.py b/python/examples/cccl_parallel_segmented_reduce.py
index 0f440e3..e54a77b 100644
--- a/python/examples/cccl_parallel_segmented_reduce.py
+++ b/python/examples/cccl_parallel_segmented_reduce.py
@@ -16,10 +16,10 @@
 
 import sys
 
+import cuda.bench as bench
 import cuda.cccl.parallel.experimental.algorithms as algorithms
 import cuda.cccl.parallel.experimental.iterators as iterators
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 import cupy as cp
 import numpy as np
 
@@ -34,22 +34,22 @@ class CCCLStream:
         return (0, self._ptr)
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
-def as_cccl_Stream(cs: nvbench.CudaStream) -> CCCLStream:
+def as_cccl_Stream(cs: bench.CudaStream) -> CCCLStream:
     return CCCLStream(cs.addressof())
 
 
 def as_cp_ExternalStream(
-    cs: nvbench.CudaStream, dev_id: int | None = -1
+    cs: bench.CudaStream, dev_id: int | None = -1
 ) -> cp.cuda.ExternalStream:
     h = cs.addressof()
     return cp.cuda.ExternalStream(h, dev_id)
 
 
-def segmented_reduce(state: nvbench.State):
+def segmented_reduce(state: bench.State):
     "Benchmark segmented_reduce example"
     n_elems = state.get_int64("numElems")
     n_cols = state.get_int64("numCols")
@@ -100,7 +100,7 @@ def segmented_reduce(state: nvbench.State):
     with cp_stream:
         temp_storage = cp.empty(temp_nbytes, dtype=cp.uint8)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_cccl_Stream(launch.get_stream())
         alg(
             temp_storage,
@@ -117,8 +117,8 @@ def segmented_reduce(state: nvbench.State):
 
 
 if __name__ == "__main__":
-    b = nvbench.register(segmented_reduce)
+    b = bench.register(segmented_reduce)
     b.add_int64_axis("numElems", [2**20, 2**22, 2**24])
     b.add_int64_axis("numCols", [1024, 2048, 4096, 8192])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cpu_activity.py b/python/examples/cpu_activity.py
index 16f70cc..a492ff7 100644
--- a/python/examples/cpu_activity.py
+++ b/python/examples/cpu_activity.py
@@ -17,21 +17,21 @@
 import sys
 import time
 
+import cuda.bench as bench
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 
 host_sleep_duration = 0.1
 
 
-def cpu_only_sleep_bench(state: nvbench.State) -> None:
-    def launcher(launch: nvbench.Launch):
+def cpu_only_sleep_bench(state: bench.State) -> None:
+    def launcher(launch: bench.Launch):
         time.sleep(host_sleep_duration)
 
     state.exec(launcher)
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
@@ -66,7 +66,7 @@ __global__ void sleep_kernel(double seconds) {
     return mod.get_kernel("sleep_kernel")
 
 
-def mixed_sleep_bench(state: nvbench.State) -> None:
+def mixed_sleep_bench(state: bench.State) -> None:
     sync = state.get_string("Sync")
     sync_flag = sync == "Do sync"
 
@@ -74,7 +74,7 @@ def mixed_sleep_bench(state: nvbench.State) -> None:
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         # host overhead
         time.sleep(host_sleep_duration)
         # GPU computation
@@ -87,11 +87,11 @@ def mixed_sleep_bench(state: nvbench.State) -> None:
 if __name__ == "__main__":
     # time function only doing work (sleeping) on the host
     # using CPU timer only
-    b = nvbench.register(cpu_only_sleep_bench)
+    b = bench.register(cpu_only_sleep_bench)
     b.set_is_cpu_only(True)
 
     # time the function that does work on both GPU and CPU
-    b2 = nvbench.register(mixed_sleep_bench)
+    b2 = bench.register(mixed_sleep_bench)
     b2.add_string_axis("Sync", ["Do not sync", "Do sync"])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cupy_extract.py b/python/examples/cupy_extract.py
index 16e5d9f..091141c 100644
--- a/python/examples/cupy_extract.py
+++ b/python/examples/cupy_extract.py
@@ -16,18 +16,18 @@
 
 import sys
 
-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import cupy as cp
 
 
 def as_cp_ExternalStream(
-    cs: nvbench.CudaStream, dev_id: int | None = -1
+    cs: bench.CudaStream, dev_id: int | None = -1
 ) -> cp.cuda.ExternalStream:
     h = cs.addressof()
     return cp.cuda.ExternalStream(h, dev_id)
 
 
-def cupy_extract_by_mask(state: nvbench.State):
+def cupy_extract_by_mask(state: bench.State):
     n_cols = state.get_int64("numCols")
     n_rows = state.get_int64("numRows")
 
@@ -48,7 +48,7 @@ def cupy_extract_by_mask(state: nvbench.State):
         mask = cp.ones((n_cols, n_rows), dtype=bool_dt)
         _ = X[mask]
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         with as_cp_ExternalStream(launch.get_stream(), dev_id):
             _ = X[mask]
 
@@ -56,8 +56,8 @@ def cupy_extract_by_mask(state: nvbench.State):
 
 
 if __name__ == "__main__":
-    b = nvbench.register(cupy_extract_by_mask)
+    b = bench.register(cupy_extract_by_mask)
     b.add_int64_axis("numCols", [1024, 2048, 4096, 2 * 4096])
     b.add_int64_axis("numRows", [1024, 2048, 4096, 2 * 4096])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/cutlass_gemm.py b/python/examples/cutlass_gemm.py
index 154bc16..cd62f39 100644
--- a/python/examples/cutlass_gemm.py
+++ b/python/examples/cutlass_gemm.py
@@ -17,19 +17,19 @@
 
 import sys
 
+import cuda.bench as bench
 import cuda.bindings.driver as driver
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 import cupy as cp
 import cutlass
 import numpy as np
 
 
-def as_bindings_Stream(cs: nvbench.CudaStream) -> driver.CUstream:
+def as_bindings_Stream(cs: bench.CudaStream) -> driver.CUstream:
     return driver.CUstream(cs.addressof())
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
@@ -47,7 +47,7 @@ def make_cp_array(
     )
 
 
-def cutlass_gemm(state: nvbench.State) -> None:
+def cutlass_gemm(state: bench.State) -> None:
     n = state.get_int64("N")
     r = state.get_int64("R")
 
@@ -96,7 +96,7 @@ def cutlass_gemm(state: nvbench.State) -> None:
     # warm-up to ensure compilation is not timed
     plan.run(stream=s)
 
-    def launcher(launch: nvbench.Launch) -> None:
+    def launcher(launch: bench.Launch) -> None:
         s = as_bindings_Stream(launch.get_stream())
         plan.run(stream=s, sync=False)
 
@@ -104,10 +104,10 @@ def cutlass_gemm(state: nvbench.State) -> None:
 
 
 if __name__ == "__main__":
-    gemm_b = nvbench.register(cutlass_gemm)
+    gemm_b = bench.register(cutlass_gemm)
     gemm_b.add_int64_axis("R", [16, 64, 256])
     gemm_b.add_int64_axis("N", [256, 512, 1024, 2048])
 
     gemm_b.add_float64_axis("alpha", [1e-2])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/exec_tag_sync.py b/python/examples/exec_tag_sync.py
index 8d0789a..b9ab5ef 100644
--- a/python/examples/exec_tag_sync.py
+++ b/python/examples/exec_tag_sync.py
@@ -18,12 +18,12 @@ import ctypes
 import sys
 from typing import Optional
 
+import cuda.bench as bench
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     "Create view of native stream used by NVBench"
     return core.Stream.from_handle(cs.addressof())
 
@@ -57,7 +57,7 @@ __global__ void fill_kernel(T *buf, T v, ::cuda::std::size_t n)
     return mod.get_kernel(instance_name)
 
 
-def synchronizing_bench(state: nvbench.State):
+def synchronizing_bench(state: bench.State):
     n_values = 64 * 1024 * 1024
     n_bytes = n_values * ctypes.sizeof(ctypes.c_int32(0))
 
@@ -70,7 +70,7 @@ def synchronizing_bench(state: nvbench.State):
     krn = make_fill_kernel()
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, buffer, 0, n_values)
         s.sync()
@@ -81,5 +81,5 @@ def synchronizing_bench(state: nvbench.State):
 
 
 if __name__ == "__main__":
-    nvbench.register(synchronizing_bench)
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.register(synchronizing_bench)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/pytorch_bench.py b/python/examples/pytorch_bench.py
index f62a7a5..f49a543 100644
--- a/python/examples/pytorch_bench.py
+++ b/python/examples/pytorch_bench.py
@@ -1,18 +1,18 @@
 import sys
 
-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import torch
 
 
 def as_torch_cuda_Stream(
-    cs: nvbench.CudaStream, dev: int | None
+    cs: bench.CudaStream, dev: int | None
 ) -> torch.cuda.ExternalStream:
     return torch.cuda.ExternalStream(
         stream_ptr=cs.addressof(), device=torch.cuda.device(dev)
     )
 
 
-def torch_bench(state: nvbench.State) -> None:
+def torch_bench(state: bench.State) -> None:
     state.set_throttle_threshold(0.25)
 
     dev_id = state.get_device()
@@ -31,7 +31,7 @@ def torch_bench(state: nvbench.State) -> None:
 
     learning_rate = 1e-4
 
-    def launcher(launch: nvbench.Launch) -> None:
+    def launcher(launch: bench.Launch) -> None:
         tc_s = as_torch_cuda_Stream(launch.get_stream(), dev_id)
         with torch.cuda.stream(tc_s):
             x2 = torch.square(x)
@@ -53,6 +53,6 @@ def torch_bench(state: nvbench.State) -> None:
 
 
 if __name__ == "__main__":
-    nvbench.register(torch_bench)
+    bench.register(torch_bench)
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/skip.py b/python/examples/skip.py
index a5555d0..cf7ec90 100644
--- a/python/examples/skip.py
+++ b/python/examples/skip.py
@@ -16,12 +16,12 @@
 
 import sys
 
+import cuda.bench as bench
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     "Create view into native stream provided by NVBench"
     return core.Stream.from_handle(cs.addressof())
 
@@ -57,7 +57,7 @@ __global__ void sleep_kernel(double seconds) {
     return mod.get_kernel("sleep_kernel")
 
 
-def runtime_skip(state: nvbench.State):
+def runtime_skip(state: bench.State):
     duration = state.get_float64("Duration")
     kramble = state.get_string("Kramble")
 
@@ -74,7 +74,7 @@ def runtime_skip(state: nvbench.State):
     krn = make_sleep_kernel()
     launch_cfg = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_cfg, krn, duration)
 
@@ -82,8 +82,8 @@ def runtime_skip(state: nvbench.State):
 
 
 if __name__ == "__main__":
-    b = nvbench.register(runtime_skip)
+    b = bench.register(runtime_skip)
     b.add_float64_axis("Duration", [1e-4 + k * 0.25e-3 for k in range(5)])
     b.add_string_axis("Kramble", ["Foo", "Bar", "Baz"])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/throughput.py b/python/examples/throughput.py
index 890c372..ff02bd3 100644
--- a/python/examples/throughput.py
+++ b/python/examples/throughput.py
@@ -16,12 +16,12 @@
 
 import sys
 
-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import numpy as np
 from numba import cuda
 
 
-def as_cuda_stream(cs: nvbench.CudaStream) -> cuda.cudadrv.driver.Stream:
+def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
     return cuda.external_stream(cs.addressof())
 
 
@@ -39,7 +39,7 @@ def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatc
     return kernel
 
 
-def throughput_bench(state: nvbench.State) -> None:
+def throughput_bench(state: bench.State) -> None:
     stride = state.get_int64("Stride")
     ipt = state.get_int64("ItemsPerThread")
 
@@ -59,7 +59,7 @@ def throughput_bench(state: nvbench.State) -> None:
 
     krn = make_throughput_kernel(ipt)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         exec_stream = as_cuda_stream(launch.get_stream())
         krn[blocks_in_grid, threads_per_block, exec_stream, 0](
             stride, elements, inp_arr, out_arr
@@ -69,8 +69,8 @@ def throughput_bench(state: nvbench.State) -> None:
 
 
 if __name__ == "__main__":
-    b = nvbench.register(throughput_bench)
+    b = bench.register(throughput_bench)
     b.add_int64_axis("Stride", [1, 2, 4])
     b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 22adc77..8466f64 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -56,4 +56,4 @@ root = ".."
 
 [tool.scikit-build.wheel.packages]
 "cuda" = "cuda"
-"cuda/nvbench" = "cuda/nvbench"
+"cuda/bench" = "cuda/bench"
diff --git a/python/test/run_1.py b/python/test/run_1.py
index dfa38f4..fbc6de0 100644
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -1,6 +1,6 @@
 import sys
 
-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import numpy as np
 from numba import cuda
 
@@ -14,15 +14,15 @@ def kernel(a, b, c):
         c[tid] = a[tid] + b[tid]
 
 
-def get_numba_stream(launch: nvbench.Launch):
+def get_numba_stream(launch: bench.Launch):
     return cuda.external_stream(launch.get_stream().addressof())
 
 
-def skipit(state: nvbench.State) -> None:
+def skipit(state: bench.State) -> None:
     state.skip("Skipping this benchmark for no reason")
 
 
-def add_two(state: nvbench.State):
+def add_two(state: bench.State):
     N = state.get_int64("elements")
     a = cuda.to_device(np.random.random(N))
     c = cuda.device_array_like(a)
@@ -47,7 +47,7 @@ def add_two(state: nvbench.State):
     state.exec(kernel_launcher, batched=True, sync=True)
 
 
-def add_float(state: nvbench.State):
+def add_float(state: bench.State):
     N = state.get_int64("elements")
     v = state.get_float64("v")
     name = state.get_string("name")
@@ -78,7 +78,7 @@ def add_float(state: nvbench.State):
     state.exec(kernel_launcher, batched=True, sync=True)
 
 
-def add_three(state: nvbench.State):
+def add_three(state: bench.State):
     N = state.get_int64("elements")
     a = cuda.to_device(np.random.random(N).astype(np.float32))
     b = cuda.to_device(np.random.random(N).astype(np.float32))
@@ -100,20 +100,20 @@ def add_three(state: nvbench.State):
 
 def register_benchmarks():
     (
-        nvbench.register(add_two).add_int64_axis(
-            "elements", [2**pow2 for pow2 in range(20, 23)]
+        bench.register(add_two).add_int64_axis(
+            "elements", [2**pow2 - 1 for pow2 in range(20, 23)]
         )
     )
     (
-        nvbench.register(add_float)
+        bench.register(add_float)
         .add_float64_axis("v", [0.1, 0.3])
         .add_string_axis("name", ["Anne", "Lynda"])
         .add_int64_power_of_two_axis("elements", range(20, 23))
     )
-    (nvbench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22)))
-    nvbench.register(skipit)
+    bench.register(add_three).add_int64_power_of_two_axis("elements", range(20, 22))
+    bench.register(skipit)
 
 
 if __name__ == "__main__":
     register_benchmarks()
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/test/stubs.py b/python/test/stubs.py
index f3f4ee2..0d09a58 100644
--- a/python/test/stubs.py
+++ b/python/test/stubs.py
@@ -18,12 +18,12 @@ import ctypes
 import sys
 from typing import Dict, Optional, Tuple
 
+import cuda.bench as bench
 import cuda.cccl.headers as headers
 import cuda.core.experimental as core
-import cuda.nvbench as nvbench
 
 
-def as_core_Stream(cs: nvbench.CudaStream) -> core.Stream:
+def as_core_Stream(cs: bench.CudaStream) -> core.Stream:
     return core.Stream.from_handle(cs.addressof())
 
 
@@ -58,7 +58,7 @@ __global__ void sleep_kernel(double seconds) {
     return mod.get_kernel("sleep_kernel")
 
 
-def no_axes(state: nvbench.State):
+def no_axes(state: bench.State):
     state.set_min_samples(1000)
     sleep_dur = 1e-3
     krn = make_sleep_kernel()
@@ -66,14 +66,14 @@ def no_axes(state: nvbench.State):
 
     print(f"Stopping criterion used: {state.get_stopping_criterion()}")
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
 
 
-def tags(state: nvbench.State):
+def tags(state: bench.State):
     state.set_min_samples(1000)
     sleep_dur = 1e-3
     krn = make_sleep_kernel()
@@ -82,28 +82,28 @@ def tags(state: nvbench.State):
     sync_flag = bool(state.get_int64("Sync"))
     batched_flag = bool(state.get_int64("Batched"))
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher, sync=sync_flag, batched=batched_flag)
 
 
-def single_float64_axis(state: nvbench.State):
+def single_float64_axis(state: bench.State):
     # get axis value, or default
     default_sleep_dur = 3.14e-4
     sleep_dur = state.get_float64_or_default("Duration", default_sleep_dur)
     krn = make_sleep_kernel()
     launch_config = core.LaunchConfig(grid=1, block=1, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, sleep_dur)
 
     state.exec(launcher)
 
 
-def default_value(state: nvbench.State):
+def default_value(state: bench.State):
     single_float64_axis(state)
 
 
@@ -138,7 +138,7 @@ __global__ void copy_kernel(const T *in, U *out, ::cuda::std::size_t n)
     return mod.get_kernel(instance_name)
 
 
-def copy_sweep_grid_shape(state: nvbench.State):
+def copy_sweep_grid_shape(state: bench.State):
     block_size = state.get_int64("BlockSize")
     num_blocks = state.get_int64("NumBlocks")
 
@@ -158,14 +158,14 @@ def copy_sweep_grid_shape(state: nvbench.State):
     krn = make_copy_kernel()
     launch_config = core.LaunchConfig(grid=num_blocks, block=block_size, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
     state.exec(launcher)
 
 
-def copy_type_sweep(state: nvbench.State):
+def copy_type_sweep(state: bench.State):
     type_id = state.get_int64("TypeID")
 
     types_map: Dict[int, Tuple[type, str]] = {
@@ -196,7 +196,7 @@ def copy_type_sweep(state: nvbench.State):
     krn = make_copy_kernel(value_cuda_t, value_cuda_t)
     launch_config = core.LaunchConfig(grid=256, block=256, shmem_size=0)
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         s = as_core_Stream(launch.get_stream())
         core.launch(s, launch_config, krn, input_buf, output_buf, num_values)
 
@@ -205,20 +205,20 @@ def copy_type_sweep(state: nvbench.State):
 
 if __name__ == "__main__":
     # Benchmark without axes
-    simple_b = nvbench.register(no_axes)
+    simple_b = bench.register(no_axes)
     simple_b.set_stopping_criterion("entropy")
     simple_b.set_criterion_param_int64("unused_int", 100)
 
-    tags_b = nvbench.register(tags)
+    tags_b = bench.register(tags)
     tags_b.add_int64_axis("Sync", [0, 1])
     tags_b.add_int64_axis("Batched", [0, 1])
 
     # benchmark with no axes, that uses default value
-    default_b = nvbench.register(default_value)
+    default_b = bench.register(default_value)
     default_b.set_min_samples(7)
 
     # specify axis
-    axes_b = nvbench.register(single_float64_axis).add_float64_axis(
+    axes_b = bench.register(single_float64_axis).add_float64_axis(
         "Duration", [7e-5, 1e-4, 5e-4]
     )
     axes_b.set_timeout(20)
@@ -226,11 +226,11 @@ if __name__ == "__main__":
     axes_b.set_throttle_threshold(0.2)
     axes_b.set_throttle_recovery_delay(0.1)
 
-    copy1_bench = nvbench.register(copy_sweep_grid_shape)
+    copy1_bench = bench.register(copy_sweep_grid_shape)
     copy1_bench.add_int64_power_of_two_axis("BlockSize", range(6, 10, 2))
     copy1_bench.add_int64_axis("NumBlocks", [2**x for x in range(6, 10, 2)])
 
-    copy2_bench = nvbench.register(copy_type_sweep)
+    copy2_bench = bench.register(copy_type_sweep)
     copy2_bench.add_int64_axis("TypeID", range(0, 6))
 
-    nvbench.run_all_benchmarks(sys.argv)
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/test/test_nvbench.py b/python/test/test_nvbench.py
index d03e263..5604a3f 100644
--- a/python/test/test_nvbench.py
+++ b/python/test/test_nvbench.py
@@ -1,31 +1,31 @@
 import json
 
-import cuda.nvbench as nvbench
+import cuda.bench as bench
 import pytest
 
 
 def test_cpp_exception():
     with pytest.raises(RuntimeError, match="Test"):
-        nvbench._nvbench.test_cpp_exception()
+        bench._nvbench.test_cpp_exception()
 
 
 def test_py_exception():
-    with pytest.raises(nvbench.NVBenchRuntimeError, match="Test"):
-        nvbench._nvbench.test_py_exception()
+    with pytest.raises(bench.NVBenchRuntimeError, match="Test"):
+        bench._nvbench.test_py_exception()
 
 
 @pytest.mark.parametrize(
-    "cls", [nvbench.CudaStream, nvbench.State, nvbench.Launch, nvbench.Benchmark]
+    "cls", [bench.CudaStream, bench.State, bench.Launch, bench.Benchmark]
 )
 def test_api_ctor(cls):
     with pytest.raises(TypeError, match="No constructor defined!"):
         cls()
 
 
-def t_bench(state: nvbench.State):
+def t_bench(state: bench.State):
     s = {"a": 1, "b": 0.5, "c": "test", "d": {"a": 1}}
 
-    def launcher(launch: nvbench.Launch):
+    def launcher(launch: bench.Launch):
         for _ in range(10000):
             _ = json.dumps(s)
 
@@ -33,7 +33,7 @@ def t_bench(state: nvbench.State):
 
 
 def test_cpu_only():
-    b = nvbench.register(t_bench)
+    b = bench.register(t_bench)
     b.set_is_cpu_only(True)
 
-    nvbench.run_all_benchmarks(["-q", "--profile"])
+    bench.run_all_benchmarks(["-q", "--profile"])