Draft of Python API for NVBench

The prototype is based on pybind11 to minimize boiler-plate code needed to deal with move-only semantics of many nvbench classes.
2026-04-19 22:38:52 +00:00 · 2025-06-30 14:30:15 -05:00
parent a9fb32e25d
commit 6552ef503c
8 changed files with 706 additions and 0 deletions
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -0,0 +1,4 @@
+build
+nvbench_build
+nvbench_install
+__pycache__
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.30...4.0)
+
+project(${SKBUILD_PROJECT_NAME} LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+find_package(Python REQUIRED COMPONENTS Development.Module)
+find_package(CUDAToolkit REQUIRED)
+
+include(FetchContent)
+
+FetchContent_Declare(
+    pybind11
+    URL https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.tar.gz
+    URL_HASH SHA256=e08cb87f4773da97fa7b5f035de8763abc656d87d5773e62f6da0587d1f0ec20
+    FIND_PACKAGE_ARGS NAMES pybind11
+)
+FetchContent_MakeAvailable(pybind11)
+
+find_package(nvbench CONFIG REQUIRED)
+
+pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
+target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
+target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
+set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
+
+install(TARGETS _nvbench DESTINATION cuda/nvbench)
+install(IMPORTED_RUNTIME_ARTIFACTS nvbench::nvbench DESTINATION cuda/nvbench)
--- a/python/README.md
+++ b/python/README.md
@@ -0,0 +1,21 @@
+# CUDA Kernel Benchmarking Package
+
+This package provides Python API to CUDA Kernel Benchmarking Library `NVBench`.
+
+## Building
+
+### Build `NVBench` project
+
+```
+cd nvbench/python
+cmake -B nvbench_build --preset nvbench-ci -S $(pwd)/.. -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DNVBench_ENABLE_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$(pwd)/nvbench_install
+cmake --build nvbench_build/ --config Release --target install
+
+nvbench_DIR=$(pwd)/nvbench_install/lib/cmake CUDACXX=/usr/local/cuda/bin/nvcc pip install -e .
+```
+
+### Verify that package works
+
+```
+python test/run_1.py
+```
--- a/python/cuda/nvbench/init.py
+++ b/python/cuda/nvbench/init.py
@@ -0,0 +1,18 @@
+import importlib.metadata
+
+from cuda.bindings.path_finder import (  # type: ignore[import-not-found]
+    _load_nvidia_dynamic_library,
+)
+
+try:
+    __version__ = importlib.metadata.version("pynvbench")
+except Exception:
+    __version__ = "0.0.0dev"
+
+for libname in ("cupti", "nvperf_target", "nvperf_host"):
+    _load_nvidia_dynamic_library(libname)
+
+from ._nvbench import *  # noqa: E402, F403
+from ._nvbench import register, run_all_benchmarks  # noqa: E402
+
+__all__ = ["register", "run_all_benchmarks"]
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -0,0 +1,58 @@
+[build-system]
+requires = ["scikit-build-core>=0.10", "setuptools_scm"]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "pynvbench"
+description = "CUDA Kernel Benchmarking Package"
+authors = [{ name = "NVIDIA Corporation" }]
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "License :: OSI Approved :: Apache Software License",
+]
+requires-python = ">=3.9"
+dependencies = [
+  # pathfinder
+  "cuda-bindings",
+
+  # Library expects to find shared libraries
+  # libcupti, libnvperf_target, libnvperf_host
+  # pathfinder is used to find it in the Python layout
+  "nvidia-cuda-cupti-cu12",
+
+  # The shared library
+  # libnvidia-ml must be installed system-wide
+  # (Debian package provider: libnvidia-compute)
+]
+dynamic = ["version"]
+readme = { file = "README.md", content-type = "text/markdown" }
+
+[project.optional-dependencies]
+test = ["pytest", "cupy-cuda12x", "numba"]
+
+[project.urls]
+Homepage = "https://developer.nvidia.com/"
+
+[tool.scikit-build]
+minimum-version = "build-system.requires"
+build-dir = "build/{wheel_tag}"
+
+[tool.scikit-build.cmake]
+version = ">=3.30.4"
+args = []
+build-type = "Release"
+source-dir = "."
+
+[tool.scikit-build.ninja]
+version = ">=1.11"
+make-fallback = true
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.setuptools_scm"
+
+[tool.setuptools_scm]
+root = ".."
+
+[tool.scikit-build.wheel.packages]
+"cuda/nvbench" = "cuda/nvbench"
--- a/python/src/README.md
+++ b/python/src/README.md
@@ -0,0 +1,17 @@
+
+```
+g++ py_nvbench.cpp                                    \
+   -shared -fPIC                                      \
+   -I ${HOME}/repos/pybind11/include                  \
+   -I ${HOME}/repos/pynvbench/nvbench_dir/include     \
+   -I /usr/local/cuda/include                         \
+   $(python3-config --includes)                       \
+   $(python3-config --libs)                           \
+   -L ${HOME}/repos/pynvbench/nvbench_dir/lib/        \
+   -lnvbench                                          \
+   -Wl,-rpath,${HOME}/repos/pynvbench/nvbench_dir/lib \
+   -L /usr/local/cuda/lib64/                          \
+   -lcudart                                           \
+   -Wl,-rpath,/usr/local/cuda/lib64                   \
+   -o _nvbench$(python3-config --extension-suffix)
+```
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -0,0 +1,451 @@
+// TODO: Copyright header
+
+#include <nvbench/nvbench.cuh>
+
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+namespace
+{
+
+inline void set_env(const char *name, const char *value)
+{
+#ifdef _MSC_VER
+  _putenv_s(name, value);
+#else
+  setenv(name, value, 1);
+#endif
+}
+
+struct PyObjectDeleter
+{
+  void operator()(py::object *p)
+  {
+    const bool initialized = Py_IsInitialized();
+
+#if PY_VERSION_HEX < 0x30d0000
+    const bool finalizing = _Py_IsFinalizing();
+#else
+    const bool finalizing = Py_IsFinalizing();
+#endif
+    const bool guard = initialized && !finalizing;
+
+    // deleter only call ~object if interpreter is active and
+    // not shutting down, let OS clean up resources after
+    // interpreter tear-down
+    if (guard)
+    {
+      delete p;
+    }
+  }
+};
+
+struct benchmark_wrapper_t
+{
+
+  benchmark_wrapper_t()
+      : m_fn() {};
+  explicit benchmark_wrapper_t(py::object o)
+      : m_fn{std::shared_ptr<py::object>(new py::object(o), PyObjectDeleter{})}
+  {}
+
+  benchmark_wrapper_t(const benchmark_wrapper_t &other)
+      : m_fn{other.m_fn}
+  {}
+  benchmark_wrapper_t &operator=(const benchmark_wrapper_t &other) = delete;
+  benchmark_wrapper_t(benchmark_wrapper_t &&) noexcept             = delete;
+  benchmark_wrapper_t &operator=(benchmark_wrapper_t &&) noexcept  = delete;
+
+  void operator()(nvbench::state &state, nvbench::type_list<>)
+  {
+    // box as Python object, using reference semantics
+    auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
+
+    // Execute Python callable
+    (*m_fn)(arg);
+  }
+
+private:
+  // Important to use shared pointer here rather than py::object directly,
+  // since copy constructor must be const (benchmark::do_clone is const member method)
+  std::shared_ptr<py::object> m_fn;
+};
+
+class GlobalBenchmarkRegistry
+{
+  bool m_finalized;
+
+public:
+  GlobalBenchmarkRegistry()
+      : m_finalized(false) {};
+
+  GlobalBenchmarkRegistry(const GlobalBenchmarkRegistry &)            = delete;
+  GlobalBenchmarkRegistry &operator=(const GlobalBenchmarkRegistry &) = delete;
+
+  GlobalBenchmarkRegistry(GlobalBenchmarkRegistry &&)            = delete;
+  GlobalBenchmarkRegistry &operator=(GlobalBenchmarkRegistry &&) = delete;
+
+  bool is_finalized() const { return m_finalized; }
+
+  nvbench::benchmark_base &add_bench(py::object fn)
+  {
+    if (m_finalized)
+    {
+      throw std::runtime_error("Can not register more benchmarks after benchmark was run");
+    }
+    if (!PyCallable_Check(fn.ptr()))
+    {
+      throw py::value_error("Benchmark should be a callable object");
+    }
+    std::string name;
+    if (py::hasattr(fn, "__name__"))
+    {
+      py::str py_name = fn.attr("__name__");
+      name            = py::cast<std::string>(py_name);
+    }
+    else
+    {
+      py::str py_name = py::repr(fn);
+      name            = py::cast<std::string>(py_name);
+    }
+    benchmark_wrapper_t executor(fn);
+
+    return nvbench::benchmark_manager::get()
+      .add(std::make_unique<nvbench::benchmark<benchmark_wrapper_t>>(executor))
+      .set_name(std::move(name));
+  }
+
+  void run(const std::vector<std::string> &argv)
+  {
+    if (nvbench::benchmark_manager::get().get_benchmarks().empty())
+    {
+      throw std::runtime_error("No benchmarks had been registered yet");
+    }
+    if (m_finalized)
+    {
+      throw std::runtime_error("Benchmarks were already executed");
+    }
+    m_finalized = true;
+
+    try
+    {
+      // TODO: This line is mandatory for correctness
+      //    Q: Why is initializing at module init not enough?
+      nvbench::benchmark_manager::get().initialize();
+      {
+        nvbench::option_parser parser{};
+        parser.parse(argv);
+
+        NVBENCH_MAIN_PRINT_PREAMBLE(parser);
+        NVBENCH_MAIN_RUN_BENCHMARKS(parser);
+        NVBENCH_MAIN_PRINT_EPILOGUE(parser);
+
+        NVBENCH_MAIN_PRINT_RESULTS(parser);
+      } /* Tear down parser before finalization */
+    }
+    catch (const std::exception &e)
+    {
+      std::stringstream ss;
+      ss << "Caught exception while running benchmakrs: ";
+      ss << e.what();
+      ss << "\n";
+      py::print(py::cast(ss.str(), py::return_value_policy::move));
+    }
+    catch (...)
+    {
+      py::print("Caught exception in nvbench_main\n");
+    }
+  }
+};
+
+// essentially a global variable, but allocated on the heap during module initialization
+constinit std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
+
+} // end of anonymous namespace
+
+PYBIND11_MODULE(_nvbench, m)
+{
+  // == STEP 1
+  // Set environment variable CUDA_MODULE_LOADING=EAGER
+
+  // See NVIDIA/NVBench#136 for CUDA_MODULE_LOADING
+  set_env("CUDA_MODULE_LOADING", "EAGER");
+
+  NVBENCH_DRIVER_API_CALL(cuInit(0));
+
+  nvbench::benchmark_manager::get().initialize();
+
+  // TODO: Use cuModuleGetLoadingMode(&mode) to confirm that (mode == CU_MODULE_EAGER_LOADING)
+  // and issue warning otherwise
+
+  // == STEP 2
+  // Define CudaStream class
+  //    ATTN: nvbench::cuda_stream is move-only class
+  //    Methods:
+  //       Constructors, based on device, or on existing stream
+  //       nvbench::cuda_stream::get_stream
+
+  auto py_cuda_stream_cls = py::class_<nvbench::cuda_stream>(m, "CudaStream");
+
+  py_cuda_stream_cls.def("__cuda_stream__",
+                         [](const nvbench::cuda_stream &s) -> std::pair<std::size_t, std::size_t> {
+                           return std::make_pair(std::size_t{0},
+                                                 reinterpret_cast<std::size_t>(s.get_stream()));
+                         });
+  py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
+    return reinterpret_cast<std::size_t>(s.get_stream());
+  });
+
+  // == STEP 3
+  // Define Launch class
+  //    ATTN: nvbench::launch is move-only class
+  //    Methods:
+  //        nvbench::launch::get_stream -> nvbench::cuda_stream
+
+  auto py_launch_cls = py::class_<nvbench::launch>(m, "Launch");
+
+  py_launch_cls.def(
+    "getStream",
+    [](nvbench::launch &launch) { return std::ref(launch.get_stream()); },
+    py::return_value_policy::reference);
+
+  // == STEP 4
+  // Define Benchmark class
+
+  auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
+  py_benchmark_cls.def("getName", &nvbench::benchmark_base::get_name);
+  py_benchmark_cls.def(
+    "addInt64Axis",
+    [](nvbench::benchmark_base &self, std::string name, const std::vector<nvbench::int64_t> &data) {
+      self.add_int64_axis(name, data);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "addFloat64Axis",
+    [](nvbench::benchmark_base &self,
+       std::string name,
+       const std::vector<nvbench::float64_t> &data) {
+      self.add_float64_axis(name, data);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+  py_benchmark_cls.def(
+    "addStringAxis",
+    [](nvbench::benchmark_base &self, std::string name, const std::vector<std::string> &data) {
+      self.add_string_axis(name, data);
+      return std::ref(self);
+    },
+    py::return_value_policy::reference);
+
+  // == STEP 5
+  // Define PyState class
+  //    ATTN: nvbench::state is move-only class
+  //    Methods:
+  //        nvbench::state::get_cuda_stream
+  //        nvbench::state::get_cuda_stream_optional
+  //        nvbench::state::set_cuda_stream
+  //        nvbench::state::get_device
+  //        nvbench::state::get_is_cpu_only
+  //        nvbench::state::get_type_config_index
+  //        nvbench::state::get_int64
+  //        nvbench::state::get_int64_or_default
+  //        nvbench::state::get_float64
+  //        nvbench::state::get_float64_or_default
+  //        nvbench::state::get_string
+  //        nvbench::state::get_string_or_default
+  //        nvbench::state::add_element_count
+  //        nvbench::state::set_element_count
+  //        nvbench::state::get_element_count
+  //        nvbench::state::add_global_memory_reads
+  //        nvbench::state::add_global_memory_writes
+  //        nvbench::state::add_buffer_size
+  //        nvbench::state::set_global_memory_rw_bytes
+  //        nvbench::state::get_global_memory_rw_bytes
+  //        nvbench::state::skip
+  //        nvbench::state::is_skipped
+  //        nvbench::state::get_skip_reason
+  //        nvbench::state::get_min_samples
+  //        nvbench::state::set_min_samples
+  //        nvbench::state::get_criterion_params
+  //        nvbench::state::get_stopping_criterion
+  //        nvbench::state::get_run_once
+  //        nvbench::state::set_run_once
+  //        nvbench::state::get_disable_blocking_kernel
+  //        nvbench::state::set_disable_blocking_kernel
+  //        nvbench::state::set_skip_time
+  //        nvbench::state::get_skip_time
+  //        nvbench::state::set_timeout
+  //        nvbench::state::get_timeout
+  //        nvbench::state::set_throttle_threshold
+  //        nvbench::state::get_throttle_threshold
+  //        nvbench::state::set_throttle_recovery_delay
+  //        nvbench::state::get_throttle_recovery_delay
+  //        nvbench::state::get_blocking_kernel_timeout
+  //        nvbench::state::set_blocking_kernel_timeout
+  //        nvbench::state::get_axis_values
+  //        nvbench::state::get_axis_values_as_string
+  //        nvbench::state::get_benchmark
+  //        nvbench::state::collect_l1_hit_rates
+  //        nvbench::state::collect_l2_hit_rates
+  //        nvbench::state::collect_stores_efficiency
+  //        nvbench::state::collect_loads_efficiency
+  //        nvbench::state::collect_dram_throughput
+  //        nvbench::state::collect_cupti_metrics
+  //        nvbench::state::is_l1_hit_rate_collected
+  //        nvbench::state::is_l2_hit_rate_collected
+  //        nvbench::state::is_stores_efficiency_collected
+  //        nvbench::state::is_loads_efficiency_collected
+  //        nvbench::state::is_dram_throughput_collected
+  //        nvbench::state::is_cupti_required
+  //        nvbench::state::add_summary
+  //        nvbench::state::get_summary
+  //        nvbench::state::get_summaries
+  //        nvbench::state::get_short_description
+  //        nvbench::state::exec
+  // NOTE:
+  //    PyState wraps std::reference_wrapper<nvbench::state>
+
+  using state_ref_t = std::reference_wrapper<nvbench::state>;
+  auto pystate_cls  = py::class_<nvbench::state>(m, "State");
+
+  pystate_cls.def("hasDevice", [](nvbench::state &state) -> bool {
+    return static_cast<bool>(state.get_device());
+  });
+  pystate_cls.def("hasPrinters", [](nvbench::state &state) -> bool {
+    return state.get_benchmark().get_printer().has_value();
+  });
+
+  pystate_cls.def("getStream", &nvbench::state::get_cuda_stream);
+
+  pystate_cls.def("getInt64", &nvbench::state::get_int64);
+  pystate_cls.def("getInt64", &nvbench::state::get_int64_or_default);
+
+  pystate_cls.def("getFloat64", &nvbench::state::get_float64);
+  pystate_cls.def("getFloat64", &nvbench::state::get_float64_or_default);
+
+  pystate_cls.def("getString", &nvbench::state::get_string);
+  pystate_cls.def("getString", &nvbench::state::get_string_or_default);
+
+  pystate_cls.def("addElementCount", &nvbench::state::add_element_count);
+  pystate_cls.def("setElementCount", &nvbench::state::set_element_count);
+  pystate_cls.def("getElementCount", &nvbench::state::get_element_count);
+
+  pystate_cls.def("skip", &nvbench::state::skip);
+  pystate_cls.def("isSkipped", &nvbench::state::is_skipped);
+  pystate_cls.def("getSkipReason", &nvbench::state::get_skip_reason);
+
+  pystate_cls.def(
+    "addGlobalMemoryReads",
+    [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
+      state.add_global_memory_reads(nbytes, column_name);
+    },
+    "Add size, in bytes, of global memory reads",
+    py::arg("nbytes"),
+    py::pos_only{},
+    py::arg("column_name") = py::str(""));
+  pystate_cls.def(
+    "addGlobalMemoryWrites",
+    [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
+      state.add_global_memory_writes(nbytes, column_name);
+    },
+    "Add size, in bytes, of global memory writes",
+    py::arg("nbytes"),
+    py::pos_only{},
+    py::arg("column_name") = py::str(""));
+  pystate_cls.def(
+    "getBenchmark",
+    [](nvbench::state &state) { return std::ref(state.get_benchmark()); },
+    py::return_value_policy::reference);
+  pystate_cls.def("getThrottleThreshold", &nvbench::state::get_throttle_threshold);
+
+  pystate_cls.def("getMinSamples", &nvbench::state::get_min_samples);
+  pystate_cls.def("setMinSamples", &nvbench::state::set_min_samples);
+
+  pystate_cls.def("getDisableBlockingKernel", &nvbench::state::get_disable_blocking_kernel);
+  pystate_cls.def("setDisableBlockingKernel", &nvbench::state::set_disable_blocking_kernel);
+
+  pystate_cls.def("getRunOnce", &nvbench::state::get_run_once);
+  pystate_cls.def("setRunOnce", &nvbench::state::set_run_once);
+
+  pystate_cls.def("getTimeout", &nvbench::state::get_timeout);
+  pystate_cls.def("setTimeout", &nvbench::state::set_timeout);
+
+  pystate_cls.def("getBlockingKernel", &nvbench::state::get_blocking_kernel_timeout);
+  pystate_cls.def("setBlockingKernel", &nvbench::state::set_blocking_kernel_timeout);
+
+  pystate_cls.def("collectCUPTIMetrics", &nvbench::state::collect_cupti_metrics);
+  pystate_cls.def("isCUPTIRequired", &nvbench::state::is_cupti_required);
+
+  pystate_cls.def(
+    "exec",
+    [](nvbench::state &state, py::object fn, bool batched, bool sync) {
+      auto launcher_fn = [fn](nvbench::launch &launch_descr) -> void {
+        fn(py::cast(std::ref(launch_descr), py::return_value_policy::reference));
+      };
+
+      if (sync)
+      {
+        if (batched)
+        {
+          state.exec(nvbench::exec_tag::sync, launcher_fn);
+        }
+        else
+        {
+          state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::no_batch, launcher_fn);
+        }
+      }
+      else
+      {
+        if (batched)
+        {
+          state.exec(nvbench::exec_tag::none, launcher_fn);
+        }
+        else
+        {
+          state.exec(nvbench::exec_tag::no_batch, launcher_fn);
+        }
+      }
+    },
+    "Executor for given callable fn(state : Launch)",
+    py::arg("fn"),
+    py::pos_only{},
+    py::arg("batched") = true,
+    py::arg("sync")    = false);
+
+  // == STEP 6
+  //    ATTN: nvbench::benchmark_manager is a singleton
+
+  global_registry =
+    std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete>(new GlobalBenchmarkRegistry(),
+                                                           py::nodelete{});
+
+  m.def(
+    "register",
+    [&](py::object fn) { return std::ref(global_registry->add_bench(fn)); },
+    py::return_value_policy::reference);
+
+  m.def(
+    "run_all_benchmarks",
+    [&](py::object argv) -> void {
+      if (!py::isinstance<py::list>(argv))
+      {
+        throw py::type_error("run_all_benchmarks expects a list of command-line arguments");
+      }
+      std::vector<std::string> args = py::cast<std::vector<std::string>>(argv);
+      global_registry->run(args);
+    },
+    "Run all benchmarks",
+    py::arg("argv") = py::list());
+}
--- a/python/test/run_1.py
+++ b/python/test/run_1.py
@@ -0,0 +1,108 @@
+import sys
+
+import cuda.nvbench as nvbench
+import numpy as np
+from numba import cuda
+
+
+@cuda.jit()
+def kernel(a, b, c):
+    tid = cuda.grid(1)
+    size = len(a)
+
+    if tid < size:
+        c[tid] = a[tid] + b[tid]
+
+
+def getNumbaStream(launch):
+    return cuda.external_stream(launch.getStream().addressof())
+
+
+def add_two(state):
+    # state.skip("Skipping this benchmark for no reason")
+    N = state.getInt64("elements")
+    a = cuda.to_device(np.random.random(N))
+    c = cuda.device_array_like(a)
+
+    state.addGlobalMemoryReads(a.nbytes)
+    state.addGlobalMemoryWrites(c.nbytes)
+
+    nthreads = 256
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    # First call locks, can't use async benchmarks until sync tag is supported
+    kernel[nblocks, nthreads](a, a, c)
+    cuda.synchronize()
+
+    def kernel_launcher(launch):
+        stream = getNumbaStream(launch)
+        kernel[nblocks, nthreads, stream](a, a, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+
+
+def add_float(state):
+    N = state.getInt64("elements")
+    v = state.getFloat64("v")
+    name = state.getString("name")
+    a = cuda.to_device(np.random.random(N).astype(np.float32))
+    b = cuda.to_device(np.random.random(N).astype(np.float32))
+    c = cuda.device_array_like(a)
+
+    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
+    state.addGlobalMemoryWrites(c.nbytes)
+
+    nthreads = 64
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    def kernel_launcher(launch):
+        _ = v
+        _ = name
+        stream = getNumbaStream(launch)
+        kernel[nblocks, nthreads, stream](a, b, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+
+
+def add_three(state):
+    N = state.getInt64("elements")
+    a = cuda.to_device(np.random.random(N).astype(np.float32))
+    b = cuda.to_device(np.random.random(N).astype(np.float32))
+    c = cuda.device_array_like(a)
+
+    state.addGlobalMemoryReads(a.nbytes + b.nbytes)
+    state.addGlobalMemoryWrites(c.nbytes)
+
+    nthreads = 256
+    nblocks = (len(a) + nthreads - 1) // nthreads
+
+    def kernel_launcher(launch):
+        stream = getNumbaStream(launch)
+        kernel[nblocks, nthreads, stream](a, b, c)
+
+    state.exec(kernel_launcher, batched=True, sync=True)
+    cuda.synchronize()
+
+
+def register_benchmarks():
+    (
+        nvbench.register(add_two).addInt64Axis(
+            "elements", [2**pow2 for pow2 in range(20, 23)]
+        )
+    )
+    (
+        nvbench.register(add_float)
+        .addFloat64Axis("v", [0.1, 0.3])
+        .addStringAxis("name", ["Anne", "Lynda"])
+        .addInt64Axis("elements", [2**pow2 for pow2 in range(20, 23)])
+    )
+    (
+        nvbench.register(add_three).addInt64Axis(
+            "elements", [2**pow2 for pow2 in range(20, 22)]
+        )
+    )
+
+
+if __name__ == "__main__":
+    register_benchmarks()
+    nvbench.run_all_benchmarks(sys.argv)