From de471e1d42c4ac45b0f33546a594007bd3527fb1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 5 Dec 2025 14:39:11 -0600
Subject: [PATCH 1/9] Use pybind11==3.0.1, do not use pybind11_add_module

---
 python/CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b18f7ef..c89c085 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -23,15 +23,17 @@ CPMAddPackage(
    FIND_PACKAGE_ARGS CONFIG REQUIRED
 )
 
-CPMAddPackage("gh:pybind/pybind11@3.0.0")
+CPMAddPackage("gh:pybind/pybind11@3.0.1")
 
-pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp)
+add_library(_nvbench MODULE src/py_nvbench.cpp)
+target_include_directories(_nvbench PRIVATE ${Python_INCLUDE_DIRS})
 target_link_libraries(_nvbench PUBLIC nvbench::nvbench)
-target_link_libraries(_nvbench PRIVATE CUDA::cudart_static)
+target_link_libraries(_nvbench PRIVATE CUDA::cudart_static pybind11::headers)
 
 set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN")
 set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
 set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(_nvbench PROPERTIES PREFIX "" SUFFIX "${PYTHON_MODULE_EXTENSION}")
 
 install(TARGETS _nvbench DESTINATION cuda/bench)
 

From c286199adc77fb3b7665aa8c5d0776d29e94d634 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 5 Dec 2025 14:40:16 -0600
Subject: [PATCH 2/9] Scripts to triage 284

---
 python/examples/study/with_blocking_krn.py    | 76 +++++++++++++++++++
 python/examples/study/without_blocking_krn.py | 76 +++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 python/examples/study/with_blocking_krn.py
 create mode 100644 python/examples/study/without_blocking_krn.py

diff --git a/python/examples/study/with_blocking_krn.py b/python/examples/study/with_blocking_krn.py
new file mode 100644
index 0000000..ff02bd3
--- /dev/null
+++ b/python/examples/study/with_blocking_krn.py
@@ -0,0 +1,76 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import numpy as np
+from numba import cuda
+
+
+def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
+    @cuda.jit
+    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
+        tid = cuda.grid(1)
+        step = cuda.gridDim.x * cuda.blockDim.x
+        for i in range(stride * tid, stride * elements, stride * step):
+            for j in range(items_per_thread):
+                read_id = (items_per_thread * i + j) % elements
+                write_id = tid + j * elements
+                out_arr[write_id] = in_arr[read_id]
+
+    return kernel
+
+
+def throughput_bench(state: bench.State) -> None:
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
+
+    nbytes = 128 * 1024 * 1024
+    elements = nbytes // np.dtype(np.int32).itemsize
+
+    alloc_stream = as_cuda_stream(state.get_stream())
+    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
+    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
+
+    state.add_element_count(elements, column_name="Elements")
+    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
+    state.add_global_memory_writes(inp_arr.nbytes)
+
+    threads_per_block = 256
+    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
+
+    krn = make_throughput_kernel(ipt)
+
+    def launcher(launch: bench.Launch):
+        exec_stream = as_cuda_stream(launch.get_stream())
+        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
+            stride, elements, inp_arr, out_arr
+        )
+
+    state.exec(launcher)
+
+
+if __name__ == "__main__":
+    b = bench.register(throughput_bench)
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
+
+    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/study/without_blocking_krn.py b/python/examples/study/without_blocking_krn.py
new file mode 100644
index 0000000..1a22256
--- /dev/null
+++ b/python/examples/study/without_blocking_krn.py
@@ -0,0 +1,76 @@
+# Copyright 2025 NVIDIA Corporation
+#
+#  Licensed under the Apache License, Version 2.0 with the LLVM exception
+#  (the "License"); you may not use this file except in compliance with
+#  the License.
+#
+#  You may obtain a copy of the License at
+#
+#      http://llvm.org/foundation/relicensing/LICENSE.txt
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import sys
+
+import cuda.bench as bench
+import numpy as np
+from numba import cuda
+
+
+def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
+    return cuda.external_stream(cs.addressof())
+
+
+def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
+    @cuda.jit
+    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
+        tid = cuda.grid(1)
+        step = cuda.gridDim.x * cuda.blockDim.x
+        for i in range(stride * tid, stride * elements, stride * step):
+            for j in range(items_per_thread):
+                read_id = (items_per_thread * i + j) % elements
+                write_id = tid + j * elements
+                out_arr[write_id] = in_arr[read_id]
+
+    return kernel
+
+
+def throughput_bench(state: bench.State) -> None:
+    stride = state.get_int64("Stride")
+    ipt = state.get_int64("ItemsPerThread")
+
+    nbytes = 128 * 1024 * 1024
+    elements = nbytes // np.dtype(np.int32).itemsize
+
+    alloc_stream = as_cuda_stream(state.get_stream())
+    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
+    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
+
+    state.add_element_count(elements, column_name="Elements")
+    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
+    state.add_global_memory_writes(inp_arr.nbytes)
+
+    threads_per_block = 256
+    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
+
+    krn = make_throughput_kernel(ipt)
+
+    def launcher(launch: bench.Launch):
+        exec_stream = as_cuda_stream(launch.get_stream())
+        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
+            stride, elements, inp_arr, out_arr
+        )
+
+    state.exec(launcher, sync=True)
+
+
+if __name__ == "__main__":
+    b = bench.register(throughput_bench)
+    b.add_int64_axis("Stride", [1, 2, 4])
+    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
+
+    bench.run_all_benchmarks(sys.argv)

From e57f1ecf4cceb3ee893a820615fb10996d2f0f5f Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 5 Dec 2025 19:32:55 -0600
Subject: [PATCH 3/9] Introduce nvbench::stop_runner_loop exception. If
 application throws it, runner loop is stopped and other pending benchmark
 instances are skipped

---
 nvbench/runner.cuh | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh
index 2c4176f..c3cc283 100644
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -27,6 +27,13 @@
 namespace nvbench
 {
 
+struct stop_runner_loop : std::runtime_error
+{
+  // ask compiler to generate all constructor signatures
+  // that are defined for the base class
+  using std::runtime_error::runtime_error;
+};
+
 // Non-templated code goes here to reduce instantiation costs:
 struct runner_base
 {
@@ -88,7 +95,8 @@ private:
       [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device](
         auto type_config_wrapper) {
         // Get current type_config:
-        using type_config = typename decltype(type_config_wrapper)::type;
+        using type_config   = typename decltype(type_config_wrapper)::type;
+        bool skip_remaining = false;
 
         // Find states with the current device / type_config
         for (nvbench::state &cur_state : states)
@@ -99,13 +107,21 @@ private:
             self.run_state_prologue(cur_state);
             try
             {
-              auto kernel_generator_copy = self.m_kernel_generator;
-              kernel_generator_copy(cur_state, type_config{});
-              if (cur_state.is_skipped())
+              if (!skip_remaining)
+              {
+                auto kernel_generator_copy = self.m_kernel_generator;
+                kernel_generator_copy(cur_state, type_config{});
+              }
+              if (skip_remaining || cur_state.is_skipped())
               {
                 self.print_skip_notification(cur_state);
               }
             }
+            catch (nvbench::stop_runner_loop &e)
+            {
+              skip_remaining = true;
+              self.handle_sampling_exception(e, cur_state);
+            }
             catch (std::exception &e)
             {
               self.handle_sampling_exception(e, cur_state);

From ce9a76167f18df30153b401c47dfc53e3fdb1f14 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 5 Dec 2025 14:38:48 -0600
Subject: [PATCH 4/9] Use nvbench::stop_runner_loop to signal stop of runner
 loop

Add try/catch around Python calls to improve keyboard interrup
response.
---
 python/src/py_nvbench.cpp | 52 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 8856e8e..766f983 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -35,8 +35,8 @@
 
 namespace py = pybind11;
 
-namespace
-{
+// namespace
+//{
 
 struct PyObjectDeleter
 {
@@ -61,6 +61,8 @@ struct PyObjectDeleter
   }
 };
 
+namespace
+{
 struct benchmark_wrapper_t
 {
 
@@ -91,7 +93,14 @@ struct benchmark_wrapper_t
     auto arg = py::cast(std::ref(state), py::return_value_policy::reference);
 
     // Execute Python callable
-    (*m_fn)(arg);
+    try
+    {
+      (*m_fn)(arg);
+    }
+    catch (const py::error_already_set &e)
+    {
+      throw nvbench::stop_runner_loop(e.what());
+    }
   }
 
 private:
@@ -99,6 +108,7 @@ private:
   // since copy constructor must be const (benchmark::do_clone is const member method)
   std::shared_ptr<py::object> m_fn;
 };
+} // namespace
 
 // Use struct to ensure public inheritance
 struct nvbench_run_error : std::runtime_error
@@ -183,17 +193,20 @@ public:
     }
     catch (py::error_already_set &e)
     {
+      std::cout << "Caught error_already_set\n";
       py::raise_from(e, benchmark_exc.ptr(), "Python error raised ");
       throw py::error_already_set();
     }
     catch (const std::exception &e)
     {
       const std::string &exc_message = e.what();
+      std::cout << "Caught std::exception " << exc_message << std::endl;
       py::set_error(benchmark_exc, exc_message.c_str());
       throw py::error_already_set();
     }
     catch (...)
     {
+      std::cout << "Got fall-through exception\n";
       py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
       throw py::error_already_set();
     }
@@ -222,7 +235,7 @@ py::dict py_get_axis_values(const nvbench::state &state)
 // essentially a global variable, but allocated on the heap during module initialization
 std::unique_ptr<GlobalBenchmarkRegistry, py::nodelete> global_registry{};
 
-} // end of anonymous namespace
+//} // end of anonymous namespace
 
 // ==========================================
 // PLEASE KEEP IN SYNC WITH __init__.pyi FILE
@@ -255,6 +268,7 @@ PYBIND11_MODULE(_nvbench, m)
                            return std::make_pair(std::size_t{0},
                                                  reinterpret_cast<std::size_t>(s.get_stream()));
                          });
+
   py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t {
     return reinterpret_cast<std::size_t>(s.get_stream());
   });
@@ -295,6 +309,7 @@ PYBIND11_MODULE(_nvbench, m)
 
   auto py_benchmark_cls = py::class_<nvbench::benchmark_base>(m, "Benchmark");
   py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name);
+
   py_benchmark_cls.def(
     "add_int64_axis",
     [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
@@ -304,6 +319,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::return_value_policy::reference,
     py::arg("name"),
     py::arg("values"));
+
   py_benchmark_cls.def(
     "add_int64_power_of_two_axis",
     [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::int64_t> data) {
@@ -315,6 +331,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::return_value_policy::reference,
     py::arg("name"),
     py::arg("values"));
+
   py_benchmark_cls.def(
     "add_float64_axis",
     [](nvbench::benchmark_base &self, std::string name, std::vector<nvbench::float64_t> data) {
@@ -324,6 +341,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::return_value_policy::reference,
     py::arg("name"),
     py::arg("values"));
+
   py_benchmark_cls.def(
     "add_string_axis",
     [](nvbench::benchmark_base &self, std::string name, std::vector<std::string> data) {
@@ -333,6 +351,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::return_value_policy::reference,
     py::arg("name"),
     py::arg("values"));
+
   py_benchmark_cls.def(
     "set_name",
     [](nvbench::benchmark_base &self, std::string name) {
@@ -341,6 +360,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("name"));
+
   py_benchmark_cls.def(
     "set_is_cpu_only",
     [](nvbench::benchmark_base &self, bool is_cpu_only) {
@@ -349,6 +369,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("is_cpu_only"));
+
   // TODO: should this be exposed?
   py_benchmark_cls.def(
     "set_run_once",
@@ -358,6 +379,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("run_once"));
+
   py_benchmark_cls.def(
     "set_skip_time",
     [](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) {
@@ -366,6 +388,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("duration_seconds"));
+
   py_benchmark_cls.def(
     "set_timeout",
     [](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) {
@@ -374,6 +397,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("duration_seconds"));
+
   py_benchmark_cls.def(
     "set_throttle_threshold",
     [](nvbench::benchmark_base &self, nvbench::float32_t threshold) {
@@ -382,6 +406,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("threshold"));
+
   py_benchmark_cls.def(
     "set_throttle_recovery_delay",
     [](nvbench::benchmark_base &self, nvbench::float32_t delay) {
@@ -390,6 +415,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("delay_seconds"));
+
   py_benchmark_cls.def(
     "set_stopping_criterion",
     [](nvbench::benchmark_base &self, std::string criterion) {
@@ -398,6 +424,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::return_value_policy::reference,
     py::arg("criterion"));
+
   py_benchmark_cls.def(
     "set_criterion_param_int64",
     [](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) {
@@ -407,6 +434,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::return_value_policy::reference,
     py::arg("name"),
     py::arg("value"));
+
   py_benchmark_cls.def(
     "set_criterion_param_float64",
     [](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) {
@@ -416,6 +444,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::return_value_policy::reference,
     py::arg("name"),
     py::arg("value"));
+
   py_benchmark_cls.def(
     "set_criterion_param_string",
     [](nvbench::benchmark_base &self, std::string name, std::string value) {
@@ -425,6 +454,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::return_value_policy::reference,
     py::arg("name"),
     py::arg("value"));
+
   py_benchmark_cls.def(
     "set_min_samples",
     [](nvbench::benchmark_base &self, nvbench::int64_t count) {
@@ -508,9 +538,11 @@ PYBIND11_MODULE(_nvbench, m)
   pystate_cls.def("has_device", [](const nvbench::state &state) -> bool {
     return static_cast<bool>(state.get_device());
   });
+
   pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool {
     return state.get_benchmark().get_printer().has_value();
   });
+
   pystate_cls.def("get_device", [](const nvbench::state &state) {
     auto dev = state.get_device();
     if (dev.has_value())
@@ -550,6 +582,7 @@ PYBIND11_MODULE(_nvbench, m)
                   &nvbench::state::add_element_count,
                   py::arg("count"),
                   py::arg("column_name") = py::str(""));
+
   pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count"));
   pystate_cls.def("get_element_count", &nvbench::state::get_element_count);
 
@@ -566,6 +599,7 @@ PYBIND11_MODULE(_nvbench, m)
     py::arg("nbytes"),
     py::pos_only{},
     py::arg("column_name") = py::str(""));
+
   pystate_cls.def(
     "add_global_memory_writes",
     [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void {
@@ -575,10 +609,12 @@ PYBIND11_MODULE(_nvbench, m)
     py::arg("nbytes"),
     py::pos_only{},
     py::arg("column_name") = py::str(""));
+
   pystate_cls.def(
     "get_benchmark",
     [](const nvbench::state &state) { return std::ref(state.get_benchmark()); },
     py::return_value_policy::reference);
+
   pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold);
   pystate_cls.def("set_throttle_threshold",
                   &nvbench::state::set_throttle_threshold,
@@ -590,22 +626,27 @@ PYBIND11_MODULE(_nvbench, m)
                   py::arg("min_samples_count"));
 
   pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel);
+
   pystate_cls.def("set_disable_blocking_kernel",
                   &nvbench::state::set_disable_blocking_kernel,
                   py::arg("disable_blocking_kernel"));
 
   pystate_cls.def("get_run_once", &nvbench::state::get_run_once);
+
   pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once"));
 
   pystate_cls.def("get_timeout", &nvbench::state::get_timeout);
+
   pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration"));
 
   pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout);
+
   pystate_cls.def("set_blocking_kernel_timeout",
                   &nvbench::state::set_blocking_kernel_timeout,
                   py::arg("duration"));
 
   pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics);
+
   pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required);
 
   pystate_cls.def(
@@ -670,6 +711,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::arg("name"),
     py::arg("value"));
+
   pystate_cls.def(
     "add_summary",
     [](nvbench::state &state, std::string column_name, std::int64_t value) {
@@ -680,6 +722,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::arg("name"),
     py::arg("value"));
+
   pystate_cls.def(
     "add_summary",
     [](nvbench::state &state, std::string column_name, double value) {
@@ -690,6 +733,7 @@ PYBIND11_MODULE(_nvbench, m)
     },
     py::arg("name"),
     py::arg("value"));
+
   pystate_cls.def("get_axis_values_as_string",
                   [](const nvbench::state &state) { return state.get_axis_values_as_string(); });
   pystate_cls.def("get_axis_values", &py_get_axis_values);

From b2a80c92b805c7d2c2a7ca1e4d7a3c0f1d028077 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 8 Dec 2025 11:53:08 -0600
Subject: [PATCH 5/9] Revert "Scripts to triage 284"

This reverts commit c286199adc77fb3b7665aa8c5d0776d29e94d634.
---
 python/examples/study/with_blocking_krn.py    | 76 -------------------
 python/examples/study/without_blocking_krn.py | 76 -------------------
 2 files changed, 152 deletions(-)
 delete mode 100644 python/examples/study/with_blocking_krn.py
 delete mode 100644 python/examples/study/without_blocking_krn.py

diff --git a/python/examples/study/with_blocking_krn.py b/python/examples/study/with_blocking_krn.py
deleted file mode 100644
index ff02bd3..0000000
--- a/python/examples/study/with_blocking_krn.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2025 NVIDIA Corporation
-#
-#  Licensed under the Apache License, Version 2.0 with the LLVM exception
-#  (the "License"); you may not use this file except in compliance with
-#  the License.
-#
-#  You may obtain a copy of the License at
-#
-#      http://llvm.org/foundation/relicensing/LICENSE.txt
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import sys
-
-import cuda.bench as bench
-import numpy as np
-from numba import cuda
-
-
-def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
-    return cuda.external_stream(cs.addressof())
-
-
-def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
-    @cuda.jit
-    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
-        tid = cuda.grid(1)
-        step = cuda.gridDim.x * cuda.blockDim.x
-        for i in range(stride * tid, stride * elements, stride * step):
-            for j in range(items_per_thread):
-                read_id = (items_per_thread * i + j) % elements
-                write_id = tid + j * elements
-                out_arr[write_id] = in_arr[read_id]
-
-    return kernel
-
-
-def throughput_bench(state: bench.State) -> None:
-    stride = state.get_int64("Stride")
-    ipt = state.get_int64("ItemsPerThread")
-
-    nbytes = 128 * 1024 * 1024
-    elements = nbytes // np.dtype(np.int32).itemsize
-
-    alloc_stream = as_cuda_stream(state.get_stream())
-    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
-    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
-
-    state.add_element_count(elements, column_name="Elements")
-    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
-    state.add_global_memory_writes(inp_arr.nbytes)
-
-    threads_per_block = 256
-    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
-
-    krn = make_throughput_kernel(ipt)
-
-    def launcher(launch: bench.Launch):
-        exec_stream = as_cuda_stream(launch.get_stream())
-        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
-            stride, elements, inp_arr, out_arr
-        )
-
-    state.exec(launcher)
-
-
-if __name__ == "__main__":
-    b = bench.register(throughput_bench)
-    b.add_int64_axis("Stride", [1, 2, 4])
-    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
-
-    bench.run_all_benchmarks(sys.argv)
diff --git a/python/examples/study/without_blocking_krn.py b/python/examples/study/without_blocking_krn.py
deleted file mode 100644
index 1a22256..0000000
--- a/python/examples/study/without_blocking_krn.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2025 NVIDIA Corporation
-#
-#  Licensed under the Apache License, Version 2.0 with the LLVM exception
-#  (the "License"); you may not use this file except in compliance with
-#  the License.
-#
-#  You may obtain a copy of the License at
-#
-#      http://llvm.org/foundation/relicensing/LICENSE.txt
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import sys
-
-import cuda.bench as bench
-import numpy as np
-from numba import cuda
-
-
-def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream:
-    return cuda.external_stream(cs.addressof())
-
-
-def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher:
-    @cuda.jit
-    def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr):
-        tid = cuda.grid(1)
-        step = cuda.gridDim.x * cuda.blockDim.x
-        for i in range(stride * tid, stride * elements, stride * step):
-            for j in range(items_per_thread):
-                read_id = (items_per_thread * i + j) % elements
-                write_id = tid + j * elements
-                out_arr[write_id] = in_arr[read_id]
-
-    return kernel
-
-
-def throughput_bench(state: bench.State) -> None:
-    stride = state.get_int64("Stride")
-    ipt = state.get_int64("ItemsPerThread")
-
-    nbytes = 128 * 1024 * 1024
-    elements = nbytes // np.dtype(np.int32).itemsize
-
-    alloc_stream = as_cuda_stream(state.get_stream())
-    inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream)
-    out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream)
-
-    state.add_element_count(elements, column_name="Elements")
-    state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize")
-    state.add_global_memory_writes(inp_arr.nbytes)
-
-    threads_per_block = 256
-    blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block
-
-    krn = make_throughput_kernel(ipt)
-
-    def launcher(launch: bench.Launch):
-        exec_stream = as_cuda_stream(launch.get_stream())
-        krn[blocks_in_grid, threads_per_block, exec_stream, 0](
-            stride, elements, inp_arr, out_arr
-        )
-
-    state.exec(launcher, sync=True)
-
-
-if __name__ == "__main__":
-    b = bench.register(throughput_bench)
-    b.add_int64_axis("Stride", [1, 2, 4])
-    b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4])
-
-    bench.run_all_benchmarks(sys.argv)

From a7763bdd7a04c9bad9172c28de34cc246ed5a146 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 8 Dec 2025 12:25:31 -0600
Subject: [PATCH 6/9] Remove debug outputs

---
 python/src/py_nvbench.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 766f983..e1b86ea 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -193,20 +193,17 @@ public:
     }
     catch (py::error_already_set &e)
     {
-      std::cout << "Caught error_already_set\n";
       py::raise_from(e, benchmark_exc.ptr(), "Python error raised ");
       throw py::error_already_set();
     }
     catch (const std::exception &e)
     {
       const std::string &exc_message = e.what();
-      std::cout << "Caught std::exception " << exc_message << std::endl;
       py::set_error(benchmark_exc, exc_message.c_str());
       throw py::error_already_set();
     }
     catch (...)
     {
-      std::cout << "Got fall-through exception\n";
       py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main");
       throw py::error_already_set();
     }

From 8e6154511e89031bf0c4440e57a94096e26076a3 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 8 Dec 2025 14:24:32 -0600
Subject: [PATCH 7/9] Introduce runner->run_or_skip(bool &) and
 benchmark->run_or_skip(bool &)

These methods take reference to a boolean whose value signals whether
benchmark instances pending for execution are to be skipped.

void benchmark->run_or_skip(bool &) is called by Python to ensure
that KeyboardInterrupt is properly handled in scripts that contain
multiple benchmarks, or in case when single benchmark script is
executed on a machine with more than one device.
---
 nvbench/benchmark.cuh      |  7 +++++++
 nvbench/benchmark_base.cuh |  2 ++
 nvbench/runner.cuh         | 19 +++++++++++--------
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/nvbench/benchmark.cuh b/nvbench/benchmark.cuh
index 4456a94..963f7a3 100644
--- a/nvbench/benchmark.cuh
+++ b/nvbench/benchmark.cuh
@@ -81,6 +81,13 @@ private:
     runner.run();
   }
 
+  void do_run_or_skip(bool &skip_remaining) final
+  {
+    nvbench::runner<benchmark> runner{*this, this->m_kernel_generator};
+    runner.generate_states();
+    runner.run_or_skip(skip_remaining);
+  }
+
   kernel_generator m_kernel_generator;
 };
 
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index dce0afc..3eddf2b 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -145,6 +145,7 @@ struct benchmark_base
   [[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }
 
   void run() { this->do_run(); }
+  void run_or_skip(bool &skip_remaining) { this->do_run_or_skip(skip_remaining); }
 
   void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); }
 
@@ -320,6 +321,7 @@ private:
   virtual std::unique_ptr<benchmark_base> do_clone() const            = 0;
   virtual void do_set_type_axes_names(std::vector<std::string> names) = 0;
   virtual void do_run()                                               = 0;
+  virtual void do_run_or_skip(bool &skip_remaining)                   = 0;
 };
 
 } // namespace nvbench
diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh
index c3cc283..8a78f33 100644
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -29,8 +29,6 @@ namespace nvbench
 
 struct stop_runner_loop : std::runtime_error
 {
-  // ask compiler to generate all constructor signatures
-  // that are defined for the base class
   using std::runtime_error::runtime_error;
 };
 
@@ -67,22 +65,28 @@ struct runner : public runner_base
   {}
 
   void run()
+  {
+    [[maybe_unused]] bool skip_remaining = false;
+    run_or_skip(skip_remaining);
+  }
+
+  void run_or_skip(bool &skip_remaining)
   {
     if (m_benchmark.m_devices.empty())
     {
-      this->run_device(std::nullopt);
+      this->run_device(std::nullopt, skip_remaining);
     }
     else
     {
       for (const auto &device : m_benchmark.m_devices)
       {
-        this->run_device(device);
+        this->run_device(device, skip_remaining);
       }
     }
   }
 
 private:
-  void run_device(const std::optional<nvbench::device_info> &device)
+  void run_device(const std::optional<nvbench::device_info> &device, bool &skip_remaining)
   {
     if (device)
     {
@@ -92,11 +96,10 @@ private:
     // Iterate through type_configs:
     std::size_t type_config_index = 0;
     nvbench::tl::foreach<type_configs>(
-      [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device](
+      [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device, &skip_remaining](
         auto type_config_wrapper) {
         // Get current type_config:
-        using type_config   = typename decltype(type_config_wrapper)::type;
-        bool skip_remaining = false;
+        using type_config = typename decltype(type_config_wrapper)::type;
 
         // Find states with the current device / type_config
         for (nvbench::state &cur_state : states)

From 7e9a9a8983e018890f5a16a5629eaee5b9ea261a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 8 Dec 2025 14:28:26 -0600
Subject: [PATCH 8/9] Replace main_arg_run_benchmarks with run_interriptible

This loop uses benchmark.run_or_skip to resolve #284 even
for scripts that contain more than one benchmark, or when
a script with a single benchmark is executed when more than
one device is available.
---
 python/src/py_nvbench.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index e1b86ea..159936f 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -119,6 +119,29 @@ struct nvbench_run_error : std::runtime_error
 };
 py::handle benchmark_exc{};
 
+void run_interruptible(nvbench::option_parser &parser)
+{
+  auto &printer    = parser.get_printer();
+  auto &benchmarks = parser.get_benchmarks();
+
+  std::size_t total_states = 0;
+  for (auto &bench_ptr : benchmarks)
+  {
+    total_states += bench_ptr->get_config_count();
+  }
+
+  printer.set_completed_state_count(0);
+  printer.set_total_state_count(total_states);
+
+  bool skip_remaining_flag = false;
+  for (auto &bench_ptr : benchmarks)
+  {
+    bench_ptr->set_printer(printer);
+    bench_ptr->run_or_skip(skip_remaining_flag);
+    bench_ptr->clear_printer();
+  }
+}
+
 class GlobalBenchmarkRegistry
 {
   bool m_finalized;
@@ -185,7 +208,7 @@ public:
         parser.parse(argv);
 
         NVBENCH_MAIN_PRINT_PREAMBLE(parser);
-        NVBENCH_MAIN_RUN_BENCHMARKS(parser);
+        run_interruptible(parser);
         NVBENCH_MAIN_PRINT_EPILOGUE(parser);
 
         NVBENCH_MAIN_PRINT_RESULTS(parser);

From f6a9b245d340e96dd4657f17d99cdf15502b0ad7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Mon, 8 Dec 2025 14:46:59 -0600
Subject: [PATCH 9/9] Only trigger skipping of outstanding benchmarks on
 KeyboardInterrupt exception, on others benchmakr is to continue execution

---
 python/src/py_nvbench.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp
index 159936f..2b09574 100644
--- a/python/src/py_nvbench.cpp
+++ b/python/src/py_nvbench.cpp
@@ -99,7 +99,16 @@ struct benchmark_wrapper_t
     }
     catch (const py::error_already_set &e)
     {
-      throw nvbench::stop_runner_loop(e.what());
+      if (e.matches(PyExc_KeyboardInterrupt))
+      {
+        // interrupt execution of outstanding instances
+        throw nvbench::stop_runner_loop(e.what());
+      }
+      else
+      {
+        // re-raise
+        throw;
+      }
     }
   }