From de471e1d42c4ac45b0f33546a594007bd3527fb1 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 5 Dec 2025 14:39:11 -0600 Subject: [PATCH 1/9] Use pybind11==3.0.1, do not use pybind11_add_module --- python/CMakeLists.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b18f7ef..c89c085 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -23,15 +23,17 @@ CPMAddPackage( FIND_PACKAGE_ARGS CONFIG REQUIRED ) -CPMAddPackage("gh:pybind/pybind11@3.0.0") +CPMAddPackage("gh:pybind/pybind11@3.0.1") -pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp) +add_library(_nvbench MODULE src/py_nvbench.cpp) +target_include_directories(_nvbench PRIVATE ${Python_INCLUDE_DIRS}) target_link_libraries(_nvbench PUBLIC nvbench::nvbench) -target_link_libraries(_nvbench PRIVATE CUDA::cudart_static) +target_link_libraries(_nvbench PRIVATE CUDA::cudart_static pybind11::headers) set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN") set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(_nvbench PROPERTIES PREFIX "" SUFFIX "${PYTHON_MODULE_EXTENSION}") install(TARGETS _nvbench DESTINATION cuda/bench) From c286199adc77fb3b7665aa8c5d0776d29e94d634 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 5 Dec 2025 14:40:16 -0600 Subject: [PATCH 2/9] Scripts to triage 284 --- python/examples/study/with_blocking_krn.py | 76 +++++++++++++++++++ python/examples/study/without_blocking_krn.py | 76 +++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 python/examples/study/with_blocking_krn.py create mode 100644 python/examples/study/without_blocking_krn.py diff --git a/python/examples/study/with_blocking_krn.py b/python/examples/study/with_blocking_krn.py new file mode 100644 index 0000000..ff02bd3 --- /dev/null +++ b/python/examples/study/with_blocking_krn.py @@ -0,0 +1,76 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import cuda.bench as bench +import numpy as np +from numba import cuda + + +def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: + return cuda.external_stream(cs.addressof()) + + +def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: + @cuda.jit + def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): + tid = cuda.grid(1) + step = cuda.gridDim.x * cuda.blockDim.x + for i in range(stride * tid, stride * elements, stride * step): + for j in range(items_per_thread): + read_id = (items_per_thread * i + j) % elements + write_id = tid + j * elements + out_arr[write_id] = in_arr[read_id] + + return kernel + + +def throughput_bench(state: bench.State) -> None: + stride = state.get_int64("Stride") + ipt = state.get_int64("ItemsPerThread") + + nbytes = 128 * 1024 * 1024 + elements = nbytes // np.dtype(np.int32).itemsize + + alloc_stream = as_cuda_stream(state.get_stream()) + inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream) + out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) + + state.add_element_count(elements, column_name="Elements") + state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize") + state.add_global_memory_writes(inp_arr.nbytes) + + threads_per_block = 256 + blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block + + krn = make_throughput_kernel(ipt) + + def launcher(launch: bench.Launch): + exec_stream = as_cuda_stream(launch.get_stream()) + krn[blocks_in_grid, threads_per_block, exec_stream, 0]( + stride, elements, inp_arr, out_arr + ) + + state.exec(launcher) + + +if __name__ == "__main__": + b = bench.register(throughput_bench) + b.add_int64_axis("Stride", [1, 2, 4]) + b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) + + bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/study/without_blocking_krn.py b/python/examples/study/without_blocking_krn.py new file mode 100644 index 0000000..1a22256 --- /dev/null +++ b/python/examples/study/without_blocking_krn.py @@ -0,0 +1,76 @@ +# Copyright 2025 NVIDIA Corporation +# +# Licensed under the Apache License, Version 2.0 with the LLVM exception +# (the "License"); you may not use this file except in compliance with +# the License. +# +# You may obtain a copy of the License at +# +# http://llvm.org/foundation/relicensing/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import cuda.bench as bench +import numpy as np +from numba import cuda + + +def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: + return cuda.external_stream(cs.addressof()) + + +def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: + @cuda.jit + def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): + tid = cuda.grid(1) + step = cuda.gridDim.x * cuda.blockDim.x + for i in range(stride * tid, stride * elements, stride * step): + for j in range(items_per_thread): + read_id = (items_per_thread * i + j) % elements + write_id = tid + j * elements + out_arr[write_id] = in_arr[read_id] + + return kernel + + +def throughput_bench(state: bench.State) -> None: + stride = state.get_int64("Stride") + ipt = state.get_int64("ItemsPerThread") + + nbytes = 128 * 1024 * 1024 + elements = nbytes // np.dtype(np.int32).itemsize + + alloc_stream = as_cuda_stream(state.get_stream()) + inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream) + out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) + + state.add_element_count(elements, column_name="Elements") + state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize") + state.add_global_memory_writes(inp_arr.nbytes) + + threads_per_block = 256 + blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block + + krn = make_throughput_kernel(ipt) + + def launcher(launch: bench.Launch): + exec_stream = as_cuda_stream(launch.get_stream()) + krn[blocks_in_grid, threads_per_block, exec_stream, 0]( + stride, elements, inp_arr, out_arr + ) + + state.exec(launcher, sync=True) + + +if __name__ == "__main__": + b = bench.register(throughput_bench) + b.add_int64_axis("Stride", [1, 2, 4]) + b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) + + bench.run_all_benchmarks(sys.argv) From e57f1ecf4cceb3ee893a820615fb10996d2f0f5f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 5 Dec 2025 19:32:55 -0600 Subject: [PATCH 3/9] Introduce nvbench::stop_runner_loop exception. If application throws it, runner loop is stopped and other pending benchmark instances are skipped --- nvbench/runner.cuh | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh index 2c4176f..c3cc283 100644 --- a/nvbench/runner.cuh +++ b/nvbench/runner.cuh @@ -27,6 +27,13 @@ namespace nvbench { +struct stop_runner_loop : std::runtime_error +{ + // ask compiler to generate all constructor signatures + // that are defined for the base class + using std::runtime_error::runtime_error; +}; + // Non-templated code goes here to reduce instantiation costs: struct runner_base { @@ -88,7 +95,8 @@ private: [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device]( auto type_config_wrapper) { // Get current type_config: - using type_config = typename decltype(type_config_wrapper)::type; + using type_config = typename decltype(type_config_wrapper)::type; + bool skip_remaining = false; // Find states with the current device / type_config for (nvbench::state &cur_state : states) @@ -99,13 +107,21 @@ private: self.run_state_prologue(cur_state); try { - auto kernel_generator_copy = self.m_kernel_generator; - kernel_generator_copy(cur_state, type_config{}); - if (cur_state.is_skipped()) + if (!skip_remaining) + { + auto kernel_generator_copy = self.m_kernel_generator; + kernel_generator_copy(cur_state, type_config{}); + } + if (skip_remaining || cur_state.is_skipped()) { self.print_skip_notification(cur_state); } } + catch (nvbench::stop_runner_loop &e) + { + skip_remaining = true; + self.handle_sampling_exception(e, cur_state); + } catch (std::exception &e) { self.handle_sampling_exception(e, cur_state); From ce9a76167f18df30153b401c47dfc53e3fdb1f14 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 5 Dec 2025 14:38:48 -0600 Subject: [PATCH 4/9] Use nvbench::stop_runner_loop to signal stop of runner loop Add try/catch around Python calls to improve keyboard interrup response. --- python/src/py_nvbench.cpp | 52 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index 8856e8e..766f983 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -35,8 +35,8 @@ namespace py = pybind11; -namespace -{ +// namespace +//{ struct PyObjectDeleter { @@ -61,6 +61,8 @@ struct PyObjectDeleter } }; +namespace +{ struct benchmark_wrapper_t { @@ -91,7 +93,14 @@ struct benchmark_wrapper_t auto arg = py::cast(std::ref(state), py::return_value_policy::reference); // Execute Python callable - (*m_fn)(arg); + try + { + (*m_fn)(arg); + } + catch (const py::error_already_set &e) + { + throw nvbench::stop_runner_loop(e.what()); + } } private: @@ -99,6 +108,7 @@ private: // since copy constructor must be const (benchmark::do_clone is const member method) std::shared_ptr m_fn; }; +} // namespace // Use struct to ensure public inheritance struct nvbench_run_error : std::runtime_error @@ -183,17 +193,20 @@ public: } catch (py::error_already_set &e) { + std::cout << "Caught error_already_set\n"; py::raise_from(e, benchmark_exc.ptr(), "Python error raised "); throw py::error_already_set(); } catch (const std::exception &e) { const std::string &exc_message = e.what(); + std::cout << "Caught std::exception " << exc_message << std::endl; py::set_error(benchmark_exc, exc_message.c_str()); throw py::error_already_set(); } catch (...) { + std::cout << "Got fall-through exception\n"; py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main"); throw py::error_already_set(); } @@ -222,7 +235,7 @@ py::dict py_get_axis_values(const nvbench::state &state) // essentially a global variable, but allocated on the heap during module initialization std::unique_ptr global_registry{}; -} // end of anonymous namespace +//} // end of anonymous namespace // ========================================== // PLEASE KEEP IN SYNC WITH __init__.pyi FILE @@ -255,6 +268,7 @@ PYBIND11_MODULE(_nvbench, m) return std::make_pair(std::size_t{0}, reinterpret_cast(s.get_stream())); }); + py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t { return reinterpret_cast(s.get_stream()); }); @@ -295,6 +309,7 @@ PYBIND11_MODULE(_nvbench, m) auto py_benchmark_cls = py::class_(m, "Benchmark"); py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name); + py_benchmark_cls.def( "add_int64_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -304,6 +319,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "add_int64_power_of_two_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -315,6 +331,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "add_float64_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -324,6 +341,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "add_string_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -333,6 +351,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "set_name", [](nvbench::benchmark_base &self, std::string name) { @@ -341,6 +360,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("name")); + py_benchmark_cls.def( "set_is_cpu_only", [](nvbench::benchmark_base &self, bool is_cpu_only) { @@ -349,6 +369,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("is_cpu_only")); + // TODO: should this be exposed? py_benchmark_cls.def( "set_run_once", @@ -358,6 +379,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("run_once")); + py_benchmark_cls.def( "set_skip_time", [](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) { @@ -366,6 +388,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("duration_seconds")); + py_benchmark_cls.def( "set_timeout", [](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) { @@ -374,6 +397,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("duration_seconds")); + py_benchmark_cls.def( "set_throttle_threshold", [](nvbench::benchmark_base &self, nvbench::float32_t threshold) { @@ -382,6 +406,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("threshold")); + py_benchmark_cls.def( "set_throttle_recovery_delay", [](nvbench::benchmark_base &self, nvbench::float32_t delay) { @@ -390,6 +415,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("delay_seconds")); + py_benchmark_cls.def( "set_stopping_criterion", [](nvbench::benchmark_base &self, std::string criterion) { @@ -398,6 +424,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("criterion")); + py_benchmark_cls.def( "set_criterion_param_int64", [](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) { @@ -407,6 +434,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("value")); + py_benchmark_cls.def( "set_criterion_param_float64", [](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) { @@ -416,6 +444,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("value")); + py_benchmark_cls.def( "set_criterion_param_string", [](nvbench::benchmark_base &self, std::string name, std::string value) { @@ -425,6 +454,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("value")); + py_benchmark_cls.def( "set_min_samples", [](nvbench::benchmark_base &self, nvbench::int64_t count) { @@ -508,9 +538,11 @@ PYBIND11_MODULE(_nvbench, m) pystate_cls.def("has_device", [](const nvbench::state &state) -> bool { return static_cast(state.get_device()); }); + pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool { return state.get_benchmark().get_printer().has_value(); }); + pystate_cls.def("get_device", [](const nvbench::state &state) { auto dev = state.get_device(); if (dev.has_value()) @@ -550,6 +582,7 @@ PYBIND11_MODULE(_nvbench, m) &nvbench::state::add_element_count, py::arg("count"), py::arg("column_name") = py::str("")); + pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count")); pystate_cls.def("get_element_count", &nvbench::state::get_element_count); @@ -566,6 +599,7 @@ PYBIND11_MODULE(_nvbench, m) py::arg("nbytes"), py::pos_only{}, py::arg("column_name") = py::str("")); + pystate_cls.def( "add_global_memory_writes", [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void { @@ -575,10 +609,12 @@ PYBIND11_MODULE(_nvbench, m) py::arg("nbytes"), py::pos_only{}, py::arg("column_name") = py::str("")); + pystate_cls.def( "get_benchmark", [](const nvbench::state &state) { return std::ref(state.get_benchmark()); }, py::return_value_policy::reference); + pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold); pystate_cls.def("set_throttle_threshold", &nvbench::state::set_throttle_threshold, @@ -590,22 +626,27 @@ PYBIND11_MODULE(_nvbench, m) py::arg("min_samples_count")); pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel); + pystate_cls.def("set_disable_blocking_kernel", &nvbench::state::set_disable_blocking_kernel, py::arg("disable_blocking_kernel")); pystate_cls.def("get_run_once", &nvbench::state::get_run_once); + pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once")); pystate_cls.def("get_timeout", &nvbench::state::get_timeout); + pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration")); pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout); + pystate_cls.def("set_blocking_kernel_timeout", &nvbench::state::set_blocking_kernel_timeout, py::arg("duration")); pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics); + pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required); pystate_cls.def( @@ -670,6 +711,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::arg("name"), py::arg("value")); + pystate_cls.def( "add_summary", [](nvbench::state &state, std::string column_name, std::int64_t value) { @@ -680,6 +722,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::arg("name"), py::arg("value")); + pystate_cls.def( "add_summary", [](nvbench::state &state, std::string column_name, double value) { @@ -690,6 +733,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::arg("name"), py::arg("value")); + pystate_cls.def("get_axis_values_as_string", [](const nvbench::state &state) { return state.get_axis_values_as_string(); }); pystate_cls.def("get_axis_values", &py_get_axis_values); From b2a80c92b805c7d2c2a7ca1e4d7a3c0f1d028077 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 8 Dec 2025 11:53:08 -0600 Subject: [PATCH 5/9] Revert "Scripts to triage 284" This reverts commit c286199adc77fb3b7665aa8c5d0776d29e94d634. --- python/examples/study/with_blocking_krn.py | 76 ------------------- python/examples/study/without_blocking_krn.py | 76 ------------------- 2 files changed, 152 deletions(-) delete mode 100644 python/examples/study/with_blocking_krn.py delete mode 100644 python/examples/study/without_blocking_krn.py diff --git a/python/examples/study/with_blocking_krn.py b/python/examples/study/with_blocking_krn.py deleted file mode 100644 index ff02bd3..0000000 --- a/python/examples/study/with_blocking_krn.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2025 NVIDIA Corporation -# -# Licensed under the Apache License, Version 2.0 with the LLVM exception -# (the "License"); you may not use this file except in compliance with -# the License. -# -# You may obtain a copy of the License at -# -# http://llvm.org/foundation/relicensing/LICENSE.txt -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys - -import cuda.bench as bench -import numpy as np -from numba import cuda - - -def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: - return cuda.external_stream(cs.addressof()) - - -def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: - @cuda.jit - def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): - tid = cuda.grid(1) - step = cuda.gridDim.x * cuda.blockDim.x - for i in range(stride * tid, stride * elements, stride * step): - for j in range(items_per_thread): - read_id = (items_per_thread * i + j) % elements - write_id = tid + j * elements - out_arr[write_id] = in_arr[read_id] - - return kernel - - -def throughput_bench(state: bench.State) -> None: - stride = state.get_int64("Stride") - ipt = state.get_int64("ItemsPerThread") - - nbytes = 128 * 1024 * 1024 - elements = nbytes // np.dtype(np.int32).itemsize - - alloc_stream = as_cuda_stream(state.get_stream()) - inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream) - out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) - - state.add_element_count(elements, column_name="Elements") - state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize") - state.add_global_memory_writes(inp_arr.nbytes) - - threads_per_block = 256 - blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block - - krn = make_throughput_kernel(ipt) - - def launcher(launch: bench.Launch): - exec_stream = as_cuda_stream(launch.get_stream()) - krn[blocks_in_grid, threads_per_block, exec_stream, 0]( - stride, elements, inp_arr, out_arr - ) - - state.exec(launcher) - - -if __name__ == "__main__": - b = bench.register(throughput_bench) - b.add_int64_axis("Stride", [1, 2, 4]) - b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) - - bench.run_all_benchmarks(sys.argv) diff --git a/python/examples/study/without_blocking_krn.py b/python/examples/study/without_blocking_krn.py deleted file mode 100644 index 1a22256..0000000 --- a/python/examples/study/without_blocking_krn.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright 2025 NVIDIA Corporation -# -# Licensed under the Apache License, Version 2.0 with the LLVM exception -# (the "License"); you may not use this file except in compliance with -# the License. -# -# You may obtain a copy of the License at -# -# http://llvm.org/foundation/relicensing/LICENSE.txt -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys - -import cuda.bench as bench -import numpy as np -from numba import cuda - - -def as_cuda_stream(cs: bench.CudaStream) -> cuda.cudadrv.driver.Stream: - return cuda.external_stream(cs.addressof()) - - -def make_throughput_kernel(items_per_thread: int) -> cuda.dispatcher.CUDADispatcher: - @cuda.jit - def kernel(stride: np.uintp, elements: np.uintp, in_arr, out_arr): - tid = cuda.grid(1) - step = cuda.gridDim.x * cuda.blockDim.x - for i in range(stride * tid, stride * elements, stride * step): - for j in range(items_per_thread): - read_id = (items_per_thread * i + j) % elements - write_id = tid + j * elements - out_arr[write_id] = in_arr[read_id] - - return kernel - - -def throughput_bench(state: bench.State) -> None: - stride = state.get_int64("Stride") - ipt = state.get_int64("ItemsPerThread") - - nbytes = 128 * 1024 * 1024 - elements = nbytes // np.dtype(np.int32).itemsize - - alloc_stream = as_cuda_stream(state.get_stream()) - inp_arr = cuda.device_array(elements, dtype=np.int32, stream=alloc_stream) - out_arr = cuda.device_array(elements * ipt, dtype=np.int32, stream=alloc_stream) - - state.add_element_count(elements, column_name="Elements") - state.add_global_memory_reads(inp_arr.nbytes, column_name="Datasize") - state.add_global_memory_writes(inp_arr.nbytes) - - threads_per_block = 256 - blocks_in_grid = (elements + threads_per_block - 1) // threads_per_block - - krn = make_throughput_kernel(ipt) - - def launcher(launch: bench.Launch): - exec_stream = as_cuda_stream(launch.get_stream()) - krn[blocks_in_grid, threads_per_block, exec_stream, 0]( - stride, elements, inp_arr, out_arr - ) - - state.exec(launcher, sync=True) - - -if __name__ == "__main__": - b = bench.register(throughput_bench) - b.add_int64_axis("Stride", [1, 2, 4]) - b.add_int64_axis("ItemsPerThread", [1, 2, 3, 4]) - - bench.run_all_benchmarks(sys.argv) From a7763bdd7a04c9bad9172c28de34cc246ed5a146 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 8 Dec 2025 12:25:31 -0600 Subject: [PATCH 6/9] Remove debug outputs --- python/src/py_nvbench.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index 766f983..e1b86ea 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -193,20 +193,17 @@ public: } catch (py::error_already_set &e) { - std::cout << "Caught error_already_set\n"; py::raise_from(e, benchmark_exc.ptr(), "Python error raised "); throw py::error_already_set(); } catch (const std::exception &e) { const std::string &exc_message = e.what(); - std::cout << "Caught std::exception " << exc_message << std::endl; py::set_error(benchmark_exc, exc_message.c_str()); throw py::error_already_set(); } catch (...) { - std::cout << "Got fall-through exception\n"; py::set_error(benchmark_exc, "Caught unknown exception in nvbench_main"); throw py::error_already_set(); } From 8e6154511e89031bf0c4440e57a94096e26076a3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 8 Dec 2025 14:24:32 -0600 Subject: [PATCH 7/9] Introduce runner->run_or_skip(bool &) and benchmark->run_or_skip(bool &) These methods take reference to a boolean whose value signals whether benchmark instances pending for execution are to be skipped. void benchmark->run_or_skip(bool &) is called by Python to ensure that KeyboardInterrupt is properly handled in scripts that contain multiple benchmarks, or in case when single benchmark script is executed on a machine with more than one device. --- nvbench/benchmark.cuh | 7 +++++++ nvbench/benchmark_base.cuh | 2 ++ nvbench/runner.cuh | 19 +++++++++++-------- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/nvbench/benchmark.cuh b/nvbench/benchmark.cuh index 4456a94..963f7a3 100644 --- a/nvbench/benchmark.cuh +++ b/nvbench/benchmark.cuh @@ -81,6 +81,13 @@ private: runner.run(); } + void do_run_or_skip(bool &skip_remaining) final + { + nvbench::runner runner{*this, this->m_kernel_generator}; + runner.generate_states(); + runner.run_or_skip(skip_remaining); + } + kernel_generator m_kernel_generator; }; diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index dce0afc..3eddf2b 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -145,6 +145,7 @@ struct benchmark_base [[nodiscard]] std::vector &get_states() { return m_states; } void run() { this->do_run(); } + void run_or_skip(bool &skip_remaining) { this->do_run_or_skip(skip_remaining); } void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); } @@ -320,6 +321,7 @@ private: virtual std::unique_ptr do_clone() const = 0; virtual void do_set_type_axes_names(std::vector names) = 0; virtual void do_run() = 0; + virtual void do_run_or_skip(bool &skip_remaining) = 0; }; } // namespace nvbench diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh index c3cc283..8a78f33 100644 --- a/nvbench/runner.cuh +++ b/nvbench/runner.cuh @@ -29,8 +29,6 @@ namespace nvbench struct stop_runner_loop : std::runtime_error { - // ask compiler to generate all constructor signatures - // that are defined for the base class using std::runtime_error::runtime_error; }; @@ -67,22 +65,28 @@ struct runner : public runner_base {} void run() + { + [[maybe_unused]] bool skip_remaining = false; + run_or_skip(skip_remaining); + } + + void run_or_skip(bool &skip_remaining) { if (m_benchmark.m_devices.empty()) { - this->run_device(std::nullopt); + this->run_device(std::nullopt, skip_remaining); } else { for (const auto &device : m_benchmark.m_devices) { - this->run_device(device); + this->run_device(device, skip_remaining); } } } private: - void run_device(const std::optional &device) + void run_device(const std::optional &device, bool &skip_remaining) { if (device) { @@ -92,11 +96,10 @@ private: // Iterate through type_configs: std::size_t type_config_index = 0; nvbench::tl::foreach( - [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device]( + [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device, &skip_remaining]( auto type_config_wrapper) { // Get current type_config: - using type_config = typename decltype(type_config_wrapper)::type; - bool skip_remaining = false; + using type_config = typename decltype(type_config_wrapper)::type; // Find states with the current device / type_config for (nvbench::state &cur_state : states) From 7e9a9a8983e018890f5a16a5629eaee5b9ea261a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 8 Dec 2025 14:28:26 -0600 Subject: [PATCH 8/9] Replace main_arg_run_benchmarks with run_interriptible This loop uses benchmark.run_or_skip to resolve #284 even for scripts that contain more than one benchmark, or when a script with a single benchmark is executed when more than one device is available. --- python/src/py_nvbench.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index e1b86ea..159936f 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -119,6 +119,29 @@ struct nvbench_run_error : std::runtime_error }; py::handle benchmark_exc{}; +void run_interruptible(nvbench::option_parser &parser) +{ + auto &printer = parser.get_printer(); + auto &benchmarks = parser.get_benchmarks(); + + std::size_t total_states = 0; + for (auto &bench_ptr : benchmarks) + { + total_states += bench_ptr->get_config_count(); + } + + printer.set_completed_state_count(0); + printer.set_total_state_count(total_states); + + bool skip_remaining_flag = false; + for (auto &bench_ptr : benchmarks) + { + bench_ptr->set_printer(printer); + bench_ptr->run_or_skip(skip_remaining_flag); + bench_ptr->clear_printer(); + } +} + class GlobalBenchmarkRegistry { bool m_finalized; @@ -185,7 +208,7 @@ public: parser.parse(argv); NVBENCH_MAIN_PRINT_PREAMBLE(parser); - NVBENCH_MAIN_RUN_BENCHMARKS(parser); + run_interruptible(parser); NVBENCH_MAIN_PRINT_EPILOGUE(parser); NVBENCH_MAIN_PRINT_RESULTS(parser); From f6a9b245d340e96dd4657f17d99cdf15502b0ad7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Mon, 8 Dec 2025 14:46:59 -0600 Subject: [PATCH 9/9] Only trigger skipping of outstanding benchmarks on KeyboardInterrupt exception, on others benchmakr is to continue execution --- python/src/py_nvbench.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index 159936f..2b09574 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -99,7 +99,16 @@ struct benchmark_wrapper_t } catch (const py::error_already_set &e) { - throw nvbench::stop_runner_loop(e.what()); + if (e.matches(PyExc_KeyboardInterrupt)) + { + // interrupt execution of outstanding instances + throw nvbench::stop_runner_loop(e.what()); + } + else + { + // re-raise + throw; + } } }