diff --git a/nvbench/benchmark.cuh b/nvbench/benchmark.cuh index 4456a94..963f7a3 100644 --- a/nvbench/benchmark.cuh +++ b/nvbench/benchmark.cuh @@ -81,6 +81,13 @@ private: runner.run(); } + void do_run_or_skip(bool &skip_remaining) final + { + nvbench::runner runner{*this, this->m_kernel_generator}; + runner.generate_states(); + runner.run_or_skip(skip_remaining); + } + kernel_generator m_kernel_generator; }; diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index dce0afc..3eddf2b 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -145,6 +145,7 @@ struct benchmark_base [[nodiscard]] std::vector &get_states() { return m_states; } void run() { this->do_run(); } + void run_or_skip(bool &skip_remaining) { this->do_run_or_skip(skip_remaining); } void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); } @@ -320,6 +321,7 @@ private: virtual std::unique_ptr do_clone() const = 0; virtual void do_set_type_axes_names(std::vector names) = 0; virtual void do_run() = 0; + virtual void do_run_or_skip(bool &skip_remaining) = 0; }; } // namespace nvbench diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh index 2c4176f..8a78f33 100644 --- a/nvbench/runner.cuh +++ b/nvbench/runner.cuh @@ -27,6 +27,11 @@ namespace nvbench { +struct stop_runner_loop : std::runtime_error +{ + using std::runtime_error::runtime_error; +}; + // Non-templated code goes here to reduce instantiation costs: struct runner_base { @@ -60,22 +65,28 @@ struct runner : public runner_base {} void run() + { + [[maybe_unused]] bool skip_remaining = false; + run_or_skip(skip_remaining); + } + + void run_or_skip(bool &skip_remaining) { if (m_benchmark.m_devices.empty()) { - this->run_device(std::nullopt); + this->run_device(std::nullopt, skip_remaining); } else { for (const auto &device : m_benchmark.m_devices) { - this->run_device(device); + this->run_device(device, skip_remaining); } } } private: - void run_device(const std::optional &device) + void run_device(const std::optional &device, bool &skip_remaining) { if (device) { @@ -85,7 +96,7 @@ private: // Iterate through type_configs: std::size_t type_config_index = 0; nvbench::tl::foreach( - [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device]( + [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device, &skip_remaining]( auto type_config_wrapper) { // Get current type_config: using type_config = typename decltype(type_config_wrapper)::type; @@ -99,13 +110,21 @@ private: self.run_state_prologue(cur_state); try { - auto kernel_generator_copy = self.m_kernel_generator; - kernel_generator_copy(cur_state, type_config{}); - if (cur_state.is_skipped()) + if (!skip_remaining) + { + auto kernel_generator_copy = self.m_kernel_generator; + kernel_generator_copy(cur_state, type_config{}); + } + if (skip_remaining || cur_state.is_skipped()) { self.print_skip_notification(cur_state); } } + catch (nvbench::stop_runner_loop &e) + { + skip_remaining = true; + self.handle_sampling_exception(e, cur_state); + } catch (std::exception &e) { self.handle_sampling_exception(e, cur_state); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b18f7ef..c89c085 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -23,15 +23,17 @@ CPMAddPackage( FIND_PACKAGE_ARGS CONFIG REQUIRED ) -CPMAddPackage("gh:pybind/pybind11@3.0.0") +CPMAddPackage("gh:pybind/pybind11@3.0.1") -pybind11_add_module(_nvbench MODULE src/py_nvbench.cpp) +add_library(_nvbench MODULE src/py_nvbench.cpp) +target_include_directories(_nvbench PRIVATE ${Python_INCLUDE_DIRS}) target_link_libraries(_nvbench PUBLIC nvbench::nvbench) -target_link_libraries(_nvbench PRIVATE CUDA::cudart_static) +target_link_libraries(_nvbench PRIVATE CUDA::cudart_static pybind11::headers) set_target_properties(_nvbench PROPERTIES INSTALL_RPATH "$ORIGIN") set_target_properties(_nvbench PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON) set_target_properties(_nvbench PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties(_nvbench PROPERTIES PREFIX "" SUFFIX "${PYTHON_MODULE_EXTENSION}") install(TARGETS _nvbench DESTINATION cuda/bench) diff --git a/python/src/py_nvbench.cpp b/python/src/py_nvbench.cpp index 8856e8e..2b09574 100644 --- a/python/src/py_nvbench.cpp +++ b/python/src/py_nvbench.cpp @@ -35,8 +35,8 @@ namespace py = pybind11; -namespace -{ +// namespace +//{ struct PyObjectDeleter { @@ -61,6 +61,8 @@ struct PyObjectDeleter } }; +namespace +{ struct benchmark_wrapper_t { @@ -91,7 +93,23 @@ struct benchmark_wrapper_t auto arg = py::cast(std::ref(state), py::return_value_policy::reference); // Execute Python callable - (*m_fn)(arg); + try + { + (*m_fn)(arg); + } + catch (const py::error_already_set &e) + { + if (e.matches(PyExc_KeyboardInterrupt)) + { + // interrupt execution of outstanding instances + throw nvbench::stop_runner_loop(e.what()); + } + else + { + // re-raise + throw; + } + } } private: @@ -99,6 +117,7 @@ private: // since copy constructor must be const (benchmark::do_clone is const member method) std::shared_ptr m_fn; }; +} // namespace // Use struct to ensure public inheritance struct nvbench_run_error : std::runtime_error @@ -109,6 +128,29 @@ struct nvbench_run_error : std::runtime_error }; py::handle benchmark_exc{}; +void run_interruptible(nvbench::option_parser &parser) +{ + auto &printer = parser.get_printer(); + auto &benchmarks = parser.get_benchmarks(); + + std::size_t total_states = 0; + for (auto &bench_ptr : benchmarks) + { + total_states += bench_ptr->get_config_count(); + } + + printer.set_completed_state_count(0); + printer.set_total_state_count(total_states); + + bool skip_remaining_flag = false; + for (auto &bench_ptr : benchmarks) + { + bench_ptr->set_printer(printer); + bench_ptr->run_or_skip(skip_remaining_flag); + bench_ptr->clear_printer(); + } +} + class GlobalBenchmarkRegistry { bool m_finalized; @@ -175,7 +217,7 @@ public: parser.parse(argv); NVBENCH_MAIN_PRINT_PREAMBLE(parser); - NVBENCH_MAIN_RUN_BENCHMARKS(parser); + run_interruptible(parser); NVBENCH_MAIN_PRINT_EPILOGUE(parser); NVBENCH_MAIN_PRINT_RESULTS(parser); @@ -222,7 +264,7 @@ py::dict py_get_axis_values(const nvbench::state &state) // essentially a global variable, but allocated on the heap during module initialization std::unique_ptr global_registry{}; -} // end of anonymous namespace +//} // end of anonymous namespace // ========================================== // PLEASE KEEP IN SYNC WITH __init__.pyi FILE @@ -255,6 +297,7 @@ PYBIND11_MODULE(_nvbench, m) return std::make_pair(std::size_t{0}, reinterpret_cast(s.get_stream())); }); + py_cuda_stream_cls.def("addressof", [](const nvbench::cuda_stream &s) -> std::size_t { return reinterpret_cast(s.get_stream()); }); @@ -295,6 +338,7 @@ PYBIND11_MODULE(_nvbench, m) auto py_benchmark_cls = py::class_(m, "Benchmark"); py_benchmark_cls.def("get_name", &nvbench::benchmark_base::get_name); + py_benchmark_cls.def( "add_int64_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -304,6 +348,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "add_int64_power_of_two_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -315,6 +360,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "add_float64_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -324,6 +370,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "add_string_axis", [](nvbench::benchmark_base &self, std::string name, std::vector data) { @@ -333,6 +380,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("values")); + py_benchmark_cls.def( "set_name", [](nvbench::benchmark_base &self, std::string name) { @@ -341,6 +389,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("name")); + py_benchmark_cls.def( "set_is_cpu_only", [](nvbench::benchmark_base &self, bool is_cpu_only) { @@ -349,6 +398,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("is_cpu_only")); + // TODO: should this be exposed? py_benchmark_cls.def( "set_run_once", @@ -358,6 +408,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("run_once")); + py_benchmark_cls.def( "set_skip_time", [](nvbench::benchmark_base &self, nvbench::float64_t skip_duration_seconds) { @@ -366,6 +417,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("duration_seconds")); + py_benchmark_cls.def( "set_timeout", [](nvbench::benchmark_base &self, nvbench::float64_t duration_seconds) { @@ -374,6 +426,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("duration_seconds")); + py_benchmark_cls.def( "set_throttle_threshold", [](nvbench::benchmark_base &self, nvbench::float32_t threshold) { @@ -382,6 +435,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("threshold")); + py_benchmark_cls.def( "set_throttle_recovery_delay", [](nvbench::benchmark_base &self, nvbench::float32_t delay) { @@ -390,6 +444,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("delay_seconds")); + py_benchmark_cls.def( "set_stopping_criterion", [](nvbench::benchmark_base &self, std::string criterion) { @@ -398,6 +453,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::return_value_policy::reference, py::arg("criterion")); + py_benchmark_cls.def( "set_criterion_param_int64", [](nvbench::benchmark_base &self, std::string name, nvbench::int64_t value) { @@ -407,6 +463,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("value")); + py_benchmark_cls.def( "set_criterion_param_float64", [](nvbench::benchmark_base &self, std::string name, nvbench::float64_t value) { @@ -416,6 +473,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("value")); + py_benchmark_cls.def( "set_criterion_param_string", [](nvbench::benchmark_base &self, std::string name, std::string value) { @@ -425,6 +483,7 @@ PYBIND11_MODULE(_nvbench, m) py::return_value_policy::reference, py::arg("name"), py::arg("value")); + py_benchmark_cls.def( "set_min_samples", [](nvbench::benchmark_base &self, nvbench::int64_t count) { @@ -508,9 +567,11 @@ PYBIND11_MODULE(_nvbench, m) pystate_cls.def("has_device", [](const nvbench::state &state) -> bool { return static_cast(state.get_device()); }); + pystate_cls.def("has_printers", [](const nvbench::state &state) -> bool { return state.get_benchmark().get_printer().has_value(); }); + pystate_cls.def("get_device", [](const nvbench::state &state) { auto dev = state.get_device(); if (dev.has_value()) @@ -550,6 +611,7 @@ PYBIND11_MODULE(_nvbench, m) &nvbench::state::add_element_count, py::arg("count"), py::arg("column_name") = py::str("")); + pystate_cls.def("set_element_count", &nvbench::state::set_element_count, py::arg("count")); pystate_cls.def("get_element_count", &nvbench::state::get_element_count); @@ -566,6 +628,7 @@ PYBIND11_MODULE(_nvbench, m) py::arg("nbytes"), py::pos_only{}, py::arg("column_name") = py::str("")); + pystate_cls.def( "add_global_memory_writes", [](nvbench::state &state, std::size_t nbytes, const std::string &column_name) -> void { @@ -575,10 +638,12 @@ PYBIND11_MODULE(_nvbench, m) py::arg("nbytes"), py::pos_only{}, py::arg("column_name") = py::str("")); + pystate_cls.def( "get_benchmark", [](const nvbench::state &state) { return std::ref(state.get_benchmark()); }, py::return_value_policy::reference); + pystate_cls.def("get_throttle_threshold", &nvbench::state::get_throttle_threshold); pystate_cls.def("set_throttle_threshold", &nvbench::state::set_throttle_threshold, @@ -590,22 +655,27 @@ PYBIND11_MODULE(_nvbench, m) py::arg("min_samples_count")); pystate_cls.def("get_disable_blocking_kernel", &nvbench::state::get_disable_blocking_kernel); + pystate_cls.def("set_disable_blocking_kernel", &nvbench::state::set_disable_blocking_kernel, py::arg("disable_blocking_kernel")); pystate_cls.def("get_run_once", &nvbench::state::get_run_once); + pystate_cls.def("set_run_once", &nvbench::state::set_run_once, py::arg("run_once")); pystate_cls.def("get_timeout", &nvbench::state::get_timeout); + pystate_cls.def("set_timeout", &nvbench::state::set_timeout, py::arg("duration")); pystate_cls.def("get_blocking_kernel_timeout", &nvbench::state::get_blocking_kernel_timeout); + pystate_cls.def("set_blocking_kernel_timeout", &nvbench::state::set_blocking_kernel_timeout, py::arg("duration")); pystate_cls.def("collect_cupti_metrics", &nvbench::state::collect_cupti_metrics); + pystate_cls.def("is_cupti_required", &nvbench::state::is_cupti_required); pystate_cls.def( @@ -670,6 +740,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::arg("name"), py::arg("value")); + pystate_cls.def( "add_summary", [](nvbench::state &state, std::string column_name, std::int64_t value) { @@ -680,6 +751,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::arg("name"), py::arg("value")); + pystate_cls.def( "add_summary", [](nvbench::state &state, std::string column_name, double value) { @@ -690,6 +762,7 @@ PYBIND11_MODULE(_nvbench, m) }, py::arg("name"), py::arg("value")); + pystate_cls.def("get_axis_values_as_string", [](const nvbench::state &state) { return state.get_axis_values_as_string(); }); pystate_cls.def("get_axis_values", &py_get_axis_values);