diff --git a/docs/cli_help.md b/docs/cli_help.md index 0336c5e..5b70096 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -133,6 +133,23 @@ * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. +* `--discard-on-throttle` + * Discard measurements if the GPU is throttled. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--throttle-threshold ` + * Set the GPU throttle threshold as percentage of the peak clock rate. + * Default is 0.75 (75%). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--throttle-recovery-delay ` + * Set the GPU throttle recovery delay in seconds. + * Default is 0.5 seconds. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + * `--run-once` * Only run the benchmark once, skipping any warmup runs and batched measurements. diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt index 33569b9..89ac7ed 100644 --- a/nvbench/CMakeLists.txt +++ b/nvbench/CMakeLists.txt @@ -29,6 +29,8 @@ set(srcs detail/measure_hot.cu detail/state_generator.cxx detail/stdrel_criterion.cxx + detail/gpu_frequency.cxx + detail/timestamps_kernel.cu internal/nvml.cxx ) diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index a363a3b..3ed0ebf 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -247,12 +247,39 @@ struct benchmark_base } /// @} - [[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; } - [[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; } + [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; } + + void set_throttle_threshold(nvbench::float32_t throttle_threshold) + { + m_throttle_threshold = throttle_threshold; + } + + [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const + { + return m_throttle_recovery_delay; + } + + void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay) + { + m_throttle_recovery_delay = throttle_recovery_delay; + } + + [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; } + + void set_discard_on_throttle(bool discard_on_throttle) + { + m_discard_on_throttle = discard_on_throttle; + } + + [[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; } + [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const + { + return m_criterion_params; + } /// Control the stopping criterion for the measurement loop. /// @{ - [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; } + [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; } benchmark_base &set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); @@ -282,6 +309,10 @@ protected: nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_timeout{15.}; + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + bool m_discard_on_throttle{false}; + nvbench::criterion_params m_criterion_params; std::string m_stopping_criterion{"stdrel"}; diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx index 3cdea6b..692019b 100644 --- a/nvbench/benchmark_base.cxx +++ b/nvbench/benchmark_base.cxx @@ -17,7 +17,6 @@ */ #include - #include namespace nvbench @@ -45,7 +44,11 @@ std::unique_ptr benchmark_base::clone() const result->m_skip_time = m_skip_time; result->m_timeout = m_timeout; - result->m_criterion_params = m_criterion_params; + result->m_criterion_params = m_criterion_params; + result->m_throttle_threshold = m_throttle_threshold; + result->m_throttle_recovery_delay = m_throttle_recovery_delay; + result->m_discard_on_throttle = m_discard_on_throttle; + result->m_stopping_criterion = m_stopping_criterion; return result; diff --git a/nvbench/detail/gpu_frequency.cuh b/nvbench/detail/gpu_frequency.cuh new file mode 100644 index 0000000..320d3d2 --- /dev/null +++ b/nvbench/detail/gpu_frequency.cuh @@ -0,0 +1,53 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace nvbench::detail +{ + +struct cuda_stream; + +struct gpu_frequency +{ + gpu_frequency() = default; + + // move-only + gpu_frequency(const gpu_frequency &) = delete; + gpu_frequency(gpu_frequency &&) = default; + gpu_frequency &operator=(const gpu_frequency &) = delete; + gpu_frequency &operator=(gpu_frequency &&) = default; + + void start(const nvbench::cuda_stream &stream) { m_start.record(stream); } + + void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); } + + [[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz, + nvbench::float32_t throttle_threshold); + + [[nodiscard]] nvbench::float32_t get_clock_frequency(); + +private: + nvbench::detail::timestamps_kernel m_start; + nvbench::detail::timestamps_kernel m_stop; +}; + +} // namespace nvbench::detail diff --git a/nvbench/detail/gpu_frequency.cxx b/nvbench/detail/gpu_frequency.cxx new file mode 100644 index 0000000..8f2d19b --- /dev/null +++ b/nvbench/detail/gpu_frequency.cxx @@ -0,0 +1,47 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace nvbench::detail +{ + +nvbench::float32_t gpu_frequency::get_clock_frequency() +{ + nvbench::uint64_t elapsed_ns = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0]; + nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1]; + nvbench::float32_t clock_rate = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f; + return clock_rate; +} + +bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz, + nvbench::float32_t throttle_threshold) +{ + float threshold = peak_sm_clock_rate_hz * throttle_threshold; + + if (this->get_clock_frequency() < threshold) + { + return true; + } + + return false; +} + +} // namespace nvbench::detail diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index a137929..95c292b 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -27,6 +27,8 @@ #include #include +#include +#include #include @@ -44,6 +46,9 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_min_samples{exec_state.get_min_samples()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} + , m_throttle_threshold(exec_state.get_throttle_threshold()) + , m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay()) + , m_discard_on_throttle(exec_state.get_discard_on_throttle()) { if (m_min_samples > 0) { @@ -86,6 +91,41 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); } void measure_cold_base::record_measurements() { + if (!m_run_once) + { + auto peak_clock_rate = static_cast(m_state.get_device()->get_sm_default_clock_rate()); + + if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold)) + { + if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value()) + { + auto current_clock_rate = m_gpu_frequency.get_clock_frequency(); + auto &printer = printer_opt_ref.value().get(); + printer.log(nvbench::log_level::warn, + fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) " + "({:0.0f}% < {:0.0f}%) on sample {}. {} previous sample and " + "pausing for {}s.", + current_clock_rate / 1000000.0f, + peak_clock_rate / 1000000.0f, + 100.0f * (current_clock_rate / peak_clock_rate), + 100.0f * m_throttle_threshold, + m_total_samples, + m_discard_on_throttle ? "Discarding" : "Keeping", + m_throttle_recovery_delay)); + } + + if (m_throttle_recovery_delay > 0.0f) + { // let the GPU cool down + std::this_thread::sleep_for(std::chrono::duration(m_throttle_recovery_delay)); + } + + if (m_discard_on_throttle) + { // ignore this measurement + return; + } + } + } + // Update and record timers and counters: const auto cur_cuda_time = m_cuda_timer.get_duration(); const auto cur_cpu_time = m_cpu_timer.get_duration(); diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 97ce869..a27e1e3 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -64,6 +65,8 @@ protected: bool is_finished(); void run_trials_epilogue(); void generate_summaries(); + void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); } + void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); } void check_skip_time(nvbench::float64_t warmup_time); @@ -88,6 +91,7 @@ protected: nvbench::criterion_params m_criterion_params; nvbench::stopping_criterion_base& m_stopping_criterion; + nvbench::detail::gpu_frequency m_gpu_frequency; bool m_disable_blocking_kernel{false}; bool m_run_once{false}; @@ -97,6 +101,10 @@ protected: nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + bool m_discard_on_throttle{false}; + nvbench::int64_t m_total_samples{}; nvbench::float64_t m_min_cuda_time{}; @@ -128,6 +136,10 @@ struct measure_cold_base::kernel_launch_timer { m_measure.block_stream(); } + if (!m_measure.m_run_once) + { + m_measure.gpu_frequency_start(); + } m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream()); if (m_disable_blocking_kernel) { @@ -143,6 +155,10 @@ struct measure_cold_base::kernel_launch_timer m_measure.m_cpu_timer.start(); m_measure.unblock_stream(); } + if (!m_measure.m_run_once) + { + m_measure.gpu_frequency_stop(); + } m_measure.sync_stream(); m_measure.m_cpu_timer.stop(); } diff --git a/nvbench/detail/timestamps_kernel.cu b/nvbench/detail/timestamps_kernel.cu new file mode 100644 index 0000000..8fa7a64 --- /dev/null +++ b/nvbench/detail/timestamps_kernel.cu @@ -0,0 +1,82 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +namespace +{ + +__global__ void get_timestamps_kernel(nvbench::uint64_t *global_timestamp, + nvbench::uint64_t *sm0_timestamp) +{ + nvbench::uint32_t smid; + asm volatile("mov.u32 %0, %%smid;" : "=r"(smid)); + if (smid == 0) + { + nvbench::uint64_t gts, lts; + asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(gts)); + lts = clock64(); + + *global_timestamp = gts; + *sm0_timestamp = lts; + } +} + +} // namespace + +namespace nvbench::detail +{ + +timestamps_kernel::timestamps_kernel() +{ + NVBENCH_CUDA_CALL( + cudaHostRegister(&m_host_timestamps, sizeof(nvbench::uint64_t) * 2, cudaHostRegisterMapped)); + NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timestamps, &m_host_timestamps, 0)); +} + +timestamps_kernel::~timestamps_kernel() +{ + NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timestamps)); +} + +void timestamps_kernel::record(const nvbench::cuda_stream &stream) +{ + m_host_timestamps[0] = 0; + m_host_timestamps[1] = 0; + + int device_id = 0; + int num_sms = 0; + + NVBENCH_CUDA_CALL(cudaGetDevice(&device_id)); + NVBENCH_CUDA_CALL( + cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id)); + + get_timestamps_kernel<<(num_sms), 1, 0, stream.get_stream()>>>( + m_device_timestamps, + m_device_timestamps + 1); +} + +} // namespace nvbench diff --git a/nvbench/detail/timestamps_kernel.cuh b/nvbench/detail/timestamps_kernel.cuh new file mode 100644 index 0000000..46831ca --- /dev/null +++ b/nvbench/detail/timestamps_kernel.cuh @@ -0,0 +1,50 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace nvbench +{ + +struct cuda_stream; + +namespace detail +{ + +struct timestamps_kernel +{ + timestamps_kernel(); + ~timestamps_kernel(); + + void record(const nvbench::cuda_stream &stream); + + // move-only + timestamps_kernel(const timestamps_kernel &) = delete; + timestamps_kernel(timestamps_kernel &&) = default; + timestamps_kernel &operator=(const timestamps_kernel &) = delete; + timestamps_kernel &operator=(timestamps_kernel &&) = default; + + nvbench::uint64_t m_host_timestamps[2]; + nvbench::uint64_t *m_device_timestamps{}; +}; + +} // namespace detail + +} // namespace nvbench diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh index 98184cf..4124d96 100644 --- a/nvbench/device_info.cuh +++ b/nvbench/device_info.cuh @@ -18,17 +18,18 @@ #pragma once +#include + #include #include #include -#include - #include // CHAR_BIT #include -#include #include +#include + // forward declare this for internal storage struct nvmlDevice_st; diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 525f275..04b50fb 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -16,28 +16,24 @@ * limitations under the License. */ -#include - #include #include -#include #include +#include +#include #include #include #include #include +#include #include #include #include -#include - // These are generated from the markdown docs by CMake in the build directory: #include #include -#include - #include #include #include @@ -48,10 +44,12 @@ #include #include #include -#include #include #include +#include +#include + namespace { @@ -433,6 +431,11 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->lock_gpu_clocks(first[1]); first += 2; } + else if (arg == "--discard-on-throttle") + { + this->enable_discard_on_throttle(); + first += 1; + } else if (arg == "--run-once") { this->enable_run_once(); @@ -515,7 +518,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->update_int64_prop(first[0], first[1]); first += 2; } - else if (arg == "--skip-time" || arg == "--timeout") + else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" || + arg == "--throttle-recovery-delay") { check_params(1); this->update_float64_prop(first[0], first[1]); @@ -623,7 +627,7 @@ void option_parser::print_version() const NVBENCH_GIT_VERSION); } -void option_parser::print_list(printer_base& printer) const +void option_parser::print_list(printer_base &printer) const { const auto &bench_mgr = nvbench::benchmark_manager::get(); printer.print_device_info(); @@ -727,6 +731,18 @@ void option_parser::enable_run_once() bench.set_run_once(true); } +void option_parser::enable_discard_on_throttle() +{ + if (m_benchmarks.empty()) + { + m_global_benchmark_args.push_back("--discard-on-throttle"); + return; + } + + benchmark_base &bench = *m_benchmarks.back(); + bench.set_discard_on_throttle(true); +} + void option_parser::set_stopping_criterion(const std::string &criterion) { // If no active benchmark, save args as global. @@ -979,10 +995,9 @@ catch (std::exception &e) e.what()); } -void option_parser::update_criterion_prop( - const std::string &prop_arg, - const std::string &prop_val, - const nvbench::named_values::type type) +void option_parser::update_criterion_prop(const std::string &prop_arg, + const std::string &prop_val, + const nvbench::named_values::type type) try { // If no active benchmark, save args as global. @@ -993,8 +1008,8 @@ try return; } - benchmark_base &bench = *m_benchmarks.back(); - nvbench::criterion_params& criterion_params = bench.get_criterion_params(); + benchmark_base &bench = *m_benchmarks.back(); + nvbench::criterion_params &criterion_params = bench.get_criterion_params(); std::string name(prop_arg.begin() + 2, prop_arg.end()); if (type == nvbench::named_values::type::float64) { @@ -1022,7 +1037,7 @@ try NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); } } -catch (std::exception& e) +catch (std::exception &e) { NVBENCH_THROW(std::runtime_error, "Error handling option `{} {}`:\n{}", @@ -1054,6 +1069,14 @@ try { bench.set_timeout(value); } + else if (prop_arg == "--throttle-threshold") + { + bench.set_throttle_threshold(static_cast(value)); + } + else if (prop_arg == "--throttle-recovery-delay") + { + bench.set_throttle_recovery_delay(static_cast(value)); + } else { NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh index 5bd834c..c58bfc5 100644 --- a/nvbench/option_parser.cuh +++ b/nvbench/option_parser.cuh @@ -80,7 +80,7 @@ private: std::ostream &printer_spec_to_ostream(const std::string &spec); void print_version() const; - void print_list(printer_base& printer) const; + void print_list(printer_base &printer) const; void print_help() const; void print_help_axis() const; @@ -89,6 +89,7 @@ private: void set_stopping_criterion(const std::string &criterion); void enable_run_once(); + void enable_discard_on_throttle(); void disable_blocking_kernel(); void add_benchmark(const std::string &name); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 86e6d16..8d56c0f 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -22,9 +22,9 @@ #include #include #include +#include #include #include -#include #include #include @@ -136,8 +136,11 @@ struct state /// Control the stopping criterion for the measurement loop. /// @{ - [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; } - void set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); } + [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; } + void set_stopping_criterion(std::string criterion) + { + m_stopping_criterion = std::move(criterion); + } /// @} /// If true, the benchmark is only run once, skipping all warmup runs and only @@ -200,6 +203,30 @@ struct state void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; } /// @} + [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; } + + void set_throttle_threshold(nvbench::float32_t throttle_threshold) + { + m_throttle_threshold = throttle_threshold; + } + + [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const + { + return m_throttle_recovery_delay; + } + + void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay) + { + m_throttle_recovery_delay = throttle_recovery_delay; + } + + [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; } + + void set_discard_on_throttle(bool discard_on_throttle) + { + m_discard_on_throttle = discard_on_throttle; + } + /// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed /// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks /// for more than `blocking_kernel_timeout` seconds, an error will be printed @@ -311,6 +338,10 @@ private: nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + bool m_discard_on_throttle{false}; + // Deadlock protection. See blocking_kernel's class doc for details. nvbench::float64_t m_blocking_kernel_timeout{30.0}; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index 8779e9f..bfdac1b 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -15,20 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include #include #include #include -#include -#include - #include #include #include +#include +#include + namespace nvbench { @@ -42,6 +41,9 @@ state::state(const benchmark_base &bench) , m_min_samples{bench.get_min_samples()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} + , m_throttle_threshold{bench.get_throttle_threshold()} + , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} + , m_discard_on_throttle{bench.get_discard_on_throttle()} {} state::state(const benchmark_base &bench, @@ -60,6 +62,9 @@ state::state(const benchmark_base &bench, , m_min_samples{bench.get_min_samples()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} + , m_throttle_threshold{bench.get_throttle_threshold()} + , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} + , m_discard_on_throttle{bench.get_discard_on_throttle()} {} nvbench::int64_t state::get_int64(const std::string &axis_name) const