From f29f7ac2fb062f5fcfc47a7caaf84ec59a8de203 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 7 Apr 2025 22:13:09 -0700 Subject: [PATCH 1/5] Detect throttle Signed-off-by: Georgy Evtushenko --- docs/cli_help.md | 17 ++++++ nvbench/CMakeLists.txt | 2 + nvbench/benchmark_base.cuh | 37 ++++++++++++- nvbench/benchmark_base.cxx | 7 ++- nvbench/detail/gpu_frequency.cuh | 53 ++++++++++++++++++ nvbench/detail/gpu_frequency.cxx | 47 ++++++++++++++++ nvbench/detail/measure_cold.cu | 40 ++++++++++++++ nvbench/detail/measure_cold.cuh | 16 ++++++ nvbench/detail/timestamps_kernel.cu | 82 ++++++++++++++++++++++++++++ nvbench/detail/timestamps_kernel.cuh | 50 +++++++++++++++++ nvbench/device_info.cuh | 7 ++- nvbench/option_parser.cu | 57 +++++++++++++------ nvbench/option_parser.cuh | 3 +- nvbench/state.cuh | 37 ++++++++++++- nvbench/state.cxx | 13 +++-- 15 files changed, 435 insertions(+), 33 deletions(-) create mode 100644 nvbench/detail/gpu_frequency.cuh create mode 100644 nvbench/detail/gpu_frequency.cxx create mode 100644 nvbench/detail/timestamps_kernel.cu create mode 100644 nvbench/detail/timestamps_kernel.cuh diff --git a/docs/cli_help.md b/docs/cli_help.md index 0336c5e..5b70096 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -133,6 +133,23 @@ * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. +* `--discard-on-throttle` + * Discard measurements if the GPU is throttled. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--throttle-threshold ` + * Set the GPU throttle threshold as percentage of the peak clock rate. + * Default is 0.75 (75%). + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + +* `--throttle-recovery-delay ` + * Set the GPU throttle recovery delay in seconds. + * Default is 0.5 seconds. + * Applies to the most recent `--benchmark`, or all benchmarks if specified + before any `--benchmark` arguments. + * `--run-once` * Only run the benchmark once, skipping any warmup runs and batched measurements. diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt index 33569b9..89ac7ed 100644 --- a/nvbench/CMakeLists.txt +++ b/nvbench/CMakeLists.txt @@ -29,6 +29,8 @@ set(srcs detail/measure_hot.cu detail/state_generator.cxx detail/stdrel_criterion.cxx + detail/gpu_frequency.cxx + detail/timestamps_kernel.cu internal/nvml.cxx ) diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index a363a3b..3ed0ebf 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -247,12 +247,39 @@ struct benchmark_base } /// @} - [[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; } - [[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; } + [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; } + + void set_throttle_threshold(nvbench::float32_t throttle_threshold) + { + m_throttle_threshold = throttle_threshold; + } + + [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const + { + return m_throttle_recovery_delay; + } + + void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay) + { + m_throttle_recovery_delay = throttle_recovery_delay; + } + + [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; } + + void set_discard_on_throttle(bool discard_on_throttle) + { + m_discard_on_throttle = discard_on_throttle; + } + + [[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; } + [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const + { + return m_criterion_params; + } /// Control the stopping criterion for the measurement loop. /// @{ - [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; } + [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; } benchmark_base &set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); @@ -282,6 +309,10 @@ protected: nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_timeout{15.}; + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + bool m_discard_on_throttle{false}; + nvbench::criterion_params m_criterion_params; std::string m_stopping_criterion{"stdrel"}; diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx index 3cdea6b..692019b 100644 --- a/nvbench/benchmark_base.cxx +++ b/nvbench/benchmark_base.cxx @@ -17,7 +17,6 @@ */ #include - #include namespace nvbench @@ -45,7 +44,11 @@ std::unique_ptr benchmark_base::clone() const result->m_skip_time = m_skip_time; result->m_timeout = m_timeout; - result->m_criterion_params = m_criterion_params; + result->m_criterion_params = m_criterion_params; + result->m_throttle_threshold = m_throttle_threshold; + result->m_throttle_recovery_delay = m_throttle_recovery_delay; + result->m_discard_on_throttle = m_discard_on_throttle; + result->m_stopping_criterion = m_stopping_criterion; return result; diff --git a/nvbench/detail/gpu_frequency.cuh b/nvbench/detail/gpu_frequency.cuh new file mode 100644 index 0000000..320d3d2 --- /dev/null +++ b/nvbench/detail/gpu_frequency.cuh @@ -0,0 +1,53 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace nvbench::detail +{ + +struct cuda_stream; + +struct gpu_frequency +{ + gpu_frequency() = default; + + // move-only + gpu_frequency(const gpu_frequency &) = delete; + gpu_frequency(gpu_frequency &&) = default; + gpu_frequency &operator=(const gpu_frequency &) = delete; + gpu_frequency &operator=(gpu_frequency &&) = default; + + void start(const nvbench::cuda_stream &stream) { m_start.record(stream); } + + void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); } + + [[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz, + nvbench::float32_t throttle_threshold); + + [[nodiscard]] nvbench::float32_t get_clock_frequency(); + +private: + nvbench::detail::timestamps_kernel m_start; + nvbench::detail::timestamps_kernel m_stop; +}; + +} // namespace nvbench::detail diff --git a/nvbench/detail/gpu_frequency.cxx b/nvbench/detail/gpu_frequency.cxx new file mode 100644 index 0000000..8f2d19b --- /dev/null +++ b/nvbench/detail/gpu_frequency.cxx @@ -0,0 +1,47 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +namespace nvbench::detail +{ + +nvbench::float32_t gpu_frequency::get_clock_frequency() +{ + nvbench::uint64_t elapsed_ns = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0]; + nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1]; + nvbench::float32_t clock_rate = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f; + return clock_rate; +} + +bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz, + nvbench::float32_t throttle_threshold) +{ + float threshold = peak_sm_clock_rate_hz * throttle_threshold; + + if (this->get_clock_frequency() < threshold) + { + return true; + } + + return false; +} + +} // namespace nvbench::detail diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index a137929..95c292b 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -27,6 +27,8 @@ #include #include +#include +#include #include @@ -44,6 +46,9 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_min_samples{exec_state.get_min_samples()} , m_skip_time{exec_state.get_skip_time()} , m_timeout{exec_state.get_timeout()} + , m_throttle_threshold(exec_state.get_throttle_threshold()) + , m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay()) + , m_discard_on_throttle(exec_state.get_discard_on_throttle()) { if (m_min_samples > 0) { @@ -86,6 +91,41 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); } void measure_cold_base::record_measurements() { + if (!m_run_once) + { + auto peak_clock_rate = static_cast(m_state.get_device()->get_sm_default_clock_rate()); + + if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold)) + { + if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value()) + { + auto current_clock_rate = m_gpu_frequency.get_clock_frequency(); + auto &printer = printer_opt_ref.value().get(); + printer.log(nvbench::log_level::warn, + fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) " + "({:0.0f}% < {:0.0f}%) on sample {}. {} previous sample and " + "pausing for {}s.", + current_clock_rate / 1000000.0f, + peak_clock_rate / 1000000.0f, + 100.0f * (current_clock_rate / peak_clock_rate), + 100.0f * m_throttle_threshold, + m_total_samples, + m_discard_on_throttle ? "Discarding" : "Keeping", + m_throttle_recovery_delay)); + } + + if (m_throttle_recovery_delay > 0.0f) + { // let the GPU cool down + std::this_thread::sleep_for(std::chrono::duration(m_throttle_recovery_delay)); + } + + if (m_discard_on_throttle) + { // ignore this measurement + return; + } + } + } + // Update and record timers and counters: const auto cur_cuda_time = m_cuda_timer.get_duration(); const auto cur_cpu_time = m_cpu_timer.get_duration(); diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 97ce869..a27e1e3 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -64,6 +65,8 @@ protected: bool is_finished(); void run_trials_epilogue(); void generate_summaries(); + void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); } + void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); } void check_skip_time(nvbench::float64_t warmup_time); @@ -88,6 +91,7 @@ protected: nvbench::criterion_params m_criterion_params; nvbench::stopping_criterion_base& m_stopping_criterion; + nvbench::detail::gpu_frequency m_gpu_frequency; bool m_disable_blocking_kernel{false}; bool m_run_once{false}; @@ -97,6 +101,10 @@ protected: nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + bool m_discard_on_throttle{false}; + nvbench::int64_t m_total_samples{}; nvbench::float64_t m_min_cuda_time{}; @@ -128,6 +136,10 @@ struct measure_cold_base::kernel_launch_timer { m_measure.block_stream(); } + if (!m_measure.m_run_once) + { + m_measure.gpu_frequency_start(); + } m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream()); if (m_disable_blocking_kernel) { @@ -143,6 +155,10 @@ struct measure_cold_base::kernel_launch_timer m_measure.m_cpu_timer.start(); m_measure.unblock_stream(); } + if (!m_measure.m_run_once) + { + m_measure.gpu_frequency_stop(); + } m_measure.sync_stream(); m_measure.m_cpu_timer.stop(); } diff --git a/nvbench/detail/timestamps_kernel.cu b/nvbench/detail/timestamps_kernel.cu new file mode 100644 index 0000000..8fa7a64 --- /dev/null +++ b/nvbench/detail/timestamps_kernel.cu @@ -0,0 +1,82 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +namespace +{ + +__global__ void get_timestamps_kernel(nvbench::uint64_t *global_timestamp, + nvbench::uint64_t *sm0_timestamp) +{ + nvbench::uint32_t smid; + asm volatile("mov.u32 %0, %%smid;" : "=r"(smid)); + if (smid == 0) + { + nvbench::uint64_t gts, lts; + asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(gts)); + lts = clock64(); + + *global_timestamp = gts; + *sm0_timestamp = lts; + } +} + +} // namespace + +namespace nvbench::detail +{ + +timestamps_kernel::timestamps_kernel() +{ + NVBENCH_CUDA_CALL( + cudaHostRegister(&m_host_timestamps, sizeof(nvbench::uint64_t) * 2, cudaHostRegisterMapped)); + NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timestamps, &m_host_timestamps, 0)); +} + +timestamps_kernel::~timestamps_kernel() +{ + NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timestamps)); +} + +void timestamps_kernel::record(const nvbench::cuda_stream &stream) +{ + m_host_timestamps[0] = 0; + m_host_timestamps[1] = 0; + + int device_id = 0; + int num_sms = 0; + + NVBENCH_CUDA_CALL(cudaGetDevice(&device_id)); + NVBENCH_CUDA_CALL( + cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id)); + + get_timestamps_kernel<<(num_sms), 1, 0, stream.get_stream()>>>( + m_device_timestamps, + m_device_timestamps + 1); +} + +} // namespace nvbench diff --git a/nvbench/detail/timestamps_kernel.cuh b/nvbench/detail/timestamps_kernel.cuh new file mode 100644 index 0000000..46831ca --- /dev/null +++ b/nvbench/detail/timestamps_kernel.cuh @@ -0,0 +1,50 @@ +/* + * Copyright 2025 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 with the LLVM exception + * (the "License"); you may not use this file except in compliance with + * the License. + * + * You may obtain a copy of the License at + * + * http://llvm.org/foundation/relicensing/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace nvbench +{ + +struct cuda_stream; + +namespace detail +{ + +struct timestamps_kernel +{ + timestamps_kernel(); + ~timestamps_kernel(); + + void record(const nvbench::cuda_stream &stream); + + // move-only + timestamps_kernel(const timestamps_kernel &) = delete; + timestamps_kernel(timestamps_kernel &&) = default; + timestamps_kernel &operator=(const timestamps_kernel &) = delete; + timestamps_kernel &operator=(timestamps_kernel &&) = default; + + nvbench::uint64_t m_host_timestamps[2]; + nvbench::uint64_t *m_device_timestamps{}; +}; + +} // namespace detail + +} // namespace nvbench diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh index 98184cf..4124d96 100644 --- a/nvbench/device_info.cuh +++ b/nvbench/device_info.cuh @@ -18,17 +18,18 @@ #pragma once +#include + #include #include #include -#include - #include // CHAR_BIT #include -#include #include +#include + // forward declare this for internal storage struct nvmlDevice_st; diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 525f275..04b50fb 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -16,28 +16,24 @@ * limitations under the License. */ -#include - #include #include -#include #include +#include +#include #include #include #include #include +#include #include #include #include -#include - // These are generated from the markdown docs by CMake in the build directory: #include #include -#include - #include #include #include @@ -48,10 +44,12 @@ #include #include #include -#include #include #include +#include +#include + namespace { @@ -433,6 +431,11 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->lock_gpu_clocks(first[1]); first += 2; } + else if (arg == "--discard-on-throttle") + { + this->enable_discard_on_throttle(); + first += 1; + } else if (arg == "--run-once") { this->enable_run_once(); @@ -515,7 +518,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->update_int64_prop(first[0], first[1]); first += 2; } - else if (arg == "--skip-time" || arg == "--timeout") + else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" || + arg == "--throttle-recovery-delay") { check_params(1); this->update_float64_prop(first[0], first[1]); @@ -623,7 +627,7 @@ void option_parser::print_version() const NVBENCH_GIT_VERSION); } -void option_parser::print_list(printer_base& printer) const +void option_parser::print_list(printer_base &printer) const { const auto &bench_mgr = nvbench::benchmark_manager::get(); printer.print_device_info(); @@ -727,6 +731,18 @@ void option_parser::enable_run_once() bench.set_run_once(true); } +void option_parser::enable_discard_on_throttle() +{ + if (m_benchmarks.empty()) + { + m_global_benchmark_args.push_back("--discard-on-throttle"); + return; + } + + benchmark_base &bench = *m_benchmarks.back(); + bench.set_discard_on_throttle(true); +} + void option_parser::set_stopping_criterion(const std::string &criterion) { // If no active benchmark, save args as global. @@ -979,10 +995,9 @@ catch (std::exception &e) e.what()); } -void option_parser::update_criterion_prop( - const std::string &prop_arg, - const std::string &prop_val, - const nvbench::named_values::type type) +void option_parser::update_criterion_prop(const std::string &prop_arg, + const std::string &prop_val, + const nvbench::named_values::type type) try { // If no active benchmark, save args as global. @@ -993,8 +1008,8 @@ try return; } - benchmark_base &bench = *m_benchmarks.back(); - nvbench::criterion_params& criterion_params = bench.get_criterion_params(); + benchmark_base &bench = *m_benchmarks.back(); + nvbench::criterion_params &criterion_params = bench.get_criterion_params(); std::string name(prop_arg.begin() + 2, prop_arg.end()); if (type == nvbench::named_values::type::float64) { @@ -1022,7 +1037,7 @@ try NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); } } -catch (std::exception& e) +catch (std::exception &e) { NVBENCH_THROW(std::runtime_error, "Error handling option `{} {}`:\n{}", @@ -1054,6 +1069,14 @@ try { bench.set_timeout(value); } + else if (prop_arg == "--throttle-threshold") + { + bench.set_throttle_threshold(static_cast(value)); + } + else if (prop_arg == "--throttle-recovery-delay") + { + bench.set_throttle_recovery_delay(static_cast(value)); + } else { NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg); diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh index 5bd834c..c58bfc5 100644 --- a/nvbench/option_parser.cuh +++ b/nvbench/option_parser.cuh @@ -80,7 +80,7 @@ private: std::ostream &printer_spec_to_ostream(const std::string &spec); void print_version() const; - void print_list(printer_base& printer) const; + void print_list(printer_base &printer) const; void print_help() const; void print_help_axis() const; @@ -89,6 +89,7 @@ private: void set_stopping_criterion(const std::string &criterion); void enable_run_once(); + void enable_discard_on_throttle(); void disable_blocking_kernel(); void add_benchmark(const std::string &name); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 86e6d16..8d56c0f 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -22,9 +22,9 @@ #include #include #include +#include #include #include -#include #include #include @@ -136,8 +136,11 @@ struct state /// Control the stopping criterion for the measurement loop. /// @{ - [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; } - void set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); } + [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; } + void set_stopping_criterion(std::string criterion) + { + m_stopping_criterion = std::move(criterion); + } /// @} /// If true, the benchmark is only run once, skipping all warmup runs and only @@ -200,6 +203,30 @@ struct state void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; } /// @} + [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; } + + void set_throttle_threshold(nvbench::float32_t throttle_threshold) + { + m_throttle_threshold = throttle_threshold; + } + + [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const + { + return m_throttle_recovery_delay; + } + + void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay) + { + m_throttle_recovery_delay = throttle_recovery_delay; + } + + [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; } + + void set_discard_on_throttle(bool discard_on_throttle) + { + m_discard_on_throttle = discard_on_throttle; + } + /// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed /// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks /// for more than `blocking_kernel_timeout` seconds, an error will be printed @@ -311,6 +338,10 @@ private: nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + bool m_discard_on_throttle{false}; + // Deadlock protection. See blocking_kernel's class doc for details. nvbench::float64_t m_blocking_kernel_timeout{30.0}; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index 8779e9f..bfdac1b 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -15,20 +15,19 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include #include #include #include -#include -#include - #include #include #include +#include +#include + namespace nvbench { @@ -42,6 +41,9 @@ state::state(const benchmark_base &bench) , m_min_samples{bench.get_min_samples()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} + , m_throttle_threshold{bench.get_throttle_threshold()} + , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} + , m_discard_on_throttle{bench.get_discard_on_throttle()} {} state::state(const benchmark_base &bench, @@ -60,6 +62,9 @@ state::state(const benchmark_base &bench, , m_min_samples{bench.get_min_samples()} , m_skip_time{bench.get_skip_time()} , m_timeout{bench.get_timeout()} + , m_throttle_threshold{bench.get_throttle_threshold()} + , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} + , m_discard_on_throttle{bench.get_discard_on_throttle()} {} nvbench::int64_t state::get_int64(const std::string &axis_name) const From 2ba2d1131deb73cc6ffba1515fcc17243baf5b58 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Fri, 11 Apr 2025 22:33:40 +0000 Subject: [PATCH 2/5] Report mean SM clock rate --- nvbench/detail/measure_cold.cu | 14 ++++++++++++++ nvbench/detail/measure_cold.cuh | 2 ++ nvbench/markdown_printer.cu | 25 +++++++++++++++++++++++++ nvbench/markdown_printer.cuh | 1 + 4 files changed, 42 insertions(+) diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index 95c292b..d911343 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -54,6 +54,7 @@ measure_cold_base::measure_cold_base(state &exec_state) { m_cuda_times.reserve(static_cast(m_min_samples)); m_cpu_times.reserve(static_cast(m_min_samples)); + m_sm_clock_rates.reserve(static_cast(m_min_samples)); } } @@ -83,6 +84,7 @@ void measure_cold_base::initialize() m_cuda_times.clear(); m_cpu_times.clear(); + m_sm_clock_rates.clear(); m_stopping_criterion.initialize(m_criterion_params); } @@ -94,6 +96,7 @@ void measure_cold_base::record_measurements() if (!m_run_once) { auto peak_clock_rate = static_cast(m_state.get_device()->get_sm_default_clock_rate()); + m_sm_clock_rates.push_back(peak_clock_rate); if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold)) { @@ -338,6 +341,17 @@ void measure_cold_base::generate_summaries() summ.set_string("hide", "Hidden by default."); } + if (!m_sm_clock_rates.empty()) + { + auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean"); + summ.set_string("name", "Clock Rate"); + summ.set_string("hint", "frequency"); + summ.set_string("description", "Mean SM clock rate"); + summ.set_string("hide", "Hidden by default."); + summ.set_float64("value", nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(), + m_sm_clock_rates.cend())); + } + // Log if a printer exists: if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value()) { diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index a27e1e3..1c6d1e7 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -36,6 +36,7 @@ #include #include +#include "nvbench/types.cuh" namespace nvbench { @@ -117,6 +118,7 @@ protected: std::vector m_cuda_times; std::vector m_cpu_times; + std::vector m_sm_clock_rates; bool m_max_time_exceeded{}; }; diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu index 263d230..db3d871 100644 --- a/nvbench/markdown_printer.cu +++ b/nvbench/markdown_printer.cu @@ -294,6 +294,10 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_ { table.add_cell(row, tag, header, this->do_format_item_rate(summ)); } + else if (hint == "frequency") + { + table.add_cell(row, tag, header, this->do_format_frequency(summ)); + } else if (hint == "bytes") { table.add_cell(row, tag, header, this->do_format_bytes(summ)); @@ -399,6 +403,27 @@ std::string markdown_printer::do_format_item_rate(const summary &data) } } +std::string markdown_printer::do_format_frequency(const nvbench::summary &data) +{ + const auto frequency_hz = data.get_float64("value"); + if (frequency_hz >= 1e9) + { + return fmt::format("{:0.3f} GHz", frequency_hz * 1e-9); + } + else if (frequency_hz >= 1e6) + { + return fmt::format("{:0.3f} MHz", frequency_hz * 1e-6); + } + else if (frequency_hz >= 1e3) + { + return fmt::format("{:0.3f} KHz", frequency_hz * 1e-3); + } + else + { + return fmt::format("{:0.3f} Hz", frequency_hz); + } +} + std::string markdown_printer::do_format_bytes(const summary &data) { const auto bytes = static_cast(data.get_int64("value")); diff --git a/nvbench/markdown_printer.cuh b/nvbench/markdown_printer.cuh index fa4fedc..970be6a 100644 --- a/nvbench/markdown_printer.cuh +++ b/nvbench/markdown_printer.cuh @@ -64,6 +64,7 @@ protected: virtual std::string do_format_default(const nvbench::summary &data); virtual std::string do_format_duration(const nvbench::summary &seconds); virtual std::string do_format_item_rate(const nvbench::summary &items_per_sec); + virtual std::string do_format_frequency(const nvbench::summary &frequency_hz); virtual std::string do_format_bytes(const nvbench::summary &bytes); virtual std::string do_format_byte_rate(const nvbench::summary &bytes_per_sec); virtual std::string do_format_sample_size(const nvbench::summary &count); From 5c0d67475735e8017ec7b243f2ba371285a13b18 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Fri, 11 Apr 2025 15:44:11 -0700 Subject: [PATCH 3/5] Fix overflow in default clock rate --- nvbench/device_info.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh index 4124d96..1bb5262 100644 --- a/nvbench/device_info.cuh +++ b/nvbench/device_info.cuh @@ -109,7 +109,7 @@ struct device_info /// @return The default clock rate of the SM in Hz. [[nodiscard]] std::size_t get_sm_default_clock_rate() const { // kHz -> Hz - return static_cast(m_prop.clockRate * 1000); + return static_cast(m_prop.clockRate) * 1000; } /// @return The number of physical streaming multiprocessors on this device. From b926daf09faa02cbe0ba8e77a2d001b82a439ced Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sun, 13 Apr 2025 03:53:59 +0000 Subject: [PATCH 4/5] Better throttle recovery delay --- docs/cli_help.md | 4 ++-- nvbench/benchmark_base.cuh | 4 ++-- nvbench/detail/measure_cold.cuh | 20 ++++++++++---------- nvbench/option_parser.cu | 2 +- nvbench/state.cuh | 4 ++-- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/cli_help.md b/docs/cli_help.md index 5b70096..e6d96d3 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -140,13 +140,13 @@ * `--throttle-threshold ` * Set the GPU throttle threshold as percentage of the peak clock rate. - * Default is 0.75 (75%). + * Default is 75%. * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. * `--throttle-recovery-delay ` * Set the GPU throttle recovery delay in seconds. - * Default is 0.5 seconds. + * Default is 0.05 seconds. * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index 3ed0ebf..4e658b5 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -309,8 +309,8 @@ protected: nvbench::float64_t m_skip_time{-1.}; nvbench::float64_t m_timeout{15.}; - nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] - nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds] bool m_discard_on_throttle{false}; nvbench::criterion_params m_criterion_params; diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 1c6d1e7..7e409fd 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -18,24 +18,24 @@ #pragma once +#include + #include #include #include #include +#include +#include +#include +#include #include #include #include #include -#include -#include -#include -#include - -#include - #include #include + #include "nvbench/types.cuh" namespace nvbench @@ -91,7 +91,7 @@ protected: nvbench::blocking_kernel m_blocker; nvbench::criterion_params m_criterion_params; - nvbench::stopping_criterion_base& m_stopping_criterion; + nvbench::stopping_criterion_base &m_stopping_criterion; nvbench::detail::gpu_frequency m_gpu_frequency; bool m_disable_blocking_kernel{false}; @@ -102,8 +102,8 @@ protected: nvbench::float64_t m_skip_time{}; nvbench::float64_t m_timeout{}; - nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] - nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay; // [seconds] bool m_discard_on_throttle{false}; nvbench::int64_t m_total_samples{}; diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 04b50fb..98a1f84 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -1071,7 +1071,7 @@ try } else if (prop_arg == "--throttle-threshold") { - bench.set_throttle_threshold(static_cast(value)); + bench.set_throttle_threshold(static_cast(value) / 100.0f); } else if (prop_arg == "--throttle-recovery-delay") { diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 8d56c0f..321e068 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -338,8 +338,8 @@ private: nvbench::float64_t m_skip_time; nvbench::float64_t m_timeout; - nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] - nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds] + nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate] + nvbench::float32_t m_throttle_recovery_delay; // [seconds] bool m_discard_on_throttle{false}; // Deadlock protection. See blocking_kernel's class doc for details. From 254ac2517fdce00c91918417df36ec3c9a1a4b36 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sat, 12 Apr 2025 21:13:13 -0700 Subject: [PATCH 5/5] Remove discard on throttle option --- docs/cli_help.md | 5 ----- nvbench/benchmark_base.cuh | 8 -------- nvbench/benchmark_base.cxx | 1 - nvbench/detail/measure_cold.cu | 22 ++++++++++------------ nvbench/detail/measure_cold.cuh | 1 - nvbench/option_parser.cu | 17 ----------------- nvbench/option_parser.cuh | 1 - nvbench/state.cuh | 8 -------- nvbench/state.cxx | 5 +---- 9 files changed, 11 insertions(+), 57 deletions(-) diff --git a/docs/cli_help.md b/docs/cli_help.md index e6d96d3..424c1be 100644 --- a/docs/cli_help.md +++ b/docs/cli_help.md @@ -133,11 +133,6 @@ * Applies to the most recent `--benchmark`, or all benchmarks if specified before any `--benchmark` arguments. -* `--discard-on-throttle` - * Discard measurements if the GPU is throttled. - * Applies to the most recent `--benchmark`, or all benchmarks if specified - before any `--benchmark` arguments. - * `--throttle-threshold ` * Set the GPU throttle threshold as percentage of the peak clock rate. * Default is 75%. diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh index 4e658b5..d35779c 100644 --- a/nvbench/benchmark_base.cuh +++ b/nvbench/benchmark_base.cuh @@ -264,13 +264,6 @@ struct benchmark_base m_throttle_recovery_delay = throttle_recovery_delay; } - [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; } - - void set_discard_on_throttle(bool discard_on_throttle) - { - m_discard_on_throttle = discard_on_throttle; - } - [[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; } [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const { @@ -311,7 +304,6 @@ protected: nvbench::float32_t m_throttle_threshold{0.75f}; // [% of peak SM clock rate] nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds] - bool m_discard_on_throttle{false}; nvbench::criterion_params m_criterion_params; std::string m_stopping_criterion{"stdrel"}; diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx index 692019b..592796d 100644 --- a/nvbench/benchmark_base.cxx +++ b/nvbench/benchmark_base.cxx @@ -47,7 +47,6 @@ std::unique_ptr benchmark_base::clone() const result->m_criterion_params = m_criterion_params; result->m_throttle_threshold = m_throttle_threshold; result->m_throttle_recovery_delay = m_throttle_recovery_delay; - result->m_discard_on_throttle = m_discard_on_throttle; result->m_stopping_criterion = m_stopping_criterion; diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu index d911343..1421241 100644 --- a/nvbench/detail/measure_cold.cu +++ b/nvbench/detail/measure_cold.cu @@ -26,8 +26,8 @@ #include #include -#include #include +#include #include #include @@ -48,7 +48,6 @@ measure_cold_base::measure_cold_base(state &exec_state) , m_timeout{exec_state.get_timeout()} , m_throttle_threshold(exec_state.get_throttle_threshold()) , m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay()) - , m_discard_on_throttle(exec_state.get_discard_on_throttle()) { if (m_min_samples > 0) { @@ -96,7 +95,6 @@ void measure_cold_base::record_measurements() if (!m_run_once) { auto peak_clock_rate = static_cast(m_state.get_device()->get_sm_default_clock_rate()); - m_sm_clock_rates.push_back(peak_clock_rate); if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold)) { @@ -106,14 +104,13 @@ void measure_cold_base::record_measurements() auto &printer = printer_opt_ref.value().get(); printer.log(nvbench::log_level::warn, fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) " - "({:0.0f}% < {:0.0f}%) on sample {}. {} previous sample and " - "pausing for {}s.", + "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample " + "and pausing for {}s.", current_clock_rate / 1000000.0f, peak_clock_rate / 1000000.0f, 100.0f * (current_clock_rate / peak_clock_rate), 100.0f * m_throttle_threshold, m_total_samples, - m_discard_on_throttle ? "Discarding" : "Keeping", m_throttle_recovery_delay)); } @@ -122,11 +119,11 @@ void measure_cold_base::record_measurements() std::this_thread::sleep_for(std::chrono::duration(m_throttle_recovery_delay)); } - if (m_discard_on_throttle) - { // ignore this measurement - return; - } + // ignore this measurement + return; } + + m_sm_clock_rates.push_back(peak_clock_rate); } // Update and record timers and counters: @@ -348,8 +345,9 @@ void measure_cold_base::generate_summaries() summ.set_string("hint", "frequency"); summ.set_string("description", "Mean SM clock rate"); summ.set_string("hide", "Hidden by default."); - summ.set_float64("value", nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(), - m_sm_clock_rates.cend())); + summ.set_float64("value", + nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(), + m_sm_clock_rates.cend())); } // Log if a printer exists: diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index 7e409fd..80f8e5a 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -104,7 +104,6 @@ protected: nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate] nvbench::float32_t m_throttle_recovery_delay; // [seconds] - bool m_discard_on_throttle{false}; nvbench::int64_t m_total_samples{}; diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu index 98a1f84..eef46f8 100644 --- a/nvbench/option_parser.cu +++ b/nvbench/option_parser.cu @@ -431,11 +431,6 @@ void option_parser::parse_range(option_parser::arg_iterator_t first, this->lock_gpu_clocks(first[1]); first += 2; } - else if (arg == "--discard-on-throttle") - { - this->enable_discard_on_throttle(); - first += 1; - } else if (arg == "--run-once") { this->enable_run_once(); @@ -731,18 +726,6 @@ void option_parser::enable_run_once() bench.set_run_once(true); } -void option_parser::enable_discard_on_throttle() -{ - if (m_benchmarks.empty()) - { - m_global_benchmark_args.push_back("--discard-on-throttle"); - return; - } - - benchmark_base &bench = *m_benchmarks.back(); - bench.set_discard_on_throttle(true); -} - void option_parser::set_stopping_criterion(const std::string &criterion) { // If no active benchmark, save args as global. diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh index c58bfc5..bb12513 100644 --- a/nvbench/option_parser.cuh +++ b/nvbench/option_parser.cuh @@ -89,7 +89,6 @@ private: void set_stopping_criterion(const std::string &criterion); void enable_run_once(); - void enable_discard_on_throttle(); void disable_blocking_kernel(); void add_benchmark(const std::string &name); diff --git a/nvbench/state.cuh b/nvbench/state.cuh index 321e068..9b0a5c1 100644 --- a/nvbench/state.cuh +++ b/nvbench/state.cuh @@ -220,13 +220,6 @@ struct state m_throttle_recovery_delay = throttle_recovery_delay; } - [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; } - - void set_discard_on_throttle(bool discard_on_throttle) - { - m_discard_on_throttle = discard_on_throttle; - } - /// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed /// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks /// for more than `blocking_kernel_timeout` seconds, an error will be printed @@ -340,7 +333,6 @@ private: nvbench::float32_t m_throttle_threshold; // [% of peak SM clock rate] nvbench::float32_t m_throttle_recovery_delay; // [seconds] - bool m_discard_on_throttle{false}; // Deadlock protection. See blocking_kernel's class doc for details. nvbench::float64_t m_blocking_kernel_timeout{30.0}; diff --git a/nvbench/state.cxx b/nvbench/state.cxx index bfdac1b..49af11b 100644 --- a/nvbench/state.cxx +++ b/nvbench/state.cxx @@ -15,10 +15,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include - #include #include +#include #include #include @@ -43,7 +42,6 @@ state::state(const benchmark_base &bench) , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} - , m_discard_on_throttle{bench.get_discard_on_throttle()} {} state::state(const benchmark_base &bench, @@ -64,7 +62,6 @@ state::state(const benchmark_base &bench, , m_timeout{bench.get_timeout()} , m_throttle_threshold{bench.get_throttle_threshold()} , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()} - , m_discard_on_throttle{bench.get_discard_on_throttle()} {} nvbench::int64_t state::get_int64(const std::string &axis_name) const