From f29f7ac2fb062f5fcfc47a7caaf84ec59a8de203 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Mon, 7 Apr 2025 22:13:09 -0700
Subject: [PATCH 1/5] Detect throttle

Signed-off-by: Georgy Evtushenko <evtushenko.georgy@gmail.com>
---
 docs/cli_help.md                     | 17 ++++++
 nvbench/CMakeLists.txt               |  2 +
 nvbench/benchmark_base.cuh           | 37 ++++++++++++-
 nvbench/benchmark_base.cxx           |  7 ++-
 nvbench/detail/gpu_frequency.cuh     | 53 ++++++++++++++++++
 nvbench/detail/gpu_frequency.cxx     | 47 ++++++++++++++++
 nvbench/detail/measure_cold.cu       | 40 ++++++++++++++
 nvbench/detail/measure_cold.cuh      | 16 ++++++
 nvbench/detail/timestamps_kernel.cu  | 82 ++++++++++++++++++++++++++++
 nvbench/detail/timestamps_kernel.cuh | 50 +++++++++++++++++
 nvbench/device_info.cuh              |  7 ++-
 nvbench/option_parser.cu             | 57 +++++++++++++------
 nvbench/option_parser.cuh            |  3 +-
 nvbench/state.cuh                    | 37 ++++++++++++-
 nvbench/state.cxx                    | 13 +++--
 15 files changed, 435 insertions(+), 33 deletions(-)
 create mode 100644 nvbench/detail/gpu_frequency.cuh
 create mode 100644 nvbench/detail/gpu_frequency.cxx
 create mode 100644 nvbench/detail/timestamps_kernel.cu
 create mode 100644 nvbench/detail/timestamps_kernel.cuh
diff --git a/docs/cli_help.md b/docs/cli_help.md
index 0336c5e..5b70096 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -133,6 +133,23 @@
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
+* `--discard-on-throttle`
+  * Discard measurements if the GPU is throttled.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--throttle-threshold <value>`
+  * Set the GPU throttle threshold as percentage of the peak clock rate.
+  * Default is 0.75 (75%).
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--throttle-recovery-delay <value>`
+  * Set the GPU throttle recovery delay in seconds.
+  * Default is 0.5 seconds.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
 * `--run-once`
   * Only run the benchmark once, skipping any warmup runs and batched
     measurements.
diff --git a/nvbench/CMakeLists.txt b/nvbench/CMakeLists.txt
index 33569b9..89ac7ed 100644
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -29,6 +29,8 @@ set(srcs
   detail/measure_hot.cu
   detail/state_generator.cxx
   detail/stdrel_criterion.cxx
+  detail/gpu_frequency.cxx
+  detail/timestamps_kernel.cu
 
   internal/nvml.cxx
 )
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index a363a3b..3ed0ebf 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -247,12 +247,39 @@ struct benchmark_base
   }
   /// @}
 
-  [[nodiscard]] nvbench::criterion_params& get_criterion_params() { return m_criterion_params; }
-  [[nodiscard]] const nvbench::criterion_params& get_criterion_params() const { return m_criterion_params; }
+  [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
+
+  void set_throttle_threshold(nvbench::float32_t throttle_threshold)
+  {
+    m_throttle_threshold = throttle_threshold;
+  }
+
+  [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
+  {
+    return m_throttle_recovery_delay;
+  }
+
+  void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
+  {
+    m_throttle_recovery_delay = throttle_recovery_delay;
+  }
+
+  [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; }
+
+  void set_discard_on_throttle(bool discard_on_throttle)
+  {
+    m_discard_on_throttle = discard_on_throttle;
+  }
+
+  [[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
+  [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
+  {
+    return m_criterion_params;
+  }
 
   /// Control the stopping criterion for the measurement loop.
   /// @{
-  [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
+  [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
   benchmark_base &set_stopping_criterion(std::string criterion)
   {
     m_stopping_criterion = std::move(criterion);
@@ -282,6 +309,10 @@ protected:
   nvbench::float64_t m_skip_time{-1.};
   nvbench::float64_t m_timeout{15.};
 
+  nvbench::float32_t m_throttle_threshold{0.75f};     // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
+  bool m_discard_on_throttle{false};
+
   nvbench::criterion_params m_criterion_params;
   std::string m_stopping_criterion{"stdrel"};
 
diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx
index 3cdea6b..692019b 100644
--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -17,7 +17,6 @@
  */
 
 #include <nvbench/benchmark_base.cuh>
-
 #include <nvbench/detail/transform_reduce.cuh>
 
 namespace nvbench
@@ -45,7 +44,11 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   result->m_skip_time = m_skip_time;
   result->m_timeout   = m_timeout;
 
-  result->m_criterion_params   = m_criterion_params;
+  result->m_criterion_params        = m_criterion_params;
+  result->m_throttle_threshold      = m_throttle_threshold;
+  result->m_throttle_recovery_delay = m_throttle_recovery_delay;
+  result->m_discard_on_throttle     = m_discard_on_throttle;
+
   result->m_stopping_criterion = m_stopping_criterion;
 
   return result;
diff --git a/nvbench/detail/gpu_frequency.cuh b/nvbench/detail/gpu_frequency.cuh
new file mode 100644
index 0000000..320d3d2
--- /dev/null
+++ b/nvbench/detail/gpu_frequency.cuh
@@ -0,0 +1,53 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/detail/timestamps_kernel.cuh>
+#include <nvbench/types.cuh>
+
+namespace nvbench::detail
+{
+
+struct cuda_stream;
+
+struct gpu_frequency
+{
+  gpu_frequency() = default;
+
+  // move-only
+  gpu_frequency(const gpu_frequency &)            = delete;
+  gpu_frequency(gpu_frequency &&)                 = default;
+  gpu_frequency &operator=(const gpu_frequency &) = delete;
+  gpu_frequency &operator=(gpu_frequency &&)      = default;
+
+  void start(const nvbench::cuda_stream &stream) { m_start.record(stream); }
+
+  void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
+
+  [[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+                                   nvbench::float32_t throttle_threshold);
+
+  [[nodiscard]] nvbench::float32_t get_clock_frequency();
+
+private:
+  nvbench::detail::timestamps_kernel m_start;
+  nvbench::detail::timestamps_kernel m_stop;
+};
+
+} // namespace nvbench::detail
diff --git a/nvbench/detail/gpu_frequency.cxx b/nvbench/detail/gpu_frequency.cxx
new file mode 100644
index 0000000..8f2d19b
--- /dev/null
+++ b/nvbench/detail/gpu_frequency.cxx
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/gpu_frequency.cuh>
+
+#include <iostream>
+
+namespace nvbench::detail
+{
+
+nvbench::float32_t gpu_frequency::get_clock_frequency()
+{
+  nvbench::uint64_t elapsed_ns     = m_stop.m_host_timestamps[0] - m_start.m_host_timestamps[0];
+  nvbench::uint64_t elapsed_clocks = m_stop.m_host_timestamps[1] - m_start.m_host_timestamps[1];
+  nvbench::float32_t clock_rate    = float(elapsed_clocks) / float(elapsed_ns) * 1000000000.f;
+  return clock_rate;
+}
+
+bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+                                  nvbench::float32_t throttle_threshold)
+{
+  float threshold = peak_sm_clock_rate_hz * throttle_threshold;
+
+  if (this->get_clock_frequency() < threshold)
+  {
+    return true;
+  }
+
+  return false;
+}
+
+} // namespace nvbench::detail
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index a137929..95c292b 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -27,6 +27,8 @@
 
 #include <algorithm>
 #include <limits>
+#include <chrono>
+#include <thread>
 
 #include <fmt/format.h>
 
@@ -44,6 +46,9 @@ measure_cold_base::measure_cold_base(state &exec_state)
     , m_min_samples{exec_state.get_min_samples()}
     , m_skip_time{exec_state.get_skip_time()}
     , m_timeout{exec_state.get_timeout()}
+    , m_throttle_threshold(exec_state.get_throttle_threshold())
+    , m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
+    , m_discard_on_throttle(exec_state.get_discard_on_throttle())
 {
   if (m_min_samples > 0)
   {
@@ -86,6 +91,41 @@ void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
 
 void measure_cold_base::record_measurements()
 {
+  if (!m_run_once)
+  {
+    auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
+
+    if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
+    {
+      if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
+      {
+        auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
+        auto &printer           = printer_opt_ref.value().get();
+        printer.log(nvbench::log_level::warn,
+                    fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
+                                "({:0.0f}% < {:0.0f}%) on sample {}. {} previous sample and "
+                                "pausing for {}s.",
+                                current_clock_rate / 1000000.0f,
+                                peak_clock_rate / 1000000.0f,
+                                100.0f * (current_clock_rate / peak_clock_rate),
+                                100.0f * m_throttle_threshold,
+                                m_total_samples,
+                                m_discard_on_throttle ? "Discarding" : "Keeping",
+                                m_throttle_recovery_delay));
+      }
+
+      if (m_throttle_recovery_delay > 0.0f)
+      { // let the GPU cool down
+        std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
+      }
+
+      if (m_discard_on_throttle)
+      { // ignore this measurement
+        return;
+      }
+    }
+  }
+
   // Update and record timers and counters:
   const auto cur_cuda_time = m_cuda_timer.get_duration();
   const auto cur_cpu_time  = m_cpu_timer.get_duration();
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 97ce869..a27e1e3 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -30,6 +30,7 @@
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
 #include <nvbench/detail/statistics.cuh>
+#include <nvbench/detail/gpu_frequency.cuh>
 
 #include <cuda_runtime.h>
 
@@ -64,6 +65,8 @@ protected:
   bool is_finished();
   void run_trials_epilogue();
   void generate_summaries();
+  void gpu_frequency_start() { m_gpu_frequency.start(m_launch.get_stream()); }
+  void gpu_frequency_stop() { m_gpu_frequency.stop(m_launch.get_stream()); }
 
   void check_skip_time(nvbench::float64_t warmup_time);
 
@@ -88,6 +91,7 @@ protected:
 
   nvbench::criterion_params m_criterion_params;
   nvbench::stopping_criterion_base& m_stopping_criterion;
+  nvbench::detail::gpu_frequency m_gpu_frequency;
 
   bool m_disable_blocking_kernel{false};
   bool m_run_once{false};
@@ -97,6 +101,10 @@ protected:
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
 
+  nvbench::float32_t m_throttle_threshold{0.75f};     // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
+  bool m_discard_on_throttle{false};
+
   nvbench::int64_t m_total_samples{};
 
   nvbench::float64_t m_min_cuda_time{};
@@ -128,6 +136,10 @@ struct measure_cold_base::kernel_launch_timer
     {
       m_measure.block_stream();
     }
+    if (!m_measure.m_run_once)
+    {
+      m_measure.gpu_frequency_start();
+    }
     m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
     if (m_disable_blocking_kernel)
     {
@@ -143,6 +155,10 @@ struct measure_cold_base::kernel_launch_timer
       m_measure.m_cpu_timer.start();
       m_measure.unblock_stream();
     }
+    if (!m_measure.m_run_once)
+    {
+      m_measure.gpu_frequency_stop();
+    }
     m_measure.sync_stream();
     m_measure.m_cpu_timer.stop();
   }
diff --git a/nvbench/detail/timestamps_kernel.cu b/nvbench/detail/timestamps_kernel.cu
new file mode 100644
index 0000000..8fa7a64
--- /dev/null
+++ b/nvbench/detail/timestamps_kernel.cu
@@ -0,0 +1,82 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <cuda_runtime.h>
+
+#include <nvbench/cuda_call.cuh>
+#include <nvbench/cuda_stream.cuh>
+#include <nvbench/detail/timestamps_kernel.cuh>
+#include <nvbench/types.cuh>
+
+#include <cstdio>
+#include <cstdlib>
+
+namespace
+{
+
+__global__ void get_timestamps_kernel(nvbench::uint64_t *global_timestamp,
+                                      nvbench::uint64_t *sm0_timestamp)
+{
+  nvbench::uint32_t smid;
+  asm volatile("mov.u32 %0, %%smid;" : "=r"(smid));
+  if (smid == 0)
+  {
+    nvbench::uint64_t gts, lts;
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(gts));
+    lts = clock64();
+
+    *global_timestamp = gts;
+    *sm0_timestamp    = lts;
+  }
+}
+
+} // namespace
+
+namespace nvbench::detail
+{
+
+timestamps_kernel::timestamps_kernel()
+{
+  NVBENCH_CUDA_CALL(
+    cudaHostRegister(&m_host_timestamps, sizeof(nvbench::uint64_t) * 2, cudaHostRegisterMapped));
+  NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timestamps, &m_host_timestamps, 0));
+}
+
+timestamps_kernel::~timestamps_kernel()
+{
+  NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timestamps));
+}
+
+void timestamps_kernel::record(const nvbench::cuda_stream &stream)
+{
+  m_host_timestamps[0] = 0;
+  m_host_timestamps[1] = 0;
+
+  int device_id = 0;
+  int num_sms   = 0;
+
+  NVBENCH_CUDA_CALL(cudaGetDevice(&device_id));
+  NVBENCH_CUDA_CALL(
+    cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
+
+  get_timestamps_kernel<<<static_cast<unsigned int>(num_sms), 1, 0, stream.get_stream()>>>(
+    m_device_timestamps,
+    m_device_timestamps + 1);
+}
+
+} // namespace nvbench
diff --git a/nvbench/detail/timestamps_kernel.cuh b/nvbench/detail/timestamps_kernel.cuh
new file mode 100644
index 0000000..46831ca
--- /dev/null
+++ b/nvbench/detail/timestamps_kernel.cuh
@@ -0,0 +1,50 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/types.cuh>
+
+namespace nvbench
+{
+
+struct cuda_stream;
+
+namespace detail
+{
+
+struct timestamps_kernel
+{
+  timestamps_kernel();
+  ~timestamps_kernel();
+
+  void record(const nvbench::cuda_stream &stream);
+
+  // move-only
+  timestamps_kernel(const timestamps_kernel &)            = delete;
+  timestamps_kernel(timestamps_kernel &&)                 = default;
+  timestamps_kernel &operator=(const timestamps_kernel &) = delete;
+  timestamps_kernel &operator=(timestamps_kernel &&)      = default;
+
+  nvbench::uint64_t m_host_timestamps[2];
+  nvbench::uint64_t *m_device_timestamps{};
+};
+
+} // namespace detail
+
+} // namespace nvbench
diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh
index 98184cf..4124d96 100644
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -18,17 +18,18 @@
 
 #pragma once
 
+#include <cuda_runtime_api.h>
+
 #include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
 
-#include <cuda_runtime_api.h>
-
 #include <cstdint> // CHAR_BIT
 #include <stdexcept>
-#include <string_view>
 #include <utility>
 
+#include <string_view>
+
 // forward declare this for internal storage
 struct nvmlDevice_st;
 
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 525f275..04b50fb 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -16,28 +16,24 @@
  *  limitations under the License.
  */
 
-#include <nvbench/option_parser.cuh>
-
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/benchmark_manager.cuh>
-#include <nvbench/csv_printer.cuh>
 #include <nvbench/criterion_manager.cuh>
+#include <nvbench/csv_printer.cuh>
+#include <nvbench/detail/throw.cuh>
 #include <nvbench/device_manager.cuh>
 #include <nvbench/git_revision.cuh>
 #include <nvbench/json_printer.cuh>
 #include <nvbench/markdown_printer.cuh>
+#include <nvbench/option_parser.cuh>
 #include <nvbench/printer_base.cuh>
 #include <nvbench/range.cuh>
 #include <nvbench/version.cuh>
 
-#include <nvbench/detail/throw.cuh>
-
 // These are generated from the markdown docs by CMake in the build directory:
 #include <nvbench/internal/cli_help.cuh>
 #include <nvbench/internal/cli_help_axis.cuh>
 
-#include <fmt/format.h>
-
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
@@ -48,10 +44,12 @@
 #include <regex>
 #include <stdexcept>
 #include <string>
-#include <string_view>
 #include <tuple>
 #include <vector>
 
+#include <fmt/format.h>
+#include <string_view>
+
 namespace
 {
 
@@ -433,6 +431,11 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->lock_gpu_clocks(first[1]);
       first += 2;
     }
+    else if (arg == "--discard-on-throttle")
+    {
+      this->enable_discard_on_throttle();
+      first += 1;
+    }
     else if (arg == "--run-once")
     {
       this->enable_run_once();
@@ -515,7 +518,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->update_int64_prop(first[0], first[1]);
       first += 2;
     }
-    else if (arg == "--skip-time" || arg == "--timeout")
+    else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
+             arg == "--throttle-recovery-delay")
     {
       check_params(1);
       this->update_float64_prop(first[0], first[1]);
@@ -623,7 +627,7 @@ void option_parser::print_version() const
              NVBENCH_GIT_VERSION);
 }
 
-void option_parser::print_list(printer_base& printer) const
+void option_parser::print_list(printer_base &printer) const
 {
   const auto &bench_mgr = nvbench::benchmark_manager::get();
   printer.print_device_info();
@@ -727,6 +731,18 @@ void option_parser::enable_run_once()
   bench.set_run_once(true);
 }
 
+void option_parser::enable_discard_on_throttle()
+{
+  if (m_benchmarks.empty())
+  {
+    m_global_benchmark_args.push_back("--discard-on-throttle");
+    return;
+  }
+
+  benchmark_base &bench = *m_benchmarks.back();
+  bench.set_discard_on_throttle(true);
+}
+
 void option_parser::set_stopping_criterion(const std::string &criterion)
 {
   // If no active benchmark, save args as global.
@@ -979,10 +995,9 @@ catch (std::exception &e)
                 e.what());
 }
 
-void option_parser::update_criterion_prop(
-  const std::string &prop_arg,
-  const std::string &prop_val,
-  const nvbench::named_values::type type)
+void option_parser::update_criterion_prop(const std::string &prop_arg,
+                                          const std::string &prop_val,
+                                          const nvbench::named_values::type type)
 try
 {
   // If no active benchmark, save args as global.
@@ -993,8 +1008,8 @@ try
     return;
   }
 
-  benchmark_base &bench = *m_benchmarks.back();
-  nvbench::criterion_params& criterion_params = bench.get_criterion_params();
+  benchmark_base &bench                       = *m_benchmarks.back();
+  nvbench::criterion_params &criterion_params = bench.get_criterion_params();
   std::string name(prop_arg.begin() + 2, prop_arg.end());
   if (type == nvbench::named_values::type::float64)
   {
@@ -1022,7 +1037,7 @@ try
     NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
   }
 }
-catch (std::exception& e)
+catch (std::exception &e)
 {
   NVBENCH_THROW(std::runtime_error,
                 "Error handling option `{} {}`:\n{}",
@@ -1054,6 +1069,14 @@ try
   {
     bench.set_timeout(value);
   }
+  else if (prop_arg == "--throttle-threshold")
+  {
+    bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value));
+  }
+  else if (prop_arg == "--throttle-recovery-delay")
+  {
+    bench.set_throttle_recovery_delay(static_cast<nvbench::float32_t>(value));
+  }
   else
   {
     NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh
index 5bd834c..c58bfc5 100644
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -80,7 +80,7 @@ private:
   std::ostream &printer_spec_to_ostream(const std::string &spec);
 
   void print_version() const;
-  void print_list(printer_base& printer) const;
+  void print_list(printer_base &printer) const;
   void print_help() const;
   void print_help_axis() const;
 
@@ -89,6 +89,7 @@ private:
 
   void set_stopping_criterion(const std::string &criterion);
   void enable_run_once();
+  void enable_discard_on_throttle();
   void disable_blocking_kernel();
 
   void add_benchmark(const std::string &name);
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 86e6d16..8d56c0f 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -22,9 +22,9 @@
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/named_values.cuh>
+#include <nvbench/stopping_criterion.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/types.cuh>
-#include <nvbench/stopping_criterion.cuh>
 
 #include <functional>
 #include <optional>
@@ -136,8 +136,11 @@ struct state
 
   /// Control the stopping criterion for the measurement loop.
   /// @{
-  [[nodiscard]] const std::string& get_stopping_criterion() const { return m_stopping_criterion; }
-  void set_stopping_criterion(std::string criterion) { m_stopping_criterion = std::move(criterion); }
+  [[nodiscard]] const std::string &get_stopping_criterion() const { return m_stopping_criterion; }
+  void set_stopping_criterion(std::string criterion)
+  {
+    m_stopping_criterion = std::move(criterion);
+  }
   /// @}
 
   /// If true, the benchmark is only run once, skipping all warmup runs and only
@@ -200,6 +203,30 @@ struct state
   void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; }
   /// @}
 
+  [[nodiscard]] nvbench::float32_t get_throttle_threshold() const { return m_throttle_threshold; }
+
+  void set_throttle_threshold(nvbench::float32_t throttle_threshold)
+  {
+    m_throttle_threshold = throttle_threshold;
+  }
+
+  [[nodiscard]] nvbench::float32_t get_throttle_recovery_delay() const
+  {
+    return m_throttle_recovery_delay;
+  }
+
+  void set_throttle_recovery_delay(nvbench::float32_t throttle_recovery_delay)
+  {
+    m_throttle_recovery_delay = throttle_recovery_delay;
+  }
+
+  [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; }
+
+  void set_discard_on_throttle(bool discard_on_throttle)
+  {
+    m_discard_on_throttle = discard_on_throttle;
+  }
+
   /// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed
   /// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks
   /// for more than `blocking_kernel_timeout` seconds, an error will be printed
@@ -311,6 +338,10 @@ private:
   nvbench::float64_t m_skip_time;
   nvbench::float64_t m_timeout;
 
+  nvbench::float32_t m_throttle_threshold{0.75f};     // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
+  bool m_discard_on_throttle{false};
+
   // Deadlock protection. See blocking_kernel's class doc for details.
   nvbench::float64_t m_blocking_kernel_timeout{30.0};
 
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index 8779e9f..bfdac1b 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -15,20 +15,19 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-
 #include <nvbench/state.cuh>
 
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/types.cuh>
 
-#include <fmt/color.h>
-#include <fmt/format.h>
-
 #include <algorithm>
 #include <stdexcept>
 #include <string>
 
+#include <fmt/color.h>
+#include <fmt/format.h>
+
 namespace nvbench
 {
 
@@ -42,6 +41,9 @@ state::state(const benchmark_base &bench)
     , m_min_samples{bench.get_min_samples()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
+    , m_throttle_threshold{bench.get_throttle_threshold()}
+    , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
+    , m_discard_on_throttle{bench.get_discard_on_throttle()}
 {}
 
 state::state(const benchmark_base &bench,
@@ -60,6 +62,9 @@ state::state(const benchmark_base &bench,
     , m_min_samples{bench.get_min_samples()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
+    , m_throttle_threshold{bench.get_throttle_threshold()}
+    , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
+    , m_discard_on_throttle{bench.get_discard_on_throttle()}
 {}
 
 nvbench::int64_t state::get_int64(const std::string &axis_name) const

From 2ba2d1131deb73cc6ffba1515fcc17243baf5b58 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 11 Apr 2025 22:33:40 +0000
Subject: [PATCH 2/5] Report mean SM clock rate

---
 nvbench/detail/measure_cold.cu  | 14 ++++++++++++++
 nvbench/detail/measure_cold.cuh |  2 ++
 nvbench/markdown_printer.cu     | 25 +++++++++++++++++++++++++
 nvbench/markdown_printer.cuh    |  1 +
 4 files changed, 42 insertions(+)

diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 95c292b..d911343 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -54,6 +54,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
   {
     m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
     m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
+    m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
   }
 }
 
@@ -83,6 +84,7 @@ void measure_cold_base::initialize()
 
   m_cuda_times.clear();
   m_cpu_times.clear();
+  m_sm_clock_rates.clear();
 
   m_stopping_criterion.initialize(m_criterion_params);
 }
@@ -94,6 +96,7 @@ void measure_cold_base::record_measurements()
   if (!m_run_once)
   {
     auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
+    m_sm_clock_rates.push_back(peak_clock_rate);
 
     if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
     {
@@ -338,6 +341,17 @@ void measure_cold_base::generate_summaries()
     summ.set_string("hide", "Hidden by default.");
   }
 
+  if (!m_sm_clock_rates.empty())
+  {
+    auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
+    summ.set_string("name", "Clock Rate");
+    summ.set_string("hint", "frequency");
+    summ.set_string("description", "Mean SM clock rate");
+    summ.set_string("hide", "Hidden by default.");
+    summ.set_float64("value", nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(), 
+                                                                        m_sm_clock_rates.cend()));
+  }
+
   // Log if a printer exists:
   if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index a27e1e3..1c6d1e7 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -36,6 +36,7 @@
 
 #include <utility>
 #include <vector>
+#include "nvbench/types.cuh"
 
 namespace nvbench
 {
@@ -117,6 +118,7 @@ protected:
 
   std::vector<nvbench::float64_t> m_cuda_times;
   std::vector<nvbench::float64_t> m_cpu_times;
+  std::vector<nvbench::float32_t> m_sm_clock_rates;
 
   bool m_max_time_exceeded{};
 };
diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu
index 263d230..db3d871 100644
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -294,6 +294,10 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_
             {
               table.add_cell(row, tag, header, this->do_format_item_rate(summ));
             }
+            else if (hint == "frequency")
+            {
+              table.add_cell(row, tag, header, this->do_format_frequency(summ));
+            }
             else if (hint == "bytes")
             {
               table.add_cell(row, tag, header, this->do_format_bytes(summ));
@@ -399,6 +403,27 @@ std::string markdown_printer::do_format_item_rate(const summary &data)
   }
 }
 
+std::string markdown_printer::do_format_frequency(const nvbench::summary &data)
+{
+  const auto frequency_hz = data.get_float64("value");
+  if (frequency_hz >= 1e9)
+  {
+    return fmt::format("{:0.3f} GHz", frequency_hz * 1e-9);
+  }
+  else if (frequency_hz >= 1e6)
+  {
+    return fmt::format("{:0.3f} MHz", frequency_hz * 1e-6);
+  }
+  else if (frequency_hz >= 1e3)
+  {
+    return fmt::format("{:0.3f} KHz", frequency_hz * 1e-3);
+  }
+  else
+  {
+    return fmt::format("{:0.3f} Hz", frequency_hz);
+  }
+}
+
 std::string markdown_printer::do_format_bytes(const summary &data)
 {
   const auto bytes = static_cast<nvbench::float64_t>(data.get_int64("value"));
diff --git a/nvbench/markdown_printer.cuh b/nvbench/markdown_printer.cuh
index fa4fedc..970be6a 100644
--- a/nvbench/markdown_printer.cuh
+++ b/nvbench/markdown_printer.cuh
@@ -64,6 +64,7 @@ protected:
   virtual std::string do_format_default(const nvbench::summary &data);
   virtual std::string do_format_duration(const nvbench::summary &seconds);
   virtual std::string do_format_item_rate(const nvbench::summary &items_per_sec);
+  virtual std::string do_format_frequency(const nvbench::summary &frequency_hz);
   virtual std::string do_format_bytes(const nvbench::summary &bytes);
   virtual std::string do_format_byte_rate(const nvbench::summary &bytes_per_sec);
   virtual std::string do_format_sample_size(const nvbench::summary &count);

From 5c0d67475735e8017ec7b243f2ba371285a13b18 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Fri, 11 Apr 2025 15:44:11 -0700
Subject: [PATCH 3/5] Fix overflow in default clock rate

---
 nvbench/device_info.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh
index 4124d96..1bb5262 100644
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -109,7 +109,7 @@ struct device_info
   /// @return The default clock rate of the SM in Hz.
   [[nodiscard]] std::size_t get_sm_default_clock_rate() const
   { // kHz -> Hz
-    return static_cast<std::size_t>(m_prop.clockRate * 1000);
+    return static_cast<std::size_t>(m_prop.clockRate) * 1000;
   }
 
   /// @return The number of physical streaming multiprocessors on this device.

From b926daf09faa02cbe0ba8e77a2d001b82a439ced Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Sun, 13 Apr 2025 03:53:59 +0000
Subject: [PATCH 4/5] Better throttle recovery delay

---
 docs/cli_help.md                |  4 ++--
 nvbench/benchmark_base.cuh      |  4 ++--
 nvbench/detail/measure_cold.cuh | 20 ++++++++++----------
 nvbench/option_parser.cu        |  2 +-
 nvbench/state.cuh               |  4 ++--
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/cli_help.md b/docs/cli_help.md
index 5b70096..e6d96d3 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -140,13 +140,13 @@
 
 * `--throttle-threshold <value>`
   * Set the GPU throttle threshold as percentage of the peak clock rate.
-  * Default is 0.75 (75%).
+  * Default is 75%.
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
 * `--throttle-recovery-delay <value>`
   * Set the GPU throttle recovery delay in seconds.
-  * Default is 0.5 seconds.
+  * Default is 0.05 seconds.
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 3ed0ebf..4e658b5 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -309,8 +309,8 @@ protected:
   nvbench::float64_t m_skip_time{-1.};
   nvbench::float64_t m_timeout{15.};
 
-  nvbench::float32_t m_throttle_threshold{0.75f};     // [% of peak SM clock rate]
-  nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
+  nvbench::float32_t m_throttle_threshold{0.75f};      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
   bool m_discard_on_throttle{false};
 
   nvbench::criterion_params m_criterion_params;
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 1c6d1e7..7e409fd 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -18,24 +18,24 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
+
 #include <nvbench/blocking_kernel.cuh>
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
+#include <nvbench/detail/gpu_frequency.cuh>
+#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#include <nvbench/detail/l2flush.cuh>
+#include <nvbench/detail/statistics.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>
 
-#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
-#include <nvbench/detail/l2flush.cuh>
-#include <nvbench/detail/statistics.cuh>
-#include <nvbench/detail/gpu_frequency.cuh>
-
-#include <cuda_runtime.h>
-
 #include <utility>
 #include <vector>
+
 #include "nvbench/types.cuh"
 
 namespace nvbench
@@ -91,7 +91,7 @@ protected:
   nvbench::blocking_kernel m_blocker;
 
   nvbench::criterion_params m_criterion_params;
-  nvbench::stopping_criterion_base& m_stopping_criterion;
+  nvbench::stopping_criterion_base &m_stopping_criterion;
   nvbench::detail::gpu_frequency m_gpu_frequency;
 
   bool m_disable_blocking_kernel{false};
@@ -102,8 +102,8 @@ protected:
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
 
-  nvbench::float32_t m_throttle_threshold{0.75f};     // [% of peak SM clock rate]
-  nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
+  nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay; // [seconds]
   bool m_discard_on_throttle{false};
 
   nvbench::int64_t m_total_samples{};
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 04b50fb..98a1f84 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -1071,7 +1071,7 @@ try
   }
   else if (prop_arg == "--throttle-threshold")
   {
-    bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value));
+    bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value) / 100.0f);
   }
   else if (prop_arg == "--throttle-recovery-delay")
   {
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 8d56c0f..321e068 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -338,8 +338,8 @@ private:
   nvbench::float64_t m_skip_time;
   nvbench::float64_t m_timeout;
 
-  nvbench::float32_t m_throttle_threshold{0.75f};     // [% of peak SM clock rate]
-  nvbench::float32_t m_throttle_recovery_delay{0.0f}; // [seconds]
+  nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay; // [seconds]
   bool m_discard_on_throttle{false};
 
   // Deadlock protection. See blocking_kernel's class doc for details.

From 254ac2517fdce00c91918417df36ec3c9a1a4b36 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Sat, 12 Apr 2025 21:13:13 -0700
Subject: [PATCH 5/5] Remove discard on throttle option

---
 docs/cli_help.md                |  5 -----
 nvbench/benchmark_base.cuh      |  8 --------
 nvbench/benchmark_base.cxx      |  1 -
 nvbench/detail/measure_cold.cu  | 22 ++++++++++------------
 nvbench/detail/measure_cold.cuh |  1 -
 nvbench/option_parser.cu        | 17 -----------------
 nvbench/option_parser.cuh       |  1 -
 nvbench/state.cuh               |  8 --------
 nvbench/state.cxx               |  5 +----
 9 files changed, 11 insertions(+), 57 deletions(-)

diff --git a/docs/cli_help.md b/docs/cli_help.md
index e6d96d3..424c1be 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -133,11 +133,6 @@
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
-* `--discard-on-throttle`
-  * Discard measurements if the GPU is throttled.
-  * Applies to the most recent `--benchmark`, or all benchmarks if specified
-    before any `--benchmark` arguments.
-
 * `--throttle-threshold <value>`
   * Set the GPU throttle threshold as percentage of the peak clock rate.
   * Default is 75%.
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 4e658b5..d35779c 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -264,13 +264,6 @@ struct benchmark_base
     m_throttle_recovery_delay = throttle_recovery_delay;
   }
 
-  [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; }
-
-  void set_discard_on_throttle(bool discard_on_throttle)
-  {
-    m_discard_on_throttle = discard_on_throttle;
-  }
-
   [[nodiscard]] nvbench::criterion_params &get_criterion_params() { return m_criterion_params; }
   [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
   {
@@ -311,7 +304,6 @@ protected:
 
   nvbench::float32_t m_throttle_threshold{0.75f};      // [% of peak SM clock rate]
   nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
-  bool m_discard_on_throttle{false};
 
   nvbench::criterion_params m_criterion_params;
   std::string m_stopping_criterion{"stdrel"};
diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx
index 692019b..592796d 100644
--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -47,7 +47,6 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   result->m_criterion_params        = m_criterion_params;
   result->m_throttle_threshold      = m_throttle_threshold;
   result->m_throttle_recovery_delay = m_throttle_recovery_delay;
-  result->m_discard_on_throttle     = m_discard_on_throttle;
 
   result->m_stopping_criterion = m_stopping_criterion;
 
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index d911343..1421241 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -26,8 +26,8 @@
 #include <nvbench/summary.cuh>
 
 #include <algorithm>
-#include <limits>
 #include <chrono>
+#include <limits>
 #include <thread>
 
 #include <fmt/format.h>
@@ -48,7 +48,6 @@ measure_cold_base::measure_cold_base(state &exec_state)
     , m_timeout{exec_state.get_timeout()}
     , m_throttle_threshold(exec_state.get_throttle_threshold())
     , m_throttle_recovery_delay(exec_state.get_throttle_recovery_delay())
-    , m_discard_on_throttle(exec_state.get_discard_on_throttle())
 {
   if (m_min_samples > 0)
   {
@@ -96,7 +95,6 @@ void measure_cold_base::record_measurements()
   if (!m_run_once)
   {
     auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
-    m_sm_clock_rates.push_back(peak_clock_rate);
 
     if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
     {
@@ -106,14 +104,13 @@ void measure_cold_base::record_measurements()
         auto &printer           = printer_opt_ref.value().get();
         printer.log(nvbench::log_level::warn,
                     fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
-                                "({:0.0f}% < {:0.0f}%) on sample {}. {} previous sample and "
-                                "pausing for {}s.",
+                                "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
+                                "and pausing for {}s.",
                                 current_clock_rate / 1000000.0f,
                                 peak_clock_rate / 1000000.0f,
                                 100.0f * (current_clock_rate / peak_clock_rate),
                                 100.0f * m_throttle_threshold,
                                 m_total_samples,
-                                m_discard_on_throttle ? "Discarding" : "Keeping",
                                 m_throttle_recovery_delay));
       }
 
@@ -122,11 +119,11 @@ void measure_cold_base::record_measurements()
         std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
       }
 
-      if (m_discard_on_throttle)
-      { // ignore this measurement
-        return;
-      }
+      // ignore this measurement
+      return;
     }
+
+    m_sm_clock_rates.push_back(peak_clock_rate);
   }
 
   // Update and record timers and counters:
@@ -348,8 +345,9 @@ void measure_cold_base::generate_summaries()
     summ.set_string("hint", "frequency");
     summ.set_string("description", "Mean SM clock rate");
     summ.set_string("hide", "Hidden by default.");
-    summ.set_float64("value", nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(), 
-                                                                        m_sm_clock_rates.cend()));
+    summ.set_float64("value",
+                     nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
+                                                               m_sm_clock_rates.cend()));
   }
 
   // Log if a printer exists:
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 7e409fd..80f8e5a 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -104,7 +104,6 @@ protected:
 
   nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
   nvbench::float32_t m_throttle_recovery_delay; // [seconds]
-  bool m_discard_on_throttle{false};
 
   nvbench::int64_t m_total_samples{};
 
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 98a1f84..eef46f8 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -431,11 +431,6 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->lock_gpu_clocks(first[1]);
       first += 2;
     }
-    else if (arg == "--discard-on-throttle")
-    {
-      this->enable_discard_on_throttle();
-      first += 1;
-    }
     else if (arg == "--run-once")
     {
       this->enable_run_once();
@@ -731,18 +726,6 @@ void option_parser::enable_run_once()
   bench.set_run_once(true);
 }
 
-void option_parser::enable_discard_on_throttle()
-{
-  if (m_benchmarks.empty())
-  {
-    m_global_benchmark_args.push_back("--discard-on-throttle");
-    return;
-  }
-
-  benchmark_base &bench = *m_benchmarks.back();
-  bench.set_discard_on_throttle(true);
-}
-
 void option_parser::set_stopping_criterion(const std::string &criterion)
 {
   // If no active benchmark, save args as global.
diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh
index c58bfc5..bb12513 100644
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -89,7 +89,6 @@ private:
 
   void set_stopping_criterion(const std::string &criterion);
   void enable_run_once();
-  void enable_discard_on_throttle();
   void disable_blocking_kernel();
 
   void add_benchmark(const std::string &name);
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 321e068..9b0a5c1 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -220,13 +220,6 @@ struct state
     m_throttle_recovery_delay = throttle_recovery_delay;
   }
 
-  [[nodiscard]] bool get_discard_on_throttle() const { return m_discard_on_throttle; }
-
-  void set_discard_on_throttle(bool discard_on_throttle)
-  {
-    m_discard_on_throttle = discard_on_throttle;
-  }
-
   /// If a `KernelLauncher` syncs and `nvbench::exec_tag::sync` is not passed
   /// to `state.exec(...)`, a deadlock may occur. If a `blocking_kernel` blocks
   /// for more than `blocking_kernel_timeout` seconds, an error will be printed
@@ -340,7 +333,6 @@ private:
 
   nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
   nvbench::float32_t m_throttle_recovery_delay; // [seconds]
-  bool m_discard_on_throttle{false};
 
   // Deadlock protection. See blocking_kernel's class doc for details.
   nvbench::float64_t m_blocking_kernel_timeout{30.0};
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index bfdac1b..49af11b 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -15,10 +15,9 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-#include <nvbench/state.cuh>
-
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/state.cuh>
 #include <nvbench/types.cuh>
 
 #include <algorithm>
@@ -43,7 +42,6 @@ state::state(const benchmark_base &bench)
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
     , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
-    , m_discard_on_throttle{bench.get_discard_on_throttle()}
 {}
 
 state::state(const benchmark_base &bench,
@@ -64,7 +62,6 @@ state::state(const benchmark_base &bench,
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
     , m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
-    , m_discard_on_throttle{bench.get_discard_on_throttle()}
 {}
 
 nvbench::int64_t state::get_int64(const std::string &axis_name) const