Merge pull request #207 from alliepiper/throttle_followup

Throttling followup
2026-03-14 20:27:24 +00:00 · 2025-04-18 08:48:41 -04:00
parent 46ab283d02 9bf5e987cf
commit 89bec09b82
8 changed files with 57 additions and 39 deletions
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -134,7 +134,7 @@
    before any `--benchmark` arguments.

 * `--throttle-threshold <value>`
-  * Set the GPU throttle threshold as percentage of the peak clock rate.
+  * Set the GPU throttle threshold as percentage of the device's default clock rate.
  * Default is 75%.
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.
--- a/examples/summaries.cu
+++ b/examples/summaries.cu
@@ -41,7 +41,7 @@ void summary_example(nvbench::state &state)
  }

  // Run the measurements:
-  state.exec([duration](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [duration](nvbench::launch &launch) {
    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
  });

@@ -56,14 +56,17 @@ void summary_example(nvbench::state &state)
 #endif

  // Default summary columns can be shown/hidden in the markdown output tables by adding/removing
-  // the "hide" key. Modify this benchmark to show the minimum and maximum times, but hide the
-  // means.
+  // the "hide" key. Modify this benchmark to show the minimum and maximum GPUs times, but hide the
+  // mean GPU time and all CPU times. SM Clock frequency and throttling info are also shown.
  state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
  state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
  state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
-  state.get_summary("nv/cold/time/cpu/min").remove_value("hide");
-  state.get_summary("nv/cold/time/cpu/max").remove_value("hide");
  state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/min").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/max").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/stdev/relative").set_string("hide", "");
+  state.get_summary("nv/cold/sm_clock_rate/mean").remove_value("hide");
+  state.get_summary("nv/cold/sm_clock_rate/scaling/percent").remove_value("hide");
 }
 NVBENCH_BENCH(summary_example)
  .add_int64_axis("ms", nvbench::range(10, 50, 20))
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -302,7 +302,7 @@ protected:
  nvbench::float64_t m_skip_time{-1.};
  nvbench::float64_t m_timeout{15.};

-  nvbench::float32_t m_throttle_threshold{0.75f};      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_threshold{0.75f};      // [% of default SM clock rate]
  nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]

  nvbench::criterion_params m_criterion_params;
--- a/nvbench/detail/gpu_frequency.cuh
+++ b/nvbench/detail/gpu_frequency.cuh
@@ -40,7 +40,7 @@ struct gpu_frequency

  void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }

-  [[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+  [[nodiscard]] bool has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
                                   nvbench::float32_t throttle_threshold);

  [[nodiscard]] nvbench::float32_t get_clock_frequency();
--- a/nvbench/detail/gpu_frequency.cxx
+++ b/nvbench/detail/gpu_frequency.cxx
@@ -31,10 +31,10 @@ nvbench::float32_t gpu_frequency::get_clock_frequency()
  return clock_rate;
 }

-bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+bool gpu_frequency::has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
                                  nvbench::float32_t throttle_threshold)
 {
-  float threshold = peak_sm_clock_rate_hz * throttle_threshold;
+  float threshold = default_sm_clock_rate_hz * throttle_threshold;

  if (this->get_clock_frequency() < threshold)
  {
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -53,7 +53,6 @@ measure_cold_base::measure_cold_base(state &exec_state)
  {
    m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
    m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
-    m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
  }
 }

@@ -72,18 +71,18 @@ void measure_cold_base::check()

 void measure_cold_base::initialize()
 {
-  m_min_cuda_time     = std::numeric_limits<nvbench::float64_t>::max();
-  m_max_cuda_time     = std::numeric_limits<nvbench::float64_t>::lowest();
-  m_total_cuda_time   = 0.;
-  m_min_cpu_time      = std::numeric_limits<nvbench::float64_t>::max();
-  m_max_cpu_time      = std::numeric_limits<nvbench::float64_t>::lowest();
-  m_total_cpu_time    = 0.;
-  m_total_samples     = 0;
-  m_max_time_exceeded = false;
+  m_min_cuda_time             = std::numeric_limits<nvbench::float64_t>::max();
+  m_max_cuda_time             = std::numeric_limits<nvbench::float64_t>::lowest();
+  m_total_cuda_time           = 0.;
+  m_min_cpu_time              = std::numeric_limits<nvbench::float64_t>::max();
+  m_max_cpu_time              = std::numeric_limits<nvbench::float64_t>::lowest();
+  m_total_cpu_time            = 0.;
+  m_sm_clock_rate_accumulator = 0.;
+  m_total_samples             = 0;
+  m_max_time_exceeded         = false;

  m_cuda_times.clear();
  m_cpu_times.clear();
-  m_sm_clock_rates.clear();

  m_stopping_criterion.initialize(m_criterion_params);
 }
@@ -94,21 +93,22 @@ void measure_cold_base::record_measurements()
 {
  if (!m_run_once)
  {
-    auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
+    const auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
+    const auto default_clock_rate =
+      static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());

-    if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
+    if (m_gpu_frequency.has_throttled(default_clock_rate, m_throttle_threshold))
    {
      if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
      {
-        auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
-        auto &printer           = printer_opt_ref.value().get();
+        auto &printer = printer_opt_ref.value().get();
        printer.log(nvbench::log_level::warn,
                    fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
                                "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
                                "and pausing for {}s.",
                                current_clock_rate / 1000000.0f,
-                                peak_clock_rate / 1000000.0f,
-                                100.0f * (current_clock_rate / peak_clock_rate),
+                                default_clock_rate / 1000000.0f,
+                                100.0f * (current_clock_rate / default_clock_rate),
                                100.0f * m_throttle_threshold,
                                m_total_samples,
                                m_throttle_recovery_delay));
@@ -123,7 +123,7 @@ void measure_cold_base::record_measurements()
      return;
    }

-    m_sm_clock_rates.push_back(peak_clock_rate);
+    m_sm_clock_rate_accumulator += current_clock_rate;
  }

  // Update and record timers and counters:
@@ -338,16 +338,30 @@ void measure_cold_base::generate_summaries()
    summ.set_string("hide", "Hidden by default.");
  }

-  if (!m_sm_clock_rates.empty())
+  if (m_sm_clock_rate_accumulator != 0.)
  {
-    auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
-    summ.set_string("name", "Clock Rate");
-    summ.set_string("hint", "frequency");
-    summ.set_string("description", "Mean SM clock rate");
-    summ.set_string("hide", "Hidden by default.");
-    summ.set_float64("value",
-                     nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
-                                                               m_sm_clock_rates.cend()));
+    const auto clock_mean = m_sm_clock_rate_accumulator / d_samples;
+
+    {
+      auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
+      summ.set_string("name", "Clock Rate");
+      summ.set_string("hint", "frequency");
+      summ.set_string("description", "Mean SM clock rate");
+      summ.set_string("hide", "Hidden by default.");
+      summ.set_float64("value", clock_mean);
+    }
+
+    {
+      const auto default_clock_rate =
+        static_cast<nvbench::float64_t>(m_state.get_device()->get_sm_default_clock_rate());
+
+      auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/scaling/percent");
+      summ.set_string("name", "Clock Scaling");
+      summ.set_string("hint", "percentage");
+      summ.set_string("description", "Mean SM clock rate as a percentage of default clock rate.");
+      summ.set_string("hide", "Hidden by default.");
+      summ.set_float64("value", clock_mean / default_clock_rate);
+    }
  }

  // Log if a printer exists:
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -101,7 +101,7 @@ protected:
  nvbench::float64_t m_skip_time{};
  nvbench::float64_t m_timeout{};

-  nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_threshold;      // [% of default SM clock rate]
  nvbench::float32_t m_throttle_recovery_delay; // [seconds]

  nvbench::int64_t m_total_samples{};
@@ -114,9 +114,10 @@ protected:
  nvbench::float64_t m_max_cpu_time{};
  nvbench::float64_t m_total_cpu_time{};

+  nvbench::float64_t m_sm_clock_rate_accumulator{};
+
  std::vector<nvbench::float64_t> m_cuda_times;
  std::vector<nvbench::float64_t> m_cpu_times;
-  std::vector<nvbench::float32_t> m_sm_clock_rates;

  bool m_max_time_exceeded{};
 };
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -331,7 +331,7 @@ private:
  nvbench::float64_t m_skip_time;
  nvbench::float64_t m_timeout;

-  nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_threshold;      // [% of default SM clock rate]
  nvbench::float32_t m_throttle_recovery_delay; // [seconds]

  // Deadlock protection. See blocking_kernel's class doc for details.