Merge pull request #215 from alliepiper/dynamic_throttle_delay

Dynamically increase recovery delay for consecutive discards.
2026-03-14 20:27:24 +00:00 · 2025-04-24 10:32:45 -04:00
parent 89bec09b82 d12614b5cb
commit 2938a94d49
2 changed files with 25 additions and 5 deletions
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -81,6 +81,9 @@ void measure_cold_base::initialize()
  m_total_samples             = 0;
  m_max_time_exceeded         = false;

+  m_dynamic_throttle_recovery_delay = m_throttle_recovery_delay;
+  m_throttle_discard_count          = 0;
+
  m_cuda_times.clear();
  m_cpu_times.clear();

@@ -99,29 +102,41 @@ void measure_cold_base::record_measurements()

    if (m_gpu_frequency.has_throttled(default_clock_rate, m_throttle_threshold))
    {
+      if (m_throttle_discard_count > 2)
+      {
+        // Throttling detected in multiple consecutive trials. The delay is not sufficient to
+        // recover. Increase the delay by no more than half of a second:
+        m_dynamic_throttle_recovery_delay += std::min(m_dynamic_throttle_recovery_delay * 1.5f,
+                                                      0.5f);
+      }
+
      if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
      {
        auto &printer = printer_opt_ref.value().get();
        printer.log(nvbench::log_level::warn,
                    fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
-                                "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
-                                "and pausing for {}s.",
+                                "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous trial "
+                                "and pausing for {:0.3f}s.",
                                current_clock_rate / 1000000.0f,
                                default_clock_rate / 1000000.0f,
                                100.0f * (current_clock_rate / default_clock_rate),
                                100.0f * m_throttle_threshold,
                                m_total_samples,
-                                m_throttle_recovery_delay));
+                                m_dynamic_throttle_recovery_delay));
      }

-      if (m_throttle_recovery_delay > 0.0f)
+      if (m_dynamic_throttle_recovery_delay > 0.0f)
      { // let the GPU cool down
-        std::this_thread::sleep_for(std::chrono::duration<float>(m_throttle_recovery_delay));
+        std::this_thread::sleep_for(
+          std::chrono::duration<float>(m_dynamic_throttle_recovery_delay));
      }

+      m_throttle_discard_count += 1;
+
      // ignore this measurement
      return;
    }
+    m_throttle_discard_count = 0;

    m_sm_clock_rate_accumulator += current_clock_rate;
  }
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -104,6 +104,11 @@ protected:
  nvbench::float32_t m_throttle_threshold;      // [% of default SM clock rate]
  nvbench::float32_t m_throttle_recovery_delay; // [seconds]

+  // Dynamically increased when repeated throttling occurs
+  // without successfully recording a sample.
+  nvbench::float32_t m_dynamic_throttle_recovery_delay{}; // [seconds]
+  nvbench::int64_t m_throttle_discard_count{};
+
  nvbench::int64_t m_total_samples{};

  nvbench::float64_t m_min_cuda_time{};