Implement new convergence check for noisy kernels.

Previously, convergence was tested by waiting for the relative stdev of cuda timings ("noise") to drop below a certain percentage (`max_noise`). This assumed that all benchmarks would eventually see their noise drop to some threshold, but this is not the case. In practice, many benchmarks never converge to the default 0.5% relative stdev and instead will always run to the 15s timeout -- even if the means have converged in a second or two. Added a new check that tests when the noise itself stabilizes and ends the benchmark, even if noise > max_noise. After testing, this patch alone significantly reduces the runtime of the Thrust+CUB benchmark suite (from 30 hours to 5 hours) and produces similar timing results. The parameters used to tune this feature are not exposed -- if this approach works long-term and there's a strong motivation to let users tweak them, then we can worry about names/APIs/CLI/docs later.
2026-05-11 08:50:03 +00:00 · 2021-12-21 21:16:17 -05:00
parent 8e56a7bd94
commit 178dd0eb68
6 changed files with 382 additions and 87 deletions
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -19,12 +19,14 @@
 #include <nvbench/detail/measure_cold.cuh>

 #include <nvbench/benchmark_base.cuh>
-#include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/printer_base.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>

+#include <nvbench/detail/ring_buffer.cuh>
+#include <nvbench/detail/throw.cuh>
+
 #include <fmt/format.h>

 #include <algorithm>
@@ -43,7 +45,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
    , m_min_time{exec_state.get_min_time()}
    , m_skip_time{exec_state.get_skip_time()}
    , m_timeout{exec_state.get_timeout()}
-{ }
+{}

 void measure_cold_base::check()
 {
@@ -62,6 +64,113 @@ void measure_cold_base::check()
  }
 }

+void measure_cold_base::initialize()
+{
+  m_total_cuda_time = 0.;
+  m_total_cpu_time  = 0.;
+  m_cpu_noise       = 0.;
+  m_total_samples   = 0;
+  m_noise_tracker.clear();
+  m_cuda_times.clear();
+  m_cpu_times.clear();
+  m_max_time_exceeded = false;
+}
+
+void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
+
+void measure_cold_base::record_measurements()
+{
+  // Update and record timers and counters:
+  const auto cur_cuda_time = m_cuda_timer.get_duration();
+  const auto cur_cpu_time  = m_cpu_timer.get_duration();
+  m_cuda_times.push_back(cur_cuda_time);
+  m_cpu_times.push_back(cur_cpu_time);
+  m_total_cuda_time += cur_cuda_time;
+  m_total_cpu_time += cur_cpu_time;
+  ++m_total_samples;
+
+  // Compute convergence statistics using CUDA timings:
+  const auto mean_cuda_time = m_total_cuda_time /
+                              static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cuda_stdev =
+    nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
+                                                    m_cuda_times.cend(),
+                                                    mean_cuda_time);
+  auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
+  if (std::isfinite(cuda_rel_stdev))
+  {
+    m_noise_tracker.push_back(cuda_rel_stdev);
+  }
+}
+
+bool measure_cold_base::is_finished()
+{
+  if (m_run_once)
+  {
+    return true;
+  }
+
+  // Check that we've gathered enough samples:
+  if (m_total_cuda_time > m_min_time && m_total_samples > m_min_samples)
+  {
+    // Noise has dropped below threshold
+    if (m_noise_tracker.back() < m_max_noise)
+    {
+      return true;
+    }
+
+    // Check if the noise (cuda rel stdev) has converged by inspecting a
+    // trailing window of recorded noise measurements.
+    // This helps identify benchmarks that are inherently noisy and would
+    // never converge to the target stdev threshold. This check ensures that the
+    // benchmark will end if the stdev stabilizes above the target threshold.
+    // Gather some iterations before checking noise, and limit how often we
+    // check this.
+    if (m_noise_tracker.size() > 64 && (m_total_samples % 16 == 0))
+    {
+      // Use the current noise as the stdev reference.
+      const auto current_noise = m_noise_tracker.back();
+      const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
+        m_noise_tracker.cbegin(),
+        m_noise_tracker.cend(),
+        current_noise);
+      const auto noise_rel_stdev = noise_stdev / current_noise;
+
+      // If the rel stdev of the last N cuda noise measurements is less than
+      // 5%, consider the result stable.
+      const auto noise_threshold = 0.05;
+      if (noise_rel_stdev < noise_threshold)
+      {
+        return true;
+      }
+    }
+  }
+
+  // Check for timeouts:
+  m_walltime_timer.stop();
+  if (m_walltime_timer.get_duration() > m_timeout)
+  {
+    m_max_time_exceeded = true;
+    return true;
+  }
+
+  return false;
+}
+
+void measure_cold_base::run_trials_epilogue()
+{
+  // Only need to compute this at the end, not per iteration.
+  const auto cpu_mean = m_total_cuda_time /
+                        static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cpu_stdev =
+    nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
+                                                    m_cpu_times.cend(),
+                                                    m_total_cpu_time);
+  m_cpu_noise = cpu_stdev / cpu_mean;
+
+  m_walltime_timer.stop();
+}
+
 void measure_cold_base::generate_summaries()
 {
  const auto d_samples = static_cast<double>(m_total_samples);
@@ -113,7 +222,10 @@ void measure_cold_base::generate_summaries()
    summ.set_string("description",
                    "Relative standard deviation of the cold GPU execution "
                    "time measurements.");
-    summ.set_float64("value", m_cuda_noise);
+    summ.set_float64("value",
+                     m_noise_tracker.empty()
+                       ? std::numeric_limits<nvbench::float64_t>::infinity()
+                       : m_noise_tracker.back());
  }

  if (const auto items = m_state.get_element_count(); items != 0)
@@ -161,16 +273,16 @@ void measure_cold_base::generate_summaries()

    if (m_max_time_exceeded)
    {
-      const auto timeout = m_timeout_timer.get_duration();
+      const auto timeout = m_walltime_timer.get_duration();

-      if (m_cuda_noise > m_max_noise)
+      if (!m_noise_tracker.empty() && m_noise_tracker.back() > m_max_noise)
      {
        printer.log(nvbench::log_level::warn,
                    fmt::format("Current measurement timed out ({:0.2f}s) "
                                "while over noise threshold ({:0.2f}% > "
                                "{:0.2f}%)",
                                timeout,
-                                m_cuda_noise * 100,
+                                m_noise_tracker.back() * 100,
                                m_max_noise * 100));
      }
      if (m_total_samples < m_min_samples)
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -28,6 +28,7 @@

 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
+#include <nvbench/detail/ring_buffer.cuh>
 #include <nvbench/detail/statistics.cuh>

 #include <cuda_runtime.h>
@@ -58,19 +59,11 @@ protected:
  struct kernel_launch_timer;

  void check();
-
-  void initialize()
-  {
-    m_total_cuda_time = 0.;
-    m_total_cpu_time  = 0.;
-    m_cuda_noise      = 0.;
-    m_cpu_noise       = 0.;
-    m_total_samples   = 0;
-    m_cuda_times.clear();
-    m_cpu_times.clear();
-    m_max_time_exceeded = false;
-  }
-
+  void initialize();
+  void run_trials_prologue();
+  void record_measurements();
+  bool is_finished();
+  void run_trials_epilogue();
  void generate_summaries();

  void check_skip_time(nvbench::float64_t warmup_time);
@@ -86,7 +79,6 @@ protected:
  }

  void block_stream();
-
  __forceinline__ void unblock_stream() { m_blocker.unblock(); }

  nvbench::state &m_state;
@@ -94,7 +86,7 @@ protected:
  nvbench::launch m_launch;
  nvbench::cuda_timer m_cuda_timer;
  nvbench::cpu_timer m_cpu_timer;
-  nvbench::cpu_timer m_timeout_timer;
+  nvbench::cpu_timer m_walltime_timer;
  nvbench::detail::l2flush m_l2flush;
  nvbench::blocking_kernel m_blocker;

@@ -110,8 +102,10 @@ protected:
  nvbench::int64_t m_total_samples{};
  nvbench::float64_t m_total_cuda_time{};
  nvbench::float64_t m_total_cpu_time{};
-  nvbench::float64_t m_cuda_noise{}; // rel stdev
-  nvbench::float64_t m_cpu_noise{};  // rel stdev
+  nvbench::float64_t m_cpu_noise{}; // rel stdev
+
+  // Trailing history of noise measurements for convergence tests
+  nvbench::detail::ring_buffer<nvbench::float64_t> m_noise_tracker{512};

  std::vector<nvbench::float64_t> m_cuda_times;
  std::vector<nvbench::float64_t> m_cpu_times;
@@ -170,7 +164,11 @@ struct measure_cold : public measure_cold_base
    this->check();
    this->initialize();
    this->run_warmup();
+
+    this->run_trials_prologue();
    this->run_trials();
+    this->run_trials_epilogue();
+
    this->generate_summaries();
  }

@@ -192,47 +190,12 @@ private:

  void run_trials()
  {
-    m_timeout_timer.start();
    kernel_launch_timer<use_blocking_kernel> timer(*this);
-
    do
    {
      this->launch_kernel(timer);
-
-      const auto cur_cuda_time = m_cuda_timer.get_duration();
-      const auto cur_cpu_time  = m_cpu_timer.get_duration();
-      m_cuda_times.push_back(cur_cuda_time);
-      m_cpu_times.push_back(cur_cpu_time);
-      m_total_cuda_time += cur_cuda_time;
-      m_total_cpu_time += cur_cpu_time;
-      ++m_total_samples;
-
-      // Only consider the cuda noise in the convergence criteria.
-      m_cuda_noise = nvbench::detail::compute_noise(m_cuda_times,
-                                                    m_total_cuda_time);
-
-      m_timeout_timer.stop();
-      const auto total_time = m_timeout_timer.get_duration();
-
-      if (m_run_once)
-      {
-        break;
-      }
-
-      if (m_total_cuda_time > m_min_time &&  // Min time okay
-          m_total_samples > m_min_samples && // Min samples okay
-          m_cuda_noise < m_max_noise)        // Noise okay
-      {
-        break;
-      }
-
-      if (total_time > m_timeout) // Max time exceeded, stop iterating.
-      {
-        m_max_time_exceeded = true;
-        break;
-      }
-    } while (true);
-    m_cpu_noise = nvbench::detail::compute_noise(m_cpu_times, m_total_cpu_time);
+      this->record_measurements();
+    } while (!this->is_finished());
  }

  template <typename TimerT>
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -0,0 +1,129 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/config.cuh>
+
+#include <nvbench/detail/statistics.cuh>
+
+#include <cassert>
+#include <vector>
+
+namespace nvbench::detail
+{
+
+/**
+ * @brief A simple, dynamically sized ring buffer.
+ */
+template <typename T>
+struct ring_buffer
+{
+  /**
+   * Create a new ring buffer with the requested capacity.
+   */
+  explicit ring_buffer(std::size_t capacity)
+      : m_buffer(capacity)
+  {}
+
+  /**
+   * Iterators provide all values in the ring buffer in unspecified order.
+   * @{
+   */
+  // clang-format off
+  [[nodiscard]] auto begin()        { return m_buffer.begin(); }
+  [[nodiscard]] auto begin() const  { return m_buffer.begin(); }
+  [[nodiscard]] auto cbegin() const { return m_buffer.cbegin(); }
+  [[nodiscard]] auto end()        { return m_buffer.begin()  + this->size(); }
+  [[nodiscard]] auto end() const  { return m_buffer.begin()  + this->size(); }
+  [[nodiscard]] auto cend() const { return m_buffer.cbegin() + this->size(); }
+  // clang-format on
+  /** @} */
+
+  /**
+   * The number of valid values in the ring buffer. Always <= capacity().
+   */
+  [[nodiscard]] std::size_t size() const
+  {
+    return m_full ? m_buffer.size() : m_index;
+  }
+
+  /**
+   * The maximum size of the ring buffer.
+   */
+  [[nodiscard]] std::size_t capacity() const
+  {
+    return m_buffer.size();
+  }
+
+  /**
+   * @return True if the ring buffer is empty.
+   */
+  [[nodiscard]] bool empty() const { return m_index == 0 && !m_full; }
+
+  /**
+   * Remove all values from the buffer without modifying capacity.
+   */
+  void clear()
+  {
+    m_index = 0;
+    m_full  = false;
+  }
+
+  /**
+   * Add a new value to the ring buffer. If size() == capacity(), the oldest
+   * element in the buffer is overwritten.
+   */
+  void push_back(T val)
+  {
+    assert(m_index < m_buffer.size());
+
+    m_buffer[m_index] = val;
+
+    m_index = (m_index + 1) % m_buffer.size();
+    if (m_index == 0)
+    { // buffer wrapped
+      m_full = true;
+    }
+  }
+
+  /**
+   * Get the most recently added value.
+   * @{
+   */
+  [[nodiscard]] auto back() const
+  {
+    assert(!this->empty());
+    const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
+    return m_buffer[back_index];
+  }
+  [[nodiscard]] auto back()
+  {
+    assert(!this->empty());
+    const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
+    return m_buffer[back_index];
+  }
+  /**@}*/
+
+private:
+  std::vector<T> m_buffer;
+  std::size_t m_index{0};
+  bool m_full{false};
+};
+
+} // namespace nvbench::detail
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -18,48 +18,47 @@

 #pragma once

+#include <nvbench/types.cuh>
+
 #include <nvbench/detail/transform_reduce.cuh>

 #include <cmath>
 #include <functional>
+#include <iterator>
 #include <limits>
-#include <numeric>
-#include <vector>
+#include <type_traits>

-namespace nvbench::detail
+namespace nvbench::detail::statistics
 {

 /**
- * Given a vector of samples and the precomputed sum of all samples in the
- * vector, return a measure of the noise in the samples.
+ * Computes and returns the unbiased sample standard deviation.
 *
- * The noise metric is the relative unbiased sample standard deviation
- * (std_dev / mean).
+ * If the input has fewer than 5 sample, infinity is returned.
 */
-inline nvbench::float64_t
-compute_noise(const std::vector<nvbench::float64_t> &data,
-              nvbench::float64_t sum)
+template <typename Iter,
+          typename ValueType = typename std::iterator_traits<Iter>::value_type>
+ValueType standard_deviation(Iter first, Iter last, ValueType mean)
 {
-  const auto num = static_cast<nvbench::float64_t>(data.size());
+  static_assert(std::is_floating_point_v<ValueType>);
+
+  const auto num = last - first;
  if (num < 5) // don't bother with low sample sizes.
  {
-    return std::numeric_limits<nvbench::float64_t>::infinity();
+    return std::numeric_limits<ValueType>::infinity();
  }

-  const auto mean = sum / num;
-  const auto variance =
-    nvbench::detail::transform_reduce(data.cbegin(),
-                                      data.cend(),
-                                      0.,
-                                      std::plus<>{},
-                                      [mean](nvbench::float64_t val) {
-                                        val -= mean;
-                                        val *= val;
-                                        return val;
-                                      }) /
-    (num - 1);
-  const auto abs_stdev = std::sqrt(variance);
-  return abs_stdev / mean;
+  const auto variance = nvbench::detail::transform_reduce(first,
+                                                          last,
+                                                          ValueType{},
+                                                          std::plus<>{},
+                                                          [mean](auto val) {
+                                                            val -= mean;
+                                                            val *= val;
+                                                            return val;
+                                                          }) /
+                        static_cast<ValueType>((num - 1));
+  return std::sqrt(variance);
 }

-} // namespace nvbench::detail
+} // namespace nvbench::detail::statistics
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -10,6 +10,7 @@ set(test_srcs
  named_values.cu
  option_parser.cu
  range.cu
+  ring_buffer.cu
  runner.cu
  state.cu
  state_generator.cu
@@ -36,3 +37,4 @@ foreach(test_src IN LISTS test_srcs)
 endforeach()

 add_subdirectory(cmake)
+add_subdirectory(device)
--- a/testing/ring_buffer.cu
+++ b/testing/ring_buffer.cu
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/ring_buffer.cuh>
+
+#include "test_asserts.cuh"
+
+#include <algorithm>
+#include <vector>
+
+template <typename T>
+bool equal(const nvbench::detail::ring_buffer<T> &buffer,
+           const std::vector<T> &reference)
+{
+  return std::equal(buffer.cbegin(), buffer.cend(), reference.cbegin());
+}
+
+int main()
+try
+{
+  nvbench::detail::ring_buffer<int> avg(3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(avg.size() == 0);
+  ASSERT(avg.empty());
+  ASSERT(equal(avg, {0, 0, 0}));
+
+  avg.push_back(32);
+  ASSERT(!avg.empty());
+  ASSERT(avg.size() == 1);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 32, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 0, 0}));
+
+  avg.push_back(2);
+  ASSERT(avg.size() == 2);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 2, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 2, 0}));
+
+  avg.push_back(-15);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == -15, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 2, -15}));
+
+  avg.push_back(5);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 5, " (got {})", avg.back());
+  ASSERT(equal(avg, {5, 2, -15}));
+
+  avg.push_back(0);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(equal(avg, {5, 0, -15}));
+  ASSERT_MSG(avg.back() == 0, " (got {})", avg.back());
+
+  avg.push_back(128);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(equal(avg, {5, 0, 128}));
+  ASSERT_MSG(avg.back() == 128, " (got {})", avg.back());
+
+  avg.clear();
+  ASSERT(avg.empty());
+  ASSERT(avg.size() == 0);
+  ASSERT(avg.capacity() == 3);
+
+  return 0;
+}
+catch (std::exception &err)
+{
+  fmt::print(stderr, "{}", err.what());
+  return 1;
+}