Merge pull request #64 from allisonvacanti/noise_convergence

New convergence check
2026-05-26 07:59:55 +00:00 · 2021-12-21 21:30:39 -05:00
parent e70c31d7e1 178dd0eb68
commit 2f8bb28c52
12 changed files with 555 additions and 91 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,10 @@ option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
 option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default})

 option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
+option(NVBench_ENABLE_DEVICE_TESTING
+  "Include tests that require a GPU (with locked clocks)."
+  OFF
+)
 option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)

 include(cmake/NVBenchConfigTarget.cmake)
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -19,12 +19,14 @@
 #include <nvbench/detail/measure_cold.cuh>

 #include <nvbench/benchmark_base.cuh>
-#include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/printer_base.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>

+#include <nvbench/detail/ring_buffer.cuh>
+#include <nvbench/detail/throw.cuh>
+
 #include <fmt/format.h>

 #include <algorithm>
@@ -43,7 +45,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
    , m_min_time{exec_state.get_min_time()}
    , m_skip_time{exec_state.get_skip_time()}
    , m_timeout{exec_state.get_timeout()}
-{ }
+{}

 void measure_cold_base::check()
 {
@@ -62,6 +64,113 @@ void measure_cold_base::check()
  }
 }

+void measure_cold_base::initialize()
+{
+  m_total_cuda_time = 0.;
+  m_total_cpu_time  = 0.;
+  m_cpu_noise       = 0.;
+  m_total_samples   = 0;
+  m_noise_tracker.clear();
+  m_cuda_times.clear();
+  m_cpu_times.clear();
+  m_max_time_exceeded = false;
+}
+
+void measure_cold_base::run_trials_prologue() { m_walltime_timer.start(); }
+
+void measure_cold_base::record_measurements()
+{
+  // Update and record timers and counters:
+  const auto cur_cuda_time = m_cuda_timer.get_duration();
+  const auto cur_cpu_time  = m_cpu_timer.get_duration();
+  m_cuda_times.push_back(cur_cuda_time);
+  m_cpu_times.push_back(cur_cpu_time);
+  m_total_cuda_time += cur_cuda_time;
+  m_total_cpu_time += cur_cpu_time;
+  ++m_total_samples;
+
+  // Compute convergence statistics using CUDA timings:
+  const auto mean_cuda_time = m_total_cuda_time /
+                              static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cuda_stdev =
+    nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
+                                                    m_cuda_times.cend(),
+                                                    mean_cuda_time);
+  auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
+  if (std::isfinite(cuda_rel_stdev))
+  {
+    m_noise_tracker.push_back(cuda_rel_stdev);
+  }
+}
+
+bool measure_cold_base::is_finished()
+{
+  if (m_run_once)
+  {
+    return true;
+  }
+
+  // Check that we've gathered enough samples:
+  if (m_total_cuda_time > m_min_time && m_total_samples > m_min_samples)
+  {
+    // Noise has dropped below threshold
+    if (m_noise_tracker.back() < m_max_noise)
+    {
+      return true;
+    }
+
+    // Check if the noise (cuda rel stdev) has converged by inspecting a
+    // trailing window of recorded noise measurements.
+    // This helps identify benchmarks that are inherently noisy and would
+    // never converge to the target stdev threshold. This check ensures that the
+    // benchmark will end if the stdev stabilizes above the target threshold.
+    // Gather some iterations before checking noise, and limit how often we
+    // check this.
+    if (m_noise_tracker.size() > 64 && (m_total_samples % 16 == 0))
+    {
+      // Use the current noise as the stdev reference.
+      const auto current_noise = m_noise_tracker.back();
+      const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
+        m_noise_tracker.cbegin(),
+        m_noise_tracker.cend(),
+        current_noise);
+      const auto noise_rel_stdev = noise_stdev / current_noise;
+
+      // If the rel stdev of the last N cuda noise measurements is less than
+      // 5%, consider the result stable.
+      const auto noise_threshold = 0.05;
+      if (noise_rel_stdev < noise_threshold)
+      {
+        return true;
+      }
+    }
+  }
+
+  // Check for timeouts:
+  m_walltime_timer.stop();
+  if (m_walltime_timer.get_duration() > m_timeout)
+  {
+    m_max_time_exceeded = true;
+    return true;
+  }
+
+  return false;
+}
+
+void measure_cold_base::run_trials_epilogue()
+{
+  // Only need to compute this at the end, not per iteration.
+  const auto cpu_mean = m_total_cuda_time /
+                        static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cpu_stdev =
+    nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
+                                                    m_cpu_times.cend(),
+                                                    m_total_cpu_time);
+  m_cpu_noise = cpu_stdev / cpu_mean;
+
+  m_walltime_timer.stop();
+}
+
 void measure_cold_base::generate_summaries()
 {
  const auto d_samples = static_cast<double>(m_total_samples);
@@ -113,7 +222,10 @@ void measure_cold_base::generate_summaries()
    summ.set_string("description",
                    "Relative standard deviation of the cold GPU execution "
                    "time measurements.");
-    summ.set_float64("value", m_cuda_noise);
+    summ.set_float64("value",
+                     m_noise_tracker.empty()
+                       ? std::numeric_limits<nvbench::float64_t>::infinity()
+                       : m_noise_tracker.back());
  }

  if (const auto items = m_state.get_element_count(); items != 0)
@@ -161,16 +273,16 @@ void measure_cold_base::generate_summaries()

    if (m_max_time_exceeded)
    {
-      const auto timeout = m_timeout_timer.get_duration();
+      const auto timeout = m_walltime_timer.get_duration();

-      if (m_cuda_noise > m_max_noise)
+      if (!m_noise_tracker.empty() && m_noise_tracker.back() > m_max_noise)
      {
        printer.log(nvbench::log_level::warn,
                    fmt::format("Current measurement timed out ({:0.2f}s) "
                                "while over noise threshold ({:0.2f}% > "
                                "{:0.2f}%)",
                                timeout,
-                                m_cuda_noise * 100,
+                                m_noise_tracker.back() * 100,
                                m_max_noise * 100));
      }
      if (m_total_samples < m_min_samples)
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -28,6 +28,7 @@

 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
+#include <nvbench/detail/ring_buffer.cuh>
 #include <nvbench/detail/statistics.cuh>

 #include <cuda_runtime.h>
@@ -58,19 +59,11 @@ protected:
  struct kernel_launch_timer;

  void check();
-
-  void initialize()
-  {
-    m_total_cuda_time = 0.;
-    m_total_cpu_time  = 0.;
-    m_cuda_noise      = 0.;
-    m_cpu_noise       = 0.;
-    m_total_samples   = 0;
-    m_cuda_times.clear();
-    m_cpu_times.clear();
-    m_max_time_exceeded = false;
-  }
-
+  void initialize();
+  void run_trials_prologue();
+  void record_measurements();
+  bool is_finished();
+  void run_trials_epilogue();
  void generate_summaries();

  void check_skip_time(nvbench::float64_t warmup_time);
@@ -86,7 +79,6 @@ protected:
  }

  void block_stream();
-
  __forceinline__ void unblock_stream() { m_blocker.unblock(); }

  nvbench::state &m_state;
@@ -94,7 +86,7 @@ protected:
  nvbench::launch m_launch;
  nvbench::cuda_timer m_cuda_timer;
  nvbench::cpu_timer m_cpu_timer;
-  nvbench::cpu_timer m_timeout_timer;
+  nvbench::cpu_timer m_walltime_timer;
  nvbench::detail::l2flush m_l2flush;
  nvbench::blocking_kernel m_blocker;

@@ -110,8 +102,10 @@ protected:
  nvbench::int64_t m_total_samples{};
  nvbench::float64_t m_total_cuda_time{};
  nvbench::float64_t m_total_cpu_time{};
-  nvbench::float64_t m_cuda_noise{}; // rel stdev
-  nvbench::float64_t m_cpu_noise{};  // rel stdev
+  nvbench::float64_t m_cpu_noise{}; // rel stdev
+
+  // Trailing history of noise measurements for convergence tests
+  nvbench::detail::ring_buffer<nvbench::float64_t> m_noise_tracker{512};

  std::vector<nvbench::float64_t> m_cuda_times;
  std::vector<nvbench::float64_t> m_cpu_times;
@@ -170,7 +164,11 @@ struct measure_cold : public measure_cold_base
    this->check();
    this->initialize();
    this->run_warmup();
+
+    this->run_trials_prologue();
    this->run_trials();
+    this->run_trials_epilogue();
+
    this->generate_summaries();
  }

@@ -192,47 +190,12 @@ private:

  void run_trials()
  {
-    m_timeout_timer.start();
    kernel_launch_timer<use_blocking_kernel> timer(*this);
-
    do
    {
      this->launch_kernel(timer);
-
-      const auto cur_cuda_time = m_cuda_timer.get_duration();
-      const auto cur_cpu_time  = m_cpu_timer.get_duration();
-      m_cuda_times.push_back(cur_cuda_time);
-      m_cpu_times.push_back(cur_cpu_time);
-      m_total_cuda_time += cur_cuda_time;
-      m_total_cpu_time += cur_cpu_time;
-      ++m_total_samples;
-
-      // Only consider the cuda noise in the convergence criteria.
-      m_cuda_noise = nvbench::detail::compute_noise(m_cuda_times,
-                                                    m_total_cuda_time);
-
-      m_timeout_timer.stop();
-      const auto total_time = m_timeout_timer.get_duration();
-
-      if (m_run_once)
-      {
-        break;
-      }
-
-      if (m_total_cuda_time > m_min_time &&  // Min time okay
-          m_total_samples > m_min_samples && // Min samples okay
-          m_cuda_noise < m_max_noise)        // Noise okay
-      {
-        break;
-      }
-
-      if (total_time > m_timeout) // Max time exceeded, stop iterating.
-      {
-        m_max_time_exceeded = true;
-        break;
-      }
-    } while (true);
-    m_cpu_noise = nvbench::detail::compute_noise(m_cpu_times, m_total_cpu_time);
+      this->record_measurements();
+    } while (!this->is_finished());
  }

  template <typename TimerT>
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -0,0 +1,129 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/config.cuh>
+
+#include <nvbench/detail/statistics.cuh>
+
+#include <cassert>
+#include <vector>
+
+namespace nvbench::detail
+{
+
+/**
+ * @brief A simple, dynamically sized ring buffer.
+ */
+template <typename T>
+struct ring_buffer
+{
+  /**
+   * Create a new ring buffer with the requested capacity.
+   */
+  explicit ring_buffer(std::size_t capacity)
+      : m_buffer(capacity)
+  {}
+
+  /**
+   * Iterators provide all values in the ring buffer in unspecified order.
+   * @{
+   */
+  // clang-format off
+  [[nodiscard]] auto begin()        { return m_buffer.begin(); }
+  [[nodiscard]] auto begin() const  { return m_buffer.begin(); }
+  [[nodiscard]] auto cbegin() const { return m_buffer.cbegin(); }
+  [[nodiscard]] auto end()        { return m_buffer.begin()  + this->size(); }
+  [[nodiscard]] auto end() const  { return m_buffer.begin()  + this->size(); }
+  [[nodiscard]] auto cend() const { return m_buffer.cbegin() + this->size(); }
+  // clang-format on
+  /** @} */
+
+  /**
+   * The number of valid values in the ring buffer. Always <= capacity().
+   */
+  [[nodiscard]] std::size_t size() const
+  {
+    return m_full ? m_buffer.size() : m_index;
+  }
+
+  /**
+   * The maximum size of the ring buffer.
+   */
+  [[nodiscard]] std::size_t capacity() const
+  {
+    return m_buffer.size();
+  }
+
+  /**
+   * @return True if the ring buffer is empty.
+   */
+  [[nodiscard]] bool empty() const { return m_index == 0 && !m_full; }
+
+  /**
+   * Remove all values from the buffer without modifying capacity.
+   */
+  void clear()
+  {
+    m_index = 0;
+    m_full  = false;
+  }
+
+  /**
+   * Add a new value to the ring buffer. If size() == capacity(), the oldest
+   * element in the buffer is overwritten.
+   */
+  void push_back(T val)
+  {
+    assert(m_index < m_buffer.size());
+
+    m_buffer[m_index] = val;
+
+    m_index = (m_index + 1) % m_buffer.size();
+    if (m_index == 0)
+    { // buffer wrapped
+      m_full = true;
+    }
+  }
+
+  /**
+   * Get the most recently added value.
+   * @{
+   */
+  [[nodiscard]] auto back() const
+  {
+    assert(!this->empty());
+    const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
+    return m_buffer[back_index];
+  }
+  [[nodiscard]] auto back()
+  {
+    assert(!this->empty());
+    const auto back_index = m_index == 0 ? m_buffer.size() - 1 : m_index - 1;
+    return m_buffer[back_index];
+  }
+  /**@}*/
+
+private:
+  std::vector<T> m_buffer;
+  std::size_t m_index{0};
+  bool m_full{false};
+};
+
+} // namespace nvbench::detail
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -18,48 +18,47 @@

 #pragma once

+#include <nvbench/types.cuh>
+
 #include <nvbench/detail/transform_reduce.cuh>

 #include <cmath>
 #include <functional>
+#include <iterator>
 #include <limits>
-#include <numeric>
-#include <vector>
+#include <type_traits>

-namespace nvbench::detail
+namespace nvbench::detail::statistics
 {

 /**
- * Given a vector of samples and the precomputed sum of all samples in the
- * vector, return a measure of the noise in the samples.
+ * Computes and returns the unbiased sample standard deviation.
 *
- * The noise metric is the relative unbiased sample standard deviation
- * (std_dev / mean).
+ * If the input has fewer than 5 sample, infinity is returned.
 */
-inline nvbench::float64_t
-compute_noise(const std::vector<nvbench::float64_t> &data,
-              nvbench::float64_t sum)
+template <typename Iter,
+          typename ValueType = typename std::iterator_traits<Iter>::value_type>
+ValueType standard_deviation(Iter first, Iter last, ValueType mean)
 {
-  const auto num = static_cast<nvbench::float64_t>(data.size());
+  static_assert(std::is_floating_point_v<ValueType>);
+
+  const auto num = last - first;
  if (num < 5) // don't bother with low sample sizes.
  {
-    return std::numeric_limits<nvbench::float64_t>::infinity();
+    return std::numeric_limits<ValueType>::infinity();
  }

-  const auto mean = sum / num;
-  const auto variance =
-    nvbench::detail::transform_reduce(data.cbegin(),
-                                      data.cend(),
-                                      0.,
-                                      std::plus<>{},
-                                      [mean](nvbench::float64_t val) {
-                                        val -= mean;
-                                        val *= val;
-                                        return val;
-                                      }) /
-    (num - 1);
-  const auto abs_stdev = std::sqrt(variance);
-  return abs_stdev / mean;
+  const auto variance = nvbench::detail::transform_reduce(first,
+                                                          last,
+                                                          ValueType{},
+                                                          std::plus<>{},
+                                                          [mean](auto val) {
+                                                            val -= mean;
+                                                            val *= val;
+                                                            return val;
+                                                          }) /
+                        static_cast<ValueType>((num - 1));
+  return std::sqrt(variance);
 }

-} // namespace nvbench::detail
+} // namespace nvbench::detail::statistics
--- a/nvbench/detail/transform_reduce.cuh
+++ b/nvbench/detail/transform_reduce.cuh
@@ -18,6 +18,8 @@

 #pragma once

+#include <utility>
+
 // Many compilers still don't ship transform_reduce with their STLs, so here's
 // a naive implementation that will work everywhere. This is never used in a
 // critical section, so perf isn't a concern.
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -128,7 +128,7 @@ const summary &state::get_summary(std::string_view name) const
                 [&name](const auto &s) { return s.get_name() == name; });
  if (iter == m_summaries.cend())
  {
-    NVBENCH_THROW(std::runtime_error, "No summary named '{}'.", name);
+    NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
  }
  return *iter;
 }
@@ -140,7 +140,7 @@ summary &state::get_summary(std::string_view name)
                           [&name](auto &s) { return s.get_name() == name; });
  if (iter == m_summaries.end())
  {
-    NVBENCH_THROW(std::runtime_error, "No summary named '{}'.", name);
+    NVBENCH_THROW(std::invalid_argument, "No summary named '{}'.", name);
  }
  return *iter;
 }
--- a/scripts/nvbench_compare.py
+++ b/scripts/nvbench_compare.py
@@ -262,10 +262,13 @@ def main():
    to_compare = []
    if os.path.isdir(files_or_dirs[0]) and os.path.isdir(files_or_dirs[1]):
        for f in os.listdir(files_or_dirs[1]):
+            if os.path.splitext(f)[1] != ".json":
+                continue
            r = os.path.join(files_or_dirs[0], f)
            c = os.path.join(files_or_dirs[1], f)
-            if os.path.isfile(r) and os.path.isfile(c):
-                to_compare.append( (r,c) )
+            if os.path.isfile(r) and os.path.isfile(c) and \
+               os.path.getsize(r) > 0 and os.path.getsize(c) > 0:
+                to_compare.append((r, c))
    else:
        to_compare = [(files_or_dirs[0],files_or_dirs[1])]

--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -10,6 +10,7 @@ set(test_srcs
  named_values.cu
  option_parser.cu
  range.cu
+  ring_buffer.cu
  runner.cu
  state.cu
  state_generator.cu
@@ -36,3 +37,4 @@ foreach(test_src IN LISTS test_srcs)
 endforeach()

 add_subdirectory(cmake)
+add_subdirectory(device)
--- a/testing/device/CMakeLists.txt
+++ b/testing/device/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Test that we're converging to an accurate mean + stdev without timing out:
+set(test_name nvbench.test.device.noisy_bench)
+add_executable(${test_name} noisy_bench.cu)
+target_link_libraries(${test_name} PRIVATE nvbench::main fmt)
+nvbench_config_target(${test_name})
+add_dependencies(nvbench.test.all ${test_name})
+
+if (NVBench_ENABLE_DEVICE_TESTING)
+  add_test(NAME ${test_name} COMMAND "$<TARGET_FILE:${test_name}>")
+  set_tests_properties(${test_name} PROPERTIES
+    # Any timeouts/warnings are hard failures for this test.
+    FAIL_REGULAR_EXPRESSION "Warn;timed out"
+  )
+endif()
--- a/testing/device/noisy_bench.cu
+++ b/testing/device/noisy_bench.cu
@@ -0,0 +1,146 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+#include <nvbench/test_kernels.cuh>
+
+#include <fmt/format.h>
+
+#include <algorithm>
+#include <limits>
+#include <random>
+#include <stdexcept>
+
+void noisy_bench(nvbench::state &state)
+{
+  // time, convert ms -> s
+  const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) /
+                    1000.f;
+  // rel stdev
+  const auto noise_pct =
+    static_cast<nvbench::float32_t>(state.get_float64("Noise"));
+  const auto noise = noise_pct / 100.f;
+  // abs stdev
+  const auto stdev = noise * mean;
+
+  std::minstd_rand rng{};
+  std::normal_distribution<nvbench::float32_t> dist(mean, stdev);
+
+  // cold tag will save time by disabling batch measurements
+  state.exec(nvbench::exec_tag::impl::cold, [&](nvbench::launch &launch) {
+    const auto seconds = dist(rng);
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(seconds);
+  });
+
+  const auto measured_mean = static_cast<nvbench::float32_t>(
+    state.get_summary("Average GPU Time (Cold)").get_float64("value"));
+  const auto measured_noise = [&]() {
+    try
+    {
+      return static_cast<nvbench::float32_t>(
+        state.get_summary("GPU Relative Standard Deviation (Cold)")
+          .get_float64("value"));
+    }
+    catch (std::invalid_argument &)
+    {
+      return std::numeric_limits<nvbench::float32_t>::infinity();
+    }
+  }();
+  const auto measured_stdev = measured_noise * measured_mean;
+
+  const auto mean_error  = std::fabs(measured_mean - mean);
+  const auto stdev_error = std::fabs(measured_stdev - stdev);
+  const auto noise_error = std::fabs(measured_noise - noise);
+
+  const auto mean_threshold  = std::max(0.025f * mean, 8e-6f); // 2.5% or 8us
+  const auto stdev_threshold = std::max(0.05f * stdev, 5e-6f); // 5% or 5us
+
+  const auto mean_pass  = mean_error < mean_threshold;
+  const auto stdev_pass = stdev_error < stdev_threshold;
+
+  fmt::print("| {:^5} "
+             "| {:^12} | {:^12} "
+             "| {:^12} | {:^12} | {:^4} |\n",
+             "",
+             "Expected",
+             "Measured",
+             "Error",
+             "Threshold",
+             "Flag");
+  fmt::print("|{:-^7}"
+             "|{:-^14}|{:-^14}"
+             "|{:-^14}|{:-^14}|{:-^6}|\n",
+             "",
+             "",
+             "",
+             "",
+             "",
+             "");
+  fmt::print("| Mean  "
+             "| {:>9.6f} ms | {:>9.6f} ms "
+             "| {:>9.6f} ms | {:>9.6f} ms | {:4} |\n"
+             "| Stdev "
+             "| {:>9.6f} ms | {:>9.6f} ms "
+             "| {:>9.6f} ms | {:>9.6f} ms | {:4} |\n"
+             "| Noise "
+             "| {:>9.6f}%   | {:>9.6f}%   "
+             "| {:>9.6f}%   | {:5}        | {:4} |\n",
+             mean * 1000,
+             measured_mean * 1000,
+             mean_error * 1000,
+             mean_threshold * 1000,
+             mean_pass ? "" : "!!!!",
+
+             stdev * 1000,
+             measured_stdev * 1000,
+             stdev_error * 1000,
+             stdev_threshold * 1000,
+             stdev_pass ? "" : "!!!!",
+
+             noise * 100,
+             measured_noise * 100,
+             noise_error * 100,
+             "",
+             "");
+
+  if (!mean_pass)
+  {
+    // This isn't actually logged, it just tells ctest to mark the test as
+    // skipped as a soft-failure.
+    fmt::print("Warn: Mean error exceeds threshold: ({:.3} ms > {:.3} ms)\n",
+               mean_error * 1000,
+               mean_threshold * 1000);
+  }
+
+  if (!stdev_pass)
+  {
+    // This isn't actually logged, it just tells ctest to mark the test as
+    // skipped as a soft-failure.
+    fmt::print("Warn: Stdev error exceeds threshold: "
+               "({:.6} ms > {:.6} ms, noise: {:.3}%)\n",
+               stdev_error * 1000,
+               stdev_threshold * 1000,
+               measured_noise * 100);
+  }
+}
+NVBENCH_BENCH(noisy_bench)
+  .add_float64_axis("Mean", {0.05, 0.1, 0.5, 1.0, 10.0}) // ms
+  .add_float64_axis("Noise", {0.1, 5., 25.})             // %
+  // disable this; we want to test that the benchmarking loop will still exit
+  // when max_noise is never reached:
+  .set_max_noise(0.0000001);
--- a/testing/ring_buffer.cu
+++ b/testing/ring_buffer.cu
@@ -0,0 +1,90 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/detail/ring_buffer.cuh>
+
+#include "test_asserts.cuh"
+
+#include <algorithm>
+#include <vector>
+
+template <typename T>
+bool equal(const nvbench::detail::ring_buffer<T> &buffer,
+           const std::vector<T> &reference)
+{
+  return std::equal(buffer.cbegin(), buffer.cend(), reference.cbegin());
+}
+
+int main()
+try
+{
+  nvbench::detail::ring_buffer<int> avg(3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(avg.size() == 0);
+  ASSERT(avg.empty());
+  ASSERT(equal(avg, {0, 0, 0}));
+
+  avg.push_back(32);
+  ASSERT(!avg.empty());
+  ASSERT(avg.size() == 1);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 32, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 0, 0}));
+
+  avg.push_back(2);
+  ASSERT(avg.size() == 2);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 2, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 2, 0}));
+
+  avg.push_back(-15);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == -15, " (got {})", avg.back());
+  ASSERT(equal(avg, {32, 2, -15}));
+
+  avg.push_back(5);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT_MSG(avg.back() == 5, " (got {})", avg.back());
+  ASSERT(equal(avg, {5, 2, -15}));
+
+  avg.push_back(0);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(equal(avg, {5, 0, -15}));
+  ASSERT_MSG(avg.back() == 0, " (got {})", avg.back());
+
+  avg.push_back(128);
+  ASSERT(avg.size() == 3);
+  ASSERT(avg.capacity() == 3);
+  ASSERT(equal(avg, {5, 0, 128}));
+  ASSERT_MSG(avg.back() == 128, " (got {})", avg.back());
+
+  avg.clear();
+  ASSERT(avg.empty());
+  ASSERT(avg.size() == 0);
+  ASSERT(avg.capacity() == 3);
+
+  return 0;
+}
+catch (std::exception &err)
+{
+  fmt::print(stderr, "{}", err.what());
+  return 1;
+}