Add termination criteria API.

- min_samples - min_time - max_noise - skip_time (not yet implemented) - timeout Refactored s/(trials)|(iters)/samples/s.
2026-04-20 06:48:53 +00:00 · 2021-02-15 11:56:10 -05:00
parent e5914ff620
commit d323f569b8
9 changed files with 258 additions and 84 deletions
--- a/nvbench/benchmark_base.cu
+++ b/nvbench/benchmark_base.cu
@@ -16,8 +16,16 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
  auto result = this->do_clone();

  // Do not copy states.
-  result->m_name = m_name;
-  result->m_axes = m_axes;
+  result->m_name    = m_name;
+  result->m_axes    = m_axes;
+  result->m_devices = m_devices;
+
+  result->m_min_samples = m_min_samples;
+  result->m_min_time    = m_min_time;
+  result->m_max_noise   = m_max_noise;
+
+  result->m_skip_time = m_skip_time;
+  result->m_timeout   = m_timeout;

  return std::move(result);
 }
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -113,6 +113,67 @@ struct benchmark_base

  void run() { this->do_run(); }

+  /// Execute at least this many trials per measurement. @{
+  [[nodiscard]] nvbench::int64_t get_min_samples() const
+  {
+    return m_min_samples;
+  }
+  benchmark_base &set_min_samples(nvbench::int64_t min_samples)
+  {
+    m_min_samples = min_samples;
+    return *this;
+  }
+  /// @}
+
+  /// Accumulate at least this many seconds of timing data per measurement. @{
+  [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
+  benchmark_base &set_min_time(nvbench::float64_t min_time)
+  {
+    m_min_time = min_time;
+    return *this;
+  }
+  /// @}
+
+  /// Specify the maximum amount of noise if a measurement supports noise.
+  /// Noise is the relative standard deviation expressed as a percentage:
+  /// `noise = 100 * (stdev / mean_time)`. @{
+  [[nodiscard]] nvbench::float64_t get_max_noise() const { return m_max_noise; }
+  benchmark_base &set_max_noise(nvbench::float64_t max_noise)
+  {
+    m_max_noise = max_noise;
+    return *this;
+  }
+  /// @}
+
+  /// If a warmup run finishes in less than `skip_time`, the measurement will
+  /// be skipped.
+  /// Extremely fast kernels (< 5000 ns) often timeout before they can
+  /// accumulate `min_time` measurements, and are often uninteresting. Setting
+  /// this value can help improve performance by skipping time consuming
+  /// measurement that don't provide much information.
+  /// Default value is 0, which disable the feature.
+  /// @{
+  [[nodiscard]] nvbench::float64_t get_skip_time() const { return m_skip_time; }
+  benchmark_base &set_skip_time(nvbench::float64_t skip_time)
+  {
+    m_skip_time = skip_time;
+    return *this;
+  }
+  /// @}
+
+  /// If a measurement take more than `timeout` seconds to complete, stop the
+  /// measurement early. A warning should be printed if this happens.
+  /// This setting overrides all other termination criteria.
+  /// Note that this is measured in CPU walltime, not sample time.
+  /// @{
+  [[nodiscard]] nvbench::float64_t get_timeout() const { return m_timeout; }
+  benchmark_base &set_timeout(nvbench::float64_t timeout)
+  {
+    m_timeout = timeout;
+    return *this;
+  }
+  /// @}
+
 protected:
  template <typename BenchmarkType>
  friend struct runner;
@@ -122,6 +183,13 @@ protected:
  std::vector<nvbench::device_info> m_devices;
  std::vector<nvbench::state> m_states;

+  nvbench::int64_t m_min_samples{10};
+  nvbench::float64_t m_min_time{0.5};
+  nvbench::float64_t m_max_noise{0.5};
+
+  nvbench::float64_t m_skip_time{0.};
+  nvbench::float64_t m_timeout{15.};
+
 private:
  // route these through virtuals so the templated subclass can inject type info
  virtual std::unique_ptr<benchmark_base> do_clone() const            = 0;
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -17,6 +17,14 @@ namespace nvbench
 namespace detail
 {

+measure_cold_base::measure_cold_base(state &exec_state)
+    : m_state{exec_state}
+    , m_min_samples{exec_state.get_min_samples()}
+    , m_max_noise{exec_state.get_max_noise()}
+    , m_min_time{exec_state.get_min_time()}
+    , m_timeout{exec_state.get_timeout()}
+{}
+
 void measure_cold_base::check()
 {
  const auto device = m_state.get_device();
@@ -38,8 +46,8 @@ void measure_cold_base::check()

 void measure_cold_base::generate_summaries()
 {
-  const auto d_iters       = static_cast<double>(m_total_iters);
-  const auto avg_cuda_time = m_total_cuda_time / d_iters;
+  const auto d_samples     = static_cast<double>(m_total_samples);
+  const auto avg_cuda_time = m_total_cuda_time / d_samples;
  {
    auto &summ = m_state.add_summary("Average GPU Time (Cold)");
    summ.set_string("hint", "duration");
@@ -60,7 +68,7 @@ void measure_cold_base::generate_summaries()
    summ.set_float64("value", m_cuda_noise);
  }

-  const auto avg_cpu_time = m_total_cpu_time / d_iters;
+  const auto avg_cpu_time = m_total_cpu_time / d_samples;
  {
    auto &summ = m_state.add_summary("Average CPU Time (Cold)");
    summ.set_string("hint", "duration");
@@ -82,11 +90,11 @@ void measure_cold_base::generate_summaries()
  }

  {
-    auto &summ = m_state.add_summary("Number of Trials (Cold)");
-    summ.set_string("short_name", "Trials");
+    auto &summ = m_state.add_summary("Number of Samples (Cold)");
+    summ.set_string("short_name", "Samples");
    summ.set_string("description",
                    "Number of kernel executions in cold time measurements.");
-    summ.set_int64("value", m_total_iters);
+    summ.set_int64("value", m_total_samples);
  }

  // Log to stdout:
@@ -127,7 +135,7 @@ void measure_cold_base::generate_summaries()
             avg_cuda_time * 1e3,
             avg_cpu_time * 1e3,
             m_total_cuda_time,
-             m_total_iters);
+             m_total_samples);
  if (m_max_time_exceeded)
  {
    if (m_cuda_noise > m_max_noise)
@@ -137,12 +145,12 @@ void measure_cold_base::generate_summaries()
                 m_cuda_noise,
                 m_max_noise);
    }
-    if (m_total_iters < m_min_iters)
+    if (m_total_samples < m_min_samples)
    {
      fmt::print("!!!! Previous benchmark exceeded max time before "
                 "accumulating min samples ({} < {})\n",
-                 m_total_iters,
-                 m_min_iters);
+                 m_total_samples,
+                 m_min_samples);
    }
    if (m_total_cuda_time < m_min_time)
    {
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -26,9 +26,7 @@ namespace detail
 // non-templated code goes here:
 struct measure_cold_base
 {
-  explicit measure_cold_base(nvbench::state &exec_state)
-      : m_state(exec_state)
-  {}
+  explicit measure_cold_base(nvbench::state &exec_state);
  measure_cold_base(const measure_cold_base &) = delete;
  measure_cold_base(measure_cold_base &&)      = delete;
  measure_cold_base &operator=(const measure_cold_base &) = delete;
@@ -43,7 +41,7 @@ protected:
    m_total_cpu_time  = 0.;
    m_cuda_noise      = 0.;
    m_cpu_noise       = 0.;
-    m_total_iters     = 0;
+    m_total_samples   = 0;
    m_cuda_times.clear();
    m_cpu_times.clear();
    m_max_time_exceeded = false;
@@ -59,18 +57,16 @@ protected:
  nvbench::cpu_timer m_timeout_timer;
  nvbench::detail::l2flush m_l2flush;

-  nvbench::int64_t m_min_iters{10};
-  nvbench::int64_t m_total_iters{};
-
-  nvbench::float64_t m_max_noise{0.5}; // % rel stdev
-  nvbench::float64_t m_cuda_noise{};   // % rel stdev
-  nvbench::float64_t m_cpu_noise{};    // % rel stdev
-
-  nvbench::float64_t m_min_time{0.5};
-  nvbench::float64_t m_max_time{5.0};
+  nvbench::int64_t m_min_samples{};
+  nvbench::float64_t m_max_noise{}; // % rel stdev
+  nvbench::float64_t m_min_time{};
+  nvbench::float64_t m_timeout{};

+  nvbench::int64_t m_total_samples{};
  nvbench::float64_t m_total_cuda_time{};
  nvbench::float64_t m_total_cpu_time{};
+  nvbench::float64_t m_cuda_noise{}; // % rel stdev
+  nvbench::float64_t m_cpu_noise{};  // % rel stdev

  std::vector<nvbench::float64_t> m_cuda_times;
  std::vector<nvbench::float64_t> m_cpu_times;
@@ -128,7 +124,7 @@ private:
      m_cpu_times.push_back(cur_cpu_time);
      m_total_cuda_time += cur_cuda_time;
      m_total_cpu_time += cur_cpu_time;
-      ++m_total_iters;
+      ++m_total_samples;

      // Only consider the cuda noise in the convergence criteria.
      m_cuda_noise = nvbench::detail::compute_noise(m_cuda_times,
@@ -137,14 +133,14 @@ private:
      m_timeout_timer.stop();
      const auto total_time = m_timeout_timer.get_duration();

-      if (m_total_cuda_time > m_min_time && // Min time okay
-          m_total_iters > m_min_iters &&    // Min iters okay
-          m_cuda_noise < m_max_noise)       // Noise okay
+      if (m_total_cuda_time > m_min_time &&  // Min time okay
+          m_total_samples > m_min_samples && // Min samples okay
+          m_cuda_noise < m_max_noise)        // Noise okay
      {
        break;
      }

-      if (total_time > m_max_time) // Max time exceeded, stop iterating.
+      if (total_time > m_timeout) // Max time exceeded, stop iterating.
      {
        m_max_time_exceeded = true;
        break;
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -36,35 +36,35 @@ void measure_hot_base::check()
 }

 measure_hot_base::measure_hot_base(state &exec_state)
-    : m_state(exec_state)
+    : m_state{exec_state}
+    , m_min_samples{exec_state.get_min_samples()}
+    , m_min_time{exec_state.get_min_time()}
+    , m_timeout{exec_state.get_timeout()}
 {
-  // Since cold measures converge to a stable result, increase the min_iters
+  // Since cold measures converge to a stable result, increase the min_samples
  // to match the cold result if available.
  try
  {
-    nvbench::int64_t cold_iters =
-      m_state.get_summary("Number of Trials (Cold)").get_int64("value");
-    m_min_iters = std::max(m_min_iters, cold_iters);
+    nvbench::int64_t cold_samples =
+      m_state.get_summary("Number of Samples (Cold)").get_int64("value");
+    m_min_samples = std::max(m_min_samples, cold_samples);
  }
  catch (...)
  {
-    // TODO Need state API
-    //    m_min_iters = state.get_min_trials();
-    //
-
-    // Apply the target_time since we don't have noise convergence estimates
-    // from the cold executions:
-    // TODO Need state API. Replace the following line with the commented one
-    const auto target_time = (m_min_time + m_max_time) / 2.;
-    //  const auto target_time = state.get_target_time();
-    m_min_time = std::max(m_min_time, target_time);
+    // If the above threw an exception, we don't have a cold measurement to use.
+    // Estimate a target_time between m_min_time and m_timeout.
+    // Use the average of the min_time and timeout, but don't go over 5x
+    // min_time in case timeout is huge.
+    // We could expose a `target_time` property on benchmark_base/state if
+    // needed.
+    m_min_time = std::min((m_min_time + m_timeout) / 2., m_min_time * 5);
  }
 }

 void measure_hot_base::generate_summaries()
 {
-  const auto d_iters       = static_cast<double>(m_total_iters);
-  const auto avg_cuda_time = m_total_cuda_time / d_iters;
+  const auto d_samples     = static_cast<double>(m_total_samples);
+  const auto avg_cuda_time = m_total_cuda_time / d_samples;
  {
    auto &summ = m_state.add_summary("Average GPU Time (Hot)");
    summ.set_string("hint", "duration");
@@ -75,7 +75,7 @@ void measure_hot_base::generate_summaries()
    summ.set_float64("value", avg_cuda_time);
  }

-  const auto avg_cpu_time = m_total_cpu_time / d_iters;
+  const auto avg_cpu_time = m_total_cpu_time / d_samples;
  {
    auto &summ = m_state.add_summary("Average CPU Time (Hot)");
    summ.set_string("hide",
@@ -89,11 +89,11 @@ void measure_hot_base::generate_summaries()
  }

  {
-    auto &summ = m_state.add_summary("Number of Trials (Hot)");
-    summ.set_string("short_name", "Trials");
+    auto &summ = m_state.add_summary("Number of Samples (Hot)");
+    summ.set_string("short_name", "Samples");
    summ.set_string("description",
                    "Number of kernel executions in hot time measurements.");
-    summ.set_int64("value", m_total_iters);
+    summ.set_int64("value", m_total_samples);
  }

  if (const auto items = m_state.get_items_processed_per_launch(); items != 0)
@@ -171,15 +171,15 @@ void measure_hot_base::generate_summaries()
             avg_cuda_time * 1e3,
             avg_cpu_time * 1e3,
             m_total_cuda_time,
-             m_total_iters);
+             m_total_samples);
  if (m_max_time_exceeded)
  {
-    if (m_total_iters < m_min_iters)
+    if (m_total_samples < m_min_samples)
    {
      fmt::print("!!!! Previous benchmark exceeded max time before "
                 "accumulating min samples ({} < {})\n",
-                 m_total_iters,
-                 m_min_iters);
+                 m_total_samples,
+                 m_min_samples);
    }
    if (m_total_cuda_time < m_min_time)
    {
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -33,7 +33,7 @@ protected:
  {
    m_total_cpu_time    = 0.;
    m_total_cuda_time   = 0.;
-    m_total_iters       = 0;
+    m_total_samples     = 0;
    m_max_time_exceeded = false;
  }

@@ -46,12 +46,11 @@ protected:
  nvbench::cpu_timer m_cpu_timer;
  nvbench::cpu_timer m_timeout_timer;

-  nvbench::int64_t m_total_iters{};
-  nvbench::int64_t m_min_iters{10};
-
-  nvbench::float64_t m_min_time{0.5};
-  nvbench::float64_t m_max_time{5.0};
+  nvbench::int64_t m_min_samples{};
+  nvbench::float64_t m_min_time{};
+  nvbench::float64_t m_timeout{};

+  nvbench::int64_t m_total_samples{};
  nvbench::float64_t m_total_cuda_time{};
  nvbench::float64_t m_total_cpu_time{};

@@ -102,7 +101,7 @@ private:
      // Block stream until some work is queued.
      // Limit the number of kernel executions while blocked to prevent
      // deadlocks. See warnings on blocking_kernel.
-      const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
+      const auto blocked_launches   = std::min(batch_size, nvbench::int64_t{2});
      const auto unblocked_launches = batch_size - blocked_launches;

      blocker.block(m_launch.get_stream());
@@ -129,22 +128,22 @@ private:

      m_total_cpu_time += m_cpu_timer.get_duration();
      m_total_cuda_time += m_cuda_timer.get_duration();
-      m_total_iters += batch_size;
+      m_total_samples += batch_size;

      // Predict number of remaining iterations:
      batch_size = (m_min_time - m_total_cuda_time) /
-                   (m_total_cuda_time / m_total_iters);
+                   (m_total_cuda_time / m_total_samples);

      m_timeout_timer.stop();
      const auto total_time = m_timeout_timer.get_duration();

      if (m_total_cuda_time > m_min_time && // min time okay
-          m_total_iters > m_min_iters)      // min iters okay
+          m_total_samples > m_min_samples)  // min samples okay
      {
        break; // Stop iterating
      }

-      if (m_total_cuda_time > m_max_time)
+      if (m_total_cuda_time > m_timeout)
      {
        m_max_time_exceeded = true;
        break;
--- a/nvbench/state.cu
+++ b/nvbench/state.cu
@@ -1,5 +1,6 @@
 #include <nvbench/state.cuh>

+#include <nvbench/benchmark_base.cuh>
 #include <nvbench/types.cuh>

 #include <fmt/format.h>
@@ -11,6 +12,30 @@
 namespace nvbench
 {

+state::state(const benchmark_base &bench)
+    : m_benchmark{bench}
+    , m_min_samples{bench.get_min_samples()}
+    , m_min_time{bench.get_min_time()}
+    , m_max_noise{bench.get_max_noise()}
+    , m_skip_time{bench.get_skip_time()}
+    , m_timeout{bench.get_timeout()}
+{}
+
+state::state(const benchmark_base &bench,
+             nvbench::named_values values,
+             std::optional<nvbench::device_info> device,
+             std::size_t type_config_index)
+    : m_benchmark{bench}
+    , m_axis_values{std::move(values)}
+    , m_device{std::move(device)}
+    , m_type_config_index{type_config_index}
+    , m_min_samples{bench.get_min_samples()}
+    , m_min_time{bench.get_min_time()}
+    , m_max_noise{bench.get_max_noise()}
+    , m_skip_time{bench.get_skip_time()}
+    , m_timeout{bench.get_timeout()}
+{}
+
 nvbench::int64_t state::get_int64(const std::string &axis_name) const
 {
  return m_axis_values.get_int64(axis_name);
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -90,6 +90,50 @@ struct state
    return m_skip_reason;
  }

+  /// Execute at least this many trials per measurement. @{
+  [[nodiscard]] nvbench::int64_t get_min_samples() const
+  {
+    return m_min_samples;
+  }
+  void set_min_samples(nvbench::int64_t min_samples)
+  {
+    m_min_samples = min_samples;
+  }
+  /// @}
+
+  /// Accumulate at least this many seconds of timing data per measurement. @{
+  [[nodiscard]] nvbench::float64_t get_min_time() const { return m_min_time; }
+  void set_min_time(nvbench::float64_t min_time) { m_min_time = min_time; }
+  /// @}
+
+  /// Specify the maximum amount of noise if a measurement supports noise.
+  /// Noise is the relative standard deviation expressed as a percentage:
+  /// `noise = 100 * (stdev / mean_time)`. @{
+  [[nodiscard]] nvbench::float64_t get_max_noise() const { return m_max_noise; }
+  void set_max_noise(nvbench::float64_t max_noise) { m_max_noise = max_noise; }
+  /// @}
+
+  /// If a warmup run finishes in less than `skip_time`, the measurement will
+  /// be skipped.
+  /// Extremely fast kernels (< 5000 ns) often timeout before they can
+  /// accumulate `min_time` measurements, and are often uninteresting. Setting
+  /// this value can help improve performance by skipping time consuming
+  /// measurement that don't provide much information.
+  /// Default value is 0, which disable the feature.
+  /// @{
+  [[nodiscard]] nvbench::float64_t get_skip_time() const { return m_skip_time; }
+  void set_skip_time(nvbench::float64_t skip_time) { m_skip_time = skip_time; }
+  /// @}
+
+  /// If a measurement take more than `timeout` seconds to complete, stop the
+  /// measurement early. A warning should be printed if this happens.
+  /// This setting overrides all other termination criteria.
+  /// Note that this is measured in CPU walltime, not sample time.
+  /// @{
+  [[nodiscard]] nvbench::float64_t get_timeout() const { return m_timeout; }
+  void set_timeout(nvbench::float64_t timeout) { m_timeout = timeout; }
+  /// @}
+
  [[nodiscard]] const named_values &get_axis_values() const
  {
    return m_axis_values;
@@ -111,25 +155,25 @@ private:
  friend struct nvbench::detail::state_generator;
  friend struct nvbench::detail::state_tester;

-  explicit state(const benchmark_base &bench)
-      : m_benchmark{bench}
-  {}
+  explicit state(const benchmark_base &bench);

  state(const benchmark_base &bench,
        nvbench::named_values values,
        std::optional<nvbench::device_info> device,
-        std::size_t type_config_index)
-      : m_benchmark{bench}
-      , m_axis_values{std::move(values)}
-      , m_device{std::move(device)}
-      , m_type_config_index{type_config_index}
-  {}
+        std::size_t type_config_index);

  std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
  nvbench::named_values m_axis_values;
  std::optional<nvbench::device_info> m_device;
  std::size_t m_type_config_index{};

+  nvbench::int64_t m_min_samples;
+  nvbench::float64_t m_min_time;
+  nvbench::float64_t m_max_noise;
+
+  nvbench::float64_t m_skip_time;
+  nvbench::float64_t m_timeout;
+
  std::vector<nvbench::summary> m_summaries;
  std::string m_skip_reason;
  nvbench::int64_t m_items_processed_per_launch{};
--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -717,16 +717,10 @@ void test_devices()
  ASSERT(states.size() == 12);

  fmt::memory_buffer buffer;
-  const std::string table_format =
-    "| {:^5} | {:^6} | {:^5} | {:^3} |\n";
+  const std::string table_format = "| {:^5} | {:^6} | {:^5} | {:^3} |\n";

  fmt::format_to(buffer, "\n");
-  fmt::format_to(buffer,
-                 table_format,
-                 "State",
-                 "Device",
-                 "S",
-                 "I");
+  fmt::format_to(buffer, table_format, "State", "Device", "S", "I");

  std::size_t config = 0;
  for (const auto &state : states)
@@ -760,6 +754,36 @@ void test_devices()
  ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
 }

+void test_termination_criteria()
+{
+  const nvbench::int64_t min_samples = 1000;
+  const nvbench::float64_t min_time  = 2000;
+  const nvbench::float64_t max_noise = 3000;
+  const nvbench::float64_t skip_time = 4000;
+  const nvbench::float64_t timeout   = 5000;
+
+  // for comparing floats
+  auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; };
+
+  dummy_bench bench;
+  bench.set_devices(std::vector<int>{});
+  bench.set_min_samples(min_samples);
+  bench.set_min_time(min_time);
+  bench.set_max_noise(max_noise);
+  bench.set_skip_time(skip_time);
+  bench.set_timeout(timeout);
+
+  const std::vector<nvbench::state> states =
+    nvbench::detail::state_generator::create(bench);
+
+  ASSERT(states.size() == 1);
+  ASSERT(min_samples == states[0].get_min_samples());
+  ASSERT(within_one(min_time, states[0].get_min_time()));
+  ASSERT(within_one(max_noise, states[0].get_max_noise()));
+  ASSERT(within_one(skip_time, states[0].get_skip_time()));
+  ASSERT(within_one(timeout, states[0].get_timeout()));
+}
+
 int main()
 try
 {
@@ -770,9 +794,11 @@ try
  test_create_with_types();
  test_create_with_masked_types();
  test_devices();
+  test_termination_criteria();
+
  return 0;
 }
-catch (std::exception& e)
+catch (std::exception &e)
 {
  fmt::print("{}\n", e.what());
  return 1;