Implement CLI option to control warmups for cold measurements (#339)

* Implement warmup-runs count, supported as CLI CLI option --warmup-runs implemented and documented. The warm-up counts is enforced to always be positive. This is necessary to ensure that JIT-ting has occurred, and use of blocking kernel would not result in time-outs. Test is option parser is added. * Ensure that measure_cold::run_warmup instantiates blocking kernel Because warm-up runs are executed without use of blocking kernel, the blocking kernel was not jitted until actual measurements were collected. The module loading cost incurred during the first run shows as elevated CPU time noise value for the first measurement as noted in https://github.com/NVIDIA/nvbench/pull/339 This PR adds `this->block_stream(); this->unblock_stream();` prior to executing warm-up loop with use of blocking kernel disabled. This ensures that blocking kernel is instantiated during the warm-up, but it no other kernel is launched between its launch and stream sync thus avoiding deadlocking. * Rename --warmup-runs to --cold-warmup-runs, add --cold-max-warmup-walltime Since configurable number of warmups only applies to measure_cold.cuh rename the CLI option to reflect that. Also add --cold-max-warmup-walltime (defaults to -1, i.e. disabled). If enabled, exits warmup loop before request count is reached if the wall-time expanded executign warmups exceeds this max-warmup-walltime value.
2026-05-23 06:14:49 +00:00 · 2026-05-12 14:30:08 -05:00
parent ebf9f9a087
commit 9ea77bccaa
11 changed files with 164 additions and 16 deletions
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -93,6 +93,21 @@
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.

+* `--cold-warmup-runs <count>`
+  * Execute up to `<count>` warmup runs before collecting cold measurement samples.
+  * The minimum is 1 warmup run.
+  * Default is 1 warmup run.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--cold-max-warmup-walltime <seconds>`
+  * Stop cold warmup after the total warmup walltime exceeds `<seconds>`.
+  * The limit is checked after each warmup run, so actual warmup time may exceed
+    this value by one warmup run.
+  * Default is -1 seconds (disabled).
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
 * `--throttle-threshold <value>`
  * Set the GPU throttle threshold as percentage of the device's default clock rate.
  * Default is 75.
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -166,6 +166,28 @@ struct benchmark_base
  }
  /// @}

+  /// Execute this many warmup runs before collecting cold measurement samples. @{
+  [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
+  benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
+  {
+    m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
+                                                                : nvbench::int64_t{1};
+    return *this;
+  }
+  /// @}
+
+  /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
+  [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
+  {
+    return m_cold_max_warmup_walltime;
+  }
+  benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
+  {
+    m_cold_max_warmup_walltime = cold_max_warmup_walltime;
+    return *this;
+  }
+  /// @}
+
  /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
  /// @{
  [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
@@ -321,7 +343,9 @@ protected:
  bool m_skip_batched{false};

  nvbench::int64_t m_min_samples{10};
+  nvbench::int64_t m_cold_warmup_runs{1};

+  nvbench::float64_t m_cold_max_warmup_walltime{-1.};
  nvbench::float64_t m_skip_time{-1.};
  nvbench::float64_t m_timeout{15.};

--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -43,7 +43,9 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
  result->m_run_once                = m_run_once;
  result->m_disable_blocking_kernel = m_disable_blocking_kernel;

-  result->m_min_samples = m_min_samples;
+  result->m_min_samples              = m_min_samples;
+  result->m_cold_warmup_runs         = m_cold_warmup_runs;
+  result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime;

  result->m_skip_time = m_skip_time;
  result->m_timeout   = m_timeout;
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -46,6 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state)
    , m_run_once{exec_state.get_run_once()}
    , m_check_throttling(!exec_state.get_run_once())
    , m_min_samples{exec_state.get_min_samples()}
+    , m_cold_warmup_runs{exec_state.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()}
    , m_skip_time{exec_state.get_skip_time()}
    , m_timeout{exec_state.get_timeout()}
    , m_throttle_threshold(exec_state.get_throttle_threshold())
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -110,7 +110,9 @@ protected:
  bool m_check_throttling{true};

  nvbench::int64_t m_min_samples{};
+  nvbench::int64_t m_cold_warmup_runs{1};

+  nvbench::float64_t m_cold_max_warmup_walltime{};
  nvbench::float64_t m_skip_time{};
  nvbench::float64_t m_timeout{};

@@ -239,8 +241,8 @@ struct measure_cold : public measure_cold_base
  }

 private:
-  // Run the kernel once, measuring the GPU time. If under skip_time, skip the
-  // measurement.
+  // Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run.
+  // If under skip_time, skip the measurement.
  void run_warmup()
  {
    if (m_run_once)
@@ -248,12 +250,29 @@ private:
      return;
    }

+    // Ensure blocking kernel is loaded during the warmup
+    // Ref: https://github.com/NVIDIA/nvbench/issues/339
+    this->block_stream();
+    this->unblock_stream();
+
    // disable use of blocking kernel for warm-up run
    // see https://github.com/NVIDIA/nvbench/issues/240
    constexpr bool disable_blocking_kernel = true;
    kernel_launch_timer timer(*this, disable_blocking_kernel);
+    nvbench::cpu_timer warmup_walltime_timer;

-    this->launch_kernel(timer);
+    warmup_walltime_timer.start();
+    for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run)
+    {
+      this->launch_kernel(timer);
+      warmup_walltime_timer.stop();
+
+      if (m_cold_max_warmup_walltime > 0. &&
+          warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime)
+      {
+        break;
+      }
+    }
    this->check_skip_time(m_cuda_timer.get_duration());
  }

--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -429,9 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
      bench["name"]  = bench_ptr->get_name();
      bench["index"] = bench_index;

-      bench["min_samples"] = bench_ptr->get_min_samples();
-      bench["skip_time"]   = bench_ptr->get_skip_time();
-      bench["timeout"]     = bench_ptr->get_timeout();
+      bench["min_samples"]              = bench_ptr->get_min_samples();
+      bench["cold_warmup_runs"]         = bench_ptr->get_cold_warmup_runs();
+      bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime();
+      bench["skip_time"]                = bench_ptr->get_skip_time();
+      bench["timeout"]                  = bench_ptr->get_timeout();

      auto &devices = bench["devices"];
      for (const auto &dev_info : bench_ptr->get_devices())
@@ -486,9 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)

        st["name"] = exec_state.get_axis_values_as_string();

-        st["min_samples"] = exec_state.get_min_samples();
-        st["skip_time"]   = exec_state.get_skip_time();
-        st["timeout"]     = exec_state.get_timeout();
+        st["min_samples"]              = exec_state.get_min_samples();
+        st["cold_warmup_runs"]         = exec_state.get_cold_warmup_runs();
+        st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime();
+        st["skip_time"]                = exec_state.get_skip_time();
+        st["timeout"]                  = exec_state.get_timeout();

        st["device"]            = exec_state.get_device()->get_id();
        st["type_config_index"] = exec_state.get_type_config_index();
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -549,14 +549,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
      this->update_axis(first[1]);
      first += 2;
    }
-    else if (arg == "--min-samples")
+    else if (arg == "--min-samples" || arg == "--cold-warmup-runs")
    {
      check_params(1);
      this->update_int64_prop(first[0], first[1]);
      first += 2;
    }
-    else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
-             arg == "--throttle-recovery-delay")
+    else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" ||
+             arg == "--throttle-threshold" || arg == "--throttle-recovery-delay")
    {
      check_params(1);
      this->update_float64_prop(first[0], first[1]);
@@ -1015,6 +1015,10 @@ try
  {
    bench.set_min_samples(value);
  }
+  else if (prop_arg == "--cold-warmup-runs")
+  {
+    bench.set_cold_warmup_runs(value);
+  }
  else
  {
    NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
@@ -1128,6 +1132,10 @@ try
  {
    bench.set_timeout(value);
  }
+  else if (prop_arg == "--cold-max-warmup-walltime")
+  {
+    bench.set_cold_max_warmup_walltime(value);
+  }
  else if (prop_arg == "--throttle-threshold")
  {
    bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value) / 100.0f);
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -152,6 +152,26 @@ struct state
  void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
  /// @}

+  /// Execute this many warmup runs before collecting cold measurement samples. @{
+  [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
+  void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
+  {
+    m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
+                                                                : nvbench::int64_t{1};
+  }
+  /// @}
+
+  /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
+  [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
+  {
+    return m_cold_max_warmup_walltime;
+  }
+  void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
+  {
+    m_cold_max_warmup_walltime = cold_max_warmup_walltime;
+  }
+  /// @}
+
  [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
  {
    return m_criterion_params;
@@ -332,7 +352,9 @@ private:
  std::string m_stopping_criterion;

  nvbench::int64_t m_min_samples;
+  nvbench::int64_t m_cold_warmup_runs;

+  nvbench::float64_t m_cold_max_warmup_walltime;
  nvbench::float64_t m_skip_time;
  nvbench::float64_t m_timeout;

--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -39,6 +39,8 @@ state::state(const benchmark_base &bench)
    , m_criterion_params{bench.get_criterion_params()}
    , m_stopping_criterion(bench.get_stopping_criterion())
    , m_min_samples{bench.get_min_samples()}
+    , m_cold_warmup_runs{bench.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
    , m_skip_time{bench.get_skip_time()}
    , m_timeout{bench.get_timeout()}
    , m_throttle_threshold{bench.get_throttle_threshold()}
@@ -61,6 +63,8 @@ state::state(const benchmark_base &bench,
    , m_criterion_params{bench.get_criterion_params()}
    , m_stopping_criterion(bench.get_stopping_criterion())
    , m_min_samples{bench.get_min_samples()}
+    , m_cold_warmup_runs{bench.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
    , m_skip_time{bench.get_skip_time()}
    , m_timeout{bench.get_timeout()}
    , m_throttle_threshold{bench.get_throttle_threshold()}
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -1204,6 +1204,36 @@ void test_min_samples()
  ASSERT(states[0].get_min_samples() == 12345);
 }

+void test_cold_warmup_runs()
+{
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 12345);
+  }
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 1);
+  }
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 1);
+  }
+}
+
 void test_skip_time()
 {
  nvbench::option_parser parser;
@@ -1214,6 +1244,16 @@ void test_skip_time()
  ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
 }

+void test_cold_max_warmup_walltime()
+{
+  nvbench::option_parser parser;
+  parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"});
+  const auto &states = parser_to_states(parser);
+
+  ASSERT(states.size() == 1);
+  ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.);
+}
+
 void test_timeout()
 {
  nvbench::option_parser parser;
@@ -1531,7 +1571,9 @@ try
  test_axis_before_benchmark();

  test_min_samples();
+  test_cold_warmup_runs();
  test_skip_time();
+  test_cold_max_warmup_walltime();
  test_timeout();
  test_output_parent_directories_created();

--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -762,9 +762,11 @@ void test_devices()

 void test_termination_criteria()
 {
-  const nvbench::int64_t min_samples = 1000;
-  const nvbench::float64_t skip_time = 4000;
-  const nvbench::float64_t timeout   = 5000;
+  const nvbench::int64_t min_samples                = 1000;
+  const nvbench::int64_t cold_warmup_runs           = 7;
+  const nvbench::float64_t cold_max_warmup_walltime = 3000;
+  const nvbench::float64_t skip_time                = 4000;
+  const nvbench::float64_t timeout                  = 5000;

  // for comparing floats
  auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; };
@@ -772,6 +774,8 @@ void test_termination_criteria()
  dummy_bench bench;
  bench.set_devices(std::vector<int>{});
  bench.set_min_samples(min_samples);
+  bench.set_cold_warmup_runs(cold_warmup_runs);
+  bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime);
  bench.set_skip_time(skip_time);
  bench.set_timeout(timeout);

@@ -779,6 +783,8 @@ void test_termination_criteria()

  ASSERT(states.size() == 1);
  ASSERT(min_samples == states[0].get_min_samples());
+  ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs());
+  ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime()));
  ASSERT(within_one(skip_time, states[0].get_skip_time()));
  ASSERT(within_one(timeout, states[0].get_timeout()));
 }