From 7dfbcad27c5583a37c00e1299b38f0caf96d1228 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 12 May 2026 10:26:28 -0500
Subject: [PATCH 1/3] Create directories for output files (#360)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* QOL UX, NVBench creates directories for output JSON, MD, CSV files

This closes #185 and supports specifying
`--json path/to/nonexistent/folder/result.json`

This would create sequence of folders where to place result.json

```
(py313) :~/repos/nvbench$ rm -rf /tmp/nested/
(py313) :~/repos/nvbench$ ./build2/bin/nvbench.example.cpp20.axes -b copy_type_and_block_size_sweep -a Type=I32 -a BlockSize=64 --jsonbin /tmp/nested/json/axes.json --md /tmp/nested/md/res.md --csv /tmp/nested/csv/res.csv > /dev/null 2>&1
(py313) :~/repos/nvbench$ tree /tmp/nested/
/tmp/nested/
├── csv
│   └── res.csv
├── json
│   ├── axes.json
│   ├── axes.json-bin
│   │   └── 0.bin
│   └── axes.json-freqs-bin
│       └── 0.bin
└── md
    └── res.md

6 directories, 5 files
```

* Add a test that non-existent output folder is created

* Remove throwing custom error message. Use default

* Replace static_assert(false, ...) with #error
---
 nvbench/json_printer.cu  |  2 +-
 nvbench/option_parser.cu | 25 +++++++++++++++
 testing/option_parser.cu | 66 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index e363a45..b5f41c3 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -49,7 +49,7 @@ namespace fs = std::filesystem;
 #include <experimental/filesystem>
 namespace fs = std::experimental::filesystem;
 #else
-static_assert(false, "No <filesystem> or <experimental/filesystem> found.");
+#error "No <filesystem> or <experimental/filesystem> found."
 #endif
 
 #if NVBENCH_CPP_DIALECT >= 2020
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 530ae94..f0570bb 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -48,9 +48,20 @@
 #include <stdexcept>
 #include <string>
 #include <string_view>
+#include <system_error>
 #include <tuple>
 #include <vector>
 
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#error "No <filesystem> or <experimental/filesystem> found."
+#endif
+
 namespace
 {
 
@@ -115,6 +126,18 @@ catch (const std::exception &)
 
 void parse(std::string_view input, std::string &val) { val = input; }
 
+void create_output_parent_directories(const std::string &spec)
+{
+  const fs::path output_path{spec};
+  const fs::path parent_path = output_path.parent_path();
+  if (parent_path.empty())
+  {
+    return;
+  }
+
+  fs::create_directories(parent_path);
+}
+
 // Parses a list of values "<val1>, <val2>, <val3>, ..." into a vector:
 template <typename T>
 std::vector<T> parse_list_values(std::string_view list_spec)
@@ -622,6 +645,8 @@ std::ostream &option_parser::printer_spec_to_ostream(const std::string &spec)
   }
   else // spec is a filename:
   {
+    ::create_output_parent_directories(spec);
+
     auto file_stream = std::make_unique<std::ofstream>();
     // Throw if file can't open
     file_stream->exceptions(file_stream->exceptions() | std::ios::failbit);
diff --git a/testing/option_parser.cu b/testing/option_parser.cu
index 6104e5e..3cd98a3 100644
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -22,6 +22,21 @@
 
 #include <fmt/format.h>
 
+#include <chrono>
+#include <iostream>
+#include <stdexcept>
+#include <system_error>
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#error "No <filesystem> or <experimental/filesystem> found."
+#endif
+
 #include "test_asserts.cuh"
 
 //==============================================================================
@@ -49,6 +64,40 @@ NVBENCH_BENCH_TYPES(TestBench, NVBENCH_TYPE_AXES(Ts, Us))
 namespace
 {
 
+struct temp_tree
+{
+  explicit temp_tree(fs::path root)
+      : root_path{std::move(root)}
+  {
+    std::error_code ec;
+    fs::remove_all(root_path, ec);
+    if (ec)
+    {
+      throw std::runtime_error{fmt::format("Failed to remove temporary directory `{}`: {}",
+                                           root_path.string(),
+                                           ec.message())};
+    }
+  }
+
+  ~temp_tree()
+  {
+    std::error_code ec;
+    fs::remove_all(root_path, ec);
+    if (ec)
+    {
+      std::cerr << "Failed to remove temporary directory `" << root_path.string()
+                << "`: " << ec.message() << "\n";
+    }
+  }
+
+  temp_tree(const temp_tree &)            = delete;
+  temp_tree(temp_tree &&)                 = delete;
+  temp_tree &operator=(const temp_tree &) = delete;
+  temp_tree &operator=(temp_tree &&)      = delete;
+
+  fs::path root_path;
+};
+
 [[nodiscard]] std::string states_to_string(const std::vector<nvbench::state> &states)
 {
   fmt::memory_buffer buffer;
@@ -1175,6 +1224,22 @@ void test_timeout()
   ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.);
 }
 
+void test_output_parent_directories_created()
+{
+  const auto unique_suffix = std::chrono::steady_clock::now().time_since_epoch().count();
+  const temp_tree temp{fs::temp_directory_path() /
+                       fmt::format("nvbench_option_parser_test_{}", unique_suffix)};
+  const auto output_path = temp.root_path / "nested" / "results.json";
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--json", output_path.string()});
+  }
+
+  ASSERT(fs::is_directory(output_path.parent_path()));
+  ASSERT(fs::exists(output_path));
+}
+
 void test_stopping_criterion()
 {
   { // Per benchmark criterion
@@ -1468,6 +1533,7 @@ try
   test_min_samples();
   test_skip_time();
   test_timeout();
+  test_output_parent_directories_created();
 
   test_stopping_criterion();
 

From ebf9f9a087deef078422cc4ae87ac4f455b6c509 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 12 May 2026 13:55:46 -0500
Subject: [PATCH 2/3] Add .coderabbit.yaml following in footsteps of CCCL
 (#359)

---
 .coderabbit.yaml | 137 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 .coderabbit.yaml

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 0000000..7336b51
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,137 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+language: en-US
+tone_instructions: |
+  Be direct, technical, brief. No praise, emojis, headings, or collapsible menus.
+  Start each comment with one prefix:
+  - suggestion: optional improvement;
+  - important: must-fix/high-impact risk;
+  - critical: blocking correctness/security/data-loss.
+
+reviews:
+  profile: chill
+
+  high_level_summary: true
+  high_level_summary_in_walkthrough: true
+  poem: false
+  in_progress_fortune: false
+  sequence_diagrams: false
+  estimate_code_review_effort: false
+  collapse_walkthrough: true
+
+  # Reduce noisy status/details sections.
+  request_changes_workflow: false
+  review_status: false
+  review_details: false
+  enable_prompt_for_ai_agents: false
+
+  auto_review:
+    enabled: false
+    drafts: false
+    base_branches:
+      - "^main$"
+      - "^branch/[0-9]+\\.[0-9]+\\.x$"
+    ignore_usernames: ["copy-pr-bot", "dependabot[bot]", "github-actions[bot]", "nv-automation-bot"]
+
+  tools:
+    gitleaks:
+      enabled: true
+    markdownlint:
+      enabled: true
+    shellcheck:
+      enabled: true
+
+  # Keep Autofix available, but disable the other finishing touch actions.
+  finishing_touches:
+    docstrings:
+      enabled: false
+    unit_tests:
+      enabled: false
+    simplify:
+      enabled: false
+
+  pre_merge_checks:
+    docstrings:
+      mode: "off"
+    title:
+      mode: "off"
+    description:
+      mode: "off"
+    issue_assessment:
+      mode: "off"
+    custom_checks: []
+
+  path_instructions:
+    - path: "nvbench/**/*"
+      instructions: |
+        Focus on benchmark correctness, CUDA stream/event ordering, synchronization behavior, error handling,
+        resource ownership, exception safety, public API compatibility, measurement semantics, statistical
+        summaries, and test coverage. Prefer comments that catch correctness, API, compile-time, runtime, or
+        measurement-regression risks.
+
+    - path: "python/**/*"
+      instructions: |
+        Focus on Python API stability, pybind11/C++ exception boundaries, GIL behavior, CUDA interoperability,
+        object lifetime, package metadata, type stubs, JSON/result parsing compatibility, and tests. Avoid
+        style-only comments already covered by Ruff, clang-format, or pre-commit.
+
+    - path: "testing/**/*"
+      instructions: |
+        Focus on whether tests cover observable behavior, remain deterministic, handle GPU availability and CUDA
+        version differences correctly, avoid excessive runtime, and exercise install/export/package boundaries
+        where relevant.
+
+    - path: "examples/**/*"
+      instructions: |
+        Check that examples are minimal, buildable, technically correct, use NVBench APIs idiomatically, avoid
+        excessive benchmark runtime, and demonstrate behavior that is useful to users.
+
+    - path: "docs/**/*"
+      instructions: |
+        For documentation changes, focus on technical accuracy, buildable examples, CLI/API consistency,
+        version compatibility, and whether behavior changes have matching documentation updates.
+
+    - path: "ci/**/*"
+      instructions: |
+        For CI and build scripts, focus on matrix correctness, targeted build/test behavior, cache/artifact
+        handling, environment setup, GPU availability assumptions, clear failures, and avoiding unnecessary
+        expensive jobs.
+
+    - path: ".github/**/*"
+      instructions: |
+        For GitHub workflows and repository automation, focus on permissions, event triggers, matrix generation,
+        status/check behavior, security boundaries, and avoiding unnecessary CI fanout.
+
+    - path: "cmake/**/*"
+      instructions: |
+        Focus on package exports, install-tree and build-tree compatibility, target usage requirements, static
+        and shared library behavior, CUDA architecture handling, and compatibility across supported CMake/CUDA
+        versions.
+
+    - path: "**/CMakeLists.txt"
+      instructions: |
+        Focus on target dependencies, exported usage requirements, option behavior, install rules, tests,
+        examples, Python package integration, and compatibility across supported CMake/CUDA versions.
+
+    - path: "CMakePresets.json"
+      instructions: |
+        Focus on preset inheritance, CI parity, cache variable correctness, CUDA compiler/toolchain assumptions,
+        and whether presets remain useful for local and automated builds.
+
+knowledge_base:
+  opt_out: false
+  code_guidelines:
+    filePatterns:
+      - ".clang-format"
+      - ".pre-commit-config.yaml"
+      - "README.md"
+      - "CMakeLists.txt"
+      - "CMakePresets.json"
+      - "pyproject.toml"
+      - "python/README.md"
+      - "python/pyproject.toml"
+      - "docs/benchmarks.md"
+      - "docs/cli_help.md"
+      - "docs/cli_help_axis.md"

From 9ea77bccaabf6b2e7fbe468d8c852f6258e8bc02 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 12 May 2026 14:30:08 -0500
Subject: [PATCH 3/3] Implement CLI option to control warmups for cold
 measurements (#339)

* Implement warmup-runs count, supported as CLI

CLI option --warmup-runs implemented and documented.

The warm-up counts is enforced to always be positive.
This is necessary to ensure that JIT-ting has occurred,
and use of blocking kernel would not result in time-outs.

Test is option parser is added.

* Ensure that measure_cold::run_warmup instantiates blocking kernel

Because warm-up runs are executed without use of blocking kernel,
the blocking kernel was not jitted until actual measurements were
collected. The module loading cost incurred during the first run
shows as elevated CPU time noise value for the first measurement
as noted in https://github.com/NVIDIA/nvbench/pull/339

This PR adds `this->block_stream(); this->unblock_stream();` prior
to executing warm-up loop with use of blocking kernel disabled.

This ensures that blocking kernel is instantiated during the warm-up,
but it no other kernel is launched between its launch and stream sync
thus avoiding deadlocking.

* Rename --warmup-runs to --cold-warmup-runs, add --cold-max-warmup-walltime

Since configurable number of warmups only applies to measure_cold.cuh
rename the CLI option to reflect that.

Also add --cold-max-warmup-walltime (defaults to -1, i.e. disabled).
If enabled, exits warmup loop before request count is reached if
the wall-time expanded executign warmups exceeds this max-warmup-walltime
value.
---
 docs/cli_help.md                | 15 ++++++++++++
 nvbench/benchmark_base.cuh      | 24 +++++++++++++++++++
 nvbench/benchmark_base.cxx      |  4 +++-
 nvbench/detail/measure_cold.cu  |  2 ++
 nvbench/detail/measure_cold.cuh | 25 +++++++++++++++++---
 nvbench/json_printer.cu         | 16 ++++++++-----
 nvbench/option_parser.cu        | 14 ++++++++---
 nvbench/state.cuh               | 22 +++++++++++++++++
 nvbench/state.cxx               |  4 ++++
 testing/option_parser.cu        | 42 +++++++++++++++++++++++++++++++++
 testing/state_generator.cu      | 12 +++++++---
 11 files changed, 164 insertions(+), 16 deletions(-)

diff --git a/docs/cli_help.md b/docs/cli_help.md
index 15a89a7..8585409 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -93,6 +93,21 @@
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
+* `--cold-warmup-runs <count>`
+  * Execute up to `<count>` warmup runs before collecting cold measurement samples.
+  * The minimum is 1 warmup run.
+  * Default is 1 warmup run.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--cold-max-warmup-walltime <seconds>`
+  * Stop cold warmup after the total warmup walltime exceeds `<seconds>`.
+  * The limit is checked after each warmup run, so actual warmup time may exceed
+    this value by one warmup run.
+  * Default is -1 seconds (disabled).
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
 * `--throttle-threshold <value>`
   * Set the GPU throttle threshold as percentage of the device's default clock rate.
   * Default is 75.
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 939bbeb..6926f80 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -166,6 +166,28 @@ struct benchmark_base
   }
   /// @}
 
+  /// Execute this many warmup runs before collecting cold measurement samples. @{
+  [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
+  benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
+  {
+    m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
+                                                                : nvbench::int64_t{1};
+    return *this;
+  }
+  /// @}
+
+  /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
+  [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
+  {
+    return m_cold_max_warmup_walltime;
+  }
+  benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
+  {
+    m_cold_max_warmup_walltime = cold_max_warmup_walltime;
+    return *this;
+  }
+  /// @}
+
   /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
   /// @{
   [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
@@ -321,7 +343,9 @@ protected:
   bool m_skip_batched{false};
 
   nvbench::int64_t m_min_samples{10};
+  nvbench::int64_t m_cold_warmup_runs{1};
 
+  nvbench::float64_t m_cold_max_warmup_walltime{-1.};
   nvbench::float64_t m_skip_time{-1.};
   nvbench::float64_t m_timeout{15.};
 
diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx
index a237e5c..6f5d331 100644
--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -43,7 +43,9 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   result->m_run_once                = m_run_once;
   result->m_disable_blocking_kernel = m_disable_blocking_kernel;
 
-  result->m_min_samples = m_min_samples;
+  result->m_min_samples              = m_min_samples;
+  result->m_cold_warmup_runs         = m_cold_warmup_runs;
+  result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime;
 
   result->m_skip_time = m_skip_time;
   result->m_timeout   = m_timeout;
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 38fb6f7..76eb941 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -46,6 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state)
     , m_run_once{exec_state.get_run_once()}
     , m_check_throttling(!exec_state.get_run_once())
     , m_min_samples{exec_state.get_min_samples()}
+    , m_cold_warmup_runs{exec_state.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()}
     , m_skip_time{exec_state.get_skip_time()}
     , m_timeout{exec_state.get_timeout()}
     , m_throttle_threshold(exec_state.get_throttle_threshold())
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 89b4201..3f228e8 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -110,7 +110,9 @@ protected:
   bool m_check_throttling{true};
 
   nvbench::int64_t m_min_samples{};
+  nvbench::int64_t m_cold_warmup_runs{1};
 
+  nvbench::float64_t m_cold_max_warmup_walltime{};
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
 
@@ -239,8 +241,8 @@ struct measure_cold : public measure_cold_base
   }
 
 private:
-  // Run the kernel once, measuring the GPU time. If under skip_time, skip the
-  // measurement.
+  // Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run.
+  // If under skip_time, skip the measurement.
   void run_warmup()
   {
     if (m_run_once)
@@ -248,12 +250,29 @@ private:
       return;
     }
 
+    // Ensure blocking kernel is loaded during the warmup
+    // Ref: https://github.com/NVIDIA/nvbench/issues/339
+    this->block_stream();
+    this->unblock_stream();
+
     // disable use of blocking kernel for warm-up run
     // see https://github.com/NVIDIA/nvbench/issues/240
     constexpr bool disable_blocking_kernel = true;
     kernel_launch_timer timer(*this, disable_blocking_kernel);
+    nvbench::cpu_timer warmup_walltime_timer;
 
-    this->launch_kernel(timer);
+    warmup_walltime_timer.start();
+    for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run)
+    {
+      this->launch_kernel(timer);
+      warmup_walltime_timer.stop();
+
+      if (m_cold_max_warmup_walltime > 0. &&
+          warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime)
+      {
+        break;
+      }
+    }
     this->check_skip_time(m_cuda_timer.get_duration());
   }
 
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index b5f41c3..9ae1719 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -429,9 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
       bench["name"]  = bench_ptr->get_name();
       bench["index"] = bench_index;
 
-      bench["min_samples"] = bench_ptr->get_min_samples();
-      bench["skip_time"]   = bench_ptr->get_skip_time();
-      bench["timeout"]     = bench_ptr->get_timeout();
+      bench["min_samples"]              = bench_ptr->get_min_samples();
+      bench["cold_warmup_runs"]         = bench_ptr->get_cold_warmup_runs();
+      bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime();
+      bench["skip_time"]                = bench_ptr->get_skip_time();
+      bench["timeout"]                  = bench_ptr->get_timeout();
 
       auto &devices = bench["devices"];
       for (const auto &dev_info : bench_ptr->get_devices())
@@ -486,9 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
 
         st["name"] = exec_state.get_axis_values_as_string();
 
-        st["min_samples"] = exec_state.get_min_samples();
-        st["skip_time"]   = exec_state.get_skip_time();
-        st["timeout"]     = exec_state.get_timeout();
+        st["min_samples"]              = exec_state.get_min_samples();
+        st["cold_warmup_runs"]         = exec_state.get_cold_warmup_runs();
+        st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime();
+        st["skip_time"]                = exec_state.get_skip_time();
+        st["timeout"]                  = exec_state.get_timeout();
 
         st["device"]            = exec_state.get_device()->get_id();
         st["type_config_index"] = exec_state.get_type_config_index();
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index f0570bb..8d3e1b3 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -549,14 +549,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->update_axis(first[1]);
       first += 2;
     }
-    else if (arg == "--min-samples")
+    else if (arg == "--min-samples" || arg == "--cold-warmup-runs")
     {
       check_params(1);
       this->update_int64_prop(first[0], first[1]);
       first += 2;
     }
-    else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
-             arg == "--throttle-recovery-delay")
+    else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" ||
+             arg == "--throttle-threshold" || arg == "--throttle-recovery-delay")
     {
       check_params(1);
       this->update_float64_prop(first[0], first[1]);
@@ -1015,6 +1015,10 @@ try
   {
     bench.set_min_samples(value);
   }
+  else if (prop_arg == "--cold-warmup-runs")
+  {
+    bench.set_cold_warmup_runs(value);
+  }
   else
   {
     NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
@@ -1128,6 +1132,10 @@ try
   {
     bench.set_timeout(value);
   }
+  else if (prop_arg == "--cold-max-warmup-walltime")
+  {
+    bench.set_cold_max_warmup_walltime(value);
+  }
   else if (prop_arg == "--throttle-threshold")
   {
     bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value) / 100.0f);
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index aad5a2c..506d1df 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -152,6 +152,26 @@ struct state
   void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
   /// @}
 
+  /// Execute this many warmup runs before collecting cold measurement samples. @{
+  [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
+  void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
+  {
+    m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
+                                                                : nvbench::int64_t{1};
+  }
+  /// @}
+
+  /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
+  [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
+  {
+    return m_cold_max_warmup_walltime;
+  }
+  void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
+  {
+    m_cold_max_warmup_walltime = cold_max_warmup_walltime;
+  }
+  /// @}
+
   [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
   {
     return m_criterion_params;
@@ -332,7 +352,9 @@ private:
   std::string m_stopping_criterion;
 
   nvbench::int64_t m_min_samples;
+  nvbench::int64_t m_cold_warmup_runs;
 
+  nvbench::float64_t m_cold_max_warmup_walltime;
   nvbench::float64_t m_skip_time;
   nvbench::float64_t m_timeout;
 
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index af53502..6aaa496 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -39,6 +39,8 @@ state::state(const benchmark_base &bench)
     , m_criterion_params{bench.get_criterion_params()}
     , m_stopping_criterion(bench.get_stopping_criterion())
     , m_min_samples{bench.get_min_samples()}
+    , m_cold_warmup_runs{bench.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
@@ -61,6 +63,8 @@ state::state(const benchmark_base &bench,
     , m_criterion_params{bench.get_criterion_params()}
     , m_stopping_criterion(bench.get_stopping_criterion())
     , m_min_samples{bench.get_min_samples()}
+    , m_cold_warmup_runs{bench.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
diff --git a/testing/option_parser.cu b/testing/option_parser.cu
index 3cd98a3..e665673 100644
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -1204,6 +1204,36 @@ void test_min_samples()
   ASSERT(states[0].get_min_samples() == 12345);
 }
 
+void test_cold_warmup_runs()
+{
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 12345);
+  }
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 1);
+  }
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 1);
+  }
+}
+
 void test_skip_time()
 {
   nvbench::option_parser parser;
@@ -1214,6 +1244,16 @@ void test_skip_time()
   ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
 }
 
+void test_cold_max_warmup_walltime()
+{
+  nvbench::option_parser parser;
+  parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"});
+  const auto &states = parser_to_states(parser);
+
+  ASSERT(states.size() == 1);
+  ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.);
+}
+
 void test_timeout()
 {
   nvbench::option_parser parser;
@@ -1531,7 +1571,9 @@ try
   test_axis_before_benchmark();
 
   test_min_samples();
+  test_cold_warmup_runs();
   test_skip_time();
+  test_cold_max_warmup_walltime();
   test_timeout();
   test_output_parent_directories_created();
 
diff --git a/testing/state_generator.cu b/testing/state_generator.cu
index 14eceb5..383c8db 100644
--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -762,9 +762,11 @@ void test_devices()
 
 void test_termination_criteria()
 {
-  const nvbench::int64_t min_samples = 1000;
-  const nvbench::float64_t skip_time = 4000;
-  const nvbench::float64_t timeout   = 5000;
+  const nvbench::int64_t min_samples                = 1000;
+  const nvbench::int64_t cold_warmup_runs           = 7;
+  const nvbench::float64_t cold_max_warmup_walltime = 3000;
+  const nvbench::float64_t skip_time                = 4000;
+  const nvbench::float64_t timeout                  = 5000;
 
   // for comparing floats
   auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; };
@@ -772,6 +774,8 @@ void test_termination_criteria()
   dummy_bench bench;
   bench.set_devices(std::vector<int>{});
   bench.set_min_samples(min_samples);
+  bench.set_cold_warmup_runs(cold_warmup_runs);
+  bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime);
   bench.set_skip_time(skip_time);
   bench.set_timeout(timeout);
 
@@ -779,6 +783,8 @@ void test_termination_criteria()
 
   ASSERT(states.size() == 1);
   ASSERT(min_samples == states[0].get_min_samples());
+  ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs());
+  ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime()));
   ASSERT(within_one(skip_time, states[0].get_skip_time()));
   ASSERT(within_one(timeout, states[0].get_timeout()));
 }