diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 0000000..7336b51
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,137 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+language: en-US
+tone_instructions: |
+  Be direct, technical, brief. No praise, emojis, headings, or collapsible menus.
+  Start each comment with one prefix:
+  - suggestion: optional improvement;
+  - important: must-fix/high-impact risk;
+  - critical: blocking correctness/security/data-loss.
+
+reviews:
+  profile: chill
+
+  high_level_summary: true
+  high_level_summary_in_walkthrough: true
+  poem: false
+  in_progress_fortune: false
+  sequence_diagrams: false
+  estimate_code_review_effort: false
+  collapse_walkthrough: true
+
+  # Reduce noisy status/details sections.
+  request_changes_workflow: false
+  review_status: false
+  review_details: false
+  enable_prompt_for_ai_agents: false
+
+  auto_review:
+    enabled: false
+    drafts: false
+    base_branches:
+      - "^main$"
+      - "^branch/[0-9]+\\.[0-9]+\\.x$"
+    ignore_usernames: ["copy-pr-bot", "dependabot[bot]", "github-actions[bot]", "nv-automation-bot"]
+
+  tools:
+    gitleaks:
+      enabled: true
+    markdownlint:
+      enabled: true
+    shellcheck:
+      enabled: true
+
+  # Keep Autofix available, but disable the other finishing touch actions.
+  finishing_touches:
+    docstrings:
+      enabled: false
+    unit_tests:
+      enabled: false
+    simplify:
+      enabled: false
+
+  pre_merge_checks:
+    docstrings:
+      mode: "off"
+    title:
+      mode: "off"
+    description:
+      mode: "off"
+    issue_assessment:
+      mode: "off"
+    custom_checks: []
+
+  path_instructions:
+    - path: "nvbench/**/*"
+      instructions: |
+        Focus on benchmark correctness, CUDA stream/event ordering, synchronization behavior, error handling,
+        resource ownership, exception safety, public API compatibility, measurement semantics, statistical
+        summaries, and test coverage. Prefer comments that catch correctness, API, compile-time, runtime, or
+        measurement-regression risks.
+
+    - path: "python/**/*"
+      instructions: |
+        Focus on Python API stability, pybind11/C++ exception boundaries, GIL behavior, CUDA interoperability,
+        object lifetime, package metadata, type stubs, JSON/result parsing compatibility, and tests. Avoid
+        style-only comments already covered by Ruff, clang-format, or pre-commit.
+
+    - path: "testing/**/*"
+      instructions: |
+        Focus on whether tests cover observable behavior, remain deterministic, handle GPU availability and CUDA
+        version differences correctly, avoid excessive runtime, and exercise install/export/package boundaries
+        where relevant.
+
+    - path: "examples/**/*"
+      instructions: |
+        Check that examples are minimal, buildable, technically correct, use NVBench APIs idiomatically, avoid
+        excessive benchmark runtime, and demonstrate behavior that is useful to users.
+
+    - path: "docs/**/*"
+      instructions: |
+        For documentation changes, focus on technical accuracy, buildable examples, CLI/API consistency,
+        version compatibility, and whether behavior changes have matching documentation updates.
+
+    - path: "ci/**/*"
+      instructions: |
+        For CI and build scripts, focus on matrix correctness, targeted build/test behavior, cache/artifact
+        handling, environment setup, GPU availability assumptions, clear failures, and avoiding unnecessary
+        expensive jobs.
+
+    - path: ".github/**/*"
+      instructions: |
+        For GitHub workflows and repository automation, focus on permissions, event triggers, matrix generation,
+        status/check behavior, security boundaries, and avoiding unnecessary CI fanout.
+
+    - path: "cmake/**/*"
+      instructions: |
+        Focus on package exports, install-tree and build-tree compatibility, target usage requirements, static
+        and shared library behavior, CUDA architecture handling, and compatibility across supported CMake/CUDA
+        versions.
+
+    - path: "**/CMakeLists.txt"
+      instructions: |
+        Focus on target dependencies, exported usage requirements, option behavior, install rules, tests,
+        examples, Python package integration, and compatibility across supported CMake/CUDA versions.
+
+    - path: "CMakePresets.json"
+      instructions: |
+        Focus on preset inheritance, CI parity, cache variable correctness, CUDA compiler/toolchain assumptions,
+        and whether presets remain useful for local and automated builds.
+
+knowledge_base:
+  opt_out: false
+  code_guidelines:
+    filePatterns:
+      - ".clang-format"
+      - ".pre-commit-config.yaml"
+      - "README.md"
+      - "CMakeLists.txt"
+      - "CMakePresets.json"
+      - "pyproject.toml"
+      - "python/README.md"
+      - "python/pyproject.toml"
+      - "docs/benchmarks.md"
+      - "docs/cli_help.md"
+      - "docs/cli_help_axis.md"
diff --git a/docs/cli_help.md b/docs/cli_help.md
index 15a89a7..8585409 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -93,6 +93,21 @@
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
 
+* `--cold-warmup-runs <count>`
+  * Execute up to `<count>` warmup runs before collecting cold measurement samples.
+  * The minimum is 1 warmup run.
+  * Default is 1 warmup run.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--cold-max-warmup-walltime <seconds>`
+  * Stop cold warmup after the total warmup walltime exceeds `<seconds>`.
+  * The limit is checked after each warmup run, so actual warmup time may exceed
+    this value by one warmup run.
+  * Default is -1 seconds (disabled).
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
 * `--throttle-threshold <value>`
   * Set the GPU throttle threshold as percentage of the device's default clock rate.
   * Default is 75.
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 939bbeb..6926f80 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -166,6 +166,28 @@ struct benchmark_base
   }
   /// @}
 
+  /// Execute this many warmup runs before collecting cold measurement samples. @{
+  [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
+  benchmark_base &set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
+  {
+    m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
+                                                                : nvbench::int64_t{1};
+    return *this;
+  }
+  /// @}
+
+  /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
+  [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
+  {
+    return m_cold_max_warmup_walltime;
+  }
+  benchmark_base &set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
+  {
+    m_cold_max_warmup_walltime = cold_max_warmup_walltime;
+    return *this;
+  }
+  /// @}
+
   /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
   /// @{
   [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
@@ -321,7 +343,9 @@ protected:
   bool m_skip_batched{false};
 
   nvbench::int64_t m_min_samples{10};
+  nvbench::int64_t m_cold_warmup_runs{1};
 
+  nvbench::float64_t m_cold_max_warmup_walltime{-1.};
   nvbench::float64_t m_skip_time{-1.};
   nvbench::float64_t m_timeout{15.};
 
diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx
index a237e5c..6f5d331 100644
--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -43,7 +43,9 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
   result->m_run_once                = m_run_once;
   result->m_disable_blocking_kernel = m_disable_blocking_kernel;
 
-  result->m_min_samples = m_min_samples;
+  result->m_min_samples              = m_min_samples;
+  result->m_cold_warmup_runs         = m_cold_warmup_runs;
+  result->m_cold_max_warmup_walltime = m_cold_max_warmup_walltime;
 
   result->m_skip_time = m_skip_time;
   result->m_timeout   = m_timeout;
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 38fb6f7..76eb941 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -46,6 +46,8 @@ measure_cold_base::measure_cold_base(state &exec_state)
     , m_run_once{exec_state.get_run_once()}
     , m_check_throttling(!exec_state.get_run_once())
     , m_min_samples{exec_state.get_min_samples()}
+    , m_cold_warmup_runs{exec_state.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{exec_state.get_cold_max_warmup_walltime()}
     , m_skip_time{exec_state.get_skip_time()}
     , m_timeout{exec_state.get_timeout()}
     , m_throttle_threshold(exec_state.get_throttle_threshold())
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 89b4201..3f228e8 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -110,7 +110,9 @@ protected:
   bool m_check_throttling{true};
 
   nvbench::int64_t m_min_samples{};
+  nvbench::int64_t m_cold_warmup_runs{1};
 
+  nvbench::float64_t m_cold_max_warmup_walltime{};
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
 
@@ -239,8 +241,8 @@ struct measure_cold : public measure_cold_base
   }
 
 private:
-  // Run the kernel once, measuring the GPU time. If under skip_time, skip the
-  // measurement.
+  // Run the kernel m_cold_warmup_runs times, measuring the GPU time of the last run.
+  // If under skip_time, skip the measurement.
   void run_warmup()
   {
     if (m_run_once)
@@ -248,12 +250,29 @@ private:
       return;
     }
 
+    // Ensure blocking kernel is loaded during the warmup
+    // Ref: https://github.com/NVIDIA/nvbench/issues/339
+    this->block_stream();
+    this->unblock_stream();
+
     // disable use of blocking kernel for warm-up run
     // see https://github.com/NVIDIA/nvbench/issues/240
     constexpr bool disable_blocking_kernel = true;
     kernel_launch_timer timer(*this, disable_blocking_kernel);
+    nvbench::cpu_timer warmup_walltime_timer;
 
-    this->launch_kernel(timer);
+    warmup_walltime_timer.start();
+    for (nvbench::int64_t warmup_run = 0; warmup_run < m_cold_warmup_runs; ++warmup_run)
+    {
+      this->launch_kernel(timer);
+      warmup_walltime_timer.stop();
+
+      if (m_cold_max_warmup_walltime > 0. &&
+          warmup_walltime_timer.get_duration() > m_cold_max_warmup_walltime)
+      {
+        break;
+      }
+    }
     this->check_skip_time(m_cuda_timer.get_duration());
   }
 
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index e363a45..9ae1719 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -49,7 +49,7 @@ namespace fs = std::filesystem;
 #include <experimental/filesystem>
 namespace fs = std::experimental::filesystem;
 #else
-static_assert(false, "No <filesystem> or <experimental/filesystem> found.");
+#error "No <filesystem> or <experimental/filesystem> found."
 #endif
 
 #if NVBENCH_CPP_DIALECT >= 2020
@@ -429,9 +429,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
       bench["name"]  = bench_ptr->get_name();
       bench["index"] = bench_index;
 
-      bench["min_samples"] = bench_ptr->get_min_samples();
-      bench["skip_time"]   = bench_ptr->get_skip_time();
-      bench["timeout"]     = bench_ptr->get_timeout();
+      bench["min_samples"]              = bench_ptr->get_min_samples();
+      bench["cold_warmup_runs"]         = bench_ptr->get_cold_warmup_runs();
+      bench["cold_max_warmup_walltime"] = bench_ptr->get_cold_max_warmup_walltime();
+      bench["skip_time"]                = bench_ptr->get_skip_time();
+      bench["timeout"]                  = bench_ptr->get_timeout();
 
       auto &devices = bench["devices"];
       for (const auto &dev_info : bench_ptr->get_devices())
@@ -486,9 +488,11 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
 
         st["name"] = exec_state.get_axis_values_as_string();
 
-        st["min_samples"] = exec_state.get_min_samples();
-        st["skip_time"]   = exec_state.get_skip_time();
-        st["timeout"]     = exec_state.get_timeout();
+        st["min_samples"]              = exec_state.get_min_samples();
+        st["cold_warmup_runs"]         = exec_state.get_cold_warmup_runs();
+        st["cold_max_warmup_walltime"] = exec_state.get_cold_max_warmup_walltime();
+        st["skip_time"]                = exec_state.get_skip_time();
+        st["timeout"]                  = exec_state.get_timeout();
 
         st["device"]            = exec_state.get_device()->get_id();
         st["type_config_index"] = exec_state.get_type_config_index();
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 530ae94..8d3e1b3 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -48,9 +48,20 @@
 #include <stdexcept>
 #include <string>
 #include <string_view>
+#include <system_error>
 #include <tuple>
 #include <vector>
 
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#error "No <filesystem> or <experimental/filesystem> found."
+#endif
+
 namespace
 {
 
@@ -115,6 +126,18 @@ catch (const std::exception &)
 
 void parse(std::string_view input, std::string &val) { val = input; }
 
+void create_output_parent_directories(const std::string &spec)
+{
+  const fs::path output_path{spec};
+  const fs::path parent_path = output_path.parent_path();
+  if (parent_path.empty())
+  {
+    return;
+  }
+
+  fs::create_directories(parent_path);
+}
+
 // Parses a list of values "<val1>, <val2>, <val3>, ..." into a vector:
 template <typename T>
 std::vector<T> parse_list_values(std::string_view list_spec)
@@ -526,14 +549,14 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->update_axis(first[1]);
       first += 2;
     }
-    else if (arg == "--min-samples")
+    else if (arg == "--min-samples" || arg == "--cold-warmup-runs")
     {
       check_params(1);
       this->update_int64_prop(first[0], first[1]);
       first += 2;
     }
-    else if (arg == "--skip-time" || arg == "--timeout" || arg == "--throttle-threshold" ||
-             arg == "--throttle-recovery-delay")
+    else if (arg == "--skip-time" || arg == "--timeout" || arg == "--cold-max-warmup-walltime" ||
+             arg == "--throttle-threshold" || arg == "--throttle-recovery-delay")
     {
       check_params(1);
       this->update_float64_prop(first[0], first[1]);
@@ -622,6 +645,8 @@ std::ostream &option_parser::printer_spec_to_ostream(const std::string &spec)
   }
   else // spec is a filename:
   {
+    ::create_output_parent_directories(spec);
+
     auto file_stream = std::make_unique<std::ofstream>();
     // Throw if file can't open
     file_stream->exceptions(file_stream->exceptions() | std::ios::failbit);
@@ -990,6 +1015,10 @@ try
   {
     bench.set_min_samples(value);
   }
+  else if (prop_arg == "--cold-warmup-runs")
+  {
+    bench.set_cold_warmup_runs(value);
+  }
   else
   {
     NVBENCH_THROW(std::runtime_error, "Unrecognized property: `{}`", prop_arg);
@@ -1103,6 +1132,10 @@ try
   {
     bench.set_timeout(value);
   }
+  else if (prop_arg == "--cold-max-warmup-walltime")
+  {
+    bench.set_cold_max_warmup_walltime(value);
+  }
   else if (prop_arg == "--throttle-threshold")
   {
     bench.set_throttle_threshold(static_cast<nvbench::float32_t>(value) / 100.0f);
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index aad5a2c..506d1df 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -152,6 +152,26 @@ struct state
   void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
   /// @}
 
+  /// Execute this many warmup runs before collecting cold measurement samples. @{
+  [[nodiscard]] nvbench::int64_t get_cold_warmup_runs() const { return m_cold_warmup_runs; }
+  void set_cold_warmup_runs(nvbench::int64_t cold_warmup_runs)
+  {
+    m_cold_warmup_runs = cold_warmup_runs > nvbench::int64_t{0} ? cold_warmup_runs
+                                                                : nvbench::int64_t{1};
+  }
+  /// @}
+
+  /// Stop cold warmups after this many seconds of walltime. Negative values disable the limit. @{
+  [[nodiscard]] nvbench::float64_t get_cold_max_warmup_walltime() const
+  {
+    return m_cold_max_warmup_walltime;
+  }
+  void set_cold_max_warmup_walltime(nvbench::float64_t cold_max_warmup_walltime)
+  {
+    m_cold_max_warmup_walltime = cold_max_warmup_walltime;
+  }
+  /// @}
+
   [[nodiscard]] const nvbench::criterion_params &get_criterion_params() const
   {
     return m_criterion_params;
@@ -332,7 +352,9 @@ private:
   std::string m_stopping_criterion;
 
   nvbench::int64_t m_min_samples;
+  nvbench::int64_t m_cold_warmup_runs;
 
+  nvbench::float64_t m_cold_max_warmup_walltime;
   nvbench::float64_t m_skip_time;
   nvbench::float64_t m_timeout;
 
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index af53502..6aaa496 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -39,6 +39,8 @@ state::state(const benchmark_base &bench)
     , m_criterion_params{bench.get_criterion_params()}
     , m_stopping_criterion(bench.get_stopping_criterion())
     , m_min_samples{bench.get_min_samples()}
+    , m_cold_warmup_runs{bench.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
@@ -61,6 +63,8 @@ state::state(const benchmark_base &bench,
     , m_criterion_params{bench.get_criterion_params()}
     , m_stopping_criterion(bench.get_stopping_criterion())
     , m_min_samples{bench.get_min_samples()}
+    , m_cold_warmup_runs{bench.get_cold_warmup_runs()}
+    , m_cold_max_warmup_walltime{bench.get_cold_max_warmup_walltime()}
     , m_skip_time{bench.get_skip_time()}
     , m_timeout{bench.get_timeout()}
     , m_throttle_threshold{bench.get_throttle_threshold()}
diff --git a/testing/option_parser.cu b/testing/option_parser.cu
index 6104e5e..e665673 100644
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -22,6 +22,21 @@
 
 #include <fmt/format.h>
 
+#include <chrono>
+#include <iostream>
+#include <stdexcept>
+#include <system_error>
+
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#error "No <filesystem> or <experimental/filesystem> found."
+#endif
+
 #include "test_asserts.cuh"
 
 //==============================================================================
@@ -49,6 +64,40 @@ NVBENCH_BENCH_TYPES(TestBench, NVBENCH_TYPE_AXES(Ts, Us))
 namespace
 {
 
+struct temp_tree
+{
+  explicit temp_tree(fs::path root)
+      : root_path{std::move(root)}
+  {
+    std::error_code ec;
+    fs::remove_all(root_path, ec);
+    if (ec)
+    {
+      throw std::runtime_error{fmt::format("Failed to remove temporary directory `{}`: {}",
+                                           root_path.string(),
+                                           ec.message())};
+    }
+  }
+
+  ~temp_tree()
+  {
+    std::error_code ec;
+    fs::remove_all(root_path, ec);
+    if (ec)
+    {
+      std::cerr << "Failed to remove temporary directory `" << root_path.string()
+                << "`: " << ec.message() << "\n";
+    }
+  }
+
+  temp_tree(const temp_tree &)            = delete;
+  temp_tree(temp_tree &&)                 = delete;
+  temp_tree &operator=(const temp_tree &) = delete;
+  temp_tree &operator=(temp_tree &&)      = delete;
+
+  fs::path root_path;
+};
+
 [[nodiscard]] std::string states_to_string(const std::vector<nvbench::state> &states)
 {
   fmt::memory_buffer buffer;
@@ -1155,6 +1204,36 @@ void test_min_samples()
   ASSERT(states[0].get_min_samples() == 12345);
 }
 
+void test_cold_warmup_runs()
+{
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "12345"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 12345);
+  }
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "0"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 1);
+  }
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--benchmark", "DummyBench", "--cold-warmup-runs", "-12345"});
+    const auto &states = parser_to_states(parser);
+
+    ASSERT(states.size() == 1);
+    ASSERT(states[0].get_cold_warmup_runs() == 1);
+  }
+}
+
 void test_skip_time()
 {
   nvbench::option_parser parser;
@@ -1165,6 +1244,16 @@ void test_skip_time()
   ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
 }
 
+void test_cold_max_warmup_walltime()
+{
+  nvbench::option_parser parser;
+  parser.parse({"--benchmark", "DummyBench", "--cold-max-warmup-walltime", "12345e2"});
+  const auto &states = parser_to_states(parser);
+
+  ASSERT(states.size() == 1);
+  ASSERT(std::abs(states[0].get_cold_max_warmup_walltime() - 12345e2) < 1.);
+}
+
 void test_timeout()
 {
   nvbench::option_parser parser;
@@ -1175,6 +1264,22 @@ void test_timeout()
   ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.);
 }
 
+void test_output_parent_directories_created()
+{
+  const auto unique_suffix = std::chrono::steady_clock::now().time_since_epoch().count();
+  const temp_tree temp{fs::temp_directory_path() /
+                       fmt::format("nvbench_option_parser_test_{}", unique_suffix)};
+  const auto output_path = temp.root_path / "nested" / "results.json";
+
+  {
+    nvbench::option_parser parser;
+    parser.parse({"--json", output_path.string()});
+  }
+
+  ASSERT(fs::is_directory(output_path.parent_path()));
+  ASSERT(fs::exists(output_path));
+}
+
 void test_stopping_criterion()
 {
   { // Per benchmark criterion
@@ -1466,8 +1571,11 @@ try
   test_axis_before_benchmark();
 
   test_min_samples();
+  test_cold_warmup_runs();
   test_skip_time();
+  test_cold_max_warmup_walltime();
   test_timeout();
+  test_output_parent_directories_created();
 
   test_stopping_criterion();
 
diff --git a/testing/state_generator.cu b/testing/state_generator.cu
index 14eceb5..383c8db 100644
--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -762,9 +762,11 @@ void test_devices()
 
 void test_termination_criteria()
 {
-  const nvbench::int64_t min_samples = 1000;
-  const nvbench::float64_t skip_time = 4000;
-  const nvbench::float64_t timeout   = 5000;
+  const nvbench::int64_t min_samples                = 1000;
+  const nvbench::int64_t cold_warmup_runs           = 7;
+  const nvbench::float64_t cold_max_warmup_walltime = 3000;
+  const nvbench::float64_t skip_time                = 4000;
+  const nvbench::float64_t timeout                  = 5000;
 
   // for comparing floats
   auto within_one = [](auto a, auto b) { return std::abs(a - b) < 1.; };
@@ -772,6 +774,8 @@ void test_termination_criteria()
   dummy_bench bench;
   bench.set_devices(std::vector<int>{});
   bench.set_min_samples(min_samples);
+  bench.set_cold_warmup_runs(cold_warmup_runs);
+  bench.set_cold_max_warmup_walltime(cold_max_warmup_walltime);
   bench.set_skip_time(skip_time);
   bench.set_timeout(timeout);
 
@@ -779,6 +783,8 @@ void test_termination_criteria()
 
   ASSERT(states.size() == 1);
   ASSERT(min_samples == states[0].get_min_samples());
+  ASSERT(cold_warmup_runs == states[0].get_cold_warmup_runs());
+  ASSERT(within_one(cold_max_warmup_walltime, states[0].get_cold_max_warmup_walltime()));
   ASSERT(within_one(skip_time, states[0].get_skip_time()));
   ASSERT(within_one(timeout, states[0].get_timeout()));
 }