From a487a38895550b24c988f6255bda1828cb6d493e Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Tue, 22 Apr 2025 14:37:16 -0700
Subject: [PATCH 1/7] Dump frequencies

---
 nvbench/detail/measure_cold.cu  |  3 ++
 nvbench/detail/measure_cold.cuh |  1 +
 nvbench/json_printer.cu         | 77 +++++++++++++++++++++++++++++++++
 nvbench/json_printer.cuh        |  1 +
 4 files changed, 82 insertions(+)

diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index d6f1dd4..e820110 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -86,6 +86,7 @@ void measure_cold_base::initialize()
   m_dynamic_throttle_recovery_delay = m_throttle_recovery_delay;
   m_throttle_discard_count          = 0;
 
+  m_sm_clock_rates.clear();
   m_cuda_times.clear();
   m_cpu_times.clear();
 
@@ -140,6 +141,7 @@ void measure_cold_base::record_measurements()
     }
     m_throttle_discard_count = 0;
 
+    m_sm_clock_rates.push_back(current_clock_rate);
     m_sm_clock_rate_accumulator += current_clock_rate;
   }
 
@@ -445,6 +447,7 @@ void measure_cold_base::generate_summaries()
                             m_total_samples));
 
     printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times);
+    printer.process_bulk_data(m_state, "nv/cold/sample_freqs", "sample_freqs", m_sm_clock_rates);
   }
 }
 
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index f3fd80a..b93aae2 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -122,6 +122,7 @@ protected:
   nvbench::float64_t m_max_cpu_time{};
   nvbench::float64_t m_total_cpu_time{};
 
+  std::vector<nvbench::float64_t> m_sm_clock_rates{};
   nvbench::float64_t m_sm_clock_rate_accumulator{};
 
   std::vector<nvbench::float64_t> m_cuda_times;
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index 085faae..b8e2a82 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -241,6 +241,83 @@ void json_printer::do_process_bulk_data_float64(state &state,
         fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
     }
   } // end hint == sample_times
+
+  if (hint == "sample_freqs")
+  {
+    nvbench::cpu_timer timer;
+    timer.start();
+
+    fs::path result_path{m_stream_name + "-freqs-bin/"};
+    try
+    {
+      if (!fs::exists(result_path))
+      {
+        if (!fs::create_directory(result_path))
+        {
+          NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'.");
+        }
+      }
+      else if (!fs::is_directory(result_path))
+      {
+        NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory.");
+      }
+
+      const auto file_id = m_num_jsonbin_freq_files++;
+      result_path /= fmt::format("{:d}.bin", file_id);
+
+      std::ofstream out;
+      out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
+      out.open(result_path, std::ios::binary | std::ios::out);
+
+      // FIXME: SLOW -- Writing the binary file, 4 bytes at a time...
+      // There are a lot of optimizations that could be done here if this ends
+      // up being a noticeable bottleneck.
+      for (auto value64 : data)
+      {
+        const auto value32 = static_cast<nvbench::float32_t>(value64);
+        char buffer[4];
+        std::memcpy(buffer, &value32, 4);
+        // the c++17 implementation of is_little_endian isn't constexpr, but
+        // all supported compilers optimize this branch as if it were.
+        if (!is_little_endian())
+        {
+          using std::swap;
+          swap(buffer[0], buffer[3]);
+          swap(buffer[1], buffer[2]);
+        }
+        out.write(buffer, 4);
+      }
+    }
+    catch (std::exception &e)
+    {
+      if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
+      {
+        auto &printer = printer_opt_ref.value().get();
+        printer.log(
+          nvbench::log_level::warn,
+          fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what()));
+      }
+    } // end catch
+
+    auto &summ = state.add_summary(fmt::format("nv/json/freqs-bin:{}", tag));
+    summ.set_string("name", "Samples Frequencies File");
+    summ.set_string("hint", "file/sample_freqs");
+    summ.set_string("description",
+                    "Binary file containing sample frequencies as little-endian "
+                    "float32.");
+    summ.set_string("filename", result_path.string());
+    summ.set_int64("size", static_cast<nvbench::int64_t>(data.size()));
+    summ.set_string("hide", "Not needed in table.");
+
+    timer.stop();
+    if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
+    {
+      auto &printer = printer_opt_ref.value().get();
+      printer.log(
+        nvbench::log_level::info,
+        fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
+    }
+  } // end hint == sample_freqs
 }
 
 static void add_devices_section(nlohmann::ordered_json &root)
diff --git a/nvbench/json_printer.cuh b/nvbench/json_printer.cuh
index ae361ff..eb062b9 100644
--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -73,6 +73,7 @@ protected:
 
   bool m_enable_binary_output{false};
   std::size_t m_num_jsonbin_files{};
+  std::size_t m_num_jsonbin_freq_files{};
 
   std::vector<std::string> m_argv;
 };

From 40b2f4ece262cc2fd6322bf92fd0b55b092f8d75 Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Sat, 3 May 2025 19:54:39 -0700
Subject: [PATCH 2/7] Better place to stop freq timer?

---
 nvbench/detail/measure_cold.cuh | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index b93aae2..786e9b7 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -137,25 +137,33 @@ struct measure_cold_base::kernel_launch_timer
       : m_measure{measure}
       , m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
       , m_run_once{measure.m_run_once}
+      , m_check_throttling{measure.m_check_throttling}
   {}
 
   explicit kernel_launch_timer(measure_cold_base &measure,
                                bool disable_blocking_kernel,
-                               bool run_once)
+                               bool run_once,
+                               bool check_throttling)
       : m_measure{measure}
       , m_disable_blocking_kernel{disable_blocking_kernel}
       , m_run_once{run_once}
+      , m_check_throttling{check_throttling}
   {}
 
   __forceinline__ void start()
   {
     m_measure.flush_device_l2();
     m_measure.sync_stream();
+
+    // start CPU timer irrespective of use of blocking kernel
+    // Ref: https://github.com/NVIDIA/nvbench/issues/249
+    m_measure.m_cpu_timer.start();
+
     if (!m_disable_blocking_kernel)
     {
       m_measure.block_stream();
     }
-    if (m_measure.m_check_throttling)
+    if (m_check_throttling)
     {
       m_measure.gpu_frequency_start();
     }
@@ -164,22 +172,19 @@ struct measure_cold_base::kernel_launch_timer
       m_measure.profiler_start();
     }
     m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
-    // start CPU timer irrespective of use of blocking kernel
-    // Ref: https://github.com/NVIDIA/nvbench/issues/249
-    m_measure.m_cpu_timer.start();
   }
 
   __forceinline__ void stop()
   {
     m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
+    if (m_check_throttling)
+    {
+      m_measure.gpu_frequency_stop();
+    }
     if (!m_disable_blocking_kernel)
     {
       m_measure.unblock_stream();
     }
-    if (m_measure.m_check_throttling)
-    {
-      m_measure.gpu_frequency_stop();
-    }
     m_measure.sync_stream();
     if (m_run_once)
     {
@@ -192,6 +197,7 @@ private:
   measure_cold_base &m_measure;
   bool m_disable_blocking_kernel;
   bool m_run_once;
+  bool m_check_throttling;
 };
 
 template <typename KernelLauncher>
@@ -228,7 +234,7 @@ private:
     // disable use of blocking kernel for warm-up run
     // see https://github.com/NVIDIA/nvbench/issues/240
     constexpr bool disable_blocking_kernel = true;
-    kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once);
+    kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling);
 
     this->launch_kernel(timer);
     this->check_skip_time(m_cuda_timer.get_duration());
@@ -239,7 +245,7 @@ private:
     // do not use blocking kernel if benchmark is only run once, e.g., when profiling
     // ref: https://github.com/NVIDIA/nvbench/issue/242
     const bool disable_blocking_kernel = m_run_once || m_disable_blocking_kernel;
-    kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once);
+    kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling);
     do
     {
       this->launch_kernel(timer);

From 988420b5b166ec7cc2088f778d0623186546ba97 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 13 Feb 2026 10:19:06 -0600
Subject: [PATCH 3/7] Use write_out_values utility to save frequencies

The utility was already used to save times
---
 nvbench/json_printer.cu | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index b8e2a82..5cfc11a 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -106,7 +106,11 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
   } // end foreach value name
 }
 
-template <std::size_t buffer_nbytes>
+// choose buffer to be block size of modern SSD
+// see: https://github.com/NVIDIA/nvbench/issues/255
+static constexpr std::size_t preferred_buffer_nbytes = 4096;
+
+template <std::size_t buffer_nbytes = preferred_buffer_nbytes>
 void write_out_values(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
 {
   static constexpr std::size_t value_nbytes = sizeof(nvbench::float32_t);
@@ -206,10 +210,7 @@ void json_printer::do_process_bulk_data_float64(state &state,
       out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
       out.open(result_path, std::ios::binary | std::ios::out);
 
-      // choose buffer to be block size of modern SSD
-      // see: https://github.com/NVIDIA/nvbench/issues/255
-      constexpr std::size_t buffer_nbytes = 4096;
-      write_out_values<buffer_nbytes>(out, data);
+      write_out_values(out, data);
     }
     catch (std::exception &e)
     {
@@ -269,24 +270,7 @@ void json_printer::do_process_bulk_data_float64(state &state,
       out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
       out.open(result_path, std::ios::binary | std::ios::out);
 
-      // FIXME: SLOW -- Writing the binary file, 4 bytes at a time...
-      // There are a lot of optimizations that could be done here if this ends
-      // up being a noticeable bottleneck.
-      for (auto value64 : data)
-      {
-        const auto value32 = static_cast<nvbench::float32_t>(value64);
-        char buffer[4];
-        std::memcpy(buffer, &value32, 4);
-        // the c++17 implementation of is_little_endian isn't constexpr, but
-        // all supported compilers optimize this branch as if it were.
-        if (!is_little_endian())
-        {
-          using std::swap;
-          swap(buffer[0], buffer[3]);
-          swap(buffer[1], buffer[2]);
-        }
-        out.write(buffer, 4);
-      }
+      write_out_values(out, data);
     }
     catch (std::exception &e)
     {

From 4da9f431c0caf6e476d908df2ea9efbcd2869c32 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 19 Feb 2026 15:32:00 -0600
Subject: [PATCH 4/7] Templatize write_out_values for different storage formats

This could be used to save data as float32_t, or float64_t.
This flexibility is useful for experimentation.
---
 nvbench/json_printer.cu | 53 +++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 10 deletions(-)

diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index 5cfc11a..4e17359 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -38,6 +38,7 @@
 #include <ostream>
 #include <stdexcept>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -110,27 +111,45 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
 // see: https://github.com/NVIDIA/nvbench/issues/255
 static constexpr std::size_t preferred_buffer_nbytes = 4096;
 
-template <std::size_t buffer_nbytes = preferred_buffer_nbytes>
-void write_out_values(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
+template <std::size_t N, std::size_t... Is>
+void swap_bytes_impl(char *p, std::index_sequence<Is...>)
 {
-  static constexpr std::size_t value_nbytes = sizeof(nvbench::float32_t);
+  ((std::swap(p[Is], p[N - 1 - Is])), ...);
+}
+
+template <std::size_t WordSize>
+void big_endian_to_little_endian(char *word)
+{
+  if constexpr (WordSize > 1)
+  {
+    static_assert((WordSize & (WordSize - 1)) == 0, "WordSize must be a power of two");
+    swap_bytes_impl<WordSize>(word, std::make_index_sequence<WordSize / 2>{});
+  }
+}
+
+template <typename StorageT, std::size_t buffer_nbytes = preferred_buffer_nbytes>
+void write_out_values_as(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
+{
+  static_assert(std::is_floating_point_v<StorageT>);
+  static_assert(std::is_convertible_v<nvbench::float64_t, StorageT>);
+
+  static constexpr std::size_t value_nbytes = sizeof(StorageT);
   static_assert(buffer_nbytes % value_nbytes == 0);
 
-  alignas(alignof(nvbench::float32_t)) char buffer[buffer_nbytes];
+  alignas(alignof(StorageT)) char buffer[buffer_nbytes];
   std::size_t bytes_in_buffer = 0;
 
   for (auto value64 : data)
   {
-    const auto value32   = static_cast<nvbench::float32_t>(value64);
+    const auto value     = static_cast<StorageT>(value64);
     auto value_subbuffer = &buffer[bytes_in_buffer];
-    std::memcpy(value_subbuffer, &value32, value_nbytes);
+    std::memcpy(value_subbuffer, &value, value_nbytes);
 
     // the c++17 implementation of is_little_endian isn't constexpr, but
     // all supported compilers optimize this branch as if it were.
     if (!is_little_endian())
     {
-      std::swap(value_subbuffer[0], value_subbuffer[3]);
-      std::swap(value_subbuffer[1], value_subbuffer[2]);
+      big_endian_to_little_endian<value_nbytes>(value_subbuffer);
     }
     bytes_in_buffer += value_nbytes;
 
@@ -149,6 +168,20 @@ void write_out_values(std::ofstream &out, const std::vector<nvbench::float64_t>
   }
 }
 
+// save data using statically downcasting to float32 format
+template <std::size_t buffer_nbytes = preferred_buffer_nbytes>
+void write_out_values_as_float32(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
+{
+  write_out_values_as<nvbench::float32_t, buffer_nbytes>(out, data);
+}
+
+// save data using float64 format
+template <std::size_t buffer_nbytes = preferred_buffer_nbytes>
+void write_out_values_as_float64(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
+{
+  write_out_values_as<nvbench::float64_t, buffer_nbytes>(out, data);
+}
+
 } // end namespace
 
 namespace nvbench
@@ -210,7 +243,7 @@ void json_printer::do_process_bulk_data_float64(state &state,
       out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
       out.open(result_path, std::ios::binary | std::ios::out);
 
-      write_out_values(out, data);
+      write_out_values_as_float32(out, data);
     }
     catch (std::exception &e)
     {
@@ -270,7 +303,7 @@ void json_printer::do_process_bulk_data_float64(state &state,
       out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
       out.open(result_path, std::ios::binary | std::ios::out);
 
-      write_out_values(out, data);
+      write_out_values_as_float32(out, data);
     }
     catch (std::exception &e)
     {

From 731e0c2c3088e8a4370a2cf4d38a08e787a7bf99 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 19 Feb 2026 15:33:57 -0600
Subject: [PATCH 5/7] Swapped data members m_sm_clock_rates and
 m_sm_clock_rate_accumulator

This places all std::vector members together. Added default initialization
to all std::vector members, and all other members with default constructors.

Exceptions are references and nvbench::launch m_launch; member
---
 nvbench/detail/measure_cold.cuh | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 786e9b7..3a372d1 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -85,27 +85,27 @@ protected:
   nvbench::state &m_state;
 
   nvbench::launch m_launch;
-  nvbench::cuda_timer m_cuda_timer;
-  nvbench::cpu_timer m_cpu_timer;
-  nvbench::cpu_timer m_walltime_timer;
-  nvbench::detail::l2flush m_l2flush;
-  nvbench::blocking_kernel m_blocker;
+  nvbench::cuda_timer m_cuda_timer{};
+  nvbench::cpu_timer m_cpu_timer{};
+  nvbench::cpu_timer m_walltime_timer{};
+  nvbench::detail::l2flush m_l2flush{};
+  nvbench::blocking_kernel m_blocker{};
 
-  nvbench::criterion_params m_criterion_params;
+  nvbench::criterion_params m_criterion_params{};
   nvbench::stopping_criterion_base &m_stopping_criterion;
-  nvbench::detail::gpu_frequency m_gpu_frequency;
+  nvbench::detail::gpu_frequency m_gpu_frequency{};
 
   bool m_disable_blocking_kernel{false};
   bool m_run_once{false};
-  bool m_check_throttling;
+  bool m_check_throttling{true};
 
   nvbench::int64_t m_min_samples{};
 
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
 
-  nvbench::float32_t m_throttle_threshold;      // [% of default SM clock rate]
-  nvbench::float32_t m_throttle_recovery_delay; // [seconds]
+  nvbench::float32_t m_throttle_threshold{};      // [% of default SM clock rate]
+  nvbench::float32_t m_throttle_recovery_delay{}; // [seconds]
 
   // Dynamically increased when repeated throttling occurs
   // without successfully recording a sample.
@@ -122,13 +122,13 @@ protected:
   nvbench::float64_t m_max_cpu_time{};
   nvbench::float64_t m_total_cpu_time{};
 
-  std::vector<nvbench::float64_t> m_sm_clock_rates{};
   nvbench::float64_t m_sm_clock_rate_accumulator{};
+  std::vector<nvbench::float64_t> m_sm_clock_rates{};
 
-  std::vector<nvbench::float64_t> m_cuda_times;
-  std::vector<nvbench::float64_t> m_cpu_times;
+  std::vector<nvbench::float64_t> m_cuda_times{};
+  std::vector<nvbench::float64_t> m_cpu_times{};
 
-  bool m_max_time_exceeded{};
+  bool m_max_time_exceeded{false};
 };
 
 struct measure_cold_base::kernel_launch_timer

From 998ab125ce8dcc6333eebe30d1c812ebdc6b9507 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 20 Feb 2026 16:34:53 -0600
Subject: [PATCH 6/7] Don't override m_check_throttling if throttling threshold
 is non-positive

measure_cold class now directly inherits m_check_throttling from state.
This ensures that when `--jsonbin` is specified frequency data corresponding
to timing data are available to write out.
---
 nvbench/detail/measure_cold.cu  |  2 +-
 nvbench/detail/measure_cold.cuh | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index e820110..fecff8b 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -44,7 +44,7 @@ measure_cold_base::measure_cold_base(state &exec_state)
         exec_state.get_stopping_criterion())}
     , m_disable_blocking_kernel{exec_state.get_disable_blocking_kernel()}
     , m_run_once{exec_state.get_run_once()}
-    , m_check_throttling(!exec_state.get_run_once() && exec_state.get_throttle_threshold() > 0.f)
+    , m_check_throttling(!exec_state.get_run_once())
     , m_min_samples{exec_state.get_min_samples()}
     , m_skip_time{exec_state.get_skip_time()}
     , m_timeout{exec_state.get_timeout()}
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 3a372d1..edd24fe 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -140,6 +140,13 @@ struct measure_cold_base::kernel_launch_timer
       , m_check_throttling{measure.m_check_throttling}
   {}
 
+  explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel)
+      : m_measure{measure}
+      , m_disable_blocking_kernel{disable_blocking_kernel}
+      , m_run_once{measure.m_run_once}
+      , m_check_throttling{measure.m_check_throttling}
+  {}
+
   explicit kernel_launch_timer(measure_cold_base &measure,
                                bool disable_blocking_kernel,
                                bool run_once,
@@ -234,7 +241,7 @@ private:
     // disable use of blocking kernel for warm-up run
     // see https://github.com/NVIDIA/nvbench/issues/240
     constexpr bool disable_blocking_kernel = true;
-    kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling);
+    kernel_launch_timer timer(*this, disable_blocking_kernel);
 
     this->launch_kernel(timer);
     this->check_skip_time(m_cuda_timer.get_duration());
@@ -245,7 +252,7 @@ private:
     // do not use blocking kernel if benchmark is only run once, e.g., when profiling
     // ref: https://github.com/NVIDIA/nvbench/issue/242
     const bool disable_blocking_kernel = m_run_once || m_disable_blocking_kernel;
-    kernel_launch_timer timer(*this, disable_blocking_kernel, m_run_once, m_check_throttling);
+    kernel_launch_timer timer(*this, disable_blocking_kernel);
     do
     {
       this->launch_kernel(timer);

From c9705de4a4b4f4f50aef59b3e1a466da6579a0f6 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 27 Feb 2026 12:49:35 -0600
Subject: [PATCH 7/7] Reserve enough space clock-rates for min samples, if
 specified

---
 nvbench/detail/measure_cold.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index fecff8b..b6a4c34 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -53,8 +53,10 @@ measure_cold_base::measure_cold_base(state &exec_state)
 {
   if (m_min_samples > 0)
   {
-    m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
-    m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
+    const auto reserve_size = static_cast<std::size_t>(m_min_samples);
+    m_sm_clock_rates.reserve(reserve_size);
+    m_cuda_times.reserve(reserve_size);
+    m_cpu_times.reserve(reserve_size);
   }
 }