diff --git a/docs/cli_help.md b/docs/cli_help.md
index d6af964..336a826 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -134,7 +134,7 @@
     before any `--benchmark` arguments.
 
 * `--throttle-threshold <value>`
-  * Set the GPU throttle threshold as percentage of the peak clock rate.
+  * Set the GPU throttle threshold as percentage of the device's default clock rate.
   * Default is 75%.
   * Applies to the most recent `--benchmark`, or all benchmarks if specified
     before any `--benchmark` arguments.
diff --git a/examples/summaries.cu b/examples/summaries.cu
index ad4fc06..0bb91ae 100644
--- a/examples/summaries.cu
+++ b/examples/summaries.cu
@@ -41,7 +41,7 @@ void summary_example(nvbench::state &state)
   }
 
   // Run the measurements:
-  state.exec([duration](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [duration](nvbench::launch &launch) {
     nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
   });
 
@@ -56,14 +56,17 @@ void summary_example(nvbench::state &state)
 #endif
 
   // Default summary columns can be shown/hidden in the markdown output tables by adding/removing
-  // the "hide" key. Modify this benchmark to show the minimum and maximum times, but hide the
-  // means.
+  // the "hide" key. Modify this benchmark to show the minimum and maximum GPUs times, but hide the
+  // mean GPU time and all CPU times. SM Clock frequency and throttling info are also shown.
   state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
   state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
   state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
-  state.get_summary("nv/cold/time/cpu/min").remove_value("hide");
-  state.get_summary("nv/cold/time/cpu/max").remove_value("hide");
   state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/min").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/max").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/stdev/relative").set_string("hide", "");
+  state.get_summary("nv/cold/sm_clock_rate/mean").remove_value("hide");
+  state.get_summary("nv/cold/sm_clock_rate/scaling/percent").remove_value("hide");
 }
 NVBENCH_BENCH(summary_example)
   .add_int64_axis("ms", nvbench::range(10, 50, 20))
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index d35779c..58c55b6 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -302,7 +302,7 @@ protected:
   nvbench::float64_t m_skip_time{-1.};
   nvbench::float64_t m_timeout{15.};
 
-  nvbench::float32_t m_throttle_threshold{0.75f};      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_threshold{0.75f};      // [% of default SM clock rate]
   nvbench::float32_t m_throttle_recovery_delay{0.05f}; // [seconds]
 
   nvbench::criterion_params m_criterion_params;
diff --git a/nvbench/detail/gpu_frequency.cuh b/nvbench/detail/gpu_frequency.cuh
index 320d3d2..dc0dcd7 100644
--- a/nvbench/detail/gpu_frequency.cuh
+++ b/nvbench/detail/gpu_frequency.cuh
@@ -40,7 +40,7 @@ struct gpu_frequency
 
   void stop(const nvbench::cuda_stream &stream) { m_stop.record(stream); }
 
-  [[nodiscard]] bool has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+  [[nodiscard]] bool has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
                                    nvbench::float32_t throttle_threshold);
 
   [[nodiscard]] nvbench::float32_t get_clock_frequency();
diff --git a/nvbench/detail/gpu_frequency.cxx b/nvbench/detail/gpu_frequency.cxx
index 8f2d19b..adc1011 100644
--- a/nvbench/detail/gpu_frequency.cxx
+++ b/nvbench/detail/gpu_frequency.cxx
@@ -31,10 +31,10 @@ nvbench::float32_t gpu_frequency::get_clock_frequency()
   return clock_rate;
 }
 
-bool gpu_frequency::has_throttled(nvbench::float32_t peak_sm_clock_rate_hz,
+bool gpu_frequency::has_throttled(nvbench::float32_t default_sm_clock_rate_hz,
                                   nvbench::float32_t throttle_threshold)
 {
-  float threshold = peak_sm_clock_rate_hz * throttle_threshold;
+  float threshold = default_sm_clock_rate_hz * throttle_threshold;
 
   if (this->get_clock_frequency() < threshold)
   {
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 2bbf0e0..a602062 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -53,7 +53,6 @@ measure_cold_base::measure_cold_base(state &exec_state)
   {
     m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
     m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
-    m_sm_clock_rates.reserve(static_cast<std::size_t>(m_min_samples));
   }
 }
 
@@ -72,18 +71,18 @@ void measure_cold_base::check()
 
 void measure_cold_base::initialize()
 {
-  m_min_cuda_time     = std::numeric_limits<nvbench::float64_t>::max();
-  m_max_cuda_time     = std::numeric_limits<nvbench::float64_t>::lowest();
-  m_total_cuda_time   = 0.;
-  m_min_cpu_time      = std::numeric_limits<nvbench::float64_t>::max();
-  m_max_cpu_time      = std::numeric_limits<nvbench::float64_t>::lowest();
-  m_total_cpu_time    = 0.;
-  m_total_samples     = 0;
-  m_max_time_exceeded = false;
+  m_min_cuda_time             = std::numeric_limits<nvbench::float64_t>::max();
+  m_max_cuda_time             = std::numeric_limits<nvbench::float64_t>::lowest();
+  m_total_cuda_time           = 0.;
+  m_min_cpu_time              = std::numeric_limits<nvbench::float64_t>::max();
+  m_max_cpu_time              = std::numeric_limits<nvbench::float64_t>::lowest();
+  m_total_cpu_time            = 0.;
+  m_sm_clock_rate_accumulator = 0.;
+  m_total_samples             = 0;
+  m_max_time_exceeded         = false;
 
   m_cuda_times.clear();
   m_cpu_times.clear();
-  m_sm_clock_rates.clear();
 
   m_stopping_criterion.initialize(m_criterion_params);
 }
@@ -94,21 +93,22 @@ void measure_cold_base::record_measurements()
 {
   if (!m_run_once)
   {
-    auto peak_clock_rate = static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
+    const auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
+    const auto default_clock_rate =
+      static_cast<float>(m_state.get_device()->get_sm_default_clock_rate());
 
-    if (m_gpu_frequency.has_throttled(peak_clock_rate, m_throttle_threshold))
+    if (m_gpu_frequency.has_throttled(default_clock_rate, m_throttle_threshold))
     {
       if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
       {
-        auto current_clock_rate = m_gpu_frequency.get_clock_frequency();
-        auto &printer           = printer_opt_ref.value().get();
+        auto &printer = printer_opt_ref.value().get();
         printer.log(nvbench::log_level::warn,
                     fmt::format("GPU throttled below threshold ({:0.2f} MHz / {:0.2f} MHz) "
                                 "({:0.0f}% < {:0.0f}%) on sample {}. Discarding previous sample "
                                 "and pausing for {}s.",
                                 current_clock_rate / 1000000.0f,
-                                peak_clock_rate / 1000000.0f,
-                                100.0f * (current_clock_rate / peak_clock_rate),
+                                default_clock_rate / 1000000.0f,
+                                100.0f * (current_clock_rate / default_clock_rate),
                                 100.0f * m_throttle_threshold,
                                 m_total_samples,
                                 m_throttle_recovery_delay));
@@ -123,7 +123,7 @@ void measure_cold_base::record_measurements()
       return;
     }
 
-    m_sm_clock_rates.push_back(peak_clock_rate);
+    m_sm_clock_rate_accumulator += current_clock_rate;
   }
 
   // Update and record timers and counters:
@@ -338,16 +338,30 @@ void measure_cold_base::generate_summaries()
     summ.set_string("hide", "Hidden by default.");
   }
 
-  if (!m_sm_clock_rates.empty())
+  if (m_sm_clock_rate_accumulator != 0.)
   {
-    auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
-    summ.set_string("name", "Clock Rate");
-    summ.set_string("hint", "frequency");
-    summ.set_string("description", "Mean SM clock rate");
-    summ.set_string("hide", "Hidden by default.");
-    summ.set_float64("value",
-                     nvbench::detail::statistics::compute_mean(m_sm_clock_rates.cbegin(),
-                                                               m_sm_clock_rates.cend()));
+    const auto clock_mean = m_sm_clock_rate_accumulator / d_samples;
+
+    {
+      auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/mean");
+      summ.set_string("name", "Clock Rate");
+      summ.set_string("hint", "frequency");
+      summ.set_string("description", "Mean SM clock rate");
+      summ.set_string("hide", "Hidden by default.");
+      summ.set_float64("value", clock_mean);
+    }
+
+    {
+      const auto default_clock_rate =
+        static_cast<nvbench::float64_t>(m_state.get_device()->get_sm_default_clock_rate());
+
+      auto &summ = m_state.add_summary("nv/cold/sm_clock_rate/scaling/percent");
+      summ.set_string("name", "Clock Scaling");
+      summ.set_string("hint", "percentage");
+      summ.set_string("description", "Mean SM clock rate as a percentage of default clock rate.");
+      summ.set_string("hide", "Hidden by default.");
+      summ.set_float64("value", clock_mean / default_clock_rate);
+    }
   }
 
   // Log if a printer exists:
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 2138b3d..26eee62 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -101,7 +101,7 @@ protected:
   nvbench::float64_t m_skip_time{};
   nvbench::float64_t m_timeout{};
 
-  nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_threshold;      // [% of default SM clock rate]
   nvbench::float32_t m_throttle_recovery_delay; // [seconds]
 
   nvbench::int64_t m_total_samples{};
@@ -114,9 +114,10 @@ protected:
   nvbench::float64_t m_max_cpu_time{};
   nvbench::float64_t m_total_cpu_time{};
 
+  nvbench::float64_t m_sm_clock_rate_accumulator{};
+
   std::vector<nvbench::float64_t> m_cuda_times;
   std::vector<nvbench::float64_t> m_cpu_times;
-  std::vector<nvbench::float32_t> m_sm_clock_rates;
 
   bool m_max_time_exceeded{};
 };
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 9b0a5c1..0691dc6 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -331,7 +331,7 @@ private:
   nvbench::float64_t m_skip_time;
   nvbench::float64_t m_timeout;
 
-  nvbench::float32_t m_throttle_threshold;      // [% of peak SM clock rate]
+  nvbench::float32_t m_throttle_threshold;      // [% of default SM clock rate]
   nvbench::float32_t m_throttle_recovery_delay; // [seconds]
 
   // Deadlock protection. See blocking_kernel's class doc for details.