Refactor measurement cleanup guards for testability

Extract hot stream cleanup and cold launch timer cleanup into reusable detail helpers. Keep measure_hot and measure_cold using those helpers through thin adapters so the tested cleanup logic matches the production path. Add driver-free cleanup guard tests using a fake measure object to verify cleanup ordering when exceptions occur after blocking stream setup, after hot unblock, and around cold GPU frequency start/stop paths.
2026-05-13 17:55:39 +00:00 · 2026-05-13 11:09:12 -05:00
parent 0d64864ac3
commit e2763ca136
6 changed files with 573 additions and 198 deletions
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -35,6 +35,7 @@
 #include <nvbench/detail/gpu_frequency.cuh>
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
+#include <nvbench/detail/measure_cold_launch_timer_core.cuh>
 #include <nvbench/detail/statistics.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
@@ -146,48 +147,18 @@ protected:

 struct measure_cold_base::kernel_launch_timer
 {
-private:
-  __forceinline__ void cleanup_noexcept() noexcept;
-
-  struct cleanup_guard
-  {
-    explicit cleanup_guard(kernel_launch_timer &timer)
-        : m_timer{timer}
-    {}
-
-    cleanup_guard(const cleanup_guard &)            = delete;
-    cleanup_guard(cleanup_guard &&)                 = delete;
-    cleanup_guard &operator=(const cleanup_guard &) = delete;
-    cleanup_guard &operator=(cleanup_guard &&)      = delete;
-
-    ~cleanup_guard() noexcept
-    {
-      if (m_active)
-      {
-        m_timer.cleanup_noexcept();
-      }
-    }
-
-    void release() noexcept { m_active = false; }
-
-  private:
-    kernel_launch_timer &m_timer;
-    bool m_active{true};
-  };
-
-public:
  kernel_launch_timer(measure_cold_base &measure)
-      : m_measure{measure}
-      , m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
-      , m_run_once{measure.m_run_once}
-      , m_check_throttling{measure.m_check_throttling}
+      : kernel_launch_timer{measure,
+                            measure.m_disable_blocking_kernel,
+                            measure.m_run_once,
+                            measure.m_check_throttling}
  {}

  explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel)
-      : m_measure{measure}
-      , m_disable_blocking_kernel{disable_blocking_kernel}
-      , m_run_once{measure.m_run_once}
-      , m_check_throttling{measure.m_check_throttling}
+      : kernel_launch_timer{measure,
+                            disable_blocking_kernel,
+                            measure.m_run_once,
+                            measure.m_check_throttling}
  {}

  explicit kernel_launch_timer(measure_cold_base &measure,
@@ -195,125 +166,69 @@ public:
                               bool run_once,
                               bool check_throttling)
      : m_measure{measure}
-      , m_disable_blocking_kernel{disable_blocking_kernel}
-      , m_run_once{run_once}
-      , m_check_throttling{check_throttling}
+      , m_core{*this, {disable_blocking_kernel, run_once, check_throttling}}
  {}

-  ~kernel_launch_timer() noexcept { this->cleanup_noexcept(); }
+  kernel_launch_timer(const kernel_launch_timer &)            = delete;
+  kernel_launch_timer(kernel_launch_timer &&)                 = delete;
+  kernel_launch_timer &operator=(const kernel_launch_timer &) = delete;
+  kernel_launch_timer &operator=(kernel_launch_timer &&)      = delete;

-  __forceinline__ void start()
+  ~kernel_launch_timer() noexcept = default;
+
+  __forceinline__ void start() { m_core.start(); }
+
+  __forceinline__ void stop() { m_core.stop(); }
+
+  __forceinline__ void flush_device_l2() { m_measure.flush_device_l2(); }
+
+  __forceinline__ void sync_stream() { m_measure.sync_stream(); }
+
+  __forceinline__ cudaError_t sync_stream_noexcept() const noexcept
  {
-    cleanup_guard cleanup{*this};
-
-    m_measure.flush_device_l2();
-    m_measure.sync_stream();
-
-    // start CPU timer irrespective of use of blocking kernel
-    // Ref: https://github.com/NVIDIA/nvbench/issues/249
-    m_measure.m_cpu_timer.start();
-    m_cpu_timer_started = true;
-
-    if (!m_disable_blocking_kernel)
-    {
-      // Arm cleanup before queueing the blocking kernel. If block_stream throws
-      // after queueing work, cleanup_noexcept must still unblock the stream.
-      m_stream_unblock_armed = true;
-      m_measure.block_stream();
-    }
-    if (m_check_throttling)
-    {
-      // Arm cleanup before queueing timestamp work. If gpu_frequency_start
-      // throws after queueing work, cleanup_noexcept must still sync the stream.
-      m_gpu_frequency_cleanup_armed = true;
-      m_measure.gpu_frequency_start();
-    }
-    if (m_run_once)
-    {
-      m_measure.profiler_start();
-      m_profiler_started = true;
-    }
-    m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
-    m_cuda_timer_started = true;
-
-    cleanup.release();
+    return m_measure.sync_stream_noexcept();
  }

-  __forceinline__ void stop()
+  __forceinline__ void cpu_timer_start() noexcept { m_measure.m_cpu_timer.start(); }
+
+  __forceinline__ void cpu_timer_stop() noexcept { m_measure.m_cpu_timer.stop(); }
+
+  __forceinline__ void cpu_timer_stop_noexcept() noexcept { m_measure.m_cpu_timer.stop(); }
+
+  __forceinline__ void block_stream() { m_measure.block_stream(); }
+
+  __forceinline__ void unblock_stream() { m_measure.unblock_stream(); }
+
+  __forceinline__ void unblock_stream_noexcept() noexcept { m_measure.unblock_stream_noexcept(); }
+
+  __forceinline__ void gpu_frequency_start() { m_measure.gpu_frequency_start(); }
+
+  __forceinline__ void gpu_frequency_stop() { m_measure.gpu_frequency_stop(); }
+
+  __forceinline__ void profiler_start() { m_measure.profiler_start(); }
+
+  __forceinline__ void profiler_stop() { m_measure.profiler_stop(); }
+
+  __forceinline__ cudaError_t profiler_stop_noexcept() const noexcept
  {
-    cleanup_guard cleanup{*this};
+    return m_measure.profiler_stop_noexcept();
+  }

-    if (m_cuda_timer_started)
-    {
-      m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
-      m_cuda_timer_started = false;
-    }
-    if (m_gpu_frequency_cleanup_armed)
-    {
-      m_measure.gpu_frequency_stop();
-      m_gpu_frequency_cleanup_armed = false;
-    }
-    if (m_stream_unblock_armed)
-    {
-      m_measure.unblock_stream();
-      m_stream_unblock_armed = false;
-    }
-    m_measure.sync_stream();
-    if (m_profiler_started)
-    {
-      m_measure.profiler_stop();
-      m_profiler_started = false;
-    }
-    if (m_cpu_timer_started)
-    {
-      m_measure.m_cpu_timer.stop();
-      m_cpu_timer_started = false;
-    }
+  __forceinline__ void cuda_timer_start()
+  {
+    m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
+  }

-    cleanup.release();
+  __forceinline__ void cuda_timer_stop()
+  {
+    m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
  }

 private:
  measure_cold_base &m_measure;
-  bool m_disable_blocking_kernel;
-  bool m_run_once;
-  bool m_check_throttling;
-  bool m_cpu_timer_started{false};
-  bool m_stream_unblock_armed{false};
-  bool m_gpu_frequency_cleanup_armed{false};
-  bool m_profiler_started{false};
-  bool m_cuda_timer_started{false};
+  nvbench::detail::measure_cold_launch_timer_core<kernel_launch_timer> m_core;
 };

-__forceinline__ void measure_cold_base::kernel_launch_timer::cleanup_noexcept() noexcept
-{
-  const bool sync_armed = m_stream_unblock_armed || m_cuda_timer_started ||
-                          m_gpu_frequency_cleanup_armed;
-
-  if (m_stream_unblock_armed)
-  {
-    m_measure.unblock_stream_noexcept();
-    m_stream_unblock_armed = false;
-  }
-  if (sync_armed)
-  {
-    (void)m_measure.sync_stream_noexcept();
-  }
-  if (m_profiler_started)
-  {
-    (void)m_measure.profiler_stop_noexcept();
-    m_profiler_started = false;
-  }
-  if (m_cpu_timer_started)
-  {
-    m_measure.m_cpu_timer.stop();
-    m_cpu_timer_started = false;
-  }
-
-  m_cuda_timer_started          = false;
-  m_gpu_frequency_cleanup_armed = false;
-}
-
 template <typename KernelLauncher>
 struct measure_cold : public measure_cold_base
 {
--- a/nvbench/detail/measure_cold_launch_timer_core.cuh
+++ b/nvbench/detail/measure_cold_launch_timer_core.cuh
@@ -0,0 +1,183 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <nvbench/config.cuh>
+
+#if defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_GCC)
+#pragma GCC system_header
+#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_CLANG)
+#pragma clang system_header
+#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_MSVC)
+#pragma system_header
+#endif
+
+namespace nvbench::detail
+{
+
+struct measure_cold_launch_timer_config
+{
+  bool disable_blocking_kernel{false};
+  bool run_once{false};
+  bool check_throttling{true};
+};
+
+template <typename Measure>
+struct measure_cold_launch_timer_core
+{
+private:
+  void cleanup_noexcept() noexcept
+  {
+    const bool sync_armed = m_stream_unblock_armed || m_cuda_timer_started ||
+                            m_gpu_frequency_cleanup_armed;
+
+    if (m_stream_unblock_armed)
+    {
+      m_measure.unblock_stream_noexcept();
+      m_stream_unblock_armed = false;
+    }
+    if (sync_armed)
+    {
+      (void)m_measure.sync_stream_noexcept();
+    }
+    if (m_profiler_started)
+    {
+      (void)m_measure.profiler_stop_noexcept();
+      m_profiler_started = false;
+    }
+    if (m_cpu_timer_started)
+    {
+      m_measure.cpu_timer_stop_noexcept();
+      m_cpu_timer_started = false;
+    }
+
+    m_cuda_timer_started          = false;
+    m_gpu_frequency_cleanup_armed = false;
+  }
+
+  struct cleanup_guard
+  {
+    explicit cleanup_guard(measure_cold_launch_timer_core &timer)
+        : m_timer{timer}
+    {}
+
+    cleanup_guard(const cleanup_guard &)            = delete;
+    cleanup_guard(cleanup_guard &&)                 = delete;
+    cleanup_guard &operator=(const cleanup_guard &) = delete;
+    cleanup_guard &operator=(cleanup_guard &&)      = delete;
+
+    ~cleanup_guard() noexcept
+    {
+      if (m_active)
+      {
+        m_timer.cleanup_noexcept();
+      }
+    }
+
+    void release() noexcept { m_active = false; }
+
+  private:
+    measure_cold_launch_timer_core &m_timer;
+    bool m_active{true};
+  };
+
+public:
+  explicit measure_cold_launch_timer_core(Measure &measure, measure_cold_launch_timer_config config)
+      : m_measure{measure}
+      , m_disable_blocking_kernel{config.disable_blocking_kernel}
+      , m_run_once{config.run_once}
+      , m_check_throttling{config.check_throttling}
+  {}
+
+  measure_cold_launch_timer_core(const measure_cold_launch_timer_core &)            = delete;
+  measure_cold_launch_timer_core(measure_cold_launch_timer_core &&)                 = delete;
+  measure_cold_launch_timer_core &operator=(const measure_cold_launch_timer_core &) = delete;
+  measure_cold_launch_timer_core &operator=(measure_cold_launch_timer_core &&)      = delete;
+
+  ~measure_cold_launch_timer_core() noexcept { this->cleanup_noexcept(); }
+
+  void start()
+  {
+    cleanup_guard cleanup{*this};
+
+    m_measure.flush_device_l2();
+    m_measure.sync_stream();
+
+    // Start CPU timer irrespective of use of blocking kernel.
+    // Ref: https://github.com/NVIDIA/nvbench/issues/249
+    m_measure.cpu_timer_start();
+    m_cpu_timer_started = true;
+
+    if (!m_disable_blocking_kernel)
+    {
+      // Arm cleanup before queueing the blocking kernel. If block_stream throws
+      // after queueing work, cleanup_noexcept must still unblock the stream.
+      m_stream_unblock_armed = true;
+      m_measure.block_stream();
+    }
+    if (m_check_throttling)
+    {
+      // Arm cleanup before queueing timestamp work. If gpu_frequency_start
+      // throws after queueing work, cleanup_noexcept must still sync the stream.
+      m_gpu_frequency_cleanup_armed = true;
+      m_measure.gpu_frequency_start();
+    }
+    if (m_run_once)
+    {
+      m_measure.profiler_start();
+      m_profiler_started = true;
+    }
+    m_measure.cuda_timer_start();
+    m_cuda_timer_started = true;
+
+    cleanup.release();
+  }
+
+  void stop()
+  {
+    cleanup_guard cleanup{*this};
+
+    if (m_cuda_timer_started)
+    {
+      m_measure.cuda_timer_stop();
+      m_cuda_timer_started = false;
+    }
+    if (m_gpu_frequency_cleanup_armed)
+    {
+      m_measure.gpu_frequency_stop();
+      m_gpu_frequency_cleanup_armed = false;
+    }
+    if (m_stream_unblock_armed)
+    {
+      m_measure.unblock_stream();
+      m_stream_unblock_armed = false;
+    }
+    m_measure.sync_stream();
+    if (m_profiler_started)
+    {
+      m_measure.profiler_stop();
+      m_profiler_started = false;
+    }
+    if (m_cpu_timer_started)
+    {
+      m_measure.cpu_timer_stop();
+      m_cpu_timer_started = false;
+    }
+
+    cleanup.release();
+  }
+
+private:
+  Measure &m_measure;
+  bool m_disable_blocking_kernel;
+  bool m_run_once;
+  bool m_check_throttling;
+  bool m_cpu_timer_started{false};
+  bool m_stream_unblock_armed{false};
+  bool m_gpu_frequency_cleanup_armed{false};
+  bool m_profiler_started{false};
+  bool m_cuda_timer_started{false};
+};
+
+} // namespace nvbench::detail
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -32,6 +32,7 @@
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
+#include <nvbench/detail/stream_cleanup_guard.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>

@@ -57,6 +58,8 @@ struct measure_hot_base
  measure_hot_base &operator=(measure_hot_base &&)      = delete;

 protected:
+  friend struct nvbench::detail::stream_cleanup_guard<measure_hot_base>;
+
  void check();

  void initialize()
@@ -82,60 +85,6 @@ protected:

  __forceinline__ void sync_stream() const { NVBENCH_CUDA_CALL(this->sync_stream_noexcept()); }

-  struct stream_cleanup_guard
-  {
-    explicit stream_cleanup_guard(measure_hot_base &measure)
-        : m_measure{measure}
-    {
-      m_sync_armed = true;
-    }
-
-    stream_cleanup_guard(const stream_cleanup_guard &)            = delete;
-    stream_cleanup_guard(stream_cleanup_guard &&)                 = delete;
-    stream_cleanup_guard &operator=(const stream_cleanup_guard &) = delete;
-    stream_cleanup_guard &operator=(stream_cleanup_guard &&)      = delete;
-
-    ~stream_cleanup_guard() noexcept
-    {
-      if (m_unblock_armed)
-      {
-        m_measure.unblock_stream_noexcept();
-      }
-      if (m_sync_armed)
-      {
-        (void)m_measure.sync_stream_noexcept();
-      }
-    }
-
-    void block_stream()
-    {
-      // Arm cleanup before queueing the blocking kernel. If block_stream throws
-      // after queueing work, the destructor must still unblock the stream.
-      m_unblock_armed = true;
-      m_measure.block_stream();
-    }
-
-    void unblock()
-    {
-      if (m_unblock_armed)
-      {
-        m_measure.unblock_stream();
-        m_unblock_armed = false;
-      }
-    }
-
-    void release() noexcept
-    {
-      m_unblock_armed = false;
-      m_sync_armed    = false;
-    }
-
-  private:
-    measure_hot_base &m_measure;
-    bool m_unblock_armed{false};
-    bool m_sync_armed{false};
-  };
-
  nvbench::state &m_state;

  nvbench::launch m_launch;
@@ -178,7 +127,7 @@ private:
  // measurement.
  void run_warmup()
  {
-    stream_cleanup_guard cleanup{*this};
+    nvbench::detail::stream_cleanup_guard<measure_hot_base> cleanup{*this};

    m_cuda_timer.start(m_launch.get_stream());
    this->launch_kernel();
@@ -204,7 +153,7 @@ private:
    {
      batch_size = std::max(batch_size, nvbench::int64_t{1});

-      stream_cleanup_guard cleanup{*this};
+      nvbench::detail::stream_cleanup_guard<measure_hot_base> cleanup{*this};

      if (!m_disable_blocking_kernel)
      {
--- a/nvbench/detail/stream_cleanup_guard.cuh
+++ b/nvbench/detail/stream_cleanup_guard.cuh
@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+
+#include <nvbench/config.cuh>
+
+#if defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_GCC)
+#pragma GCC system_header
+#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_CLANG)
+#pragma clang system_header
+#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_MSVC)
+#pragma system_header
+#endif
+
+namespace nvbench::detail
+{
+
+template <typename Measure>
+struct stream_cleanup_guard
+{
+  explicit stream_cleanup_guard(Measure &measure)
+      : m_measure{measure}
+  {
+    m_sync_armed = true;
+  }
+
+  stream_cleanup_guard(const stream_cleanup_guard &)            = delete;
+  stream_cleanup_guard(stream_cleanup_guard &&)                 = delete;
+  stream_cleanup_guard &operator=(const stream_cleanup_guard &) = delete;
+  stream_cleanup_guard &operator=(stream_cleanup_guard &&)      = delete;
+
+  ~stream_cleanup_guard() noexcept
+  {
+    if (m_unblock_armed)
+    {
+      m_measure.unblock_stream_noexcept();
+    }
+    if (m_sync_armed)
+    {
+      (void)m_measure.sync_stream_noexcept();
+    }
+  }
+
+  void block_stream()
+  {
+    // Arm cleanup before queueing the blocking kernel. If block_stream throws
+    // after queueing work, the destructor must still unblock the stream.
+    m_unblock_armed = true;
+    m_measure.block_stream();
+  }
+
+  void unblock()
+  {
+    if (m_unblock_armed)
+    {
+      m_measure.unblock_stream();
+      m_unblock_armed = false;
+    }
+  }
+
+  void release() noexcept
+  {
+    m_unblock_armed = false;
+    m_sync_armed    = false;
+  }
+
+private:
+  Measure &m_measure;
+  bool m_unblock_armed{false};
+  bool m_sync_armed{false};
+};
+
+} // namespace nvbench::detail
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(test_srcs
  axes_metadata.cu
  benchmark.cu
+  cleanup_guards.cu
  create.cu
  cuda_timer.cu
  cuda_stream.cu
--- a/testing/cleanup_guards.cu
+++ b/testing/cleanup_guards.cu
@@ -0,0 +1,253 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <nvbench/detail/measure_cold_launch_timer_core.cuh>
+#include <nvbench/detail/stream_cleanup_guard.cuh>
+
+#include <fmt/format.h>
+
+#include <array>
+#include <cstddef>
+#include <initializer_list>
+#include <stdexcept>
+
+#include "test_asserts.cuh"
+
+namespace
+{
+
+enum class action
+{
+  flush_device_l2,
+  sync_stream,
+  sync_stream_noexcept,
+  cpu_timer_start,
+  cpu_timer_stop,
+  cpu_timer_stop_noexcept,
+  block_stream,
+  unblock_stream,
+  unblock_stream_noexcept,
+  gpu_frequency_start,
+  gpu_frequency_stop,
+  profiler_start,
+  profiler_stop,
+  profiler_stop_noexcept,
+  cuda_timer_start,
+  cuda_timer_stop,
+};
+
+struct fake_measure
+{
+  void clear_actions() noexcept
+  {
+    action_count = 0;
+    overflow     = false;
+  }
+
+  void throw_on(action a) noexcept
+  {
+    should_throw = true;
+    throw_action = a;
+  }
+
+  void record(action a) noexcept
+  {
+    if (action_count < actions.size())
+    {
+      actions[action_count++] = a;
+    }
+    else
+    {
+      overflow = true;
+    }
+  }
+
+  void record_or_throw(action a)
+  {
+    this->record(a);
+    if (should_throw && throw_action == a)
+    {
+      should_throw = false;
+      throw std::runtime_error{"Injected fake_measure failure."};
+    }
+  }
+
+  void flush_device_l2() { this->record_or_throw(action::flush_device_l2); }
+  void sync_stream() { this->record_or_throw(action::sync_stream); }
+  int sync_stream_noexcept() noexcept
+  {
+    this->record(action::sync_stream_noexcept);
+    return 0;
+  }
+
+  void cpu_timer_start() noexcept { this->record(action::cpu_timer_start); }
+  void cpu_timer_stop() noexcept { this->record(action::cpu_timer_stop); }
+  void cpu_timer_stop_noexcept() noexcept { this->record(action::cpu_timer_stop_noexcept); }
+
+  void block_stream() { this->record_or_throw(action::block_stream); }
+  void unblock_stream() { this->record_or_throw(action::unblock_stream); }
+  void unblock_stream_noexcept() noexcept { this->record(action::unblock_stream_noexcept); }
+
+  void gpu_frequency_start() { this->record_or_throw(action::gpu_frequency_start); }
+  void gpu_frequency_stop() { this->record_or_throw(action::gpu_frequency_stop); }
+
+  void profiler_start() { this->record_or_throw(action::profiler_start); }
+  void profiler_stop() { this->record_or_throw(action::profiler_stop); }
+  int profiler_stop_noexcept() noexcept
+  {
+    this->record(action::profiler_stop_noexcept);
+    return 0;
+  }
+
+  void cuda_timer_start() { this->record_or_throw(action::cuda_timer_start); }
+  void cuda_timer_stop() { this->record_or_throw(action::cuda_timer_stop); }
+
+  std::array<action, 32> actions{};
+  std::size_t action_count{};
+  action throw_action{};
+  bool should_throw{false};
+  bool overflow{false};
+};
+
+template <typename Callable>
+void assert_throws(Callable &&callable)
+{
+  bool threw = false;
+  try
+  {
+    callable();
+  }
+  catch (const std::runtime_error &)
+  {
+    threw = true;
+  }
+  ASSERT(threw);
+}
+
+void assert_actions(const fake_measure &measure, std::initializer_list<action> expected)
+{
+  ASSERT(!measure.overflow);
+  ASSERT(measure.action_count == expected.size());
+
+  std::size_t index = 0;
+  for (const action expected_action : expected)
+  {
+    ASSERT(measure.actions[index] == expected_action);
+    ++index;
+  }
+}
+
+void test_stream_cleanup_guard_block_stream_throw()
+{
+  fake_measure measure;
+  measure.throw_on(action::block_stream);
+
+  assert_throws([&measure] {
+    nvbench::detail::stream_cleanup_guard<fake_measure> cleanup{measure};
+    cleanup.block_stream();
+  });
+
+  assert_actions(
+    measure,
+    {action::block_stream, action::unblock_stream_noexcept, action::sync_stream_noexcept});
+}
+
+void test_stream_cleanup_guard_unblock_then_throw()
+{
+  fake_measure measure;
+
+  assert_throws([&measure] {
+    nvbench::detail::stream_cleanup_guard<fake_measure> cleanup{measure};
+    cleanup.block_stream();
+    cleanup.unblock();
+    throw std::runtime_error{"Injected post-unblock failure."};
+  });
+
+  assert_actions(measure,
+                 {action::block_stream, action::unblock_stream, action::sync_stream_noexcept});
+}
+
+void test_kernel_launch_timer_block_stream_throw()
+{
+  fake_measure measure;
+  measure.throw_on(action::block_stream);
+
+  assert_throws([&measure] {
+    nvbench::detail::measure_cold_launch_timer_core<fake_measure> timer{
+      measure,
+      nvbench::detail::measure_cold_launch_timer_config{false, false, true}};
+    timer.start();
+  });
+
+  assert_actions(measure,
+                 {action::flush_device_l2,
+                  action::sync_stream,
+                  action::cpu_timer_start,
+                  action::block_stream,
+                  action::unblock_stream_noexcept,
+                  action::sync_stream_noexcept,
+                  action::cpu_timer_stop_noexcept});
+}
+
+void test_kernel_launch_timer_gpu_frequency_start_throw()
+{
+  fake_measure measure;
+  measure.throw_on(action::gpu_frequency_start);
+
+  assert_throws([&measure] {
+    nvbench::detail::measure_cold_launch_timer_core<fake_measure> timer{
+      measure,
+      nvbench::detail::measure_cold_launch_timer_config{false, false, true}};
+    timer.start();
+  });
+
+  assert_actions(measure,
+                 {action::flush_device_l2,
+                  action::sync_stream,
+                  action::cpu_timer_start,
+                  action::block_stream,
+                  action::gpu_frequency_start,
+                  action::unblock_stream_noexcept,
+                  action::sync_stream_noexcept,
+                  action::cpu_timer_stop_noexcept});
+}
+
+void test_kernel_launch_timer_gpu_frequency_stop_throw()
+{
+  fake_measure measure;
+  nvbench::detail::measure_cold_launch_timer_core<fake_measure> timer{
+    measure,
+    nvbench::detail::measure_cold_launch_timer_config{false, false, true}};
+
+  timer.start();
+  measure.clear_actions();
+  measure.throw_on(action::gpu_frequency_stop);
+
+  assert_throws([&timer] { timer.stop(); });
+
+  assert_actions(measure,
+                 {action::cuda_timer_stop,
+                  action::gpu_frequency_stop,
+                  action::unblock_stream_noexcept,
+                  action::sync_stream_noexcept,
+                  action::cpu_timer_stop_noexcept});
+}
+
+} // namespace
+
+int main()
+try
+{
+  test_stream_cleanup_guard_block_stream_throw();
+  test_stream_cleanup_guard_unblock_then_throw();
+  test_kernel_launch_timer_block_stream_throw();
+  test_kernel_launch_timer_gpu_frequency_start_throw();
+  test_kernel_launch_timer_gpu_frequency_stop_throw();
+
+  return 0;
+}
+catch (std::exception &e)
+{
+  fmt::print("{}\n", e.what());
+  return 1;
+}