Refactor measurement cleanup guards for testability

Extract hot stream cleanup and cold launch timer cleanup into reusable
detail helpers. Keep measure_hot and measure_cold using those helpers through
thin adapters so the tested cleanup logic matches the production path.

Add driver-free cleanup guard tests using a fake measure object to verify
cleanup ordering when exceptions occur after blocking stream setup, after hot
unblock, and around cold GPU frequency start/stop paths.
This commit is contained in:
Oleksandr Pavlyk
2026-05-13 11:09:12 -05:00
parent 0d64864ac3
commit e2763ca136
6 changed files with 573 additions and 198 deletions

View File

@@ -35,6 +35,7 @@
#include <nvbench/detail/gpu_frequency.cuh>
#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
#include <nvbench/detail/l2flush.cuh>
#include <nvbench/detail/measure_cold_launch_timer_core.cuh>
#include <nvbench/detail/statistics.cuh>
#include <nvbench/device_info.cuh>
#include <nvbench/exec_tag.cuh>
@@ -146,48 +147,18 @@ protected:
struct measure_cold_base::kernel_launch_timer
{
private:
__forceinline__ void cleanup_noexcept() noexcept;
struct cleanup_guard
{
explicit cleanup_guard(kernel_launch_timer &timer)
: m_timer{timer}
{}
cleanup_guard(const cleanup_guard &) = delete;
cleanup_guard(cleanup_guard &&) = delete;
cleanup_guard &operator=(const cleanup_guard &) = delete;
cleanup_guard &operator=(cleanup_guard &&) = delete;
~cleanup_guard() noexcept
{
if (m_active)
{
m_timer.cleanup_noexcept();
}
}
void release() noexcept { m_active = false; }
private:
kernel_launch_timer &m_timer;
bool m_active{true};
};
public:
kernel_launch_timer(measure_cold_base &measure)
: m_measure{measure}
, m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
, m_run_once{measure.m_run_once}
, m_check_throttling{measure.m_check_throttling}
: kernel_launch_timer{measure,
measure.m_disable_blocking_kernel,
measure.m_run_once,
measure.m_check_throttling}
{}
explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel)
: m_measure{measure}
, m_disable_blocking_kernel{disable_blocking_kernel}
, m_run_once{measure.m_run_once}
, m_check_throttling{measure.m_check_throttling}
: kernel_launch_timer{measure,
disable_blocking_kernel,
measure.m_run_once,
measure.m_check_throttling}
{}
explicit kernel_launch_timer(measure_cold_base &measure,
@@ -195,125 +166,69 @@ public:
bool run_once,
bool check_throttling)
: m_measure{measure}
, m_disable_blocking_kernel{disable_blocking_kernel}
, m_run_once{run_once}
, m_check_throttling{check_throttling}
, m_core{*this, {disable_blocking_kernel, run_once, check_throttling}}
{}
~kernel_launch_timer() noexcept { this->cleanup_noexcept(); }
kernel_launch_timer(const kernel_launch_timer &) = delete;
kernel_launch_timer(kernel_launch_timer &&) = delete;
kernel_launch_timer &operator=(const kernel_launch_timer &) = delete;
kernel_launch_timer &operator=(kernel_launch_timer &&) = delete;
__forceinline__ void start()
~kernel_launch_timer() noexcept = default;
__forceinline__ void start() { m_core.start(); }
__forceinline__ void stop() { m_core.stop(); }
__forceinline__ void flush_device_l2() { m_measure.flush_device_l2(); }
__forceinline__ void sync_stream() { m_measure.sync_stream(); }
__forceinline__ cudaError_t sync_stream_noexcept() const noexcept
{
cleanup_guard cleanup{*this};
m_measure.flush_device_l2();
m_measure.sync_stream();
// start CPU timer irrespective of use of blocking kernel
// Ref: https://github.com/NVIDIA/nvbench/issues/249
m_measure.m_cpu_timer.start();
m_cpu_timer_started = true;
if (!m_disable_blocking_kernel)
{
// Arm cleanup before queueing the blocking kernel. If block_stream throws
// after queueing work, cleanup_noexcept must still unblock the stream.
m_stream_unblock_armed = true;
m_measure.block_stream();
}
if (m_check_throttling)
{
// Arm cleanup before queueing timestamp work. If gpu_frequency_start
// throws after queueing work, cleanup_noexcept must still sync the stream.
m_gpu_frequency_cleanup_armed = true;
m_measure.gpu_frequency_start();
}
if (m_run_once)
{
m_measure.profiler_start();
m_profiler_started = true;
}
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
m_cuda_timer_started = true;
cleanup.release();
return m_measure.sync_stream_noexcept();
}
__forceinline__ void stop()
__forceinline__ void cpu_timer_start() noexcept { m_measure.m_cpu_timer.start(); }
__forceinline__ void cpu_timer_stop() noexcept { m_measure.m_cpu_timer.stop(); }
__forceinline__ void cpu_timer_stop_noexcept() noexcept { m_measure.m_cpu_timer.stop(); }
__forceinline__ void block_stream() { m_measure.block_stream(); }
__forceinline__ void unblock_stream() { m_measure.unblock_stream(); }
__forceinline__ void unblock_stream_noexcept() noexcept { m_measure.unblock_stream_noexcept(); }
__forceinline__ void gpu_frequency_start() { m_measure.gpu_frequency_start(); }
__forceinline__ void gpu_frequency_stop() { m_measure.gpu_frequency_stop(); }
__forceinline__ void profiler_start() { m_measure.profiler_start(); }
__forceinline__ void profiler_stop() { m_measure.profiler_stop(); }
__forceinline__ cudaError_t profiler_stop_noexcept() const noexcept
{
cleanup_guard cleanup{*this};
return m_measure.profiler_stop_noexcept();
}
if (m_cuda_timer_started)
{
m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
m_cuda_timer_started = false;
}
if (m_gpu_frequency_cleanup_armed)
{
m_measure.gpu_frequency_stop();
m_gpu_frequency_cleanup_armed = false;
}
if (m_stream_unblock_armed)
{
m_measure.unblock_stream();
m_stream_unblock_armed = false;
}
m_measure.sync_stream();
if (m_profiler_started)
{
m_measure.profiler_stop();
m_profiler_started = false;
}
if (m_cpu_timer_started)
{
m_measure.m_cpu_timer.stop();
m_cpu_timer_started = false;
}
__forceinline__ void cuda_timer_start()
{
m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
}
cleanup.release();
__forceinline__ void cuda_timer_stop()
{
m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
}
private:
measure_cold_base &m_measure;
bool m_disable_blocking_kernel;
bool m_run_once;
bool m_check_throttling;
bool m_cpu_timer_started{false};
bool m_stream_unblock_armed{false};
bool m_gpu_frequency_cleanup_armed{false};
bool m_profiler_started{false};
bool m_cuda_timer_started{false};
nvbench::detail::measure_cold_launch_timer_core<kernel_launch_timer> m_core;
};
__forceinline__ void measure_cold_base::kernel_launch_timer::cleanup_noexcept() noexcept
{
const bool sync_armed = m_stream_unblock_armed || m_cuda_timer_started ||
m_gpu_frequency_cleanup_armed;
if (m_stream_unblock_armed)
{
m_measure.unblock_stream_noexcept();
m_stream_unblock_armed = false;
}
if (sync_armed)
{
(void)m_measure.sync_stream_noexcept();
}
if (m_profiler_started)
{
(void)m_measure.profiler_stop_noexcept();
m_profiler_started = false;
}
if (m_cpu_timer_started)
{
m_measure.m_cpu_timer.stop();
m_cpu_timer_started = false;
}
m_cuda_timer_started = false;
m_gpu_frequency_cleanup_armed = false;
}
template <typename KernelLauncher>
struct measure_cold : public measure_cold_base
{

View File

@@ -0,0 +1,183 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#pragma once
#include <nvbench/config.cuh>
#if defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_GCC)
#pragma GCC system_header
#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_CLANG)
#pragma clang system_header
#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_MSVC)
#pragma system_header
#endif
namespace nvbench::detail
{
struct measure_cold_launch_timer_config
{
bool disable_blocking_kernel{false};
bool run_once{false};
bool check_throttling{true};
};
template <typename Measure>
struct measure_cold_launch_timer_core
{
private:
void cleanup_noexcept() noexcept
{
const bool sync_armed = m_stream_unblock_armed || m_cuda_timer_started ||
m_gpu_frequency_cleanup_armed;
if (m_stream_unblock_armed)
{
m_measure.unblock_stream_noexcept();
m_stream_unblock_armed = false;
}
if (sync_armed)
{
(void)m_measure.sync_stream_noexcept();
}
if (m_profiler_started)
{
(void)m_measure.profiler_stop_noexcept();
m_profiler_started = false;
}
if (m_cpu_timer_started)
{
m_measure.cpu_timer_stop_noexcept();
m_cpu_timer_started = false;
}
m_cuda_timer_started = false;
m_gpu_frequency_cleanup_armed = false;
}
struct cleanup_guard
{
explicit cleanup_guard(measure_cold_launch_timer_core &timer)
: m_timer{timer}
{}
cleanup_guard(const cleanup_guard &) = delete;
cleanup_guard(cleanup_guard &&) = delete;
cleanup_guard &operator=(const cleanup_guard &) = delete;
cleanup_guard &operator=(cleanup_guard &&) = delete;
~cleanup_guard() noexcept
{
if (m_active)
{
m_timer.cleanup_noexcept();
}
}
void release() noexcept { m_active = false; }
private:
measure_cold_launch_timer_core &m_timer;
bool m_active{true};
};
public:
explicit measure_cold_launch_timer_core(Measure &measure, measure_cold_launch_timer_config config)
: m_measure{measure}
, m_disable_blocking_kernel{config.disable_blocking_kernel}
, m_run_once{config.run_once}
, m_check_throttling{config.check_throttling}
{}
measure_cold_launch_timer_core(const measure_cold_launch_timer_core &) = delete;
measure_cold_launch_timer_core(measure_cold_launch_timer_core &&) = delete;
measure_cold_launch_timer_core &operator=(const measure_cold_launch_timer_core &) = delete;
measure_cold_launch_timer_core &operator=(measure_cold_launch_timer_core &&) = delete;
~measure_cold_launch_timer_core() noexcept { this->cleanup_noexcept(); }
void start()
{
cleanup_guard cleanup{*this};
m_measure.flush_device_l2();
m_measure.sync_stream();
// Start CPU timer irrespective of use of blocking kernel.
// Ref: https://github.com/NVIDIA/nvbench/issues/249
m_measure.cpu_timer_start();
m_cpu_timer_started = true;
if (!m_disable_blocking_kernel)
{
// Arm cleanup before queueing the blocking kernel. If block_stream throws
// after queueing work, cleanup_noexcept must still unblock the stream.
m_stream_unblock_armed = true;
m_measure.block_stream();
}
if (m_check_throttling)
{
// Arm cleanup before queueing timestamp work. If gpu_frequency_start
// throws after queueing work, cleanup_noexcept must still sync the stream.
m_gpu_frequency_cleanup_armed = true;
m_measure.gpu_frequency_start();
}
if (m_run_once)
{
m_measure.profiler_start();
m_profiler_started = true;
}
m_measure.cuda_timer_start();
m_cuda_timer_started = true;
cleanup.release();
}
void stop()
{
cleanup_guard cleanup{*this};
if (m_cuda_timer_started)
{
m_measure.cuda_timer_stop();
m_cuda_timer_started = false;
}
if (m_gpu_frequency_cleanup_armed)
{
m_measure.gpu_frequency_stop();
m_gpu_frequency_cleanup_armed = false;
}
if (m_stream_unblock_armed)
{
m_measure.unblock_stream();
m_stream_unblock_armed = false;
}
m_measure.sync_stream();
if (m_profiler_started)
{
m_measure.profiler_stop();
m_profiler_started = false;
}
if (m_cpu_timer_started)
{
m_measure.cpu_timer_stop();
m_cpu_timer_started = false;
}
cleanup.release();
}
private:
Measure &m_measure;
bool m_disable_blocking_kernel;
bool m_run_once;
bool m_check_throttling;
bool m_cpu_timer_started{false};
bool m_stream_unblock_armed{false};
bool m_gpu_frequency_cleanup_armed{false};
bool m_profiler_started{false};
bool m_cuda_timer_started{false};
};
} // namespace nvbench::detail

View File

@@ -32,6 +32,7 @@
#include <nvbench/cpu_timer.cuh>
#include <nvbench/cuda_call.cuh>
#include <nvbench/cuda_timer.cuh>
#include <nvbench/detail/stream_cleanup_guard.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/launch.cuh>
@@ -57,6 +58,8 @@ struct measure_hot_base
measure_hot_base &operator=(measure_hot_base &&) = delete;
protected:
friend struct nvbench::detail::stream_cleanup_guard<measure_hot_base>;
void check();
void initialize()
@@ -82,60 +85,6 @@ protected:
__forceinline__ void sync_stream() const { NVBENCH_CUDA_CALL(this->sync_stream_noexcept()); }
struct stream_cleanup_guard
{
explicit stream_cleanup_guard(measure_hot_base &measure)
: m_measure{measure}
{
m_sync_armed = true;
}
stream_cleanup_guard(const stream_cleanup_guard &) = delete;
stream_cleanup_guard(stream_cleanup_guard &&) = delete;
stream_cleanup_guard &operator=(const stream_cleanup_guard &) = delete;
stream_cleanup_guard &operator=(stream_cleanup_guard &&) = delete;
~stream_cleanup_guard() noexcept
{
if (m_unblock_armed)
{
m_measure.unblock_stream_noexcept();
}
if (m_sync_armed)
{
(void)m_measure.sync_stream_noexcept();
}
}
void block_stream()
{
// Arm cleanup before queueing the blocking kernel. If block_stream throws
// after queueing work, the destructor must still unblock the stream.
m_unblock_armed = true;
m_measure.block_stream();
}
void unblock()
{
if (m_unblock_armed)
{
m_measure.unblock_stream();
m_unblock_armed = false;
}
}
void release() noexcept
{
m_unblock_armed = false;
m_sync_armed = false;
}
private:
measure_hot_base &m_measure;
bool m_unblock_armed{false};
bool m_sync_armed{false};
};
nvbench::state &m_state;
nvbench::launch m_launch;
@@ -178,7 +127,7 @@ private:
// measurement.
void run_warmup()
{
stream_cleanup_guard cleanup{*this};
nvbench::detail::stream_cleanup_guard<measure_hot_base> cleanup{*this};
m_cuda_timer.start(m_launch.get_stream());
this->launch_kernel();
@@ -204,7 +153,7 @@ private:
{
batch_size = std::max(batch_size, nvbench::int64_t{1});
stream_cleanup_guard cleanup{*this};
nvbench::detail::stream_cleanup_guard<measure_hot_base> cleanup{*this};
if (!m_disable_blocking_kernel)
{

View File

@@ -0,0 +1,74 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#pragma once
#include <nvbench/config.cuh>
#if defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_GCC)
#pragma GCC system_header
#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_CLANG)
#pragma clang system_header
#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_MSVC)
#pragma system_header
#endif
namespace nvbench::detail
{
template <typename Measure>
struct stream_cleanup_guard
{
explicit stream_cleanup_guard(Measure &measure)
: m_measure{measure}
{
m_sync_armed = true;
}
stream_cleanup_guard(const stream_cleanup_guard &) = delete;
stream_cleanup_guard(stream_cleanup_guard &&) = delete;
stream_cleanup_guard &operator=(const stream_cleanup_guard &) = delete;
stream_cleanup_guard &operator=(stream_cleanup_guard &&) = delete;
~stream_cleanup_guard() noexcept
{
if (m_unblock_armed)
{
m_measure.unblock_stream_noexcept();
}
if (m_sync_armed)
{
(void)m_measure.sync_stream_noexcept();
}
}
void block_stream()
{
// Arm cleanup before queueing the blocking kernel. If block_stream throws
// after queueing work, the destructor must still unblock the stream.
m_unblock_armed = true;
m_measure.block_stream();
}
void unblock()
{
if (m_unblock_armed)
{
m_measure.unblock_stream();
m_unblock_armed = false;
}
}
void release() noexcept
{
m_unblock_armed = false;
m_sync_armed = false;
}
private:
Measure &m_measure;
bool m_unblock_armed{false};
bool m_sync_armed{false};
};
} // namespace nvbench::detail

View File

@@ -1,6 +1,7 @@
set(test_srcs
axes_metadata.cu
benchmark.cu
cleanup_guards.cu
create.cu
cuda_timer.cu
cuda_stream.cu

253
testing/cleanup_guards.cu Normal file
View File

@@ -0,0 +1,253 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include <nvbench/detail/measure_cold_launch_timer_core.cuh>
#include <nvbench/detail/stream_cleanup_guard.cuh>
#include <fmt/format.h>
#include <array>
#include <cstddef>
#include <initializer_list>
#include <stdexcept>
#include "test_asserts.cuh"
namespace
{
enum class action
{
flush_device_l2,
sync_stream,
sync_stream_noexcept,
cpu_timer_start,
cpu_timer_stop,
cpu_timer_stop_noexcept,
block_stream,
unblock_stream,
unblock_stream_noexcept,
gpu_frequency_start,
gpu_frequency_stop,
profiler_start,
profiler_stop,
profiler_stop_noexcept,
cuda_timer_start,
cuda_timer_stop,
};
struct fake_measure
{
void clear_actions() noexcept
{
action_count = 0;
overflow = false;
}
void throw_on(action a) noexcept
{
should_throw = true;
throw_action = a;
}
void record(action a) noexcept
{
if (action_count < actions.size())
{
actions[action_count++] = a;
}
else
{
overflow = true;
}
}
void record_or_throw(action a)
{
this->record(a);
if (should_throw && throw_action == a)
{
should_throw = false;
throw std::runtime_error{"Injected fake_measure failure."};
}
}
void flush_device_l2() { this->record_or_throw(action::flush_device_l2); }
void sync_stream() { this->record_or_throw(action::sync_stream); }
int sync_stream_noexcept() noexcept
{
this->record(action::sync_stream_noexcept);
return 0;
}
void cpu_timer_start() noexcept { this->record(action::cpu_timer_start); }
void cpu_timer_stop() noexcept { this->record(action::cpu_timer_stop); }
void cpu_timer_stop_noexcept() noexcept { this->record(action::cpu_timer_stop_noexcept); }
void block_stream() { this->record_or_throw(action::block_stream); }
void unblock_stream() { this->record_or_throw(action::unblock_stream); }
void unblock_stream_noexcept() noexcept { this->record(action::unblock_stream_noexcept); }
void gpu_frequency_start() { this->record_or_throw(action::gpu_frequency_start); }
void gpu_frequency_stop() { this->record_or_throw(action::gpu_frequency_stop); }
void profiler_start() { this->record_or_throw(action::profiler_start); }
void profiler_stop() { this->record_or_throw(action::profiler_stop); }
int profiler_stop_noexcept() noexcept
{
this->record(action::profiler_stop_noexcept);
return 0;
}
void cuda_timer_start() { this->record_or_throw(action::cuda_timer_start); }
void cuda_timer_stop() { this->record_or_throw(action::cuda_timer_stop); }
std::array<action, 32> actions{};
std::size_t action_count{};
action throw_action{};
bool should_throw{false};
bool overflow{false};
};
template <typename Callable>
void assert_throws(Callable &&callable)
{
bool threw = false;
try
{
callable();
}
catch (const std::runtime_error &)
{
threw = true;
}
ASSERT(threw);
}
void assert_actions(const fake_measure &measure, std::initializer_list<action> expected)
{
ASSERT(!measure.overflow);
ASSERT(measure.action_count == expected.size());
std::size_t index = 0;
for (const action expected_action : expected)
{
ASSERT(measure.actions[index] == expected_action);
++index;
}
}
void test_stream_cleanup_guard_block_stream_throw()
{
fake_measure measure;
measure.throw_on(action::block_stream);
assert_throws([&measure] {
nvbench::detail::stream_cleanup_guard<fake_measure> cleanup{measure};
cleanup.block_stream();
});
assert_actions(
measure,
{action::block_stream, action::unblock_stream_noexcept, action::sync_stream_noexcept});
}
void test_stream_cleanup_guard_unblock_then_throw()
{
fake_measure measure;
assert_throws([&measure] {
nvbench::detail::stream_cleanup_guard<fake_measure> cleanup{measure};
cleanup.block_stream();
cleanup.unblock();
throw std::runtime_error{"Injected post-unblock failure."};
});
assert_actions(measure,
{action::block_stream, action::unblock_stream, action::sync_stream_noexcept});
}
void test_kernel_launch_timer_block_stream_throw()
{
fake_measure measure;
measure.throw_on(action::block_stream);
assert_throws([&measure] {
nvbench::detail::measure_cold_launch_timer_core<fake_measure> timer{
measure,
nvbench::detail::measure_cold_launch_timer_config{false, false, true}};
timer.start();
});
assert_actions(measure,
{action::flush_device_l2,
action::sync_stream,
action::cpu_timer_start,
action::block_stream,
action::unblock_stream_noexcept,
action::sync_stream_noexcept,
action::cpu_timer_stop_noexcept});
}
void test_kernel_launch_timer_gpu_frequency_start_throw()
{
fake_measure measure;
measure.throw_on(action::gpu_frequency_start);
assert_throws([&measure] {
nvbench::detail::measure_cold_launch_timer_core<fake_measure> timer{
measure,
nvbench::detail::measure_cold_launch_timer_config{false, false, true}};
timer.start();
});
assert_actions(measure,
{action::flush_device_l2,
action::sync_stream,
action::cpu_timer_start,
action::block_stream,
action::gpu_frequency_start,
action::unblock_stream_noexcept,
action::sync_stream_noexcept,
action::cpu_timer_stop_noexcept});
}
void test_kernel_launch_timer_gpu_frequency_stop_throw()
{
fake_measure measure;
nvbench::detail::measure_cold_launch_timer_core<fake_measure> timer{
measure,
nvbench::detail::measure_cold_launch_timer_config{false, false, true}};
timer.start();
measure.clear_actions();
measure.throw_on(action::gpu_frequency_stop);
assert_throws([&timer] { timer.stop(); });
assert_actions(measure,
{action::cuda_timer_stop,
action::gpu_frequency_stop,
action::unblock_stream_noexcept,
action::sync_stream_noexcept,
action::cpu_timer_stop_noexcept});
}
} // namespace
int main()
try
{
test_stream_cleanup_guard_block_stream_throw();
test_stream_cleanup_guard_unblock_then_throw();
test_kernel_launch_timer_block_stream_throw();
test_kernel_launch_timer_gpu_frequency_start_throw();
test_kernel_launch_timer_gpu_frequency_stop_throw();
return 0;
}
catch (std::exception &e)
{
fmt::print("{}\n", e.what());
return 1;
}