diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index e11b13c..18c9164 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -146,48 +147,18 @@ protected: struct measure_cold_base::kernel_launch_timer { -private: - __forceinline__ void cleanup_noexcept() noexcept; - - struct cleanup_guard - { - explicit cleanup_guard(kernel_launch_timer &timer) - : m_timer{timer} - {} - - cleanup_guard(const cleanup_guard &) = delete; - cleanup_guard(cleanup_guard &&) = delete; - cleanup_guard &operator=(const cleanup_guard &) = delete; - cleanup_guard &operator=(cleanup_guard &&) = delete; - - ~cleanup_guard() noexcept - { - if (m_active) - { - m_timer.cleanup_noexcept(); - } - } - - void release() noexcept { m_active = false; } - - private: - kernel_launch_timer &m_timer; - bool m_active{true}; - }; - -public: kernel_launch_timer(measure_cold_base &measure) - : m_measure{measure} - , m_disable_blocking_kernel{measure.m_disable_blocking_kernel} - , m_run_once{measure.m_run_once} - , m_check_throttling{measure.m_check_throttling} + : kernel_launch_timer{measure, + measure.m_disable_blocking_kernel, + measure.m_run_once, + measure.m_check_throttling} {} explicit kernel_launch_timer(measure_cold_base &measure, bool disable_blocking_kernel) - : m_measure{measure} - , m_disable_blocking_kernel{disable_blocking_kernel} - , m_run_once{measure.m_run_once} - , m_check_throttling{measure.m_check_throttling} + : kernel_launch_timer{measure, + disable_blocking_kernel, + measure.m_run_once, + measure.m_check_throttling} {} explicit kernel_launch_timer(measure_cold_base &measure, @@ -195,125 +166,69 @@ public: bool run_once, bool check_throttling) : m_measure{measure} - , m_disable_blocking_kernel{disable_blocking_kernel} - , m_run_once{run_once} - , m_check_throttling{check_throttling} + , m_core{*this, {disable_blocking_kernel, run_once, check_throttling}} {} - ~kernel_launch_timer() noexcept { this->cleanup_noexcept(); } + kernel_launch_timer(const kernel_launch_timer &) = delete; + kernel_launch_timer(kernel_launch_timer &&) = delete; + kernel_launch_timer &operator=(const kernel_launch_timer &) = delete; + kernel_launch_timer &operator=(kernel_launch_timer &&) = delete; - __forceinline__ void start() + ~kernel_launch_timer() noexcept = default; + + __forceinline__ void start() { m_core.start(); } + + __forceinline__ void stop() { m_core.stop(); } + + __forceinline__ void flush_device_l2() { m_measure.flush_device_l2(); } + + __forceinline__ void sync_stream() { m_measure.sync_stream(); } + + __forceinline__ cudaError_t sync_stream_noexcept() const noexcept { - cleanup_guard cleanup{*this}; - - m_measure.flush_device_l2(); - m_measure.sync_stream(); - - // start CPU timer irrespective of use of blocking kernel - // Ref: https://github.com/NVIDIA/nvbench/issues/249 - m_measure.m_cpu_timer.start(); - m_cpu_timer_started = true; - - if (!m_disable_blocking_kernel) - { - // Arm cleanup before queueing the blocking kernel. If block_stream throws - // after queueing work, cleanup_noexcept must still unblock the stream. - m_stream_unblock_armed = true; - m_measure.block_stream(); - } - if (m_check_throttling) - { - // Arm cleanup before queueing timestamp work. If gpu_frequency_start - // throws after queueing work, cleanup_noexcept must still sync the stream. - m_gpu_frequency_cleanup_armed = true; - m_measure.gpu_frequency_start(); - } - if (m_run_once) - { - m_measure.profiler_start(); - m_profiler_started = true; - } - m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream()); - m_cuda_timer_started = true; - - cleanup.release(); + return m_measure.sync_stream_noexcept(); } - __forceinline__ void stop() + __forceinline__ void cpu_timer_start() noexcept { m_measure.m_cpu_timer.start(); } + + __forceinline__ void cpu_timer_stop() noexcept { m_measure.m_cpu_timer.stop(); } + + __forceinline__ void cpu_timer_stop_noexcept() noexcept { m_measure.m_cpu_timer.stop(); } + + __forceinline__ void block_stream() { m_measure.block_stream(); } + + __forceinline__ void unblock_stream() { m_measure.unblock_stream(); } + + __forceinline__ void unblock_stream_noexcept() noexcept { m_measure.unblock_stream_noexcept(); } + + __forceinline__ void gpu_frequency_start() { m_measure.gpu_frequency_start(); } + + __forceinline__ void gpu_frequency_stop() { m_measure.gpu_frequency_stop(); } + + __forceinline__ void profiler_start() { m_measure.profiler_start(); } + + __forceinline__ void profiler_stop() { m_measure.profiler_stop(); } + + __forceinline__ cudaError_t profiler_stop_noexcept() const noexcept { - cleanup_guard cleanup{*this}; + return m_measure.profiler_stop_noexcept(); + } - if (m_cuda_timer_started) - { - m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream()); - m_cuda_timer_started = false; - } - if (m_gpu_frequency_cleanup_armed) - { - m_measure.gpu_frequency_stop(); - m_gpu_frequency_cleanup_armed = false; - } - if (m_stream_unblock_armed) - { - m_measure.unblock_stream(); - m_stream_unblock_armed = false; - } - m_measure.sync_stream(); - if (m_profiler_started) - { - m_measure.profiler_stop(); - m_profiler_started = false; - } - if (m_cpu_timer_started) - { - m_measure.m_cpu_timer.stop(); - m_cpu_timer_started = false; - } + __forceinline__ void cuda_timer_start() + { + m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream()); + } - cleanup.release(); + __forceinline__ void cuda_timer_stop() + { + m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream()); } private: measure_cold_base &m_measure; - bool m_disable_blocking_kernel; - bool m_run_once; - bool m_check_throttling; - bool m_cpu_timer_started{false}; - bool m_stream_unblock_armed{false}; - bool m_gpu_frequency_cleanup_armed{false}; - bool m_profiler_started{false}; - bool m_cuda_timer_started{false}; + nvbench::detail::measure_cold_launch_timer_core m_core; }; -__forceinline__ void measure_cold_base::kernel_launch_timer::cleanup_noexcept() noexcept -{ - const bool sync_armed = m_stream_unblock_armed || m_cuda_timer_started || - m_gpu_frequency_cleanup_armed; - - if (m_stream_unblock_armed) - { - m_measure.unblock_stream_noexcept(); - m_stream_unblock_armed = false; - } - if (sync_armed) - { - (void)m_measure.sync_stream_noexcept(); - } - if (m_profiler_started) - { - (void)m_measure.profiler_stop_noexcept(); - m_profiler_started = false; - } - if (m_cpu_timer_started) - { - m_measure.m_cpu_timer.stop(); - m_cpu_timer_started = false; - } - - m_cuda_timer_started = false; - m_gpu_frequency_cleanup_armed = false; -} - template struct measure_cold : public measure_cold_base { diff --git a/nvbench/detail/measure_cold_launch_timer_core.cuh b/nvbench/detail/measure_cold_launch_timer_core.cuh new file mode 100644 index 0000000..8af2b8b --- /dev/null +++ b/nvbench/detail/measure_cold_launch_timer_core.cuh @@ -0,0 +1,183 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_GCC) +#pragma GCC system_header +#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_CLANG) +#pragma clang system_header +#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_MSVC) +#pragma system_header +#endif + +namespace nvbench::detail +{ + +struct measure_cold_launch_timer_config +{ + bool disable_blocking_kernel{false}; + bool run_once{false}; + bool check_throttling{true}; +}; + +template +struct measure_cold_launch_timer_core +{ +private: + void cleanup_noexcept() noexcept + { + const bool sync_armed = m_stream_unblock_armed || m_cuda_timer_started || + m_gpu_frequency_cleanup_armed; + + if (m_stream_unblock_armed) + { + m_measure.unblock_stream_noexcept(); + m_stream_unblock_armed = false; + } + if (sync_armed) + { + (void)m_measure.sync_stream_noexcept(); + } + if (m_profiler_started) + { + (void)m_measure.profiler_stop_noexcept(); + m_profiler_started = false; + } + if (m_cpu_timer_started) + { + m_measure.cpu_timer_stop_noexcept(); + m_cpu_timer_started = false; + } + + m_cuda_timer_started = false; + m_gpu_frequency_cleanup_armed = false; + } + + struct cleanup_guard + { + explicit cleanup_guard(measure_cold_launch_timer_core &timer) + : m_timer{timer} + {} + + cleanup_guard(const cleanup_guard &) = delete; + cleanup_guard(cleanup_guard &&) = delete; + cleanup_guard &operator=(const cleanup_guard &) = delete; + cleanup_guard &operator=(cleanup_guard &&) = delete; + + ~cleanup_guard() noexcept + { + if (m_active) + { + m_timer.cleanup_noexcept(); + } + } + + void release() noexcept { m_active = false; } + + private: + measure_cold_launch_timer_core &m_timer; + bool m_active{true}; + }; + +public: + explicit measure_cold_launch_timer_core(Measure &measure, measure_cold_launch_timer_config config) + : m_measure{measure} + , m_disable_blocking_kernel{config.disable_blocking_kernel} + , m_run_once{config.run_once} + , m_check_throttling{config.check_throttling} + {} + + measure_cold_launch_timer_core(const measure_cold_launch_timer_core &) = delete; + measure_cold_launch_timer_core(measure_cold_launch_timer_core &&) = delete; + measure_cold_launch_timer_core &operator=(const measure_cold_launch_timer_core &) = delete; + measure_cold_launch_timer_core &operator=(measure_cold_launch_timer_core &&) = delete; + + ~measure_cold_launch_timer_core() noexcept { this->cleanup_noexcept(); } + + void start() + { + cleanup_guard cleanup{*this}; + + m_measure.flush_device_l2(); + m_measure.sync_stream(); + + // Start CPU timer irrespective of use of blocking kernel. + // Ref: https://github.com/NVIDIA/nvbench/issues/249 + m_measure.cpu_timer_start(); + m_cpu_timer_started = true; + + if (!m_disable_blocking_kernel) + { + // Arm cleanup before queueing the blocking kernel. If block_stream throws + // after queueing work, cleanup_noexcept must still unblock the stream. + m_stream_unblock_armed = true; + m_measure.block_stream(); + } + if (m_check_throttling) + { + // Arm cleanup before queueing timestamp work. If gpu_frequency_start + // throws after queueing work, cleanup_noexcept must still sync the stream. + m_gpu_frequency_cleanup_armed = true; + m_measure.gpu_frequency_start(); + } + if (m_run_once) + { + m_measure.profiler_start(); + m_profiler_started = true; + } + m_measure.cuda_timer_start(); + m_cuda_timer_started = true; + + cleanup.release(); + } + + void stop() + { + cleanup_guard cleanup{*this}; + + if (m_cuda_timer_started) + { + m_measure.cuda_timer_stop(); + m_cuda_timer_started = false; + } + if (m_gpu_frequency_cleanup_armed) + { + m_measure.gpu_frequency_stop(); + m_gpu_frequency_cleanup_armed = false; + } + if (m_stream_unblock_armed) + { + m_measure.unblock_stream(); + m_stream_unblock_armed = false; + } + m_measure.sync_stream(); + if (m_profiler_started) + { + m_measure.profiler_stop(); + m_profiler_started = false; + } + if (m_cpu_timer_started) + { + m_measure.cpu_timer_stop(); + m_cpu_timer_started = false; + } + + cleanup.release(); + } + +private: + Measure &m_measure; + bool m_disable_blocking_kernel; + bool m_run_once; + bool m_check_throttling; + bool m_cpu_timer_started{false}; + bool m_stream_unblock_armed{false}; + bool m_gpu_frequency_cleanup_armed{false}; + bool m_profiler_started{false}; + bool m_cuda_timer_started{false}; +}; + +} // namespace nvbench::detail diff --git a/nvbench/detail/measure_hot.cuh b/nvbench/detail/measure_hot.cuh index 1f6ac18..f9faca3 100644 --- a/nvbench/detail/measure_hot.cuh +++ b/nvbench/detail/measure_hot.cuh @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,8 @@ struct measure_hot_base measure_hot_base &operator=(measure_hot_base &&) = delete; protected: + friend struct nvbench::detail::stream_cleanup_guard; + void check(); void initialize() @@ -82,60 +85,6 @@ protected: __forceinline__ void sync_stream() const { NVBENCH_CUDA_CALL(this->sync_stream_noexcept()); } - struct stream_cleanup_guard - { - explicit stream_cleanup_guard(measure_hot_base &measure) - : m_measure{measure} - { - m_sync_armed = true; - } - - stream_cleanup_guard(const stream_cleanup_guard &) = delete; - stream_cleanup_guard(stream_cleanup_guard &&) = delete; - stream_cleanup_guard &operator=(const stream_cleanup_guard &) = delete; - stream_cleanup_guard &operator=(stream_cleanup_guard &&) = delete; - - ~stream_cleanup_guard() noexcept - { - if (m_unblock_armed) - { - m_measure.unblock_stream_noexcept(); - } - if (m_sync_armed) - { - (void)m_measure.sync_stream_noexcept(); - } - } - - void block_stream() - { - // Arm cleanup before queueing the blocking kernel. If block_stream throws - // after queueing work, the destructor must still unblock the stream. - m_unblock_armed = true; - m_measure.block_stream(); - } - - void unblock() - { - if (m_unblock_armed) - { - m_measure.unblock_stream(); - m_unblock_armed = false; - } - } - - void release() noexcept - { - m_unblock_armed = false; - m_sync_armed = false; - } - - private: - measure_hot_base &m_measure; - bool m_unblock_armed{false}; - bool m_sync_armed{false}; - }; - nvbench::state &m_state; nvbench::launch m_launch; @@ -178,7 +127,7 @@ private: // measurement. void run_warmup() { - stream_cleanup_guard cleanup{*this}; + nvbench::detail::stream_cleanup_guard cleanup{*this}; m_cuda_timer.start(m_launch.get_stream()); this->launch_kernel(); @@ -204,7 +153,7 @@ private: { batch_size = std::max(batch_size, nvbench::int64_t{1}); - stream_cleanup_guard cleanup{*this}; + nvbench::detail::stream_cleanup_guard cleanup{*this}; if (!m_disable_blocking_kernel) { diff --git a/nvbench/detail/stream_cleanup_guard.cuh b/nvbench/detail/stream_cleanup_guard.cuh new file mode 100644 index 0000000..3d46997 --- /dev/null +++ b/nvbench/detail/stream_cleanup_guard.cuh @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include + +#if defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_GCC) +#pragma GCC system_header +#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_CLANG) +#pragma clang system_header +#elif defined(NVBENCH_IMPLICIT_SYSTEM_HEADER_MSVC) +#pragma system_header +#endif + +namespace nvbench::detail +{ + +template +struct stream_cleanup_guard +{ + explicit stream_cleanup_guard(Measure &measure) + : m_measure{measure} + { + m_sync_armed = true; + } + + stream_cleanup_guard(const stream_cleanup_guard &) = delete; + stream_cleanup_guard(stream_cleanup_guard &&) = delete; + stream_cleanup_guard &operator=(const stream_cleanup_guard &) = delete; + stream_cleanup_guard &operator=(stream_cleanup_guard &&) = delete; + + ~stream_cleanup_guard() noexcept + { + if (m_unblock_armed) + { + m_measure.unblock_stream_noexcept(); + } + if (m_sync_armed) + { + (void)m_measure.sync_stream_noexcept(); + } + } + + void block_stream() + { + // Arm cleanup before queueing the blocking kernel. If block_stream throws + // after queueing work, the destructor must still unblock the stream. + m_unblock_armed = true; + m_measure.block_stream(); + } + + void unblock() + { + if (m_unblock_armed) + { + m_measure.unblock_stream(); + m_unblock_armed = false; + } + } + + void release() noexcept + { + m_unblock_armed = false; + m_sync_armed = false; + } + +private: + Measure &m_measure; + bool m_unblock_armed{false}; + bool m_sync_armed{false}; +}; + +} // namespace nvbench::detail diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt index bbf3e19..5a10941 100644 --- a/testing/CMakeLists.txt +++ b/testing/CMakeLists.txt @@ -1,6 +1,7 @@ set(test_srcs axes_metadata.cu benchmark.cu + cleanup_guards.cu create.cu cuda_timer.cu cuda_stream.cu diff --git a/testing/cleanup_guards.cu b/testing/cleanup_guards.cu new file mode 100644 index 0000000..c806fc5 --- /dev/null +++ b/testing/cleanup_guards.cu @@ -0,0 +1,253 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +#include + +#include +#include +#include +#include + +#include "test_asserts.cuh" + +namespace +{ + +enum class action +{ + flush_device_l2, + sync_stream, + sync_stream_noexcept, + cpu_timer_start, + cpu_timer_stop, + cpu_timer_stop_noexcept, + block_stream, + unblock_stream, + unblock_stream_noexcept, + gpu_frequency_start, + gpu_frequency_stop, + profiler_start, + profiler_stop, + profiler_stop_noexcept, + cuda_timer_start, + cuda_timer_stop, +}; + +struct fake_measure +{ + void clear_actions() noexcept + { + action_count = 0; + overflow = false; + } + + void throw_on(action a) noexcept + { + should_throw = true; + throw_action = a; + } + + void record(action a) noexcept + { + if (action_count < actions.size()) + { + actions[action_count++] = a; + } + else + { + overflow = true; + } + } + + void record_or_throw(action a) + { + this->record(a); + if (should_throw && throw_action == a) + { + should_throw = false; + throw std::runtime_error{"Injected fake_measure failure."}; + } + } + + void flush_device_l2() { this->record_or_throw(action::flush_device_l2); } + void sync_stream() { this->record_or_throw(action::sync_stream); } + int sync_stream_noexcept() noexcept + { + this->record(action::sync_stream_noexcept); + return 0; + } + + void cpu_timer_start() noexcept { this->record(action::cpu_timer_start); } + void cpu_timer_stop() noexcept { this->record(action::cpu_timer_stop); } + void cpu_timer_stop_noexcept() noexcept { this->record(action::cpu_timer_stop_noexcept); } + + void block_stream() { this->record_or_throw(action::block_stream); } + void unblock_stream() { this->record_or_throw(action::unblock_stream); } + void unblock_stream_noexcept() noexcept { this->record(action::unblock_stream_noexcept); } + + void gpu_frequency_start() { this->record_or_throw(action::gpu_frequency_start); } + void gpu_frequency_stop() { this->record_or_throw(action::gpu_frequency_stop); } + + void profiler_start() { this->record_or_throw(action::profiler_start); } + void profiler_stop() { this->record_or_throw(action::profiler_stop); } + int profiler_stop_noexcept() noexcept + { + this->record(action::profiler_stop_noexcept); + return 0; + } + + void cuda_timer_start() { this->record_or_throw(action::cuda_timer_start); } + void cuda_timer_stop() { this->record_or_throw(action::cuda_timer_stop); } + + std::array actions{}; + std::size_t action_count{}; + action throw_action{}; + bool should_throw{false}; + bool overflow{false}; +}; + +template +void assert_throws(Callable &&callable) +{ + bool threw = false; + try + { + callable(); + } + catch (const std::runtime_error &) + { + threw = true; + } + ASSERT(threw); +} + +void assert_actions(const fake_measure &measure, std::initializer_list expected) +{ + ASSERT(!measure.overflow); + ASSERT(measure.action_count == expected.size()); + + std::size_t index = 0; + for (const action expected_action : expected) + { + ASSERT(measure.actions[index] == expected_action); + ++index; + } +} + +void test_stream_cleanup_guard_block_stream_throw() +{ + fake_measure measure; + measure.throw_on(action::block_stream); + + assert_throws([&measure] { + nvbench::detail::stream_cleanup_guard cleanup{measure}; + cleanup.block_stream(); + }); + + assert_actions( + measure, + {action::block_stream, action::unblock_stream_noexcept, action::sync_stream_noexcept}); +} + +void test_stream_cleanup_guard_unblock_then_throw() +{ + fake_measure measure; + + assert_throws([&measure] { + nvbench::detail::stream_cleanup_guard cleanup{measure}; + cleanup.block_stream(); + cleanup.unblock(); + throw std::runtime_error{"Injected post-unblock failure."}; + }); + + assert_actions(measure, + {action::block_stream, action::unblock_stream, action::sync_stream_noexcept}); +} + +void test_kernel_launch_timer_block_stream_throw() +{ + fake_measure measure; + measure.throw_on(action::block_stream); + + assert_throws([&measure] { + nvbench::detail::measure_cold_launch_timer_core timer{ + measure, + nvbench::detail::measure_cold_launch_timer_config{false, false, true}}; + timer.start(); + }); + + assert_actions(measure, + {action::flush_device_l2, + action::sync_stream, + action::cpu_timer_start, + action::block_stream, + action::unblock_stream_noexcept, + action::sync_stream_noexcept, + action::cpu_timer_stop_noexcept}); +} + +void test_kernel_launch_timer_gpu_frequency_start_throw() +{ + fake_measure measure; + measure.throw_on(action::gpu_frequency_start); + + assert_throws([&measure] { + nvbench::detail::measure_cold_launch_timer_core timer{ + measure, + nvbench::detail::measure_cold_launch_timer_config{false, false, true}}; + timer.start(); + }); + + assert_actions(measure, + {action::flush_device_l2, + action::sync_stream, + action::cpu_timer_start, + action::block_stream, + action::gpu_frequency_start, + action::unblock_stream_noexcept, + action::sync_stream_noexcept, + action::cpu_timer_stop_noexcept}); +} + +void test_kernel_launch_timer_gpu_frequency_stop_throw() +{ + fake_measure measure; + nvbench::detail::measure_cold_launch_timer_core timer{ + measure, + nvbench::detail::measure_cold_launch_timer_config{false, false, true}}; + + timer.start(); + measure.clear_actions(); + measure.throw_on(action::gpu_frequency_stop); + + assert_throws([&timer] { timer.stop(); }); + + assert_actions(measure, + {action::cuda_timer_stop, + action::gpu_frequency_stop, + action::unblock_stream_noexcept, + action::sync_stream_noexcept, + action::cpu_timer_stop_noexcept}); +} + +} // namespace + +int main() +try +{ + test_stream_cleanup_guard_block_stream_throw(); + test_stream_cleanup_guard_unblock_then_throw(); + test_kernel_launch_timer_block_stream_throw(); + test_kernel_launch_timer_gpu_frequency_start_throw(); + test_kernel_launch_timer_gpu_frequency_stop_throw(); + + return 0; +} +catch (std::exception &e) +{ + fmt::print("{}\n", e.what()); + return 1; +}