Files
nvbench/testing/exception_safety.cu
Oleksandr Pavlyk 6dd27aedfd Fix exception safety (#358)
Improve exception safety of timer structs by using local scope guards to ensure that cleanup steps, such as signaling blocking kernel to unblock and making sure that the stream is synchronized are performed even launch object throws an exception.

Tests of exception safety were added.

--

* blocking_kernel.unblock_noexcept() noexcept method added

This decouples the logic of signaling to unblock from checking
of the timeout.

* Improve exception safely in kernel_launch_timer

Introduce noexcept cleanup methods. Place body of start()
and stop() methods in the try/catch block and execute
noexcept clean-up on exception before rethrowing.

* Improve exception safety of measure_hot

* Make sure that throwing methods call noexcept ones instead of duplicating functionality

* Use cleanup_guard in measure_cold_base::kernel_launch_timer

Replace try/catch pattern with cleaner use of cleanup_guard
class.

* cpu_timer::start, cpu_timer::stop methods marked noexcept

These methods do not throw, and marking them noexcept explicitly
makes it fine to call them from other noexcept methods, as such
cleanup_noexcept in measure_cold.

* Address remaining exception safety issue in measure_hot

* Renamed guard variables to reflect their purpose, apply arm-then-do to ops queueing kernels

Set m_block_stream_armed = true; before launching the kernel. Doing so signals
cleanup guard that stream must be unblocked, even if launching of the kernel failed.

Same for operation launching time-stamps kernel.

* Add testing/device/exception_safety.cu

This test add benchmark that throws. It verifies that it did not
time-out and control counters the benchmark maintains are at
the expected values.

* Refactor measurement cleanup guards for testability

Extract hot stream cleanup and cold launch timer cleanup into reusable
detail helpers. Keep measure_hot and measure_cold using those helpers through
thin adapters so the tested cleanup logic matches the production path.

Add driver-free cleanup guard tests using a fake measure object to verify
cleanup ordering when exceptions occur after blocking stream setup, after hot
unblock, and around cold GPU frequency start/stop paths.

* Implement cpu_timer_stop_noexcept in terms of cpu_timer_stop

The cpu_timer_stop is already noexcept by nature of implementation,
but we maintain cpu_timer_stop_noexcept method for symmetry with
other pairs sync_stream()/sync_stream_noexcept().

The cpu_timer_stop_noexcept() is implemented via cpu_timer_stop().
These methods are annotated __forceinline__, so the same code should be
generated.

* More readable initialization of bool members

* Moved exception_safety.cu back to testing/ folder

testing/device is reserved for tests that require locking
of GPU frequency per CMake option description.

* Fixed nitpick and bug it discovered

Changed testing/exception_safety.cu:237 so run_benchmark now iterates over every state
from bench.get_states() and checks each one is skipped with a reason
containing "requested".

That exposed a real runner behavior gap, so I also made a minimal fix in
nvbench/runner.cuh:120: after stop_runner_loop, remaining states are now explicitly
marked skipped with a reason instead of only printing a skip notification.

* Move static assertions (pertaining to cleanup guards) to
testing/cleanup_guards.cu

The CI failure with CTK 12.0 and certain version of GCC is caused
by OOM in cudafe++ process tripped by compiling instantiation
of contract verification on cold_launch_timer_probe struct.

As a work-around, this instantiation is excluded for CTK 12.0-12.6
2026-05-15 15:14:30 -05:00

204 lines
4.8 KiB
Plaintext

// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#include <nvbench/benchmark.cuh>
#include <nvbench/cuda_call.cuh>
#include <nvbench/exec_tag.cuh>
#include <nvbench/launch.cuh>
#include <nvbench/runner.cuh>
#include <nvbench/state.cuh>
#include <nvbench/type_list.cuh>
#include <nvbench/types.cuh>
#include <cuda_runtime.h>
#include <fmt/format.h>
#include <chrono>
#include <stdexcept>
#include <string>
#include "test_asserts.cuh"
namespace
{
__global__ void spin_kernel(nvbench::uint64_t target_cycles)
{
const auto start = static_cast<nvbench::uint64_t>(clock64());
while (static_cast<nvbench::uint64_t>(clock64()) - start < target_cycles)
{
}
}
constexpr nvbench::uint64_t spin_cycles = 100000;
enum class measurement_kind
{
cold,
hot,
};
enum class exception_kind
{
runtime_error,
stop_runner_loop,
};
struct test_control
{
measurement_kind measurement{measurement_kind::cold};
exception_kind exception{exception_kind::runtime_error};
int generator_calls{0};
int launcher_calls{0};
};
void synchronize_with_timeout_guard()
{
const auto start = std::chrono::steady_clock::now();
NVBENCH_CUDA_CALL(cudaDeviceSynchronize());
const auto elapsed = std::chrono::steady_clock::now() - start;
ASSERT_MSG(elapsed < std::chrono::seconds{5},
"cudaDeviceSynchronize took {} ms; stream cleanup may have leaked blocked work",
std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count());
}
void throw_requested_exception(exception_kind exception)
{
if (exception == exception_kind::stop_runner_loop)
{
throw nvbench::stop_runner_loop{"requested stop from exception-safety test"};
}
throw std::runtime_error{"requested throw from exception-safety test"};
}
void configure_state(nvbench::state &state)
{
state.set_min_samples(1);
state.set_timeout(0.01);
// Keep this below the CTest timeout. If cleanup fails to unblock the
// blocking kernel, the device-side timeout lets the elapsed-time assertion
// report the leak before CTest has to kill the process.
state.set_blocking_kernel_timeout(10.0);
}
void run_throwing_measurement(nvbench::state &state, test_control &control)
{
configure_state(state);
auto launcher = [&control](nvbench::launch &launch) {
++control.launcher_calls;
spin_kernel<<<1, 1, 0, launch.get_stream()>>>(spin_cycles);
// Let the warmup complete. The next launcher call happens under the cold
// or hot measurement cleanup path that this test is exercising.
if (control.launcher_calls > 1)
{
throw_requested_exception(control.exception);
}
};
if (control.measurement == measurement_kind::hot)
{
state.exec(nvbench::exec_tag::impl::hot, launcher);
}
else
{
state.exec(nvbench::exec_tag::impl::cold, launcher);
}
}
struct throwing_generator
{
test_control *control{};
void operator()(nvbench::state &state, nvbench::type_list<>) const
{
++control->generator_calls;
run_throwing_measurement(state, *control);
}
};
using benchmark_type = nvbench::benchmark<throwing_generator>;
void run_benchmark(test_control &control, bool add_axis = false)
{
benchmark_type bench{throwing_generator{&control}};
bench.add_device(0);
bench.set_min_samples(1);
bench.set_timeout(0.01);
bench.set_criterion_param_float64("min-time", 1e-6);
if (add_axis)
{
bench.add_int64_axis("Case", {0, 1, 2});
}
bench.run();
synchronize_with_timeout_guard();
const auto &states = bench.get_states();
ASSERT(!states.empty());
for (const auto &state : states)
{
ASSERT(state.is_skipped());
ASSERT(state.get_skip_reason().find("requested") != std::string::npos);
}
}
void test_cold_runtime_error_cleanup()
{
test_control control;
control.measurement = measurement_kind::cold;
control.exception = exception_kind::runtime_error;
run_benchmark(control);
ASSERT(control.generator_calls == 1);
ASSERT(control.launcher_calls == 2);
}
void test_hot_runtime_error_cleanup()
{
test_control control;
control.measurement = measurement_kind::hot;
control.exception = exception_kind::runtime_error;
run_benchmark(control);
ASSERT(control.generator_calls == 1);
ASSERT(control.launcher_calls == 2);
}
void test_stop_runner_loop_cleanup_and_skip_remaining()
{
test_control control;
control.measurement = measurement_kind::cold;
control.exception = exception_kind::stop_runner_loop;
run_benchmark(control, true);
ASSERT(control.generator_calls == 1);
ASSERT(control.launcher_calls == 2);
}
} // namespace
int main()
try
{
test_cold_runtime_error_cleanup();
test_hot_runtime_error_cleanup();
test_stop_runner_loop_cleanup_and_skip_remaining();
return 0;
}
catch (std::exception &e)
{
fmt::print("{}\n", e.what());
return 1;
}