mirror of
https://github.com/NVIDIA/nvbench.git
synced 2026-06-29 18:57:44 +00:00
Improve exception safety of timer structs by using local scope guards to ensure that cleanup steps, such as signaling blocking kernel to unblock and making sure that the stream is synchronized are performed even launch object throws an exception. Tests of exception safety were added. -- * blocking_kernel.unblock_noexcept() noexcept method added This decouples the logic of signaling to unblock from checking of the timeout. * Improve exception safely in kernel_launch_timer Introduce noexcept cleanup methods. Place body of start() and stop() methods in the try/catch block and execute noexcept clean-up on exception before rethrowing. * Improve exception safety of measure_hot * Make sure that throwing methods call noexcept ones instead of duplicating functionality * Use cleanup_guard in measure_cold_base::kernel_launch_timer Replace try/catch pattern with cleaner use of cleanup_guard class. * cpu_timer::start, cpu_timer::stop methods marked noexcept These methods do not throw, and marking them noexcept explicitly makes it fine to call them from other noexcept methods, as such cleanup_noexcept in measure_cold. * Address remaining exception safety issue in measure_hot * Renamed guard variables to reflect their purpose, apply arm-then-do to ops queueing kernels Set m_block_stream_armed = true; before launching the kernel. Doing so signals cleanup guard that stream must be unblocked, even if launching of the kernel failed. Same for operation launching time-stamps kernel. * Add testing/device/exception_safety.cu This test add benchmark that throws. It verifies that it did not time-out and control counters the benchmark maintains are at the expected values. * Refactor measurement cleanup guards for testability Extract hot stream cleanup and cold launch timer cleanup into reusable detail helpers. Keep measure_hot and measure_cold using those helpers through thin adapters so the tested cleanup logic matches the production path. Add driver-free cleanup guard tests using a fake measure object to verify cleanup ordering when exceptions occur after blocking stream setup, after hot unblock, and around cold GPU frequency start/stop paths. * Implement cpu_timer_stop_noexcept in terms of cpu_timer_stop The cpu_timer_stop is already noexcept by nature of implementation, but we maintain cpu_timer_stop_noexcept method for symmetry with other pairs sync_stream()/sync_stream_noexcept(). The cpu_timer_stop_noexcept() is implemented via cpu_timer_stop(). These methods are annotated __forceinline__, so the same code should be generated. * More readable initialization of bool members * Moved exception_safety.cu back to testing/ folder testing/device is reserved for tests that require locking of GPU frequency per CMake option description. * Fixed nitpick and bug it discovered Changed testing/exception_safety.cu:237 so run_benchmark now iterates over every state from bench.get_states() and checks each one is skipped with a reason containing "requested". That exposed a real runner behavior gap, so I also made a minimal fix in nvbench/runner.cuh:120: after stop_runner_loop, remaining states are now explicitly marked skipped with a reason instead of only printing a skip notification. * Move static assertions (pertaining to cleanup guards) to testing/cleanup_guards.cu The CI failure with CTK 12.0 and certain version of GCC is caused by OOM in cudafe++ process tripped by compiling instantiation of contract verification on cold_launch_timer_probe struct. As a work-around, this instantiation is excluded for CTK 12.0-12.6
204 lines
4.8 KiB
Plaintext
204 lines
4.8 KiB
Plaintext
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
#include <nvbench/benchmark.cuh>
|
|
#include <nvbench/cuda_call.cuh>
|
|
#include <nvbench/exec_tag.cuh>
|
|
#include <nvbench/launch.cuh>
|
|
#include <nvbench/runner.cuh>
|
|
#include <nvbench/state.cuh>
|
|
#include <nvbench/type_list.cuh>
|
|
#include <nvbench/types.cuh>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <fmt/format.h>
|
|
|
|
#include <chrono>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
|
|
#include "test_asserts.cuh"
|
|
|
|
namespace
|
|
{
|
|
|
|
__global__ void spin_kernel(nvbench::uint64_t target_cycles)
|
|
{
|
|
const auto start = static_cast<nvbench::uint64_t>(clock64());
|
|
while (static_cast<nvbench::uint64_t>(clock64()) - start < target_cycles)
|
|
{
|
|
}
|
|
}
|
|
|
|
constexpr nvbench::uint64_t spin_cycles = 100000;
|
|
|
|
enum class measurement_kind
|
|
{
|
|
cold,
|
|
hot,
|
|
};
|
|
|
|
enum class exception_kind
|
|
{
|
|
runtime_error,
|
|
stop_runner_loop,
|
|
};
|
|
|
|
struct test_control
|
|
{
|
|
measurement_kind measurement{measurement_kind::cold};
|
|
exception_kind exception{exception_kind::runtime_error};
|
|
int generator_calls{0};
|
|
int launcher_calls{0};
|
|
};
|
|
|
|
void synchronize_with_timeout_guard()
|
|
{
|
|
const auto start = std::chrono::steady_clock::now();
|
|
NVBENCH_CUDA_CALL(cudaDeviceSynchronize());
|
|
const auto elapsed = std::chrono::steady_clock::now() - start;
|
|
|
|
ASSERT_MSG(elapsed < std::chrono::seconds{5},
|
|
"cudaDeviceSynchronize took {} ms; stream cleanup may have leaked blocked work",
|
|
std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count());
|
|
}
|
|
|
|
void throw_requested_exception(exception_kind exception)
|
|
{
|
|
if (exception == exception_kind::stop_runner_loop)
|
|
{
|
|
throw nvbench::stop_runner_loop{"requested stop from exception-safety test"};
|
|
}
|
|
|
|
throw std::runtime_error{"requested throw from exception-safety test"};
|
|
}
|
|
|
|
void configure_state(nvbench::state &state)
|
|
{
|
|
state.set_min_samples(1);
|
|
state.set_timeout(0.01);
|
|
|
|
// Keep this below the CTest timeout. If cleanup fails to unblock the
|
|
// blocking kernel, the device-side timeout lets the elapsed-time assertion
|
|
// report the leak before CTest has to kill the process.
|
|
state.set_blocking_kernel_timeout(10.0);
|
|
}
|
|
|
|
void run_throwing_measurement(nvbench::state &state, test_control &control)
|
|
{
|
|
configure_state(state);
|
|
|
|
auto launcher = [&control](nvbench::launch &launch) {
|
|
++control.launcher_calls;
|
|
spin_kernel<<<1, 1, 0, launch.get_stream()>>>(spin_cycles);
|
|
|
|
// Let the warmup complete. The next launcher call happens under the cold
|
|
// or hot measurement cleanup path that this test is exercising.
|
|
if (control.launcher_calls > 1)
|
|
{
|
|
throw_requested_exception(control.exception);
|
|
}
|
|
};
|
|
|
|
if (control.measurement == measurement_kind::hot)
|
|
{
|
|
state.exec(nvbench::exec_tag::impl::hot, launcher);
|
|
}
|
|
else
|
|
{
|
|
state.exec(nvbench::exec_tag::impl::cold, launcher);
|
|
}
|
|
}
|
|
|
|
struct throwing_generator
|
|
{
|
|
test_control *control{};
|
|
|
|
void operator()(nvbench::state &state, nvbench::type_list<>) const
|
|
{
|
|
++control->generator_calls;
|
|
run_throwing_measurement(state, *control);
|
|
}
|
|
};
|
|
|
|
using benchmark_type = nvbench::benchmark<throwing_generator>;
|
|
|
|
void run_benchmark(test_control &control, bool add_axis = false)
|
|
{
|
|
benchmark_type bench{throwing_generator{&control}};
|
|
bench.add_device(0);
|
|
bench.set_min_samples(1);
|
|
bench.set_timeout(0.01);
|
|
bench.set_criterion_param_float64("min-time", 1e-6);
|
|
if (add_axis)
|
|
{
|
|
bench.add_int64_axis("Case", {0, 1, 2});
|
|
}
|
|
|
|
bench.run();
|
|
|
|
synchronize_with_timeout_guard();
|
|
|
|
const auto &states = bench.get_states();
|
|
ASSERT(!states.empty());
|
|
for (const auto &state : states)
|
|
{
|
|
ASSERT(state.is_skipped());
|
|
ASSERT(state.get_skip_reason().find("requested") != std::string::npos);
|
|
}
|
|
}
|
|
|
|
void test_cold_runtime_error_cleanup()
|
|
{
|
|
test_control control;
|
|
control.measurement = measurement_kind::cold;
|
|
control.exception = exception_kind::runtime_error;
|
|
|
|
run_benchmark(control);
|
|
|
|
ASSERT(control.generator_calls == 1);
|
|
ASSERT(control.launcher_calls == 2);
|
|
}
|
|
|
|
void test_hot_runtime_error_cleanup()
|
|
{
|
|
test_control control;
|
|
control.measurement = measurement_kind::hot;
|
|
control.exception = exception_kind::runtime_error;
|
|
|
|
run_benchmark(control);
|
|
|
|
ASSERT(control.generator_calls == 1);
|
|
ASSERT(control.launcher_calls == 2);
|
|
}
|
|
|
|
void test_stop_runner_loop_cleanup_and_skip_remaining()
|
|
{
|
|
test_control control;
|
|
control.measurement = measurement_kind::cold;
|
|
control.exception = exception_kind::stop_runner_loop;
|
|
|
|
run_benchmark(control, true);
|
|
|
|
ASSERT(control.generator_calls == 1);
|
|
ASSERT(control.launcher_calls == 2);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
int main()
|
|
try
|
|
{
|
|
test_cold_runtime_error_cleanup();
|
|
test_hot_runtime_error_cleanup();
|
|
test_stop_runner_loop_cleanup_and_skip_remaining();
|
|
|
|
return 0;
|
|
}
|
|
catch (std::exception &e)
|
|
{
|
|
fmt::print("{}\n", e.what());
|
|
return 1;
|
|
}
|