From 84af4ea455b7a70e9ff43ca9383a865cff38d98d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 30 Apr 2026 13:32:02 -0500 Subject: [PATCH] Ensure that measure_cold::run_warmup instantiates blocking kernel Because warm-up runs are executed without use of blocking kernel, the blocking kernel was not jitted until actual measurements were collected. The module loading cost incurred during the first run shows as elevated CPU time noise value for the first measurement as noted in https://github.com/NVIDIA/nvbench/pull/339 This PR adds `this->block_stream(); this->unblock_stream();` prior to executing warm-up loop with use of blocking kernel disabled. This ensures that blocking kernel is instantiated during the warm-up, but it no other kernel is launched between its launch and stream sync thus avoiding deadlocking. --- nvbench/detail/measure_cold.cuh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh index f7dc1fc..f37d207 100644 --- a/nvbench/detail/measure_cold.cuh +++ b/nvbench/detail/measure_cold.cuh @@ -249,6 +249,11 @@ private: return; } + // Ensure blocking kernel is loaded during the warmup + // Ref: https://github.com/NVIDIA/nvbench/issues/339 + this->block_stream(); + this->unblock_stream(); + // disable use of blocking kernel for warm-up run // see https://github.com/NVIDIA/nvbench/issues/240 constexpr bool disable_blocking_kernel = true;