Make blocking kernel use a runtime option.

It's not worth instantiating multiple instances of the measurement class to handle this. Since there's already runtime option to disable the blocking kernel, the current implementation by default will instantiate both the blocking and non-blocking version of the algorithm for dynamic dispatch.
2026-04-20 06:48:53 +00:00 · 2025-04-08 12:56:04 -04:00
parent 52028be94f
commit 851d7aadd0
5 changed files with 33 additions and 41 deletions
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -34,9 +34,10 @@ measure_cold_base::measure_cold_base(state &exec_state)
    : m_state{exec_state}
    , m_launch{m_state.get_cuda_stream()}
    , m_criterion_params{exec_state.get_criterion_params()}
-    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
+    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
+        exec_state.get_stopping_criterion())}
+    , m_disable_blocking_kernel{exec_state.get_disable_blocking_kernel()}
    , m_run_once{exec_state.get_run_once()}
-    , m_no_block{exec_state.get_disable_blocking_kernel()}
    , m_min_samples{exec_state.get_min_samples()}
    , m_skip_time{exec_state.get_skip_time()}
    , m_timeout{exec_state.get_timeout()}
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -54,8 +54,8 @@ struct measure_cold_base
  measure_cold_base &operator=(measure_cold_base &&)      = delete;

 protected:
-  template <bool use_blocking_kernel>
  struct kernel_launch_timer;
+  friend struct kernel_launch_timer;

  void check();
  void initialize();
@@ -89,8 +89,8 @@ protected:
  nvbench::criterion_params m_criterion_params;
  nvbench::stopping_criterion_base& m_stopping_criterion;

+  bool m_disable_blocking_kernel{false};
  bool m_run_once{false};
-  bool m_no_block{false};

  nvbench::int64_t m_min_samples{};

@@ -108,23 +108,23 @@ protected:
  bool m_max_time_exceeded{};
 };

-template <bool use_blocking_kernel>
 struct measure_cold_base::kernel_launch_timer
 {
  kernel_launch_timer(measure_cold_base &measure)
      : m_measure{measure}
+      , m_disable_blocking_kernel{measure.m_disable_blocking_kernel}
  {}

  __forceinline__ void start()
  {
    m_measure.flush_device_l2();
    m_measure.sync_stream();
-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
    {
      m_measure.block_stream();
    }
    m_measure.m_cuda_timer.start(m_measure.m_launch.get_stream());
-    if constexpr (!use_blocking_kernel)
+    if (m_disable_blocking_kernel)
    {
      m_measure.m_cpu_timer.start();
    }
@@ -133,7 +133,7 @@ struct measure_cold_base::kernel_launch_timer
  __forceinline__ void stop()
  {
    m_measure.m_cuda_timer.stop(m_measure.m_launch.get_stream());
-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
    {
      m_measure.m_cpu_timer.start();
      m_measure.unblock_stream();
@@ -144,9 +144,10 @@ struct measure_cold_base::kernel_launch_timer

 private:
  measure_cold_base &m_measure;
+  bool m_disable_blocking_kernel;
 };

-template <typename KernelLauncher, bool use_blocking_kernel>
+template <typename KernelLauncher>
 struct measure_cold : public measure_cold_base
 {
  measure_cold(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -177,7 +178,7 @@ private:
      return;
    }

-    kernel_launch_timer<use_blocking_kernel> timer(*this);
+    kernel_launch_timer timer(*this);

    this->launch_kernel(timer);
    this->check_skip_time(m_cuda_timer.get_duration());
@@ -185,7 +186,7 @@ private:

  void run_trials()
  {
-    kernel_launch_timer<use_blocking_kernel> timer(*this);
+    kernel_launch_timer timer(*this);
    do
    {
      this->launch_kernel(timer);
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -80,10 +80,11 @@ protected:
  nvbench::int64_t m_total_samples{};
  nvbench::float64_t m_total_cuda_time{};

+  bool m_disable_blocking_kernel{false};
  bool m_max_time_exceeded{false};
 };

-template <typename KernelLauncher, bool use_blocking_kernel>
+template <typename KernelLauncher>
 struct measure_hot : public measure_hot_base
 {
  measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)
@@ -105,7 +106,7 @@ private:
  // measurement.
  void run_warmup()
  {
-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
    {
      this->block_stream();
    }
@@ -114,7 +115,7 @@ private:
    this->launch_kernel();
    m_cuda_timer.stop(m_launch.get_stream());

-    if constexpr (use_blocking_kernel)
+    if (!m_disable_blocking_kernel)
    {
      this->unblock_stream();
    }
@@ -137,7 +138,7 @@ private:
    {
      batch_size = std::max(batch_size, nvbench::int64_t{1});

-      if constexpr (use_blocking_kernel)
+      if (!m_disable_blocking_kernel)
      {
        // Block stream until some work is queued.
        // Limit the number of kernel executions while blocked to prevent
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -79,17 +79,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    return;
  }

-  // TODO The `no_block` tag should be removed and replaced with a runtime branch in measure_cold
-  // and measure_hot. Currently this causes unnecesaary codegen. Note that the `sync` exec_tag
-  // implies `no_block` when refactoring.
-  if (!(measure_tags & cpu_only) && !(modifier_tags & no_block) &&
-      this->get_disable_blocking_kernel())
-  {
-    constexpr auto no_block_tags = tags | no_block;
-    this->exec(no_block_tags, std::forward<KernelLauncher>(kernel_launcher));
-    return;
-  }
-
  // If no measurements selected, pick some defaults based on the modifiers:
  if constexpr (!measure_tags)
  {
@@ -123,6 +112,12 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    return;
  }

+  // Syncing will cause the blocking kernel pattern to deadlock:
+  if constexpr (modifier_tags & sync)
+  {
+    this->set_disable_blocking_kernel(true);
+  }
+
  if (this->is_skipped())
  {
    return;
@@ -157,7 +152,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    {
      static_assert(!(tags & no_gpu), "Cold measurement doesn't support the `no_gpu` exec_tag.");

-      constexpr bool use_blocking_kernel = !(tags & no_block);
      if constexpr (tags & timer)
      {
 // Estimate bandwidth here
@@ -173,7 +167,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
        }
 #endif

-        using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
+        using measure_t = nvbench::detail::measure_cold<KL>;
        measure_t measure{*this, kernel_launcher};
        measure();
      }
@@ -195,7 +189,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
        }
 #endif

-        using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
+        using measure_t = nvbench::detail::measure_cold<wrapper_t>;
        measure_t measure(*this, wrapper);
        measure();
      }
@@ -207,8 +201,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
      static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
      static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
      static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
-      constexpr bool use_blocking_kernel = !(tags & no_block);
-      using measure_t                    = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
+      using measure_t = nvbench::detail::measure_hot<KL>;
      measure_t measure{*this, kernel_launcher};
      measure();
    }
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -32,12 +32,11 @@ enum class exec_flag

  // Modifiers:
  timer         = 0x01, // KernelLauncher uses manual timing
-  no_block      = 0x02, // Disables use of `blocking_kernel`.
-  sync          = 0x04, // KernelLauncher has indicated that it will sync
-  run_once      = 0x08, // Only run the benchmark once (for profiling).
-  gpu           = 0x10, // Don't instantiate `measure_cpu_only`.
-  no_gpu        = 0x20, // No GPU measurements should be instantiated.
-  no_batch      = 0x40, // `measure_hot` will not be used.
+  sync          = 0x02, // KernelLauncher has indicated that it will sync
+  run_once      = 0x04, // Only run the benchmark once (for profiling).
+  gpu           = 0x08, // Don't instantiate `measure_cpu_only`.
+  no_gpu        = 0x10, // No GPU measurements should be instantiated.
+  no_batch      = 0x20, // `measure_hot` will not be used.
  modifier_mask = 0xFF,

  // Measurement types to instantiate. Derived from modifiers.
@@ -97,7 +96,6 @@ struct tag

 using none_t          = tag<nvbench::detail::exec_flag::none>;
 using timer_t         = tag<nvbench::detail::exec_flag::timer>;
-using no_block_t      = tag<nvbench::detail::exec_flag::no_block>;
 using sync_t          = tag<nvbench::detail::exec_flag::sync>;
 using run_once_t      = tag<nvbench::detail::exec_flag::run_once>;
 using gpu_t           = tag<nvbench::detail::exec_flag::gpu>;
@@ -112,7 +110,6 @@ using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;

 constexpr inline none_t none;
 constexpr inline timer_t timer;
-constexpr inline no_block_t no_block;
 constexpr inline sync_t sync;
 constexpr inline run_once_t run_once;
 constexpr inline gpu_t gpu;
@@ -136,8 +133,7 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer | //

 /// Modifier used to indicate that the KernelGenerator will perform CUDA
 /// synchronizations. Without this flag such benchmarks will deadlock.
-constexpr inline auto sync = nvbench::exec_tag::impl::no_block | //
-                             nvbench::exec_tag::impl::sync |     //
+constexpr inline auto sync = nvbench::exec_tag::impl::sync | //
                             nvbench::exec_tag::impl::no_batch;

 /// Modifier used to indicate that batched measurements should be disabled