Remove run_once exec_tag.

Similar to `no_block`, this is a runtime variable that doesn't need to be encoded statically. It was not exposed publicly and existing solely as an implementation detail of `state::exec`, introducing unnecessary complexity there.
2026-03-14 20:27:24 +00:00 · 2025-04-08 17:11:09 +00:00
parent 851d7aadd0
commit 35360614ed
2 changed files with 31 additions and 49 deletions
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -45,6 +45,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
 {
  using KL = typename std::remove_reference<KernelLauncher>::type;
  using namespace nvbench::exec_tag::impl;
+
  static_assert(is_exec_tag_v<ExecTags>,
                "`ExecTags` argument must be a member (or combination of members) from "
                "`nvbench::exec_tag`.");
@@ -55,30 +56,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
  constexpr auto modifier_tags = tags & modifier_mask;
  constexpr auto measure_tags  = tags & measure_mask;

-  if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
-  {
-    throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
-                             "`set_is_cpu_only(true)` is called when defining the benchmark.");
-  }
-
-  if ((modifier_tags & gpu) && this->get_is_cpu_only())
-  {
-    throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
-                             "`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
-  }
-
-  // "run once" should disable batch measurements:
-  // TODO This should just be a runtime branch in measure_cold. Currently this causes two versions
-  // of measure_cold to be compiled. We don't expose the `run_once` tag to users, it should be
-  // removed.
-  // TODO CPU measurements should support run_once as well.
-  if (!(modifier_tags & run_once) && this->get_run_once())
-  {
-    constexpr auto run_once_tags = modifier_tags | run_once | (measure_tags & ~hot);
-    this->exec(run_once_tags, std::forward<KernelLauncher>(kernel_launcher));
-    return;
-  }
-
  // If no measurements selected, pick some defaults based on the modifiers:
  if constexpr (!measure_tags)
  {
@@ -112,6 +89,18 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    return;
  }

+  if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
+  {
+    throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
+                             "`set_is_cpu_only(true)` is called when defining the benchmark.");
+  }
+
+  if ((modifier_tags & gpu) && this->get_is_cpu_only())
+  {
+    throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
+                             "`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
+  }
+
  // Syncing will cause the blocking kernel pattern to deadlock:
  if constexpr (modifier_tags & sync)
  {
@@ -154,16 +143,12 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)

      if constexpr (tags & timer)
      {
-// Estimate bandwidth here
 #ifdef NVBENCH_HAS_CUPTI
-        if constexpr (!(modifier_tags & run_once))
+        if (this->is_cupti_required() && !this->get_run_once())
        {
-          if (this->is_cupti_required())
-          {
-            using measure_t = nvbench::detail::measure_cupti<KL>;
-            measure_t measure{*this, kernel_launcher};
-            measure();
-          }
+          using measure_t = nvbench::detail::measure_cupti<KL>;
+          measure_t measure{*this, kernel_launcher};
+          measure();
        }
 #endif

@@ -176,16 +161,12 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
        using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
        wrapper_t wrapper{kernel_launcher};

-// Estimate bandwidth here
 #ifdef NVBENCH_HAS_CUPTI
-        if constexpr (!(modifier_tags & run_once))
+        if (this->is_cupti_required() && !this->get_run_once())
        {
-          if (this->is_cupti_required())
-          {
-            using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
-            measure_t measure{*this, wrapper};
-            measure();
-          }
+          using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
+          measure_t measure{*this, wrapper};
+          measure();
        }
 #endif

@@ -201,9 +182,13 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
      static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
      static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
      static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
-      using measure_t = nvbench::detail::measure_hot<KL>;
-      measure_t measure{*this, kernel_launcher};
-      measure();
+
+      if (!this->get_run_once())
+      {
+        using measure_t = nvbench::detail::measure_hot<KL>;
+        measure_t measure{*this, kernel_launcher};
+        measure();
+      }
    }
  }
 }
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -33,10 +33,9 @@ enum class exec_flag
  // Modifiers:
  timer         = 0x01, // KernelLauncher uses manual timing
  sync          = 0x02, // KernelLauncher has indicated that it will sync
-  run_once      = 0x04, // Only run the benchmark once (for profiling).
-  gpu           = 0x08, // Don't instantiate `measure_cpu_only`.
-  no_gpu        = 0x10, // No GPU measurements should be instantiated.
-  no_batch      = 0x20, // `measure_hot` will not be used.
+  gpu           = 0x04, // Don't instantiate `measure_cpu_only`.
+  no_gpu        = 0x08, // No GPU measurements should be instantiated.
+  no_batch      = 0x10, // `measure_hot` will not be used.
  modifier_mask = 0xFF,

  // Measurement types to instantiate. Derived from modifiers.
@@ -97,7 +96,6 @@ struct tag
 using none_t          = tag<nvbench::detail::exec_flag::none>;
 using timer_t         = tag<nvbench::detail::exec_flag::timer>;
 using sync_t          = tag<nvbench::detail::exec_flag::sync>;
-using run_once_t      = tag<nvbench::detail::exec_flag::run_once>;
 using gpu_t           = tag<nvbench::detail::exec_flag::gpu>;
 using no_gpu_t        = tag<nvbench::detail::exec_flag::no_gpu>;
 using no_batch_t      = tag<nvbench::detail::exec_flag::no_batch>;
@@ -111,7 +109,6 @@ using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;
 constexpr inline none_t none;
 constexpr inline timer_t timer;
 constexpr inline sync_t sync;
-constexpr inline run_once_t run_once;
 constexpr inline gpu_t gpu;
 constexpr inline no_gpu_t no_gpu;
 constexpr inline no_batch_t no_batch;