Add initial implementation of exec_tag dispatching.

nvbench::exec_tags are used to request measurement types and share information about the kernel. They are used to ensure that templated measurement code is not instantiated unless actually used. Replaces the nvbench::exec(state, launcher, tags) pattern with: state.exec(tags, launcher); state.exec(launcher); // defaults to hot/cold cuda measurements
2026-04-20 14:58:54 +00:00 · 2021-02-16 23:47:36 -05:00
parent 37e753f7b6
commit f61be70a93
8 changed files with 243 additions and 52 deletions
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -5,8 +5,8 @@
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
 #include <nvbench/device_info.cuh>
+#include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
-#include <nvbench/state.cuh>

 #include <nvbench/detail/l2flush.cuh>
 #include <nvbench/detail/statistics.cuh>
@@ -17,7 +17,12 @@
 #include <utility>
 #include <vector>

-namespace nvbench::detail
+namespace nvbench
+{
+
+struct state;
+
+namespace detail
 {

 // non-templated code goes here:
@@ -75,9 +80,16 @@ protected:
  bool m_max_time_exceeded{};
 };

-template <typename KernelLauncher, bool DelayEventRecording = true>
+template <typename KernelLauncher, nvbench::detail::exec_flag ExecTagModifiers>
 struct measure_cold : public measure_cold_base
 {
+  static constexpr bool needs_timer_wrapper =
+    (ExecTagModifiers & nvbench::detail::exec_flag::timer) ==
+    nvbench::detail::exec_flag::none;
+  static constexpr bool use_blocking_kernel =
+    (ExecTagModifiers & nvbench::detail::exec_flag::no_block) ==
+    nvbench::detail::exec_flag::none;
+
  measure_cold(nvbench::state &state, KernelLauncher &kernel_launcher)
      : measure_cold_base(state)
      , m_kernel_launcher{kernel_launcher}
@@ -101,7 +113,7 @@ private:
    this->sync_stream();

    nvbench::blocking_kernel blocker;
-    if constexpr (DelayEventRecording)
+    if constexpr (use_blocking_kernel)
    {
      blocker.block(m_launch.get_stream());
    }
@@ -110,7 +122,7 @@ private:
    this->launch_kernel();
    m_cuda_timer.stop(m_launch.get_stream());

-    if constexpr (DelayEventRecording)
+    if constexpr (use_blocking_kernel)
    {
      blocker.unblock();
    }
@@ -128,7 +140,7 @@ private:
      this->flush_device_l2();
      this->sync_stream();

-      if constexpr (DelayEventRecording)
+      if constexpr (use_blocking_kernel)
      {
        blocker.block(m_launch.get_stream());
      }
@@ -141,7 +153,7 @@ private:
      this->launch_kernel();
      m_cuda_timer.stop(m_launch.get_stream());

-      if constexpr (DelayEventRecording)
+      if constexpr (use_blocking_kernel)
      {
        m_cpu_timer.start();
        blocker.unblock();
@@ -195,4 +207,5 @@ private:
  KernelLauncher &m_kernel_launcher;
 };

-} // namespace nvbench::detail
+} // namespace detail
+} // namespace nvbench
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -4,14 +4,19 @@
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
+#include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
-#include <nvbench/state.cuh>

 #include <cuda_runtime.h>

 #include <utility>

-namespace nvbench::detail
+namespace nvbench
+{
+
+struct state;
+
+namespace detail
 {

 // non-templated code goes here to keep instantiation cost down:
@@ -58,9 +63,16 @@ protected:
  bool m_max_time_exceeded{false};
 };

-template <typename KernelLauncher, bool DelayEventRecording = true>
+template <typename KernelLauncher, nvbench::detail::exec_flag ExecTagModifiers>
 struct measure_hot : public measure_hot_base
 {
+  static constexpr bool needs_timer_wrapper =
+    (ExecTagModifiers & nvbench::detail::exec_flag::timer) ==
+    nvbench::detail::exec_flag::none;
+  static constexpr bool use_blocking_kernel =
+    (ExecTagModifiers & nvbench::detail::exec_flag::no_block) ==
+    nvbench::detail::exec_flag::none;
+
  measure_hot(nvbench::state &state, KernelLauncher &kernel_launcher)
      : measure_hot_base(state)
      , m_kernel_launcher{kernel_launcher}
@@ -82,7 +94,7 @@ private:
  {
    nvbench::blocking_kernel blocker;

-    if constexpr (DelayEventRecording)
+    if constexpr (use_blocking_kernel)
    {
      blocker.block(m_launch.get_stream());
    }
@@ -91,7 +103,7 @@ private:
    this->launch_kernel();
    m_cuda_timer.stop(m_launch.get_stream());

-    if constexpr (DelayEventRecording)
+    if constexpr (use_blocking_kernel)
    {
      blocker.unblock();
    }
@@ -116,7 +128,7 @@ private:
    {
      batch_size = std::max(batch_size, nvbench::int64_t{1});

-      if constexpr (DelayEventRecording)
+      if constexpr (use_blocking_kernel)
      {
        // Block stream until some work is queued.
        // Limit the number of kernel executions while blocked to prevent
@@ -192,4 +204,5 @@ private:
  KernelLauncher &m_kernel_launcher;
 };

-} // namespace nvbench::detail
+} // namespace detail
+} // namespace nvbench
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -0,0 +1,61 @@
+#pragma once
+
+#ifndef NVBENCH_STATE_EXEC_GUARD
+#error "This is a private implementation header for state.cuh. " \
+       "Do not include it directly."
+#endif // NVBENCH_STATE_EXEC_GUARD
+
+#include <nvbench/exec_tag.cuh>
+#include <nvbench/state.cuh>
+
+#include <nvbench/detail/measure_cold.cuh>
+#include <nvbench/detail/measure_hot.cuh>
+
+#include <type_traits>
+
+namespace nvbench
+{
+
+template <typename ExecTags, typename KernelLauncher>
+void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
+{
+  using namespace nvbench::exec_tag::impl;
+  static_assert(is_exec_tag_v<ExecTags>,
+                "`ExecTags` argument must be a member (or combination of "
+                "members) from nvbench::exec_tag.");
+  if constexpr (!(tags & measure_mask))
+  { // No measurements requested -- add the default and rerun:
+    this->exec(tags | nvbench::exec_tag::default_tag,
+               std::forward<KernelLauncher>(kernel_launcher));
+    return;
+  }
+
+  if (this->is_skipped())
+  {
+    return;
+  }
+
+  static_assert(!(tags & timer), "Manual timer mode not implemented.");
+  static_assert(!(tags & cpu), "CPU-only measurements not implemented.");
+
+  using KL = std::remove_reference_t<KernelLauncher>;
+  constexpr auto modifiers = (tags & modifier_mask).flags;
+
+  // Each measurement is deliberately isolated in constexpr branches to
+  // avoid instantiating unused measurements.
+  if constexpr (tags & cold)
+  {
+    using measure_t = nvbench::detail::measure_cold<KL, modifiers>;
+    measure_t measure{*this, kernel_launcher};
+    measure();
+  }
+
+  if constexpr (tags & hot)
+  {
+    using measure_t = nvbench::detail::measure_hot<KL, modifiers>;
+    measure_t measure{*this, kernel_launcher};
+    measure();
+  }
+}
+
+} // namespace nvbench
--- a/nvbench/exec.cuh
+++ b/nvbench/exec.cuh
@@ -1,32 +0,0 @@
-#pragma once
-
-#include <nvbench/state.cuh>
-
-#include <nvbench/detail/measure_cold.cuh>
-#include <nvbench/detail/measure_hot.cuh>
-
-namespace nvbench
-{
-
-template <typename KernelLauncher>
-void exec(nvbench::state &exec_state, KernelLauncher &&kernel_launcher)
-{
-  using KL = std::remove_reference_t<KernelLauncher>;
-
-  if (exec_state.is_skipped())
-  {
-    return;
-  }
-
-  {
-    nvbench::detail::measure_cold<KL> cold{exec_state, kernel_launcher};
-    cold();
-  }
-
-  {
-    nvbench::detail::measure_hot<KL> hot{exec_state, kernel_launcher};
-    hot();
-  }
-}
-
-} // namespace nvbench
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <nvbench/flags.cuh>
+
+#include <type_traits>
+
+namespace nvbench::detail
+{
+
+// See the similarly named tags in nvbench::exec_tag:: for documentation.
+enum class exec_flag
+{
+  none = 0x0,
+
+  // Modifiers:
+  timer    = 0x1, // KernelLauncher uses manual timing
+  no_block = 0x2, // Disables use of `blocking_kernel`. Needed when KL syncs.
+
+  // Measurement types:
+  cold = 0x4,  // measure_hot
+  hot  = 0x8,  // measure_cold
+  cpu  = 0x10, // measure_cpu
+
+  // Masks:
+  modifier_mask = timer | no_block,
+  measure_mask  = cold | hot | cpu
+};
+
+} // namespace nvbench::detail
+
+NVBENCH_DECLARE_FLAGS(nvbench::detail::exec_flag)
+
+namespace nvbench::exec_tag
+{
+
+namespace impl
+{
+
+struct tag_base
+{};
+
+template <typename ExecTag>
+constexpr inline bool is_exec_tag_v = std::is_base_of_v<tag_base, ExecTag>;
+
+/// Base class for exec_tag functionality.
+/// This exists so that the `exec_flag`s can be embedded in a type with flag
+/// semantics. This allows state::exec to only instantiate the measurements
+/// that are actually used.
+template <nvbench::detail::exec_flag Flags>
+struct tag
+    : std::integral_constant<nvbench::detail::exec_flag, Flags>
+    , tag_base
+{
+  static constexpr nvbench::detail::exec_flag flags = Flags;
+
+  template <nvbench::detail::exec_flag OFlags>
+  constexpr auto operator|(tag<OFlags>) const
+  {
+    return tag<Flags | OFlags>{};
+  }
+
+  template <nvbench::detail::exec_flag OFlags>
+  constexpr auto operator&(tag<OFlags>) const
+  {
+    return tag<Flags & OFlags>{};
+  }
+
+  constexpr auto operator~() const { return tag<~Flags>{}; }
+
+  constexpr operator bool() const // NOLINT(google-explicit-constructor)
+  {
+    return Flags != nvbench::detail::exec_flag::none;
+  }
+};
+
+using none_t          = tag<nvbench::detail::exec_flag::none>;
+using timer_t         = tag<nvbench::detail::exec_flag::timer>;
+using no_block_t      = tag<nvbench::detail::exec_flag::no_block>;
+using hot_t           = tag<nvbench::detail::exec_flag::hot>;
+using cold_t          = tag<nvbench::detail::exec_flag::cold>;
+using cpu_t           = tag<nvbench::detail::exec_flag::cpu>;
+using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
+using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;
+
+constexpr inline none_t none;
+constexpr inline timer_t timer;
+constexpr inline no_block_t no_block;
+constexpr inline cold_t cold;
+constexpr inline hot_t hot;
+constexpr inline cpu_t cpu;
+constexpr inline modifier_mask_t modifier_mask;
+constexpr inline measure_mask_t measure_mask;
+
+} // namespace impl
+
+/// Modifier used when only a portion of the KernelLauncher needs to be timed.
+/// Useful for resetting state in-between timed kernel launches.
+constexpr inline auto timer = nvbench::exec_tag::impl::timer;
+
+/// Modifier used to indicate that the KernelGenerator will perform CUDA
+/// synchronizations. Without this flag such benchmarks will deadlock.
+constexpr inline auto sync = nvbench::exec_tag::impl::no_block;
+
+/// Request Cold measurements.
+constexpr inline auto cold = nvbench::exec_tag::impl::cold;
+
+/// Request Hot measurements.
+constexpr inline auto hot = nvbench::exec_tag::impl::hot;
+
+/// Request CPU-only measurements.
+constexpr inline auto cpu = nvbench::exec_tag::impl::cpu;
+
+/// Requests hot and cold CUDA measurements with no modifiers.
+constexpr inline auto cuda = hot | cold;
+
+/// The default tag; used when none specified.
+constexpr inline auto default_tag = cuda;
+
+} // namespace nvbench::exec_tag
--- a/nvbench/flags.cuh
+++ b/nvbench/flags.cuh
@@ -3,22 +3,22 @@
 #include <type_traits>

 #define NVBENCH_DECLARE_FLAGS(T)                                               \
-  inline T operator|(T v1, T v2)                                               \
+  constexpr inline T operator|(T v1, T v2)                                     \
  {                                                                            \
    using UT = std::underlying_type_t<T>;                                      \
    return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2));          \
  }                                                                            \
-  inline T operator&(T v1, T v2)                                               \
+  constexpr inline T operator&(T v1, T v2)                                     \
  {                                                                            \
    using UT = std::underlying_type_t<T>;                                      \
    return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2));          \
  }                                                                            \
-  inline T operator^(T v1, T v2)                                               \
+  constexpr inline T operator^(T v1, T v2)                                     \
  {                                                                            \
    using UT = std::underlying_type_t<T>;                                      \
    return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2));          \
  }                                                                            \
-  inline T operator~(T v1)                                                     \
+  constexpr inline T operator~(T v1)                                           \
  {                                                                            \
    using UT = std::underlying_type_t<T>;                                      \
    return static_cast<T>(~static_cast<UT>(v1));                               \
--- a/nvbench/nvbench.cuh
+++ b/nvbench/nvbench.cuh
@@ -9,7 +9,7 @@
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/cuda_timer.cuh>
-#include <nvbench/exec.cuh>
+#include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/main.cuh>
 #include <nvbench/range.cuh>
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -1,6 +1,7 @@
 #pragma once

 #include <nvbench/device_info.cuh>
+#include <nvbench/exec_tag.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/types.cuh>
@@ -158,6 +159,18 @@ struct state
  /// ```
  [[nodiscard]] std::string get_short_description() const;

+  // TODO This will need detailed docs and include a reference to an appropriate
+  // section of the user's guide
+  template <typename ExecTags, typename KernelLauncher>
+  void exec(ExecTags, KernelLauncher &&kernel_launcher);
+
+  template <typename KernelLauncher>
+  void exec(KernelLauncher &&kernel_launcher)
+  {
+    this->exec(nvbench::exec_tag::default_tag,
+               std::forward<KernelLauncher>(kernel_launcher));
+  }
+
 private:
  friend struct nvbench::detail::state_generator;
  friend struct nvbench::detail::state_tester;
@@ -188,3 +201,7 @@ private:
 };

 } // namespace nvbench
+
+#define NVBENCH_STATE_EXEC_GUARD
+#include <nvbench/detail/state_exec.cuh>
+#undef NVBENCH_STATE_EXEC_GUARD