Remove run_once exec_tag.

Similar to `no_block`, this is a runtime variable that doesn't need to be encoded statically.
It was not exposed publicly and existing solely as an implementation detail of `state::exec`, introducing unnecessary complexity there.
This commit is contained in:
Allison Piper
2025-04-08 17:11:09 +00:00
parent 851d7aadd0
commit 35360614ed
2 changed files with 31 additions and 49 deletions

View File

@@ -45,6 +45,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
{
using KL = typename std::remove_reference<KernelLauncher>::type;
using namespace nvbench::exec_tag::impl;
static_assert(is_exec_tag_v<ExecTags>,
"`ExecTags` argument must be a member (or combination of members) from "
"`nvbench::exec_tag`.");
@@ -55,30 +56,6 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
constexpr auto modifier_tags = tags & modifier_mask;
constexpr auto measure_tags = tags & measure_mask;
if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
{
throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
"`set_is_cpu_only(true)` is called when defining the benchmark.");
}
if ((modifier_tags & gpu) && this->get_is_cpu_only())
{
throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
"`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
}
// "run once" should disable batch measurements:
// TODO This should just be a runtime branch in measure_cold. Currently this causes two versions
// of measure_cold to be compiled. We don't expose the `run_once` tag to users, it should be
// removed.
// TODO CPU measurements should support run_once as well.
if (!(modifier_tags & run_once) && this->get_run_once())
{
constexpr auto run_once_tags = modifier_tags | run_once | (measure_tags & ~hot);
this->exec(run_once_tags, std::forward<KernelLauncher>(kernel_launcher));
return;
}
// If no measurements selected, pick some defaults based on the modifiers:
if constexpr (!measure_tags)
{
@@ -112,6 +89,18 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
return;
}
if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
{
throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
"`set_is_cpu_only(true)` is called when defining the benchmark.");
}
if ((modifier_tags & gpu) && this->get_is_cpu_only())
{
throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
"`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
}
// Syncing will cause the blocking kernel pattern to deadlock:
if constexpr (modifier_tags & sync)
{
@@ -154,16 +143,12 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
if constexpr (tags & timer)
{
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
if constexpr (!(modifier_tags & run_once))
if (this->is_cupti_required() && !this->get_run_once())
{
if (this->is_cupti_required())
{
using measure_t = nvbench::detail::measure_cupti<KL>;
measure_t measure{*this, kernel_launcher};
measure();
}
using measure_t = nvbench::detail::measure_cupti<KL>;
measure_t measure{*this, kernel_launcher};
measure();
}
#endif
@@ -176,16 +161,12 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
wrapper_t wrapper{kernel_launcher};
// Estimate bandwidth here
#ifdef NVBENCH_HAS_CUPTI
if constexpr (!(modifier_tags & run_once))
if (this->is_cupti_required() && !this->get_run_once())
{
if (this->is_cupti_required())
{
using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
measure_t measure{*this, wrapper};
measure();
}
using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
measure_t measure{*this, wrapper};
measure();
}
#endif
@@ -201,9 +182,13 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
using measure_t = nvbench::detail::measure_hot<KL>;
measure_t measure{*this, kernel_launcher};
measure();
if (!this->get_run_once())
{
using measure_t = nvbench::detail::measure_hot<KL>;
measure_t measure{*this, kernel_launcher};
measure();
}
}
}
}

View File

@@ -33,10 +33,9 @@ enum class exec_flag
// Modifiers:
timer = 0x01, // KernelLauncher uses manual timing
sync = 0x02, // KernelLauncher has indicated that it will sync
run_once = 0x04, // Only run the benchmark once (for profiling).
gpu = 0x08, // Don't instantiate `measure_cpu_only`.
no_gpu = 0x10, // No GPU measurements should be instantiated.
no_batch = 0x20, // `measure_hot` will not be used.
gpu = 0x04, // Don't instantiate `measure_cpu_only`.
no_gpu = 0x08, // No GPU measurements should be instantiated.
no_batch = 0x10, // `measure_hot` will not be used.
modifier_mask = 0xFF,
// Measurement types to instantiate. Derived from modifiers.
@@ -97,7 +96,6 @@ struct tag
using none_t = tag<nvbench::detail::exec_flag::none>;
using timer_t = tag<nvbench::detail::exec_flag::timer>;
using sync_t = tag<nvbench::detail::exec_flag::sync>;
using run_once_t = tag<nvbench::detail::exec_flag::run_once>;
using gpu_t = tag<nvbench::detail::exec_flag::gpu>;
using no_gpu_t = tag<nvbench::detail::exec_flag::no_gpu>;
using no_batch_t = tag<nvbench::detail::exec_flag::no_batch>;
@@ -111,7 +109,6 @@ using measure_mask_t = tag<nvbench::detail::exec_flag::measure_mask>;
constexpr inline none_t none;
constexpr inline timer_t timer;
constexpr inline sync_t sync;
constexpr inline run_once_t run_once;
constexpr inline gpu_t gpu;
constexpr inline no_gpu_t no_gpu;
constexpr inline no_batch_t no_batch;