Add support for CPU-only benchmarking.

Fixes #95. CPU-only mode is enabled by setting the `is_cpu_only` property while defining a benchmark, e.g. `NVBENCH_BENCH(foo).set_is_cpu_only(true)`. An optional `nvbench::exec_tag::no_gpu` hint can also be passed to `state.exec` to avoid instantiating GPU benchmarking backends. Note that a CUDA compiler and CUDA runtime are always required, even if all benchmarks in a translation unit are CPU-only. Similarly, a new `nvbench::exec_tag::gpu` hint can be used to avoid compiling CPU-only backends for GPU benchmarks.
2026-03-14 20:27:24 +00:00 · 2025-04-04 20:03:49 +00:00
parent 1efed5f8e1
commit a6df59a9b5
16 changed files with 780 additions and 89 deletions
--- a/README.md
+++ b/README.md
@@ -25,6 +25,9 @@ features:
  * Batch Measurements:
    * Executes the benchmark multiple times back-to-back and records total time.
    * Reports the average execution time (total time / number of executions).
+  * [CPU-only Measurements](docs/benchmarks.md#cpu-only-benchmarks)
+    * Measures the host-side execution time of a non-GPU benchmark.
+    * Not suitable for microbenchmarking.

 # Supported Compilers and Tools

@@ -65,6 +68,7 @@ This repository provides a number of [examples](examples/) that demonstrate
 various NVBench features and usecases:

 - [Runtime and compile-time parameter sweeps](examples/axes.cu)
+- [CPU-only benchmarking](examples/cpu_only.cu)
 - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
 - [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
 - [Skipping benchmark configurations](examples/skip.cu)
@@ -171,6 +175,7 @@ testing and parameter tuning of individual kernels. For in-depth analysis of
 end-to-end performance of multiple applications, the NVIDIA Nsight tools are
 more appropriate.

-NVBench is focused on evaluating the performance of CUDA kernels and is not
-optimized for CPU microbenchmarks. This may change in the future, but for now,
+NVBench is focused on evaluating the performance of CUDA kernels. It also provides
+CPU-only benchmarking facilities intended for non-trivial CPU workloads, but is
+not optimized for CPU microbenchmarks. This may change in the future, but for now,
 consider using Google Benchmark for high resolution CPU benchmarks.
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -4,7 +4,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:

 ```cpp
 void my_benchmark(nvbench::state& state) {
-  state.exec([](nvbench::launch& launch) { 
+  state.exec([](nvbench::launch& launch) {
    my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
  });
 }
@@ -97,7 +97,7 @@ void benchmark(nvbench::state& state)
  const auto num_inputs = state.get_int64("NumInputs");
  thrust::device_vector<int> data = generate_input(num_inputs);

-  state.exec([&data](nvbench::launch& launch) { 
+  state.exec([&data](nvbench::launch& launch) {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
  });
 }
@@ -134,7 +134,7 @@ void benchmark(nvbench::state& state)
  const auto quality = state.get_float64("Quality");

  state.exec([&quality](nvbench::launch& launch)
-  { 
+  {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
  });
 }
@@ -153,7 +153,7 @@ void benchmark(nvbench::state& state)
  thrust::device_vector<int> data = generate_input(rng_dist);

  state.exec([&data](nvbench::launch& launch)
-  { 
+  {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
  });
 }
@@ -182,7 +182,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
  thrust::device_vector<T> data = generate_input<T>();

  state.exec([&data](nvbench::launch& launch)
-  { 
+  {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
  });
 }
@@ -266,7 +266,6 @@ In general::

 More examples can found in [examples/throughput.cu](../examples/throughput.cu).

-
 # Skip Uninteresting / Invalid Benchmarks

 Sometimes particular combinations of parameters aren't useful or interesting —
@@ -294,7 +293,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
 // Skip benchmarks at compile time -- for example, always skip when T == U
 // (Note that the `type_list` argument defines the same type twice).
 template <typename SameType>
-void my_benchmark(nvbench::state& state, 
+void my_benchmark(nvbench::state& state,
                  nvbench::type_list<SameType, SameType>)
 {
  state.skip("T must not be the same type as U.");
@@ -320,6 +319,15 @@ true:
  synchronize internally.
 - `nvbench::exec_tag::timer` requests a timer object that can be used to
  restrict the timed region.
+- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables
+  them during execution to reduce runtime, and prevents their compilation to
+  reduce compile-time and binary size.
+- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark is defined with `set_is_cpu_only(true)`.
+- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark does not also define `set_is_cpu_only(true)`.

 Multiple execution tags may be combined using `operator|`, e.g.

@@ -370,7 +378,7 @@ Note that using manual timer mode disables batch measurements.
 void timer_example(nvbench::state& state)
 {
  // Pass the `timer` exec tag to request a timer:
-  state.exec(nvbench::exec_tag::timer, 
+  state.exec(nvbench::exec_tag::timer,
    // Lambda now accepts a timer:
    [](nvbench::launch& launch, auto& timer)
    {
@@ -391,6 +399,79 @@ NVBENCH_BENCH(timer_example);
 See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete
 example.

+## Compilation hints: `nvbench::exec_tag::no_batch`, `gpu`, and `no_gpu`
+
+These execution tags are optional hints that disable the compilation of various
+code paths when they are not needed. They apply only to a single benchmark.
+
+- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend.
+- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends.
+  - Requires that the benchmark does not define `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - Host-side CPU measurements of GPU kernel execution time are still provided.
+- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends.
+  - Requires that the benchmark defines `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - See also [CPU-only Benchmarks](#cpu-only-benchmarks).
+
+# CPU-only Benchmarks
+
+NVBench provides CPU-only benchmarking facilities that are intended for measuring
+significant CPU workloads. We do not recommend using these features for high-resolution
+CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for
+such applications. Examples are provided in [examples/cpu_only.cu](../examples/cpu_only.cu).
+
+Note that NVBench still requires a CUDA compiler and runtime even if a project only contains
+CPU-only benchmarks.
+
+The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements:
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and
+binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while
+`is_cpu_only` is false.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends
+             [](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This
+is useful for benchmarks that require additional per-sample setup/teardown. See the
+[`nvbench::exec_tag::timer`](#explicit-timer-mode-nvbenchexec_tagtimer) section for more
+details.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends
+             nvbench::exec_tag::timer,   // Request a timer object
+             [](nvbench::launch &, auto &timer)
+    {
+      // Setup here
+      timer.start();
+      // timed workload
+      timer.stop();
+      // teardown here
+    });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
 # Beware: Combinatorial Explosion Is Lurking

 Be very careful of how quickly the configuration space can grow. The following
@@ -403,7 +484,7 @@ using value_types = nvbench::type_list<nvbench::uint8_t,
                                       nvbench::int32_t,
                                       nvbench::float32_t,
                                       nvbench::float64_t>;
-using op_types = nvbench::type_list<thrust::plus<>, 
+using op_types = nvbench::type_list<thrust::plus<>,
                                    thrust::multiplies<>,
                                    thrust::maximum<>>;

@@ -418,7 +499,7 @@ NVBENCH_BENCH_TYPES(my_benchmark,

 ```
 960 total configs
-= 4 [T=(U8, I32, F32, F64)] 
+= 4 [T=(U8, I32, F32, F64)]
 * 4 [U=(U8, I32, F32, F64)]
 * 4 [V=(U8, I32, F32, F64)]
 * 3 [Op=(plus, multiplies, max)]
@@ -427,8 +508,8 @@ NVBENCH_BENCH_TYPES(my_benchmark,

 For large configuration spaces like this, pruning some of the less useful
 combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
-described in the "Skip Uninteresting / Invalid Benchmarks" section can help
-immensely with keeping compile / run times manageable.
+described in the [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks)
+section can help immensely with keeping compile / run times manageable.

 Splitting a single large configuration space into multiple, more focused
 benchmarks with reduced dimensionality will likely be worth the effort as well.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -2,6 +2,7 @@ set(example_srcs
  auto_throughput.cu
  axes.cu
  custom_criterion.cu
+  cpu_only.cu
  enums.cu
  exec_tag_sync.cu
  exec_tag_timer.cu
--- a/examples/cpu_only.cu
+++ b/examples/cpu_only.cu
@@ -0,0 +1,83 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <chrono>
+#include <thread>
+
+// Block execution of the current CPU thread for `seconds` seconds.
+void sleep_host(double seconds)
+{
+  std::this_thread::sleep_for(
+    std::chrono::milliseconds(static_cast<nvbench::int64_t>(seconds * 1000)));
+}
+
+//=============================================================================
+// Simple CPU-only benchmark that sleeps on host for a specified duration.
+void simple(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec([duration](nvbench::launch &) { sleep_host(duration); });
+}
+NVBENCH_BENCH(simple)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
+
+//=============================================================================
+// Simple CPU-only benchmark that sleeps on host for a specified duration and
+// uses a custom timed region.
+void simple_timer(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec(nvbench::exec_tag::timer, [duration](nvbench::launch &, auto &timer) {
+    // Do any setup work before starting the timer here...
+    timer.start();
+
+    // The region of code to be timed:
+    sleep_host(duration);
+
+    timer.stop();
+    // Any per-run cleanup here...
+  });
+}
+NVBENCH_BENCH(simple_timer)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
+
+//=============================================================================
+// Simple CPU-only benchmark that uses the optional `nvbench::exec_tag::no_gpu`
+// hint to prevent GPU measurement code from being instantiated. Note that
+// `set_is_cpu_only(true)` is still required when using this hint.
+void simple_no_gpu(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec(nvbench::exec_tag::no_gpu, [duration](nvbench::launch &) { sleep_host(duration); });
+}
+NVBENCH_BENCH(simple_no_gpu)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -25,6 +25,7 @@ set(srcs

  detail/entropy_criterion.cxx
  detail/measure_cold.cu
+  detail/measure_cpu_only.cxx
  detail/measure_hot.cu
  detail/state_generator.cxx
  detail/stdrel_criterion.cxx
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -159,6 +159,16 @@ struct benchmark_base
  }
  /// @}

+  /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
+  /// @{
+  [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
+  benchmark_base &set_is_cpu_only(bool is_cpu_only)
+  {
+    m_is_cpu_only = is_cpu_only;
+    return *this;
+  }
+  /// @}
+
  /// If true, the benchmark is only run once, skipping all warmup runs and only
  /// executing a single non-batched measurement. This is intended for use with
  /// external profiling tools. @{
@@ -263,6 +273,7 @@ protected:

  optional_ref<nvbench::printer_base> m_printer;

+  bool m_is_cpu_only{false};
  bool m_run_once{false};
  bool m_disable_blocking_kernel{false};

--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -34,12 +34,18 @@ std::unique_ptr<benchmark_base> benchmark_base::clone() const
  result->m_axes    = m_axes;
  result->m_devices = m_devices;

-  result->m_min_samples      = m_min_samples;
-  result->m_criterion_params = m_criterion_params;
+  result->m_printer = m_printer;
+
+  result->m_is_cpu_only             = m_is_cpu_only;
+  result->m_run_once                = m_run_once;
+  result->m_disable_blocking_kernel = m_disable_blocking_kernel;
+
+  result->m_min_samples = m_min_samples;

  result->m_skip_time = m_skip_time;
  result->m_timeout   = m_timeout;

+  result->m_criterion_params   = m_criterion_params;
  result->m_stopping_criterion = m_stopping_criterion;

  return result;
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -40,7 +40,10 @@ void benchmark_manager::initialize()
  const auto& mgr = device_manager::get();
  for (auto& bench : m_benchmarks)
  {
-    bench->set_devices(mgr.get_devices());
+    if (!bench->get_is_cpu_only())
+    {
+      bench->set_devices(mgr.get_devices());
+    }
  }
 }

--- a/nvbench/detail/measure_cpu_only.cuh
+++ b/nvbench/detail/measure_cpu_only.cuh
@@ -0,0 +1,141 @@
+/*
+ *  Copyright 2021-2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <nvbench/cpu_timer.cuh>
+#include <nvbench/exec_tag.cuh>
+#include <nvbench/launch.cuh>
+#include <nvbench/stopping_criterion.cuh>
+
+#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#include <nvbench/detail/statistics.cuh>
+
+#include <utility>
+#include <vector>
+
+namespace nvbench
+{
+
+struct state;
+
+namespace detail
+{
+
+// non-templated code goes here:
+struct measure_cpu_only_base
+{
+  explicit measure_cpu_only_base(nvbench::state &exec_state);
+  measure_cpu_only_base(const measure_cpu_only_base &)            = delete;
+  measure_cpu_only_base(measure_cpu_only_base &&)                 = delete;
+  measure_cpu_only_base &operator=(const measure_cpu_only_base &) = delete;
+  measure_cpu_only_base &operator=(measure_cpu_only_base &&)      = delete;
+
+protected:
+  void check();
+  void initialize();
+  void run_trials_prologue();
+  void record_measurements();
+  bool is_finished();
+  void run_trials_epilogue();
+  void generate_summaries();
+
+  void check_skip_time(nvbench::float64_t warmup_time);
+
+  nvbench::state &m_state;
+
+  // Required to satisfy the KernelLauncher interface:
+  nvbench::launch m_launch;
+
+  nvbench::cpu_timer m_cpu_timer;
+  nvbench::cpu_timer m_walltime_timer;
+
+  nvbench::criterion_params m_criterion_params;
+  nvbench::stopping_criterion_base& m_stopping_criterion;
+
+  bool m_run_once{false};
+
+  nvbench::int64_t m_min_samples{};
+
+  nvbench::float64_t m_skip_time{};
+  nvbench::float64_t m_timeout{};
+
+  nvbench::int64_t m_total_samples{};
+  nvbench::float64_t m_total_cpu_time{};
+  nvbench::float64_t m_cpu_noise{}; // rel stdev
+
+  std::vector<nvbench::float64_t> m_cpu_times;
+
+  bool m_max_time_exceeded{};
+};
+
+template <typename KernelLauncher>
+struct measure_cpu_only : public measure_cpu_only_base
+{
+  measure_cpu_only(nvbench::state &state, KernelLauncher &kernel_launcher)
+      : measure_cpu_only_base(state)
+      , m_kernel_launcher{kernel_launcher}
+  {}
+
+  void operator()()
+  {
+    this->check();
+    this->initialize();
+    this->run_warmup();
+
+    this->run_trials_prologue();
+    this->run_trials();
+    this->run_trials_epilogue();
+
+    this->generate_summaries();
+  }
+
+private:
+  // Run the kernel once, measuring the CPU time. If under skip_time, skip the
+  // measurement.
+  void run_warmup()
+  {
+    if (m_run_once)
+    { // Skip warmups
+      return;
+    }
+
+    this->launch_kernel(m_cpu_timer);
+    this->check_skip_time(m_cpu_timer.get_duration());
+  }
+
+  void run_trials()
+  {
+    do
+    {
+      this->launch_kernel(m_cpu_timer);
+      this->record_measurements();
+    } while (!this->is_finished());
+  }
+
+  template <typename TimerT>
+  __forceinline__ void launch_kernel(TimerT &timer)
+  {
+    m_kernel_launcher(m_launch, timer);
+  }
+
+  KernelLauncher &m_kernel_launcher;
+};
+
+} // namespace detail
+} // namespace nvbench
--- a/nvbench/detail/measure_cpu_only.cxx
+++ b/nvbench/detail/measure_cpu_only.cxx
@@ -0,0 +1,246 @@
+/*
+ *  Copyright 2021-2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/benchmark_base.cuh>
+#include <nvbench/criterion_manager.cuh>
+#include <nvbench/detail/measure_cpu_only.cuh>
+#include <nvbench/detail/throw.cuh>
+#include <nvbench/printer_base.cuh>
+#include <nvbench/state.cuh>
+#include <nvbench/summary.cuh>
+
+#include <fmt/format.h>
+
+namespace nvbench::detail
+{
+
+measure_cpu_only_base::measure_cpu_only_base(state &exec_state)
+    : m_state{exec_state}
+    , m_launch(m_state.get_cuda_stream())
+    , m_criterion_params{exec_state.get_criterion_params()}
+    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
+    , m_run_once{exec_state.get_run_once()}
+    , m_min_samples{exec_state.get_min_samples()}
+    , m_skip_time{exec_state.get_skip_time()}
+    , m_timeout{exec_state.get_timeout()}
+{
+  if (m_min_samples > 0)
+  {
+    m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
+  }
+}
+
+void measure_cpu_only_base::check()
+{
+  // no-op
+}
+
+void measure_cpu_only_base::initialize()
+{
+  m_total_cpu_time  = 0.;
+  m_cpu_noise       = 0.;
+  m_total_samples   = 0;
+  m_cpu_times.clear();
+  m_max_time_exceeded = false;
+
+  m_stopping_criterion.initialize(m_criterion_params);
+}
+
+void measure_cpu_only_base::run_trials_prologue() { m_walltime_timer.start(); }
+
+void measure_cpu_only_base::record_measurements()
+{
+  // Update and record timers and counters:
+  const auto cur_cpu_time  = m_cpu_timer.get_duration();
+  m_cpu_times.push_back(cur_cpu_time);
+  m_total_cpu_time += cur_cpu_time;
+  ++m_total_samples;
+
+  m_stopping_criterion.add_measurement(cur_cpu_time);
+}
+
+bool measure_cpu_only_base::is_finished()
+{
+  if (m_run_once)
+  {
+    return true;
+  }
+
+  // Check that we've gathered enough samples:
+  if (m_total_samples > m_min_samples)
+  {
+    if (m_stopping_criterion.is_finished())
+    {
+      return true;
+    }
+  }
+
+  // Check for timeouts:
+  m_walltime_timer.stop();
+  if (m_walltime_timer.get_duration() > m_timeout)
+  {
+    m_max_time_exceeded = true;
+    return true;
+  }
+
+  return false;
+}
+
+void measure_cpu_only_base::run_trials_epilogue()
+{
+  // Only need to compute this at the end, not per iteration.
+  const auto cpu_mean  = m_total_cpu_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
+                                                                         m_cpu_times.cend(),
+                                                                         cpu_mean);
+  m_cpu_noise          = cpu_stdev / cpu_mean;
+
+  m_walltime_timer.stop();
+}
+
+void measure_cpu_only_base::generate_summaries()
+{
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/sample_size");
+    summ.set_string("name", "Samples");
+    summ.set_string("hint", "sample_size");
+    summ.set_string("description", "Number of isolated kernel executions");
+    summ.set_int64("value", m_total_samples);
+  }
+
+  const auto d_samples = static_cast<nvbench::float64_t>(m_total_samples);
+  const auto avg_cpu_time = m_total_cpu_time / d_samples;
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/mean");
+    summ.set_string("name", "CPU Time");
+    summ.set_string("hint", "duration");
+    summ.set_string("description",
+                    "Mean isolated kernel execution time "
+                    "(measured on host CPU)");
+    summ.set_float64("value", avg_cpu_time);
+  }
+
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/time/cpu/stdev/relative");
+    summ.set_string("name", "Noise");
+    summ.set_string("hint", "percentage");
+    summ.set_string("description", "Relative standard deviation of isolated CPU times");
+    summ.set_float64("value", m_cpu_noise);
+  }
+
+  if (const auto items = m_state.get_element_count(); items != 0)
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/bw/item_rate");
+    summ.set_string("name", "Elem/s");
+    summ.set_string("hint", "item_rate");
+    summ.set_string("description", "Number of input elements processed per second");
+    summ.set_float64("value", static_cast<double>(items) / avg_cpu_time);
+  }
+
+  if (const auto bytes = m_state.get_global_memory_rw_bytes(); bytes != 0)
+  {
+    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cpu_time;
+    {
+      auto &summ = m_state.add_summary("nv/cpu_only/bw/global/bytes_per_second");
+      summ.set_string("name", "GlobalMem BW");
+      summ.set_string("hint", "byte_rate");
+      summ.set_string("description",
+                      "Number of bytes read/written per second.");
+      summ.set_float64("value", avg_used_gmem_bw);
+    }
+  } // bandwidth
+
+  {
+    auto &summ = m_state.add_summary("nv/cpu_only/walltime");
+    summ.set_string("name", "Walltime");
+    summ.set_string("hint", "duration");
+    summ.set_string("description", "Walltime used for isolated measurements");
+    summ.set_float64("value", m_walltime_timer.get_duration());
+    summ.set_string("hide", "Hidden by default.");
+  }
+
+  // Log if a printer exists:
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
+  {
+    auto &printer = printer_opt_ref.value().get();
+
+    if (m_max_time_exceeded)
+    {
+      const auto timeout = m_walltime_timer.get_duration();
+      const auto max_noise = m_criterion_params.get_float64("max-noise");
+      const auto min_time = m_criterion_params.get_float64("min-time");
+
+      if (m_cpu_noise > max_noise)
+      {
+        printer.log(nvbench::log_level::warn,
+                    fmt::format("Current measurement timed out ({:0.2f}s) "
+                                "while over noise threshold ({:0.2f}% > "
+                                "{:0.2f}%)",
+                                timeout,
+                                m_cpu_noise * 100,
+                                max_noise * 100));
+      }
+      if (m_total_samples < m_min_samples)
+      {
+        printer.log(nvbench::log_level::warn,
+                    fmt::format("Current measurement timed out ({:0.2f}s) "
+                                "before accumulating min_samples ({} < {})",
+                                timeout,
+                                m_total_samples,
+                                m_min_samples));
+      }
+      if (m_total_cpu_time < min_time)
+      {
+        printer.log(nvbench::log_level::warn,
+                    fmt::format("Current measurement timed out ({:0.2f}s) "
+                                "before accumulating min_time ({:0.2f}s < "
+                                "{:0.2f}s)",
+                                timeout,
+                                m_total_cpu_time,
+                                min_time));
+      }
+    }
+
+    // Log to stdout:
+    printer.log(nvbench::log_level::pass,
+                fmt::format("CpuOnly: {:0.6f}ms mean CPU, {:0.2f}s total CPU, "
+                            "{:0.2f}s total wall, {}x ",
+                            avg_cpu_time * 1e3,
+                            m_total_cpu_time,
+                            m_walltime_timer.get_duration(),
+                            m_total_samples));
+
+    printer.process_bulk_data(m_state, "nv/cpu_only/sample_times", "sample_times", m_cpu_times);
+  }
+}
+
+void measure_cpu_only_base::check_skip_time(nvbench::float64_t warmup_time)
+{
+  if (m_skip_time > 0. && warmup_time < m_skip_time)
+  {
+    auto reason = fmt::format("Warmup time did not meet skip_time limit: "
+                              "{:0.3f}us < {:0.3f}us.",
+                              warmup_time * 1e6,
+                              m_skip_time * 1e6);
+
+    m_state.skip(reason);
+    NVBENCH_THROW(std::runtime_error, "{}", std::move(reason));
+  }
+}
+
+} // namespace nvbench::detail
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -24,15 +24,16 @@
 #endif // NVBENCH_STATE_EXEC_GUARD

 #include <nvbench/config.cuh>
+#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#include <nvbench/detail/measure_cold.cuh>
+#include <nvbench/detail/measure_cpu_only.cuh>
+#include <nvbench/detail/measure_hot.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/state.cuh>

-#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #ifdef NVBENCH_HAS_CUPTI
 #include <nvbench/detail/measure_cupti.cuh>
 #endif // NVBENCH_HAS_CUPTI
-#include <nvbench/detail/measure_cold.cuh>
-#include <nvbench/detail/measure_hot.cuh>

 #include <type_traits>

@@ -45,21 +46,44 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
  using KL = typename std::remove_reference<KernelLauncher>::type;
  using namespace nvbench::exec_tag::impl;
  static_assert(is_exec_tag_v<ExecTags>,
-                "`ExecTags` argument must be a member (or combination of "
-                "members) from nvbench::exec_tag.");
+                "`ExecTags` argument must be a member (or combination of members) from "
+                "`nvbench::exec_tag`.");
+  static_assert(!((tags & gpu) && (tags & no_gpu)),
+                "`nvbench::exec_tag::gpu` and `nvbench::exec_tag::no_gpu` are mutually "
+                "exclusive.");

-  constexpr auto measure_tags  = tags & measure_mask;
  constexpr auto modifier_tags = tags & modifier_mask;
+  constexpr auto measure_tags  = tags & measure_mask;

-  // "run once" is handled by the cold measurement:
+  if ((modifier_tags & no_gpu) && !this->get_is_cpu_only())
+  {
+    throw std::runtime_error("The `nvbench::exec_tag::no_gpu` tag requires that "
+                             "`set_is_cpu_only(true)` is called when defining the benchmark.");
+  }
+
+  if ((modifier_tags & gpu) && this->get_is_cpu_only())
+  {
+    throw std::runtime_error("The `nvbench::exec_tag::gpu` tag requires that "
+                             "`set_is_cpu_only(true)` is NOT called when defining the benchmark.");
+  }
+
+  // "run once" should disable batch measurements:
+  // TODO This should just be a runtime branch in measure_cold. Currently this causes two versions
+  // of measure_cold to be compiled. We don't expose the `run_once` tag to users, it should be
+  // removed.
+  // TODO CPU measurements should support run_once as well.
  if (!(modifier_tags & run_once) && this->get_run_once())
  {
-    constexpr auto run_once_tags = modifier_tags | cold | run_once;
+    constexpr auto run_once_tags = modifier_tags | run_once | (measure_tags & ~hot);
    this->exec(run_once_tags, std::forward<KernelLauncher>(kernel_launcher));
    return;
  }

-  if (!(modifier_tags & no_block) && this->get_disable_blocking_kernel())
+  // TODO The `no_block` tag should be removed and replaced with a runtime branch in measure_cold
+  // and measure_hot. Currently this causes unnecesaary codegen. Note that the `sync` exec_tag
+  // implies `no_block` when refactoring.
+  if (!(measure_tags & cpu_only) && !(modifier_tags & no_block) &&
+      this->get_disable_blocking_kernel())
  {
    constexpr auto no_block_tags = tags | no_block;
    this->exec(no_block_tags, std::forward<KernelLauncher>(kernel_launcher));
@@ -69,14 +93,32 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
  // If no measurements selected, pick some defaults based on the modifiers:
  if constexpr (!measure_tags)
  {
-    if constexpr (modifier_tags & (timer | sync))
-    { // Can't do hot timings with manual timer or sync; whole point is to not
-      // sync in between executions.
-      this->exec(cold | tags, std::forward<KernelLauncher>(kernel_launcher));
-    }
-    else
+    if constexpr (modifier_tags & gpu)
    {
-      this->exec(cold | hot | tags, std::forward<KernelLauncher>(kernel_launcher));
+      if constexpr (modifier_tags & no_batch)
+      {
+        this->exec(cold | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
+      }
+      else
+      {
+        this->exec(cold | hot | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
+      }
+    }
+    else if constexpr (modifier_tags & no_gpu)
+    {
+      this->exec(cpu_only | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
+    }
+    else // Instantiate both CPU and GPU measurement code:
+    {
+      if constexpr (modifier_tags & no_batch)
+      {
+        this->exec(cold | cpu_only | modifier_tags, std::forward<KernelLauncher>(kernel_launcher));
+      }
+      else
+      {
+        this->exec(cold | hot | cpu_only | modifier_tags,
+                   std::forward<KernelLauncher>(kernel_launcher));
+      }
    }
    return;
  }
@@ -86,62 +128,90 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    return;
  }

-  // Each measurement is deliberately isolated in constexpr branches to
-  // avoid instantiating unused measurements.
-  if constexpr (tags & cold)
+  if (this->get_is_cpu_only())
  {
-    constexpr bool use_blocking_kernel = !(tags & no_block);
-    if constexpr (tags & timer)
+    if constexpr (tags & cpu_only) // Prevent instantiation when not needed
    {
+      static_assert(!(tags & gpu), "CPU-only measurement doesn't support the `gpu` exec_tag.");
+
+      if constexpr (tags & timer)
+      {
+        using measure_t = nvbench::detail::measure_cpu_only<KL>;
+        measure_t measure{*this, kernel_launcher};
+        measure();
+      }
+      else
+      { // Need to wrap the kernel launcher with a timer wrapper:
+        using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
+        wrapper_t wrapper{kernel_launcher};
+
+        using measure_t = nvbench::detail::measure_cpu_only<wrapper_t>;
+        measure_t measure(*this, wrapper);
+        measure();
+      }
+    }
+  }
+  else
+  {
+    if constexpr (tags & cold) // Prevent instantiation when not needed
+    {
+      static_assert(!(tags & no_gpu), "Cold measurement doesn't support the `no_gpu` exec_tag.");
+
+      constexpr bool use_blocking_kernel = !(tags & no_block);
+      if constexpr (tags & timer)
+      {
 // Estimate bandwidth here
 #ifdef NVBENCH_HAS_CUPTI
-      if constexpr (!(modifier_tags & run_once))
-      {
-        if (this->is_cupti_required())
+        if constexpr (!(modifier_tags & run_once))
        {
-          using measure_t = nvbench::detail::measure_cupti<KL>;
-          measure_t measure{*this, kernel_launcher};
-          measure();
+          if (this->is_cupti_required())
+          {
+            using measure_t = nvbench::detail::measure_cupti<KL>;
+            measure_t measure{*this, kernel_launcher};
+            measure();
+          }
        }
-      }
 #endif

-      using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
+        using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
+        measure_t measure{*this, kernel_launcher};
+        measure();
+      }
+      else
+      { // Need to wrap the kernel launcher with a timer wrapper:
+        using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
+        wrapper_t wrapper{kernel_launcher};
+
+// Estimate bandwidth here
+#ifdef NVBENCH_HAS_CUPTI
+        if constexpr (!(modifier_tags & run_once))
+        {
+          if (this->is_cupti_required())
+          {
+            using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
+            measure_t measure{*this, wrapper};
+            measure();
+          }
+        }
+#endif
+
+        using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
+        measure_t measure(*this, wrapper);
+        measure();
+      }
+    }
+
+    if constexpr (tags & hot) // Prevent instantiation when not needed
+    {
+      static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
+      static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
+      static_assert(!(tags & no_batch), "Hot measurement doesn't support the `no_batch` exec_tag.");
+      static_assert(!(tags & no_gpu), "Hot measurement doesn't support the `no_gpu` exec_tag.");
+      constexpr bool use_blocking_kernel = !(tags & no_block);
+      using measure_t                    = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
      measure_t measure{*this, kernel_launcher};
      measure();
    }
-    else
-    { // Need to wrap the kernel launcher with a timer wrapper:
-      using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
-      wrapper_t wrapper{kernel_launcher};
-
-// Estimate bandwidth here
-#ifdef NVBENCH_HAS_CUPTI
-      if constexpr (!(modifier_tags & run_once))
-      {
-        if (this->is_cupti_required())
-        {
-          using measure_t = nvbench::detail::measure_cupti<wrapper_t>;
-          measure_t measure{*this, wrapper};
-          measure();
-        }
-      }
-#endif
-
-      using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
-      measure_t measure(*this, wrapper);
-      measure();
-    }
-  }
-
-  if constexpr (tags & hot)
-  {
-    static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
-    static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
-    constexpr bool use_blocking_kernel = !(tags & no_block);
-    using measure_t                    = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
-    measure_t measure{*this, kernel_launcher};
-    measure();
  }
 }

--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -35,12 +35,17 @@ enum class exec_flag
  no_block      = 0x02, // Disables use of `blocking_kernel`.
  sync          = 0x04, // KernelLauncher has indicated that it will sync
  run_once      = 0x08, // Only run the benchmark once (for profiling).
-  modifier_mask = timer | no_block | sync | run_once,
+  gpu           = 0x10, // Don't instantiate `measure_cpu_only`.
+  no_gpu        = 0x20, // No GPU measurements should be instantiated.
+  no_batch      = 0x40, // `measure_hot` will not be used.
+  modifier_mask = 0xFF,

-  // Measurement types:
+  // Measurement types to instantiate. Derived from modifiers.
+  // Should not be exposed directly via nvbench::exec_tag::<...>.
  cold         = 0x0100, // measure_cold
  hot          = 0x0200, // measure_hot
-  measure_mask = cold | hot
+  cpu_only     = 0x0400, // measure_cpu_only
+  measure_mask = 0xFF00,
 };

 } // namespace nvbench::detail
@@ -95,9 +100,14 @@ using timer_t         = tag<nvbench::detail::exec_flag::timer>;
 using no_block_t      = tag<nvbench::detail::exec_flag::no_block>;
 using sync_t          = tag<nvbench::detail::exec_flag::sync>;
 using run_once_t      = tag<nvbench::detail::exec_flag::run_once>;
+using gpu_t           = tag<nvbench::detail::exec_flag::gpu>;
+using no_gpu_t        = tag<nvbench::detail::exec_flag::no_gpu>;
+using no_batch_t      = tag<nvbench::detail::exec_flag::no_batch>;
+using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
+
 using hot_t           = tag<nvbench::detail::exec_flag::hot>;
 using cold_t          = tag<nvbench::detail::exec_flag::cold>;
-using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
+using cpu_only_t      = tag<nvbench::detail::exec_flag::cpu_only>;
 using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;

 constexpr inline none_t none;
@@ -105,9 +115,14 @@ constexpr inline timer_t timer;
 constexpr inline no_block_t no_block;
 constexpr inline sync_t sync;
 constexpr inline run_once_t run_once;
+constexpr inline gpu_t gpu;
+constexpr inline no_gpu_t no_gpu;
+constexpr inline no_batch_t no_batch;
+constexpr inline modifier_mask_t modifier_mask;
+
 constexpr inline cold_t cold;
 constexpr inline hot_t hot;
-constexpr inline modifier_mask_t modifier_mask;
+constexpr inline cpu_only_t cpu_only;
 constexpr inline measure_mask_t measure_mask;

 } // namespace impl
@@ -116,13 +131,26 @@ constexpr inline auto none = nvbench::exec_tag::impl::none;

 /// Modifier used when only a portion of the KernelLauncher needs to be timed.
 /// Useful for resetting state in-between timed kernel launches.
-constexpr inline auto timer = nvbench::exec_tag::impl::timer;
+constexpr inline auto timer = nvbench::exec_tag::impl::timer | //
+                              nvbench::exec_tag::impl::no_batch;

 /// Modifier used to indicate that the KernelGenerator will perform CUDA
 /// synchronizations. Without this flag such benchmarks will deadlock.
-constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync;
+constexpr inline auto sync = nvbench::exec_tag::impl::no_block | //
+                             nvbench::exec_tag::impl::sync |     //
+                             nvbench::exec_tag::impl::no_batch;

 /// Modifier used to indicate that batched measurements should be disabled
-constexpr inline auto no_batch = nvbench::exec_tag::impl::cold;
+constexpr inline auto no_batch = nvbench::exec_tag::impl::no_batch;
+
+/// Optional optimization for CPU-only benchmarks. Requires that `set_is_cpu_only(true)`
+/// is called when defining the benchmark. Passing this exec_tag will ensure that
+/// GPU measurement code is not instantiated.
+constexpr inline auto no_gpu = nvbench::exec_tag::impl::no_gpu;
+
+/// Optional optimization for GPU benchmarks. Requires that `set_is_cpu_only(true)`
+/// is NOT called when defining the benchmark. Passing this exec_tag will prevent unused CPU-only
+/// measurement code from being instantiated.
+constexpr inline auto gpu = nvbench::exec_tag::impl::gpu;

 } // namespace nvbench::exec_tag
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -227,7 +227,7 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_
    const auto &devices = bench.get_devices();
    const auto &axes    = bench.get_axes();

-    fmt::format_to(std::back_inserter(buffer), "\n## {}\n", bench.get_name());
+    fmt::format_to(std::back_inserter(buffer), "\n## {}\n\n", bench.get_name());

    // Do a single pass when no devices are specified. This happens for
    // benchmarks with `cpu` exec_tags.
@@ -240,7 +240,10 @@ void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_

      if (device)
      {
-        fmt::format_to(std::back_inserter(buffer), "\n### [{}] {}\n\n", device->get_id(), device->get_name());
+        fmt::format_to(std::back_inserter(buffer),
+                       "### [{}] {}\n\n",
+                       device->get_id(),
+                       device->get_name());
      }

      std::size_t row = 0;
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -799,7 +799,10 @@ try
  else
  {
    benchmark_base &bench = *m_benchmarks.back();
-    bench.set_devices(device_vec);
+    if (!bench.get_is_cpu_only())
+    {
+      bench.set_devices(device_vec);
+    }
  }

  m_recent_devices = std::move(device_vec);
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -71,6 +71,12 @@ struct state
  /// nullopt for CPU-only benchmarks.
  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }

+  /// If true, the benchmark measurements only record CPU time and assume no GPU work is performed.
+  /// @{
+  // No setter, this should not be modified after construction, as it is a benchmark-wide property.
+  [[nodiscard]] bool get_is_cpu_only() const { return m_is_cpu_only; }
+  /// @}
+
  /// An index into a benchmark::type_configs type_list. Returns 0 if no type
  /// axes in the associated benchmark.
  [[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }
@@ -293,6 +299,7 @@ private:
  std::optional<nvbench::device_info> m_device;
  std::size_t m_type_config_index{};

+  bool m_is_cpu_only{false};
  bool m_run_once{false};
  bool m_disable_blocking_kernel{false};

--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -34,6 +34,7 @@ namespace nvbench

 state::state(const benchmark_base &bench)
    : m_benchmark{bench}
+    , m_is_cpu_only(bench.get_is_cpu_only())
    , m_run_once{bench.get_run_once()}
    , m_disable_blocking_kernel{bench.get_disable_blocking_kernel()}
    , m_criterion_params{bench.get_criterion_params()}
@@ -51,6 +52,7 @@ state::state(const benchmark_base &bench,
    , m_axis_values{std::move(values)}
    , m_device{std::move(device)}
    , m_type_config_index{type_config_index}
+    , m_is_cpu_only(bench.get_is_cpu_only())
    , m_run_once{bench.get_run_once()}
    , m_disable_blocking_kernel{bench.get_disable_blocking_kernel()}
    , m_criterion_params{bench.get_criterion_params()}