Polishing up the proof of concept.

- Enable throughput stats - Add cold measurements - Print liveness/log messages while running trials. - Improve markdown formatting - nvbench::range
2026-04-20 06:48:53 +00:00 · 2021-01-02 01:45:12 -05:00
parent d93462ca09
commit 0f3f1ef899
15 changed files with 524 additions and 70 deletions
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -12,6 +12,7 @@ set(srcs
  type_axis.cu

  detail/markdown_format.cu
+  detail/measure_cold.cu
  detail/measure_hot.cu
  detail/state_generator.cu
 )
--- a/nvbench/detail/l2flush.cuh
+++ b/nvbench/detail/l2flush.cuh
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <nvbench/cuda_call.cuh>
+
+#include <cuda_runtime_api.h>
+
+namespace nvbench
+{
+namespace detail
+{
+
+struct l2flush
+{
+  l2flush()
+      : m_l2_buffer{nullptr}
+  {
+    int dev_id{};
+    NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
+    NVBENCH_CUDA_CALL(
+      cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
+    if (m_l2_size > 0)
+    {
+      NVBENCH_CUDA_CALL(cudaMalloc(&m_l2_buffer, m_l2_size));
+    }
+  }
+
+  ~l2flush()
+  {
+    if (m_l2_buffer)
+    {
+      NVBENCH_CUDA_CALL(cudaFree(m_l2_buffer));
+    }
+  }
+
+  void flush(cudaStream_t stream)
+  {
+    if (m_l2_size > 0)
+    {
+      NVBENCH_CUDA_CALL(cudaMemsetAsync(m_l2_buffer, 0, m_l2_size, stream));
+    }
+  }
+
+private:
+  int m_l2_size;
+  int *m_l2_buffer;
+};
+
+} // namespace detail
+} // namespace nvbench
--- a/nvbench/detail/markdown_format.cu
+++ b/nvbench/detail/markdown_format.cu
@@ -124,7 +124,7 @@ void markdown_format::print()
    using T = std::decay_t<decltype(v)>;
    if constexpr (std::is_same_v<T, nvbench::float64_t>)
    {
-      return fmt::format("{:7.5g}", v);
+      return fmt::format("{:.5g}", v);
    }
    else if constexpr (std::is_same_v<T, std::string>)
    {
@@ -136,22 +136,50 @@ void markdown_format::print()
  auto format_duration = [](nvbench::float64_t seconds) {
    if (seconds >= 1.) // 1+ sec
    {
-      return fmt::format("{:5.2f} s", seconds);
+      return fmt::format("{:.2f} s", seconds);
    }
-    else if (seconds >= 1e-1) // 100+ ms.
+    else if (seconds >= 1e-2) // 10+ ms.
    {
-      return fmt::format("{:5.2f} ms", seconds * 1e3);
+      return fmt::format("{:.2f} ms", seconds * 1e3);
    }
-    else if (seconds >= 1e-4) // 100+ us.
+    else if (seconds >= 1e-5) // 10+ us.
    {
-      return fmt::format("{:5.2f} us", seconds * 1e6);
+      return fmt::format("{:.2f} us", seconds * 1e6);
    }
    else
    {
-      return fmt::format("{:5.2f} ns", seconds * 1e9);
+      return fmt::format("{:.2f} ns", seconds * 1e9);
    }
  };

+  auto format_item_rate = [](nvbench::float64_t items_per_second) {
+    return fmt::format("{:.3g}/sec", items_per_second);
+  };
+
+  auto format_byte_rate = [](nvbench::float64_t bytes_per_second) {
+    if (bytes_per_second >= 10. * 1024. * 1024. * 1024.) // 10 GiB/s
+    {
+      return fmt::format("{:.2f} GiB/s",
+                         bytes_per_second / (1024. * 1024. * 1024.));
+    }
+    else if (bytes_per_second >= 10. * 1024. * 1024.) // 10 MiB/s
+    {
+      return fmt::format("{:.2f} MiB/s", bytes_per_second / (1024. * 1024.));
+    }
+    else if (bytes_per_second >= 10. * 1024.) // 10 KiB/s.
+    {
+      return fmt::format("{:.2f} KiB/s", bytes_per_second / 1024.);
+    }
+    else
+    {
+      return fmt::format("{:.2f} B/s", bytes_per_second);
+    }
+  };
+
+  auto format_percentage = [](nvbench::float64_t percentage) {
+    return fmt::format("{:.2f}%", percentage);
+  };
+
  auto &mgr = nvbench::benchmark_manager::get();
  for (const auto &bench_ptr : mgr.get_benchmarks())
  {
@@ -178,12 +206,32 @@ void markdown_format::print()
                                      ? summ.get_string("short_name")
                                      : summ.get_name();

-          if (summ.has_value("hint") && summ.get_string("hint") == "duration")
+          std::string hint = summ.has_value("hint") ? summ.get_string("hint")
+                                                    : std::string{};
+          if (hint == "duration")
          {
            table.add_cell(row,
                           name,
                           format_duration(summ.get_float64("value")));
          }
+          else if (hint == "item_rate")
+          {
+            table.add_cell(row,
+                           name,
+                           format_item_rate(summ.get_float64("value")));
+          }
+          else if (hint == "byte_rate")
+          {
+            table.add_cell(row,
+                           name,
+                           format_byte_rate(summ.get_float64("value")));
+          }
+          else if (hint == "percentage")
+          {
+            table.add_cell(row,
+                           name,
+                           format_percentage(summ.get_float64("value")));
+          }
          else
          {
            table.add_cell(row,
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -0,0 +1,81 @@
+#include <nvbench/detail/measure_cold.cuh>
+
+#include <nvbench/benchmark_base.cuh>
+#include <nvbench/state.cuh>
+#include <nvbench/summary.cuh>
+
+#include <fmt/format.h>
+
+#include <cstdio>
+#include <variant>
+
+namespace nvbench
+{
+
+namespace detail
+{
+
+void measure_cold_base::initialize()
+{
+  m_cuda_time  = 0.;
+  m_cpu_time   = 0.;
+  m_num_trials = 0;
+}
+
+void measure_cold_base::generate_summaries()
+{
+  {
+    auto &summ = m_state.add_summary("Number of Trials (Cold)");
+    summ.set_string("short_name", "Cold Trials");
+    summ.set_string("description",
+                    "Number of kernel executions in cold time measurements.");
+    summ.set_int64("value", m_num_trials);
+  }
+
+  const auto avg_cuda_time = m_cuda_time / m_num_trials;
+  {
+    auto &summ = m_state.add_summary("Average GPU Time (Cold)");
+    summ.set_string("hint", "duration");
+    summ.set_string("short_name", "Cold GPU");
+    summ.set_string("description",
+                    "Average isolated kernel execution time as measured "
+                    "by CUDA events.");
+    summ.set_float64("value", avg_cuda_time);
+  }
+
+  const auto avg_cpu_time = m_cpu_time / m_num_trials;
+  {
+    auto &summ = m_state.add_summary("Average CPU Time (Cold)");
+    summ.set_string("hint", "duration");
+    summ.set_string("short_name", "Cold CPU");
+    summ.set_string("description",
+                    "Average isolated kernel execution time observed "
+                    "from host.");
+    summ.set_float64("value", avg_cpu_time);
+  }
+
+  // Log to stdout:
+  fmt::memory_buffer param_buffer;
+  fmt::format_to(param_buffer, "");
+  const auto &axis_values = m_state.get_axis_values();
+  for (const auto &name : axis_values.get_names())
+  {
+    fmt::format_to(param_buffer, "{}=", name);
+    std::visit([&param_buffer](
+                 const auto &val) { fmt::format_to(param_buffer, "{} ", val); },
+               axis_values.get_value(name));
+  }
+
+  fmt::print("Benchmark {} Params: [ {}] Cold {:.6f} ms GPU, {:.6f} ms CPU, "
+             "{}x\n",
+             m_state.get_benchmark().get_name(),
+             fmt::to_string(param_buffer),
+             avg_cuda_time * 1e3,
+             avg_cpu_time * 1e3,
+             m_num_trials);
+  std::fflush(stdout);
+}
+
+} // namespace detail
+
+} // namespace nvbench
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <nvbench/cpu_timer.cuh>
+#include <nvbench/cuda_call.cuh>
+#include <nvbench/cuda_timer.cuh>
+#include <nvbench/launch.cuh>
+#include <nvbench/state.cuh>
+
+#include <nvbench/detail/l2flush.cuh>
+
+#include <cuda_runtime.h>
+
+#include <utility>
+
+namespace nvbench
+{
+
+namespace detail
+{
+
+// non-templated code goes here:
+struct measure_cold_base
+{
+  explicit measure_cold_base(nvbench::state &exec_state)
+      : m_state(exec_state)
+  {}
+  measure_cold_base(const measure_cold_base &) = delete;
+  measure_cold_base(measure_cold_base &&)      = delete;
+  measure_cold_base &operator=(const measure_cold_base &) = delete;
+  measure_cold_base &operator=(measure_cold_base &&) = delete;
+
+protected:
+  void initialize();
+
+  void generate_summaries();
+
+  nvbench::launch m_launch{};
+  nvbench::cuda_timer m_cuda_timer{};
+  nvbench::cpu_timer m_cpu_timer{};
+  nvbench::detail::l2flush m_l2flush{};
+
+  // seconds:
+  nvbench::float64_t m_min_time{1.};
+  nvbench::float64_t m_cuda_time{};
+  nvbench::float64_t m_cpu_time{};
+
+  nvbench::int64_t m_num_trials{};
+
+  nvbench::state &m_state;
+};
+
+template <typename KernelLauncher>
+struct measure_cold : public measure_cold_base
+{
+  measure_cold(nvbench::state &state, KernelLauncher &kernel_launcher)
+      : measure_cold_base(state)
+      , m_kernel_launcher{kernel_launcher}
+  {}
+
+  void operator()()
+  {
+    this->initialize();
+    this->run_warmup();
+    this->run_trials();
+    this->generate_summaries();
+  }
+
+private:
+  void run_warmup()
+  {
+    m_l2flush.flush(m_launch.get_stream());
+    this->launch_kernel();
+    NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
+  }
+
+  void run_trials()
+  {
+    do
+    {
+      m_l2flush.flush(m_launch.get_stream());
+      NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
+
+      m_cuda_timer.start(m_launch.get_stream());
+      m_cpu_timer.start();
+
+      this->launch_kernel();
+
+      m_cuda_timer.stop(m_launch.get_stream());
+
+      NVBENCH_CUDA_CALL(cudaStreamSynchronize(m_launch.get_stream()));
+      m_cpu_timer.stop();
+
+      // TODO eventually these should also get logged in a vector for
+      // statistical analysis.
+      m_cuda_time += m_cuda_timer.get_duration();
+      m_cpu_time += m_cpu_timer.get_duration();
+      ++m_num_trials;
+    } while (std::max(m_cuda_time, m_cpu_time) < m_min_time);
+  }
+
+  // TODO forceinline
+  void launch_kernel() { m_kernel_launcher(m_launch); }
+
+  KernelLauncher &m_kernel_launcher;
+};
+
+} // namespace detail
+} // namespace nvbench
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -1,8 +1,18 @@
 #include <nvbench/detail/measure_hot.cuh>

+#include <nvbench/benchmark_base.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>

+#include <fmt/format.h>
+
+#include <cstdio>
+#include <variant>
+
+// note that these can be removed once there's a device_manager or some such:
+#include <cuda_runtime_api.h>
+#include <nvbench/cuda_call.cuh>
+
 namespace nvbench
 {

@@ -12,32 +22,98 @@ namespace detail
 void measure_hot_base::generate_summaries()
 {
  {
-    auto &summary = m_state.add_summary("Number of Trials (Hot)");
-    summary.set_string("short_name", "Hot Trials");
-    summary.set_string("description",
-                       "Number of kernel executions in hot time measurements.");
-    summary.set_int64("value", m_num_trials);
+    auto &summ = m_state.add_summary("Number of Trials (Hot)");
+    summ.set_string("short_name", "Hot Trials");
+    summ.set_string("description",
+                    "Number of kernel executions in hot time measurements.");
+    summ.set_int64("value", m_num_trials);
  }

+  const auto avg_cuda_time = m_cuda_time / m_num_trials;
  {
-    auto &summary = m_state.add_summary("Average GPU Time (Hot)");
-    summary.set_string("hint", "duration");
-    summary.set_string("short_name", "Hot GPU");
-    summary.set_string("description",
-                       "Average back-to-back kernel execution time as measured "
-                       "by CUDA events.");
-    summary.set_float64("value", m_cuda_time / m_num_trials);
+    auto &summ = m_state.add_summary("Average GPU Time (Hot)");
+    summ.set_string("hint", "duration");
+    summ.set_string("short_name", "Hot GPU");
+    summ.set_string("description",
+                    "Average back-to-back kernel execution time as measured "
+                    "by CUDA events.");
+    summ.set_float64("value", avg_cuda_time);
  }

+  const auto avg_cpu_time = m_cpu_time / m_num_trials;
  {
-    auto &summary = m_state.add_summary("Average CPU Time (Hot)");
-    summary.set_string("hint", "duration");
-    summary.set_string("short_name", "Hot CPU");
-    summary.set_string("description",
-                       "Average back-to-back kernel execution time observed "
-                       "from host.");
-    summary.set_float64("value", m_cpu_time / m_num_trials);
+    auto &summ = m_state.add_summary("Average CPU Time (Hot)");
+    summ.set_string("hint", "duration");
+    summ.set_string("short_name", "Hot CPU");
+    summ.set_string("description",
+                    "Average back-to-back kernel execution time observed "
+                    "from host.");
+    summ.set_float64("value", avg_cpu_time);
  }
+
+  if (const auto items = m_state.get_items_processed_per_launch(); items != 0)
+  {
+    auto &summ = m_state.add_summary("Item Throughput");
+    summ.set_string("hint", "item_rate");
+    summ.set_string("short_name", "Item Rate");
+    summ.set_string("description", "Number of input items handled per second.");
+    summ.set_float64("value", items / avg_cuda_time);
+  }
+
+  if (const auto bytes = m_state.get_global_bytes_accessed_per_launch();
+      bytes != 0)
+  {
+    const auto avg_used_gmem_bw = bytes / avg_cuda_time;
+    {
+      auto &summ = m_state.add_summary("Average Global Memory Throughput");
+      summ.set_string("hint", "byte_rate");
+      summ.set_string("short_name", "GlobalMemUse");
+      summ.set_string("description",
+                      "Number of bytes read/written per second to the CUDA "
+                      "device's global memory.");
+      summ.set_float64("value", avg_used_gmem_bw);
+    }
+
+    // TODO cache this in a singleton somewhere.
+    int dev_id{};
+    cudaDeviceProp prop{};
+    NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
+    NVBENCH_CUDA_CALL(cudaGetDeviceProperties(&prop, dev_id));
+    // clock rate in khz, width in bits. Result in bytes/sec.
+    const auto peak_gmem_bw = 2 * 1000. * prop.memoryClockRate * // (sec^-1)
+                              prop.memoryBusWidth / CHAR_BIT;    // bytes
+
+    {
+      auto &summ = m_state.add_summary("Percent Peak Global Memory Throughput");
+      summ.set_string("hint", "percentage");
+      summ.set_string("short_name", "PeakGMem");
+      summ.set_string("description",
+                      "Global device memory throughput as a percentage of the "
+                      "device's peak bandwidth.");
+      summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw * 100.);
+    }
+  }
+
+  // Log to stdout:
+  fmt::memory_buffer param_buffer;
+  fmt::format_to(param_buffer, "");
+  const auto &axis_values = m_state.get_axis_values();
+  for (const auto &name : axis_values.get_names())
+  {
+    fmt::format_to(param_buffer, "{}=", name);
+    std::visit([&param_buffer](
+                 const auto &val) { fmt::format_to(param_buffer, "{} ", val); },
+               axis_values.get_value(name));
+  }
+
+  fmt::print("Benchmark {} Params: [ {}] Hot {:.6f} ms GPU, {:.6f} ms CPU, "
+             "{}x\n",
+             m_state.get_benchmark().get_name(),
+             fmt::to_string(param_buffer),
+             avg_cuda_time * 1e3,
+             avg_cpu_time * 1e3,
+             m_num_trials);
+  std::fflush(stdout);
 }

 } // namespace detail
--- a/nvbench/detail/state_generator.cu
+++ b/nvbench/detail/state_generator.cu
@@ -1,5 +1,6 @@
 #include <nvbench/detail/state_generator.cuh>

+#include <nvbench/benchmark_base.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/type_axis.cuh>

@@ -18,7 +19,7 @@ namespace detail
 {

 std::vector<std::vector<nvbench::state>>
-state_generator::create(const axes_metadata &axes)
+state_generator::create(const benchmark_base &bench)
 {
  // Assemble states into a std::vector<std::vector<nvbench::state>>, where the
  // outer vector has one inner vector per type_config, and all configs in an
@@ -27,6 +28,7 @@ state_generator::create(const axes_metadata &axes)
  // matching up states to kernel_generator instantiations much easier during
  // dispatch.

+  const axes_metadata& axes = bench.get_axes();
  // vector of all axes:
  const std::vector<std::unique_ptr<axis_base>> &axes_vec = axes.get_axes();

@@ -99,7 +101,7 @@ state_generator::create(const axes_metadata &axes)
      for (non_type_sg.init(); non_type_sg.iter_valid(); non_type_sg.next())
      {
        // Initialize each state with the current type_config:
-        nvbench::state state{type_config};
+        nvbench::state state{bench, type_config};
        // Add non-type parameters to state:
        for (const axis_index &axis_info : non_type_sg.get_current_indices())
        {
--- a/nvbench/detail/state_generator.cuh
+++ b/nvbench/detail/state_generator.cuh
@@ -9,7 +9,7 @@

 namespace nvbench
 {
-
+struct benchmark_base;
 namespace detail
 {

@@ -17,7 +17,7 @@ struct state_generator
 {

  static std::vector<std::vector<nvbench::state>>
-  create(const axes_metadata &axes);
+  create(const benchmark_base &bench);

 protected:
  struct axis_index
--- a/nvbench/exec.cuh
+++ b/nvbench/exec.cuh
@@ -1,5 +1,6 @@
 #pragma once

+#include <nvbench/detail/measure_cold.cuh>
 #include <nvbench/detail/measure_hot.cuh>

 namespace nvbench
@@ -9,8 +10,16 @@ template <typename KernelLauncher>
 void exec(nvbench::state &exec_state, KernelLauncher &&kernel_launcher)
 {
  using KL = std::remove_reference_t<KernelLauncher>;
-  nvbench::detail::measure_hot<KL> hot{exec_state, kernel_launcher};
-  hot();
+
+  {
+    nvbench::detail::measure_cold<KL> cold{exec_state, kernel_launcher};
+    cold();
+  }
+
+  {
+    nvbench::detail::measure_hot<KL> hot{exec_state, kernel_launcher};
+    hot();
+  }
 }

 } // namespace nvbench
--- a/nvbench/nvbench.cuh
+++ b/nvbench/nvbench.cuh
@@ -12,6 +12,7 @@
 #include <nvbench/exec.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/main.cuh>
+#include <nvbench/range.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/types.cuh>
--- a/nvbench/range.cuh
+++ b/nvbench/range.cuh
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <nvbench/types.cuh>
+
+#include <type_traits>
+#include <vector>
+
+namespace nvbench
+{
+
+namespace detail
+{
+template <typename T>
+using range_output_t = std::conditional_t<std::is_floating_point_v<T>,
+                                          nvbench::float64_t,
+                                          nvbench::int64_t>;
+}
+
+template <typename T>
+auto range(T start, T end, T stride = T{1})
+{
+  using output_t = detail::range_output_t<T>;
+  using result_t = std::vector<output_t>;
+  result_t result;
+  for (; start <= end; start += stride)
+  {
+    result.push_back(static_cast<output_t>(start));
+  }
+  return result;
+}
+
+} // namespace nvbench
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -24,7 +24,7 @@ struct runner
  void generate_states()
  {
    m_benchmark.m_states =
-      nvbench::detail::state_generator::create(m_benchmark.m_axes);
+      nvbench::detail::state_generator::create(m_benchmark);
  }

  void run()
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -10,6 +10,8 @@
 namespace nvbench
 {

+struct benchmark_base;
+
 namespace detail
 {
 struct state_generator;
@@ -45,6 +47,24 @@ struct state
  [[nodiscard]] const std::string &
  get_string(const std::string &axis_name) const;

+  void set_items_processed_per_launch(nvbench::int64_t items)
+  {
+    m_items_processed_per_launch = items;
+  }
+  nvbench::int64_t get_items_processed_per_launch() const
+  {
+    return m_items_processed_per_launch;
+  }
+
+  void set_global_bytes_accessed_per_launch(nvbench::int64_t bytes)
+  {
+    m_global_bytes_accessed_per_launch = bytes;
+  }
+  nvbench::int64_t get_global_bytes_accessed_per_launch() const
+  {
+    return m_global_bytes_accessed_per_launch;
+  }
+
  void skip(std::string reason) { m_skip_reason = std::move(reason); }
  [[nodiscard]] bool is_skipped() const { return !m_skip_reason.empty(); }
  [[nodiscard]] const std::string &get_skip_reason() const
@@ -57,6 +77,8 @@ struct state
    return m_axis_values;
  }

+  const benchmark_base &get_benchmark() const { return m_benchmark; }
+
  summary &add_summary(std::string summary_name);
  summary &add_summary(summary s);
  [[nodiscard]] const summary &get_summary(std::string_view name) const;
@@ -67,15 +89,21 @@ struct state
 protected:
  friend struct nvbench::detail::state_generator;

-  state() = default;
-
-  state(nvbench::named_values values)
-      : m_axis_values{std::move(values)}
+  explicit state(const benchmark_base &bench)
+      : m_benchmark{bench}
  {}

+  state(const benchmark_base &bench, nvbench::named_values values)
+      : m_benchmark{bench}
+      , m_axis_values{std::move(values)}
+  {}
+
+  const nvbench::benchmark_base &m_benchmark;
  nvbench::named_values m_axis_values;
  std::vector<nvbench::summary> m_summaries;
  std::string m_skip_reason;
+  nvbench::int64_t m_items_processed_per_launch{};
+  nvbench::int64_t m_global_bytes_accessed_per_launch{};
 };

 } // namespace nvbench
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -1,15 +1,22 @@
 #include <nvbench/state.cuh>

+#include <nvbench/benchmark.cuh>
+#include <nvbench/callable.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/types.cuh>

 #include "test_asserts.cuh"

+// Mock up a benchmark for testing:
+void dummy_generator(nvbench::state &) {}
+NVBENCH_DEFINE_CALLABLE(dummy_generator, dummy_callable);
+using dummy_bench = nvbench::benchmark<dummy_callable>;
+
 // Subclass to gain access to protected members for testing:
 struct state_tester : public nvbench::state
 {
-  state_tester()
-      : nvbench::state()
+  state_tester(const nvbench::benchmark_base& bench)
+      : nvbench::state{bench}
  {}

  template <typename T>
@@ -23,8 +30,10 @@ struct state_tester : public nvbench::state

 void test_params()
 {
+  dummy_bench bench;
+
  // Build a state param by param
-  state_tester state;
+  state_tester state{bench};
  state.set_param("TestInt", nvbench::int64_t{22});
  state.set_param("TestFloat", nvbench::float64_t{3.14});
  state.set_param("TestString", "A String!");
@@ -36,7 +45,8 @@ void test_params()

 void test_summaries()
 {
-  state_tester state;
+  dummy_bench bench;
+  state_tester state{bench};
  ASSERT(state.get_summaries().size() == 0);

  {
--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -2,11 +2,27 @@

 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/axis_base.cuh>
+#include <nvbench/benchmark.cuh>
+#include <nvbench/callable.cuh>

 #include "test_asserts.cuh"

 #include <fmt/format.h>

+// Mock up a benchmark for testing:
+void dummy_generator(nvbench::state &) {}
+NVBENCH_DEFINE_CALLABLE(dummy_generator, dummy_callable);
+using dummy_bench = nvbench::benchmark<dummy_callable>;
+
+using floats    = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
+using ints      = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using misc      = nvbench::type_list<void, bool>;
+using type_axes = nvbench::type_list<floats, ints, misc>;
+template <typename F, typename I, typename M>
+void template_generator(nvbench::state &, nvbench::type_list<F, I, M>){};
+NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_generator, template_callable);
+using template_bench = nvbench::benchmark<template_callable, type_axes>;
+
 struct state_generator_tester : nvbench::detail::state_generator
 {
  using nvbench::detail::state_generator::add_axis;
@@ -124,16 +140,16 @@ void test_basic()

 void test_create()
 {
-  nvbench::axes_metadata axes;
-  axes.add_float64_axis("Radians", {3.14, 6.28});
-  axes.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
-  axes.add_int64_axis("NumInputs",
-                      {10, 15, 20},
-                      nvbench::int64_axis_flags::power_of_two);
-  axes.add_string_axis("Strategy", {"Recursive", "Iterative"});
+  dummy_bench bench;
+  bench.add_float64_axis("Radians", {3.14, 6.28});
+  bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
+  bench.add_int64_axis("NumInputs",
+                       {10, 15, 20},
+                       nvbench::int64_axis_flags::power_of_two);
+  bench.add_string_axis("Strategy", {"Recursive", "Iterative"});

  const std::vector<std::vector<nvbench::state>> states =
-    nvbench::detail::state_generator::create(axes);
+    nvbench::detail::state_generator::create(bench);

  // Outer vector has one entry per type_config. There are no type axes, so
  // there's only one type_config:
@@ -161,7 +177,7 @@ void test_create()
                 "Strategy");

  std::size_t type_config = 0;
-  std::size_t config = 0;
+  std::size_t config      = 0;
  for (const auto &inner_states : states)
  {
    for (const nvbench::state &state : inner_states)
@@ -224,26 +240,19 @@ void test_create()
             fmt::format("Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test));
 }

-
 void test_create_with_types()
 {
-  using floats = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
-  using ints   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-  using misc   = nvbench::type_list<void, bool>;
-
-  using type_axes = nvbench::type_list<floats, ints, misc>;
-
-  nvbench::axes_metadata axes;
-  axes.set_type_axes_names<type_axes>({"Floats", "Ints", "Misc"});
-  axes.add_float64_axis("Radians", {3.14, 6.28});
-  axes.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
-  axes.add_int64_axis("NumInputs",
-                      {10, 15, 20},
-                      nvbench::int64_axis_flags::power_of_two);
-  axes.add_string_axis("Strategy", {"Recursive", "Iterative"});
+  template_bench bench;
+  bench.set_type_axes_names({"Floats", "Ints", "Misc"});
+  bench.add_float64_axis("Radians", {3.14, 6.28});
+  bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
+  bench.add_int64_axis("NumInputs",
+                       {10, 15, 20},
+                       nvbench::int64_axis_flags::power_of_two);
+  bench.add_string_axis("Strategy", {"Recursive", "Iterative"});

  const std::vector<std::vector<nvbench::state>> states =
-    nvbench::detail::state_generator::create(axes);
+    nvbench::detail::state_generator::create(bench);

  // Outer vector has one entry per type_config
  // 2 (Floats) * 2 (Ints) * 2 (Misc) = 8 total type_configs
@@ -257,8 +266,8 @@ void test_create_with_types()
  }

  fmt::memory_buffer buffer;
-  std::string table_format =
-    "| {:^5} | {:^10} | {:^6} | {:^4} | {:^4} | {:^7} | {:^7} | {:^9} | {:^9} |\n";
+  std::string table_format = "| {:^5} | {:^10} | {:^6} | {:^4} | {:^4} | {:^7} "
+                             "| {:^7} | {:^9} | {:^9} |\n";

  fmt::format_to(buffer, "\n");
  fmt::format_to(buffer,
@@ -274,7 +283,7 @@ void test_create_with_types()
                 "Strategy");

  std::size_t type_config = 0;
-  std::size_t config = 0;
+  std::size_t config      = 0;
  for (const auto &inner_states : states)
  {
    for (const nvbench::state &state : inner_states)