Auto format

2026-05-11 08:50:03 +00:00 · 2022-08-18 22:09:24 +04:00
parent 87ce9ee576
commit 8f1152d4a2
74 changed files with 835 additions and 1479 deletions
--- a/nvbench/axes_metadata.cuh
+++ b/nvbench/axes_metadata.cuh
@@ -41,8 +41,8 @@ struct axes_metadata
  template <typename... TypeAxes>
  explicit axes_metadata(nvbench::type_list<TypeAxes...>);

-  axes_metadata()                 = default;
-  axes_metadata(axes_metadata &&) = default;
+  axes_metadata()                            = default;
+  axes_metadata(axes_metadata &&)            = default;
  axes_metadata &operator=(axes_metadata &&) = default;

  axes_metadata(const axes_metadata &);
@@ -58,20 +58,16 @@ struct axes_metadata

  void add_string_axis(std::string name, std::vector<std::string> data);

-  [[nodiscard]] const nvbench::int64_axis &
-  get_int64_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::int64_axis &get_int64_axis(std::string_view name) const;
  [[nodiscard]] nvbench::int64_axis &get_int64_axis(std::string_view name);

-  [[nodiscard]] const nvbench::float64_axis &
-  get_float64_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::float64_axis &get_float64_axis(std::string_view name) const;
  [[nodiscard]] nvbench::float64_axis &get_float64_axis(std::string_view name);

-  [[nodiscard]] const nvbench::string_axis &
-  get_string_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::string_axis &get_string_axis(std::string_view name) const;
  [[nodiscard]] nvbench::string_axis &get_string_axis(std::string_view name);

-  [[nodiscard]] const nvbench::type_axis &
-  get_type_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::type_axis &get_type_axis(std::string_view name) const;
  [[nodiscard]] nvbench::type_axis &get_type_axis(std::string_view name);

  [[nodiscard]] const nvbench::type_axis &get_type_axis(std::size_t index) const;
@@ -83,10 +79,9 @@ struct axes_metadata
  [[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name) const;
  [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name);

-  [[nodiscard]] const nvbench::axis_base &
-  get_axis(std::string_view name, nvbench::axis_type type) const;
-  [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name,
-                                             nvbench::axis_type type);
+  [[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name,
+                                                   nvbench::axis_type type) const;
+  [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name, nvbench::axis_type type);

  [[nodiscard]] static std::vector<std::string>
  generate_default_type_axis_names(std::size_t num_type_axes);
@@ -101,7 +96,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
 {
  using type_axes_list         = nvbench::type_list<TypeAxes...>;
  constexpr auto num_type_axes = nvbench::tl::size<type_axes_list>::value;
-  auto names = axes_metadata::generate_default_type_axis_names(num_type_axes);
+  auto names                   = axes_metadata::generate_default_type_axis_names(num_type_axes);

  auto names_iter = names.begin(); // contents will be moved from
  nvbench::tl::foreach<type_axes_list>(
@@ -114,8 +109,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
      // The word "type" appears 6 times in the next line.
      // Every. Single. Token.
      typedef typename decltype(wrapped_type)::type type_list;
-      auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++),
-                                                       type_axis_index);
+      auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++), type_axis_index);
      axis->template set_inputs<type_list>();
      axes.push_back(std::move(axis));
    });
--- a/nvbench/axes_metadata.cxx
+++ b/nvbench/axes_metadata.cxx
@@ -64,9 +64,7 @@ try
    auto &axis = *m_axes[i];
    if (axis.get_type() != nvbench::axis_type::type)
    {
-      NVBENCH_THROW(std::runtime_error,
-                    "Number of names exceeds number of type axes ({})",
-                    i);
+      NVBENCH_THROW(std::runtime_error, "Number of names exceeds number of type axes ({})", i);
    }

    axis.set_name(std::move(names[i]));
@@ -81,8 +79,7 @@ catch (std::exception &e)
                names);
 }

-void axes_metadata::add_float64_axis(std::string name,
-                                     std::vector<nvbench::float64_t> data)
+void axes_metadata::add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
 {
  auto axis = std::make_unique<nvbench::float64_axis>(std::move(name));
  axis->set_inputs(std::move(data));
@@ -98,8 +95,7 @@ void axes_metadata::add_int64_axis(std::string name,
  m_axes.push_back(std::move(axis));
 }

-void axes_metadata::add_string_axis(std::string name,
-                                    std::vector<std::string> data)
+void axes_metadata::add_string_axis(std::string name, std::vector<std::string> data)
 {
  auto axis = std::make_unique<nvbench::string_axis>(std::move(name));
  axis->set_inputs(std::move(data));
@@ -188,10 +184,9 @@ nvbench::type_axis &axes_metadata::get_type_axis(std::size_t index)

 const axis_base &axes_metadata::get_axis(std::string_view name) const
 {
-  auto iter =
-    std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
-      return axis->get_name() == name;
-    });
+  auto iter = std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
+    return axis->get_name() == name;
+  });

  if (iter == m_axes.cend())
  {
@@ -203,10 +198,9 @@ const axis_base &axes_metadata::get_axis(std::string_view name) const

 axis_base &axes_metadata::get_axis(std::string_view name)
 {
-  auto iter =
-    std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
-      return axis->get_name() == name;
-    });
+  auto iter = std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
+    return axis->get_name() == name;
+  });

  if (iter == m_axes.end())
  {
@@ -216,8 +210,7 @@ axis_base &axes_metadata::get_axis(std::string_view name)
  return **iter;
 }

-const axis_base &axes_metadata::get_axis(std::string_view name,
-                                         nvbench::axis_type type) const
+const axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type) const
 {
  const auto &axis = this->get_axis(name);
  if (axis.get_type() != type)
@@ -231,8 +224,7 @@ const axis_base &axes_metadata::get_axis(std::string_view name,
  return axis;
 }

-axis_base &axes_metadata::get_axis(std::string_view name,
-                                   nvbench::axis_type type)
+axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type)
 {
  auto &axis = this->get_axis(name);
  if (axis.get_type() != type)
@@ -246,8 +238,7 @@ axis_base &axes_metadata::get_axis(std::string_view name,
  return axis;
 }

-std::vector<std::string>
-axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
+std::vector<std::string> axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
 {
  switch (num_type_axes)
  {
--- a/nvbench/axis_base.cuh
+++ b/nvbench/axis_base.cuh
@@ -47,10 +47,7 @@ struct axis_base

  [[nodiscard]] axis_type get_type() const { return m_type; }

-  [[nodiscard]] std::string_view get_type_as_string() const
-  {
-    return axis_type_to_string(m_type);
-  }
+  [[nodiscard]] std::string_view get_type_as_string() const { return axis_type_to_string(m_type); }

  [[nodiscard]] std::string_view get_flags_as_string() const
  {
--- a/nvbench/axis_base.cxx
+++ b/nvbench/axis_base.cxx
@@ -23,9 +23,6 @@ namespace nvbench

 axis_base::~axis_base() = default;

-std::unique_ptr<axis_base> axis_base::clone() const
-{
-  return this->do_clone();
-}
+std::unique_ptr<axis_base> axis_base::clone() const { return this->do_clone(); }

 } // namespace nvbench
--- a/nvbench/benchmark.cuh
+++ b/nvbench/benchmark.cuh
@@ -57,18 +57,14 @@ struct benchmark final : public benchmark_base
  using type_axes        = TypeAxes;
  using type_configs     = nvbench::tl::cartesian_product<type_axes>;

-  static constexpr std::size_t num_type_configs =
-    nvbench::tl::size<type_configs>{};
+  static constexpr std::size_t num_type_configs = nvbench::tl::size<type_configs>{};

  benchmark()
      : benchmark_base(type_axes{})
  {}

 private:
-  std::unique_ptr<benchmark_base> do_clone() const final
-  {
-    return std::make_unique<benchmark>();
-  }
+  std::unique_ptr<benchmark_base> do_clone() const final { return std::make_unique<benchmark>(); }

  void do_set_type_axes_names(std::vector<std::string> names) final
  {
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -80,32 +80,28 @@ struct benchmark_base
    return *this;
  }

-  benchmark_base &add_float64_axis(std::string name,
-                                   std::vector<nvbench::float64_t> data)
+  benchmark_base &add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
  {
    m_axes.add_float64_axis(std::move(name), std::move(data));
    return *this;
  }

-  benchmark_base &add_int64_axis(
-    std::string name,
-    std::vector<nvbench::int64_t> data,
-    nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
+  benchmark_base &add_int64_axis(std::string name,
+                                 std::vector<nvbench::int64_t> data,
+                                 nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
  {
    m_axes.add_int64_axis(std::move(name), std::move(data), flags);
    return *this;
  }

-  benchmark_base &add_int64_power_of_two_axis(std::string name,
-                                              std::vector<nvbench::int64_t> data)
+  benchmark_base &add_int64_power_of_two_axis(std::string name, std::vector<nvbench::int64_t> data)
  {
    return this->add_int64_axis(std::move(name),
                                std::move(data),
                                nvbench::int64_axis_flags::power_of_two);
  }

-  benchmark_base &add_string_axis(std::string name,
-                                  std::vector<std::string> data)
+  benchmark_base &add_string_axis(std::string name, std::vector<std::string> data)
  {
    m_axes.add_string_axis(std::move(name), std::move(data));
    return *this;
@@ -133,48 +129,30 @@ struct benchmark_base
    return *this;
  }

-  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const
-  {
-    return m_devices;
-  }
+  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const { return m_devices; }

  [[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; }

-  [[nodiscard]] const nvbench::axes_metadata &get_axes() const
-  {
-    return m_axes;
-  }
+  [[nodiscard]] const nvbench::axes_metadata &get_axes() const { return m_axes; }

  // Computes the number of configs in the benchmark.
  // Unlike get_states().size(), this method may be used prior to calling run().
  [[nodiscard]] std::size_t get_config_count() const;

  // Is empty until run() is called.
-  [[nodiscard]] const std::vector<nvbench::state> &get_states() const
-  {
-    return m_states;
-  }
+  [[nodiscard]] const std::vector<nvbench::state> &get_states() const { return m_states; }
  [[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }

  void run() { this->do_run(); }

-  void set_printer(nvbench::printer_base &printer)
-  {
-    m_printer = std::ref(printer);
-  }
+  void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); }

  void clear_printer() { m_printer = std::nullopt; }

-  [[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const
-  {
-    return m_printer;
-  }
+  [[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const { return m_printer; }

  /// Execute at least this many trials per measurement. @{
-  [[nodiscard]] nvbench::int64_t get_min_samples() const
-  {
-    return m_min_samples;
-  }
+  [[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
  benchmark_base &set_min_samples(nvbench::int64_t min_samples)
  {
    m_min_samples = min_samples;
@@ -193,7 +171,7 @@ struct benchmark_base
  }
  /// @}

-  /// If true, the benchmark does not use the blocking_kernel. This is intended 
+  /// If true, the benchmark does not use the blocking_kernel. This is intended
  /// for use with external profiling tools. @{
  [[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
  benchmark_base &set_disable_blocking_kernel(bool v)
--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -68,8 +68,7 @@ std::size_t benchmark_base::get_config_count() const
    std::size_t{1},
    std::multiplies<>{},
    [](const auto &axis_ptr) {
-      if (const auto *type_axis_ptr =
-            dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
+      if (const auto *type_axis_ptr = dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
          type_axis_ptr != nullptr)
      {
        return type_axis_ptr->get_active_count();
--- a/nvbench/benchmark_manager.cuh
+++ b/nvbench/benchmark_manager.cuh
@@ -31,8 +31,7 @@ namespace nvbench
 */
 struct benchmark_manager
 {
-  using benchmark_vector =
-    std::vector<std::unique_ptr<nvbench::benchmark_base>>;
+  using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;

  /**
   * @return The singleton benchmark_manager instance.
@@ -53,25 +52,21 @@ struct benchmark_manager
   * Get a non-mutable reference to benchmark with the specified name/index.
   * @{
   */
-  [[nodiscard]] const benchmark_base &
-  get_benchmark(const std::string &name) const;
+  [[nodiscard]] const benchmark_base &get_benchmark(const std::string &name) const;
  [[nodiscard]] const benchmark_base &get_benchmark(std::size_t idx) const
  {
    return *m_benchmarks.at(idx);
  }
  /**@}*/

-  [[nodiscard]] const benchmark_vector &get_benchmarks() const
-  {
-    return m_benchmarks;
-  };
+  [[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };

 private:
-  benchmark_manager()                          = default;
-  benchmark_manager(const benchmark_manager &) = delete;
-  benchmark_manager(benchmark_manager &&)      = delete;
+  benchmark_manager()                                     = default;
+  benchmark_manager(const benchmark_manager &)            = delete;
+  benchmark_manager(benchmark_manager &&)                 = delete;
  benchmark_manager &operator=(const benchmark_manager &) = delete;
-  benchmark_manager &operator=(benchmark_manager &&) = delete;
+  benchmark_manager &operator=(benchmark_manager &&)      = delete;

  benchmark_vector m_benchmarks;
 };
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -43,21 +43,18 @@ benchmark_base &benchmark_manager::add(std::unique_ptr<benchmark_base> bench)
 benchmark_manager::benchmark_vector benchmark_manager::clone_benchmarks() const
 {
  benchmark_vector result(m_benchmarks.size());
-  std::transform(m_benchmarks.cbegin(),
-                 m_benchmarks.cend(),
-                 result.begin(),
-                 [](const auto &bench) { return bench->clone(); });
+  std::transform(m_benchmarks.cbegin(), m_benchmarks.cend(), result.begin(), [](const auto &bench) {
+    return bench->clone();
+  });
  return result;
 }

-const benchmark_base &
-benchmark_manager::get_benchmark(const std::string &name) const
+const benchmark_base &benchmark_manager::get_benchmark(const std::string &name) const
 {
-  auto iter = std::find_if(m_benchmarks.cbegin(),
-                           m_benchmarks.cend(),
-                           [&name](const auto &bench_ptr) {
-                             return bench_ptr->get_name() == name;
-                           });
+  auto iter =
+    std::find_if(m_benchmarks.cbegin(), m_benchmarks.cend(), [&name](const auto &bench_ptr) {
+      return bench_ptr->get_name() == name;
+    });
  if (iter == m_benchmarks.cend())
  {
    NVBENCH_THROW(std::out_of_range, "No benchmark named '{}'.", name);
--- a/nvbench/blocking_kernel.cu
+++ b/nvbench/blocking_kernel.cu
@@ -42,8 +42,8 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
                             nvbench::float64_t timeout)
 {
  const auto start_point = cuda::std::chrono::high_resolution_clock::now();
-  const auto timeout_ns  = cuda::std::chrono::nanoseconds(
-    static_cast<nvbench::int64_t>(timeout * 1e9));
+  const auto timeout_ns =
+    cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(timeout * 1e9));
  const auto timeout_point = start_point + timeout_ns;

  const bool use_timeout = timeout >= 0.;
@@ -57,41 +57,40 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
  {
    *timeout_flag = 1;
    __threadfence_system(); // Ensure timeout flag visibility on host.
-    printf(
-      "\n"
-      "######################################################################\n"
-      "##################### Possible Deadlock Detected #####################\n"
-      "######################################################################\n"
-      "\n"
-      "Forcing unblock: The current measurement appears to have deadlocked\n"
-      "and the results cannot be trusted.\n"
-      "\n"
-      "This happens when the KernelLauncher synchronizes the CUDA device.\n"
-      "If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
-      "\n"
-      "    state.exec(<KernelLauncher>); // Deadlock\n"
-      "    state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
-      "\n"
-      "This tells NVBench about the sync so it can run the benchmark safely.\n"
-      "\n"
-      "If the KernelLauncher does not synchronize but has a very long \n"
-      "execution time, this may be a false positive. If so, disable this\n"
-      "check with:\n"
-      "\n"
-      "    state.set_blocking_kernel_timeout(-1);\n"
-      "\n"
-      "The current timeout is set to %0.5g seconds.\n"
-      "\n"
-      "For more information, see the 'Benchmarks that sync' section of the\n"
-      "NVBench documentation.\n"
-      "\n"
-      "If this happens while profiling with an external tool,\n"
-      "pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
-      "(to also only run the benchmark once) to the executable.\n"
-      "\n"
-      "For more information, see the 'Benchmark Properties' section of the\n"
-      "NVBench documentation.\n\n",
-      timeout);
+    printf("\n"
+           "######################################################################\n"
+           "##################### Possible Deadlock Detected #####################\n"
+           "######################################################################\n"
+           "\n"
+           "Forcing unblock: The current measurement appears to have deadlocked\n"
+           "and the results cannot be trusted.\n"
+           "\n"
+           "This happens when the KernelLauncher synchronizes the CUDA device.\n"
+           "If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
+           "\n"
+           "    state.exec(<KernelLauncher>); // Deadlock\n"
+           "    state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
+           "\n"
+           "This tells NVBench about the sync so it can run the benchmark safely.\n"
+           "\n"
+           "If the KernelLauncher does not synchronize but has a very long \n"
+           "execution time, this may be a false positive. If so, disable this\n"
+           "check with:\n"
+           "\n"
+           "    state.set_blocking_kernel_timeout(-1);\n"
+           "\n"
+           "The current timeout is set to %0.5g seconds.\n"
+           "\n"
+           "For more information, see the 'Benchmarks that sync' section of the\n"
+           "NVBench documentation.\n"
+           "\n"
+           "If this happens while profiling with an external tool,\n"
+           "pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
+           "(to also only run the benchmark once) to the executable.\n"
+           "\n"
+           "For more information, see the 'Benchmark Properties' section of the\n"
+           "NVBench documentation.\n\n",
+           timeout);
  }
 }

@@ -102,15 +101,11 @@ namespace nvbench

 blocking_kernel::blocking_kernel()
 {
-  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag,
-                                     sizeof(m_host_flag),
-                                     cudaHostRegisterMapped));
+  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag, sizeof(m_host_flag), cudaHostRegisterMapped));
  NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_flag, &m_host_flag, 0));
-  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_timeout_flag,
-                                     sizeof(m_host_timeout_flag),
-                                     cudaHostRegisterMapped));
  NVBENCH_CUDA_CALL(
-    cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
+    cudaHostRegister(&m_host_timeout_flag, sizeof(m_host_timeout_flag), cudaHostRegisterMapped));
+  NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
 }

 blocking_kernel::~blocking_kernel()
@@ -119,14 +114,11 @@ blocking_kernel::~blocking_kernel()
  NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timeout_flag));
 }

-void blocking_kernel::block(const nvbench::cuda_stream &stream,
-                            nvbench::float64_t timeout)
+void blocking_kernel::block(const nvbench::cuda_stream &stream, nvbench::float64_t timeout)
 {
  m_host_flag         = 0;
  m_host_timeout_flag = 0;
-  block_stream<<<1, 1, 0, stream>>>(m_device_flag,
-                                    m_device_timeout_flag,
-                                    timeout);
+  block_stream<<<1, 1, 0, stream>>>(m_device_flag, m_device_timeout_flag, timeout);
 }

 void blocking_kernel::timeout_detected()
--- a/nvbench/blocking_kernel.cuh
+++ b/nvbench/blocking_kernel.cuh
@@ -97,10 +97,10 @@ struct blocking_kernel
  }

  // move-only
-  blocking_kernel(const blocking_kernel &) = delete;
-  blocking_kernel(blocking_kernel &&)      = default;
+  blocking_kernel(const blocking_kernel &)            = delete;
+  blocking_kernel(blocking_kernel &&)                 = default;
  blocking_kernel &operator=(const blocking_kernel &) = delete;
-  blocking_kernel &operator=(blocking_kernel &&) = default;
+  blocking_kernel &operator=(blocking_kernel &&)      = default;

 private:
  nvbench::int32_t m_host_flag{};
--- a/nvbench/callable.cuh
+++ b/nvbench/callable.cuh
@@ -30,35 +30,29 @@ struct state;
 // Define a simple callable wrapper around a function. This allows the function
 // to be used as a class template parameter. Intended for use with kernel
 // generators and `NVBENCH_BENCH` macros.
-#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function)                               \
+#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function)                                                   \
  NVBENCH_DEFINE_CALLABLE(function, NVBENCH_UNIQUE_IDENTIFIER(function))

-#define NVBENCH_DEFINE_CALLABLE(function, callable_name)                       \
-  struct callable_name                                                         \
-  {                                                                            \
-    void operator()(nvbench::state &state, nvbench::type_list<>)               \
-    {                                                                          \
-      function(state);                                                         \
-    }                                                                          \
+#define NVBENCH_DEFINE_CALLABLE(function, callable_name)                                           \
+  struct callable_name                                                                             \
+  {                                                                                                \
+    void operator()(nvbench::state &state, nvbench::type_list<>) { function(state); }              \
  }

-#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function)                      \
-  NVBENCH_DEFINE_CALLABLE_TEMPLATE(function,                                   \
-                                   NVBENCH_UNIQUE_IDENTIFIER(function))
+#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function)                                          \
+  NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, NVBENCH_UNIQUE_IDENTIFIER(function))

-#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name)              \
-  struct callable_name                                                         \
-  {                                                                            \
-    template <typename... Ts>                                                  \
-    void operator()(nvbench::state &state, nvbench::type_list<Ts...>)          \
-    {                                                                          \
-      function(state, nvbench::type_list<Ts...>{});                            \
-    }                                                                          \
+#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name)                                  \
+  struct callable_name                                                                             \
+  {                                                                                                \
+    template <typename... Ts>                                                                      \
+    void operator()(nvbench::state &state, nvbench::type_list<Ts...>)                              \
+    {                                                                                              \
+      function(state, nvbench::type_list<Ts...>{});                                                \
+    }                                                                                              \
  }

-#define NVBENCH_UNIQUE_IDENTIFIER(prefix)                                      \
-  NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
-#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id)                     \
+#define NVBENCH_UNIQUE_IDENTIFIER(prefix) NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
+#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id)                                         \
  NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)
-#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)                     \
-  prefix##_line_##unique_id
+#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id) prefix##_line_##unique_id
--- a/nvbench/cpu_timer.cuh
+++ b/nvbench/cpu_timer.cuh
@@ -30,27 +30,20 @@ struct cpu_timer
  __forceinline__ cpu_timer() = default;

  // move-only
-  cpu_timer(const cpu_timer &) = delete;
-  cpu_timer(cpu_timer &&)      = default;
+  cpu_timer(const cpu_timer &)            = delete;
+  cpu_timer(cpu_timer &&)                 = default;
  cpu_timer &operator=(const cpu_timer &) = delete;
-  cpu_timer &operator=(cpu_timer &&) = default;
+  cpu_timer &operator=(cpu_timer &&)      = default;

-  __forceinline__ void start()
-  {
-    m_start = std::chrono::high_resolution_clock::now();
-  }
+  __forceinline__ void start() { m_start = std::chrono::high_resolution_clock::now(); }

-  __forceinline__ void stop()
-  {
-    m_stop = std::chrono::high_resolution_clock::now();
-  }
+  __forceinline__ void stop() { m_stop = std::chrono::high_resolution_clock::now(); }

  // In seconds:
  [[nodiscard]] __forceinline__ nvbench::float64_t get_duration()
  {
    const auto duration = m_stop - m_start;
-    const auto ns =
-      std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+    const auto ns       = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
    return static_cast<nvbench::float64_t>(ns) * (1e-9);
  }

--- a/nvbench/create.cuh
+++ b/nvbench/create.cuh
@@ -27,19 +27,17 @@

 #define NVBENCH_TYPE_AXES(...) nvbench::type_list<__VA_ARGS__>

-#define NVBENCH_BENCH(KernelGenerator)                                         \
-  NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator);                             \
-  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =  \
-    nvbench::benchmark_manager::get()                                          \
-      .add(std::make_unique<                                                   \
-           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>())  \
+#define NVBENCH_BENCH(KernelGenerator)                                                             \
+  NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator);                                                 \
+  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =                      \
+    nvbench::benchmark_manager::get()                                                              \
+      .add(std::make_unique<nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>())     \
      .set_name(#KernelGenerator)

-#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes)                         \
-  NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator);                    \
-  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =  \
-    nvbench::benchmark_manager::get()                                          \
-      .add(std::make_unique<                                                   \
-           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator),      \
-                              TypeAxes>>())                                    \
+#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes)                                             \
+  NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator);                                        \
+  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =                      \
+    nvbench::benchmark_manager::get()                                                              \
+      .add(std::make_unique<                                                                       \
+           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator), TypeAxes>>())            \
      .set_name(#KernelGenerator)
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -66,10 +66,8 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
    {
      std::optional<nvbench::device_info> device = cur_state.get_device();

-      std::string device_id   = device ? fmt::to_string(device->get_id())
-                                       : std::string{};
-      std::string device_name = device ? std::string{device->get_name()}
-                                       : std::string{};
+      std::string device_id   = device ? fmt::to_string(device->get_id()) : std::string{};
+      std::string device_name = device ? std::string{device->get_name()} : std::string{};

      table.add_cell(row, "_bench_name", "Benchmark", bench_name);
      table.add_cell(row, "_device_id", "Device", std::move(device_id));
@@ -88,15 +86,11 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
                         name + "_axis_pow2_pretty",
                         name + " (pow2)",
                         fmt::format("2^{}", exponent));
-          table.add_cell(row,
-                         name + "_axis_plain",
-                         fmt::format("{}", name),
-                         fmt::to_string(value));
+          table.add_cell(row, name + "_axis_plain", fmt::format("{}", name), fmt::to_string(value));
        }
        else
        {
-          std::string value = std::visit(format_visitor,
-                                         axis_values.get_value(name));
+          std::string value = std::visit(format_visitor, axis_values.get_value(name));
          table.add_cell(row, name + "_axis", name, std::move(value));
        }
      }
@@ -117,14 +111,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
          continue;
        }
        const std::string &tag    = summ.get_tag();
-        const std::string &header = summ.has_value("name")
-                                      ? summ.get_string("name")
-                                      : tag;
+        const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;

-        const std::string hint = summ.has_value("hint")
-                                   ? summ.get_string("hint")
-                                   : std::string{};
-        std::string value = std::visit(format_visitor, summ.get_value("value"));
+        const std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
+        std::string value      = std::visit(format_visitor, summ.get_value("value"));
        if (hint == "duration")
        {
          table.add_cell(row, tag, header + " (sec)", std::move(value));
@@ -182,10 +172,7 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
      std::size_t remaining = table.m_columns.size();
      for (const auto &col : table.m_columns)
      {
-        fmt::format_to(buffer,
-                       "{}{}",
-                       col.rows[i],
-                       (--remaining == 0) ? "" : ",");
+        fmt::format_to(buffer, "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
      }
      fmt::format_to(buffer, "\n");
    }
--- a/nvbench/cuda_call.cu
+++ b/nvbench/cuda_call.cu
@@ -16,8 +16,8 @@
 *  limitations under the License.
 */

-#include <nvbench/cuda_call.cuh>
 #include <nvbench/config.cuh>
+#include <nvbench/cuda_call.cuh>

 #include <fmt/format.h>

@@ -66,11 +66,7 @@ void throw_error(const std::string &filename,
                                       command));
 }
 #else
-void throw_error(const std::string &,
-                 std::size_t,
-                 const std::string &,
-                 CUresult)
-{}
+void throw_error(const std::string &, std::size_t, const std::string &, CUresult) {}
 #endif

 void exit_error(const std::string &filename,
--- a/nvbench/cuda_call.cuh
+++ b/nvbench/cuda_call.cuh
@@ -18,52 +18,43 @@

 #pragma once

-#include <cuda_runtime_api.h>
 #include <cuda.h>
+#include <cuda_runtime_api.h>

 #include <string>

 /// Throws a std::runtime_error if `call` doesn't return `cudaSuccess`.
-#define NVBENCH_CUDA_CALL(call)                                                \
-  do                                                                           \
-  {                                                                            \
-    const cudaError_t nvbench_cuda_call_error = call;                          \
-    if (nvbench_cuda_call_error != cudaSuccess)                                \
-    {                                                                          \
-      nvbench::cuda_call::throw_error(__FILE__,                                \
-                                      __LINE__,                                \
-                                      #call,                                   \
-                                      nvbench_cuda_call_error);                \
-    }                                                                          \
+#define NVBENCH_CUDA_CALL(call)                                                                    \
+  do                                                                                               \
+  {                                                                                                \
+    const cudaError_t nvbench_cuda_call_error = call;                                              \
+    if (nvbench_cuda_call_error != cudaSuccess)                                                    \
+    {                                                                                              \
+      nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);         \
+    }                                                                                              \
  } while (false)

 /// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`.
-#define NVBENCH_DRIVER_API_CALL(call)                                          \
-  do                                                                           \
-  {                                                                            \
-    const CUresult nvbench_cuda_call_error = call;                             \
-    if (nvbench_cuda_call_error != CUDA_SUCCESS)                               \
-    {                                                                          \
-      nvbench::cuda_call::throw_error(__FILE__,                                \
-                                      __LINE__,                                \
-                                      #call,                                   \
-                                      nvbench_cuda_call_error);                \
-    }                                                                          \
+#define NVBENCH_DRIVER_API_CALL(call)                                                              \
+  do                                                                                               \
+  {                                                                                                \
+    const CUresult nvbench_cuda_call_error = call;                                                 \
+    if (nvbench_cuda_call_error != CUDA_SUCCESS)                                                   \
+    {                                                                                              \
+      nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);         \
+    }                                                                                              \
  } while (false)

 /// Terminates process with failure status if `call` doesn't return
 /// `cudaSuccess`.
-#define NVBENCH_CUDA_CALL_NOEXCEPT(call)                                       \
-  do                                                                           \
-  {                                                                            \
-    const cudaError_t nvbench_cuda_call_error = call;                          \
-    if (nvbench_cuda_call_error != cudaSuccess)                                \
-    {                                                                          \
-      nvbench::cuda_call::exit_error(__FILE__,                                 \
-                                     __LINE__,                                 \
-                                     #call,                                    \
-                                     nvbench_cuda_call_error);                 \
-    }                                                                          \
+#define NVBENCH_CUDA_CALL_NOEXCEPT(call)                                                           \
+  do                                                                                               \
+  {                                                                                                \
+    const cudaError_t nvbench_cuda_call_error = call;                                              \
+    if (nvbench_cuda_call_error != cudaSuccess)                                                    \
+    {                                                                                              \
+      nvbench::cuda_call::exit_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);          \
+    }                                                                                              \
  } while (false)

 namespace nvbench::cuda_call
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -66,10 +66,10 @@ struct cuda_stream
  ~cuda_stream() = default;

  // move-only
-  cuda_stream(const cuda_stream &) = delete;
+  cuda_stream(const cuda_stream &)            = delete;
  cuda_stream &operator=(const cuda_stream &) = delete;
  cuda_stream(cuda_stream &&)                 = default;
-  cuda_stream &operator=(cuda_stream &&) = default;
+  cuda_stream &operator=(cuda_stream &&)      = default;

  /**
   * @return The `cudaStream_t` managed by this `cuda_stream`.
--- a/nvbench/cuda_timer.cuh
+++ b/nvbench/cuda_timer.cuh
@@ -42,10 +42,10 @@ struct cuda_timer
  }

  // move-only
-  cuda_timer(const cuda_timer &) = delete;
-  cuda_timer(cuda_timer &&)      = default;
+  cuda_timer(const cuda_timer &)            = delete;
+  cuda_timer(cuda_timer &&)                 = default;
  cuda_timer &operator=(const cuda_timer &) = delete;
-  cuda_timer &operator=(cuda_timer &&) = default;
+  cuda_timer &operator=(cuda_timer &&)      = default;

  __forceinline__ void start(cudaStream_t stream)
  {
--- a/nvbench/cupti_profiler.cuh
+++ b/nvbench/cupti_profiler.cuh
@@ -21,14 +21,13 @@
 #include <nvbench/config.cuh>
 #include <nvbench/device_info.cuh>

+#include <optional>
 #include <string>
 #include <vector>
-#include <optional>

 namespace nvbench::detail
 {

-
 #ifdef NVBENCH_HAS_CUPTI
 /**
 * Pass required metrics in the constructor and organize your code as follows
@@ -62,7 +61,7 @@ namespace nvbench::detail
 */
 class cupti_profiler
 {
-  bool m_available {};
+  bool m_available{};
  std::string m_chip_name;

  // Counter data
@@ -87,11 +86,10 @@ public:
  cupti_profiler(cupti_profiler &&) noexcept;
  cupti_profiler &operator=(cupti_profiler &&) noexcept;

-  cupti_profiler(const cupti_profiler &) = delete;
+  cupti_profiler(const cupti_profiler &)            = delete;
  cupti_profiler &operator=(const cupti_profiler &) = delete;

-  cupti_profiler(nvbench::device_info device,
-                 std::vector<std::string> &&metric_names);
+  cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names);
  ~cupti_profiler();

  [[nodiscard]] bool is_initialized() const;
@@ -125,5 +123,4 @@ private:
 };
 #endif

-
 } // namespace nvbench::detail
--- a/nvbench/cupti_profiler.cxx
+++ b/nvbench/cupti_profiler.cxx
@@ -59,8 +59,7 @@ void nvpw_call(const NVPA_Status status)

 } // namespace

-cupti_profiler::cupti_profiler(nvbench::device_info device,
-                               std::vector<std::string> &&metric_names)
+cupti_profiler::cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names)
    : m_metric_names(metric_names)
    , m_device(device)
 {
@@ -154,12 +153,10 @@ class eval_request
  NVPW_MetricsEvaluator *evaluator_ptr;

 public:
-  eval_request(NVPW_MetricsEvaluator *evaluator_ptr,
-               const std::string &metric_name)
+  eval_request(NVPW_MetricsEvaluator *evaluator_ptr, const std::string &metric_name)
      : evaluator_ptr(evaluator_ptr)
  {
-    NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params =
-      {};
+    NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = {};

    params.structSize =
      NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE;
@@ -168,8 +165,7 @@ public:
    params.pMetricEvalRequest          = &request;
    params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;

-    nvpw_call(
-      NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
+    nvpw_call(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
  }

  [[nodiscard]] std::vector<const char *> get_raw_dependencies()
@@ -178,10 +174,9 @@ public:

    NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params{};

-    params.structSize =
-      NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
-    params.pMetricsEvaluator           = evaluator_ptr;
-    params.pMetricEvalRequests         = &request;
+    params.structSize          = NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
+    params.pMetricsEvaluator   = evaluator_ptr;
+    params.pMetricEvalRequests = &request;
    params.numMetricEvalRequests       = 1;
    params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
    params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
@@ -211,26 +206,23 @@ public:
                   const std::uint8_t *counter_data_image         = nullptr,
                   const std::size_t counter_data_image_size      = 0)
  {
-    NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
-      scratch_buffer_param{};
+    NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params scratch_buffer_param{};

    scratch_buffer_param.structSize =
      NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE;
    scratch_buffer_param.pChipName                 = chip_name.c_str();
    scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image;

-    nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(
-      &scratch_buffer_param));
+    nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(&scratch_buffer_param));

    scratch_buffer.resize(scratch_buffer_param.scratchBufferSize);

    NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params{};

-    evaluator_params.structSize =
-      NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
-    evaluator_params.scratchBufferSize         = scratch_buffer.size();
-    evaluator_params.pScratchBuffer            = scratch_buffer.data();
-    evaluator_params.pChipName                 = chip_name.c_str();
+    evaluator_params.structSize        = NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
+    evaluator_params.scratchBufferSize = scratch_buffer.size();
+    evaluator_params.pScratchBuffer    = scratch_buffer.data();
+    evaluator_params.pChipName         = chip_name.c_str();
    evaluator_params.pCounterAvailabilityImage = counter_availability_image;
    evaluator_params.pCounterDataImage         = counter_data_image;
    evaluator_params.counterDataImageSize      = counter_data_image_size;
@@ -247,7 +239,7 @@ public:
    {
      NVPW_MetricsEvaluator_Destroy_Params params{};

-      params.structSize = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
+      params.structSize        = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
      params.pMetricsEvaluator = evaluator_ptr;

      nvpw_call(NVPW_MetricsEvaluator_Destroy(&params));
@@ -259,10 +251,7 @@ public:
    return {evaluator_ptr, metric_name};
  }

-  [[nodiscard]] operator NVPW_MetricsEvaluator *() const
-  {
-    return evaluator_ptr;
-  }
+  [[nodiscard]] operator NVPW_MetricsEvaluator *() const { return evaluator_ptr; }
 };

 } // namespace
@@ -270,10 +259,10 @@ public:
 namespace
 {

-[[nodiscard]] std::vector<NVPA_RawMetricRequest> get_raw_metric_requests(
-  const std::string &chip_name,
-  const std::vector<std::string> &metric_names,
-  const std::uint8_t *counter_availability_image = nullptr)
+[[nodiscard]] std::vector<NVPA_RawMetricRequest>
+get_raw_metric_requests(const std::string &chip_name,
+                        const std::vector<std::string> &metric_names,
+                        const std::uint8_t *counter_availability_image = nullptr)
 {
  metric_evaluator evaluator(chip_name, counter_availability_image);

@@ -282,8 +271,7 @@ namespace

  for (auto &metric_name : metric_names)
  {
-    for (auto &raw_dependency :
-         evaluator.create_request(metric_name).get_raw_dependencies())
+    for (auto &raw_dependency : evaluator.create_request(metric_name).get_raw_dependencies())
    {
      raw_metric_names.push_back(raw_dependency);
    }
@@ -295,10 +283,10 @@ namespace
  for (auto &raw_name : raw_metric_names)
  {
    NVPA_RawMetricRequest metricRequest{};
-    metricRequest.structSize            = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
-    metricRequest.pMetricName           = raw_name;
-    metricRequest.isolated              = true;
-    metricRequest.keepInstances         = true;
+    metricRequest.structSize    = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
+    metricRequest.pMetricName   = raw_name;
+    metricRequest.isolated      = true;
+    metricRequest.keepInstances = true;
    raw_requests.push_back(metricRequest);
  }

@@ -309,12 +297,11 @@ class metrics_config
 {
  bool initialized{};

-  void create(const std::string &chip_name,
-              const std::uint8_t *availability_image)
+  void create(const std::string &chip_name, const std::uint8_t *availability_image)
  {
    NVPW_CUDA_RawMetricsConfig_Create_V2_Params params{};

-    params.structSize = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
+    params.structSize                = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
    params.activityKind              = NVPA_ACTIVITY_KIND_PROFILER;
    params.pChipName                 = chip_name.c_str();
    params.pCounterAvailabilityImage = availability_image;
@@ -329,9 +316,8 @@ class metrics_config
  {
    NVPW_RawMetricsConfig_SetCounterAvailability_Params params{};

-    params.structSize =
-      NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
-    params.pRawMetricsConfig         = raw_metrics_config;
+    params.structSize        = NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
+    params.pRawMetricsConfig = raw_metrics_config;
    params.pCounterAvailabilityImage = availability_image;

    nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(&params));
@@ -341,7 +327,7 @@ class metrics_config
  {
    NVPW_RawMetricsConfig_BeginPassGroup_Params params{};

-    params.structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
    params.pRawMetricsConfig = raw_metrics_config;

    nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(&params));
@@ -351,7 +337,7 @@ class metrics_config
  {
    NVPW_RawMetricsConfig_AddMetrics_Params params{};

-    params.structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
+    params.structSize         = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
    params.pRawMetricsConfig  = raw_metrics_config;
    params.pRawMetricRequests = raw_metric_requests.data();
    params.numMetricRequests  = raw_metric_requests.size();
@@ -363,7 +349,7 @@ class metrics_config
  {
    NVPW_RawMetricsConfig_EndPassGroup_Params params{};

-    params.structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
    params.pRawMetricsConfig = raw_metrics_config;

    nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(&params));
@@ -373,8 +359,7 @@ class metrics_config
  {
    NVPW_RawMetricsConfig_GenerateConfigImage_Params params{};

-    params.structSize =
-      NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
    params.pRawMetricsConfig = raw_metrics_config;

    nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(&params));
@@ -398,7 +383,7 @@ public:
  {
    NVPW_RawMetricsConfig_GetConfigImage_Params params{};

-    params.structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
    params.pRawMetricsConfig = raw_metrics_config;
    params.bytesAllocated    = 0;
    params.pBuffer           = nullptr;
@@ -419,7 +404,7 @@ public:
    {
      NVPW_RawMetricsConfig_Destroy_Params params{};

-      params.structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
+      params.structSize        = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
      params.pRawMetricsConfig = raw_metrics_config;

      NVPW_RawMetricsConfig_Destroy(&params);
@@ -433,13 +418,12 @@ public:

 void cupti_profiler::initialize_config_image()
 {
-  m_config_image =
-    metrics_config(m_chip_name,
-                   get_raw_metric_requests(m_chip_name,
-                                           m_metric_names,
-                                           m_availability_image.data()),
-                   m_availability_image.data())
-      .get_config_image();
+  m_config_image = metrics_config(m_chip_name,
+                                  get_raw_metric_requests(m_chip_name,
+                                                          m_metric_names,
+                                                          m_availability_image.data()),
+                                  m_availability_image.data())
+                     .get_config_image();
 }

 namespace
@@ -450,12 +434,11 @@ class counter_data_builder
  bool initialized{};

 public:
-  counter_data_builder(const std::string &chip_name,
-                       const std::uint8_t *pCounterAvailabilityImage)
+  counter_data_builder(const std::string &chip_name, const std::uint8_t *pCounterAvailabilityImage)
  {
    NVPW_CUDA_CounterDataBuilder_Create_Params params{};

-    params.structSize = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
+    params.structSize                = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
    params.pChipName                 = chip_name.c_str();
    params.pCounterAvailabilityImage = pCounterAvailabilityImage;

@@ -471,7 +454,7 @@ public:
    {
      NVPW_CounterDataBuilder_Destroy_Params params{};

-      params.structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
+      params.structSize          = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
      params.pCounterDataBuilder = builder;

      NVPW_CounterDataBuilder_Destroy(&params);
@@ -488,16 +471,14 @@ void cupti_profiler::initialize_counter_data_prefix_image()
  const std::uint8_t *counter_availability_image = nullptr;

  std::vector<NVPA_RawMetricRequest> raw_metric_requests =
-    get_raw_metric_requests(m_chip_name,
-                            m_metric_names,
-                            counter_availability_image);
+    get_raw_metric_requests(m_chip_name, m_metric_names, counter_availability_image);

  counter_data_builder data_builder(m_chip_name, counter_availability_image);

  {
    NVPW_CounterDataBuilder_AddMetrics_Params params{};

-    params.structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
+    params.structSize          = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
    params.pCounterDataBuilder = data_builder.builder;
    params.pRawMetricRequests  = raw_metric_requests.data();
    params.numMetricRequests   = raw_metric_requests.size();
@@ -508,8 +489,7 @@ void cupti_profiler::initialize_counter_data_prefix_image()
  {
    NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params{};

-    params.structSize =
-      NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
+    params.structSize          = NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
    params.pCounterDataBuilder = data_builder.builder;
    params.bytesAllocated      = 0;
    params.pBuffer             = nullptr;
@@ -532,11 +512,9 @@ get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options)
 {
  CUpti_Profiler_CounterDataImage_CalculateSize_Params params{};

-  params.structSize =
-    CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
-  params.pOptions = options;
-  params.sizeofCounterDataImageOptions =
-    CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+  params.structSize = CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
+  params.pOptions   = options;
+  params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;

  cupti_call(cuptiProfilerCounterDataImageCalculateSize(&params));
  return params.counterDataImageSize;
@@ -559,12 +537,10 @@ void cupti_profiler::initialize_counter_data_image()
  {
    CUpti_Profiler_CounterDataImage_Initialize_Params params{};

-    params.structSize =
-      CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
-    params.sizeofCounterDataImageOptions =
-      CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
-    params.pOptions             = &counter_data_image_options;
-    params.counterDataImageSize = m_data_image.size();
+    params.structSize = CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
+    params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+    params.pOptions                      = &counter_data_image_options;
+    params.counterDataImageSize          = m_data_image.size();

    params.pCounterDataImage = &m_data_image[0];
    cupti_call(cuptiProfilerCounterDataImageInitialize(&params));
@@ -578,8 +554,7 @@ void cupti_profiler::initialize_counter_data_image()
    params.counterDataImageSize = m_data_image.size();
    params.pCounterDataImage    = &m_data_image[0];

-    cupti_call(
-      cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
+    cupti_call(cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));

    m_data_scratch_buffer.resize(params.counterDataScratchBufferSize);
  }
@@ -587,8 +562,7 @@ void cupti_profiler::initialize_counter_data_image()
  {
    CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params{};

-    params.structSize =
-      CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
+    params.structSize = CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
    params.counterDataImageSize         = m_data_image.size();
    params.pCounterDataImage            = &m_data_image[0];
    params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
@@ -608,17 +582,14 @@ cupti_profiler::~cupti_profiler()
  }
 }

-bool cupti_profiler::is_initialized() const
-{
-  return m_available;
-}
+bool cupti_profiler::is_initialized() const { return m_available; }

 void cupti_profiler::prepare_user_loop()
 {
  {
    CUpti_Profiler_BeginSession_Params params{};

-    params.structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
+    params.structSize                   = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
    params.ctx                          = nullptr;
    params.counterDataImageSize         = m_data_image.size();
    params.pCounterDataImage            = &m_data_image[0];
@@ -735,9 +706,7 @@ std::vector<double> cupti_profiler::get_counter_values()

    if (params.numRanges != 1)
    {
-      NVBENCH_THROW(std::runtime_error,
-                    "{}",
-                    "Something's gone wrong, one range is expected");
+      NVBENCH_THROW(std::runtime_error, "{}", "Something's gone wrong, one range is expected");
    }
  }

@@ -752,8 +721,7 @@ std::vector<double> cupti_profiler::get_counter_values()
    {
      NVPW_MetricsEvaluator_SetDeviceAttributes_Params params{};

-      params.structSize =
-        NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
+      params.structSize           = NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
      params.pMetricsEvaluator    = evaluator;
      params.pCounterDataImage    = m_data_image.data();
      params.counterDataImageSize = m_data_image.size();
@@ -764,11 +732,10 @@ std::vector<double> cupti_profiler::get_counter_values()
    {
      NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params{};

-      params.structSize =
-        NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
-      params.pMetricsEvaluator           = evaluator;
-      params.pMetricEvalRequests         = &request.request;
-      params.numMetricEvalRequests       = 1;
+      params.structSize            = NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
+      params.pMetricsEvaluator     = evaluator;
+      params.pMetricEvalRequests   = &request.request;
+      params.numMetricEvalRequests = 1;
      params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
      params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
      params.pCounterDataImage           = m_data_image.data();
--- a/nvbench/detail/device_scope.cuh
+++ b/nvbench/detail/device_scope.cuh
@@ -39,9 +39,9 @@ struct [[maybe_unused]] device_scope
  ~device_scope() { NVBENCH_CUDA_CALL(cudaSetDevice(m_old_device_id)); }

  // move-only
-  device_scope(device_scope &&) = default;
-  device_scope &operator=(device_scope &&) = default;
-  device_scope(const device_scope &)       = delete;
+  device_scope(device_scope &&)                 = default;
+  device_scope &operator=(device_scope &&)      = default;
+  device_scope(const device_scope &)            = delete;
  device_scope &operator=(const device_scope &) = delete;

 private:
--- a/nvbench/detail/kernel_launcher_timer_wrapper.cuh
+++ b/nvbench/detail/kernel_launcher_timer_wrapper.cuh
@@ -33,7 +33,7 @@ namespace detail
 template <typename KernelLauncher>
 struct kernel_launch_timer_wrapper
 {
- explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
+  explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
      : m_kernel_launcher{launcher}
  {}

--- a/nvbench/detail/l2flush.cuh
+++ b/nvbench/detail/l2flush.cuh
@@ -31,13 +31,12 @@ struct l2flush
  {
    int dev_id{};
    NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
-    NVBENCH_CUDA_CALL(
-      cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
+    NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
    if (m_l2_size > 0)
    {
-      void* buffer = m_l2_buffer;
+      void *buffer = m_l2_buffer;
      NVBENCH_CUDA_CALL(cudaMalloc(&buffer, m_l2_size));
-      m_l2_buffer = reinterpret_cast<int*>(buffer);
+      m_l2_buffer = reinterpret_cast<int *>(buffer);
    }
  }

--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -54,15 +54,11 @@ void measure_cold_base::check()
  const auto device = m_state.get_device();
  if (!device)
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `cold` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cold` measurement.");
  }
  if (!device->is_active())
  { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
  }
 }

@@ -92,13 +88,11 @@ void measure_cold_base::record_measurements()
  ++m_total_samples;

  // Compute convergence statistics using CUDA timings:
-  const auto mean_cuda_time = m_total_cuda_time /
-                              static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cuda_stdev =
-    nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
-                                                    m_cuda_times.cend(),
-                                                    mean_cuda_time);
-  auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
+  const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
+                                                                          m_cuda_times.cend(),
+                                                                          mean_cuda_time);
+  auto cuda_rel_stdev       = cuda_stdev / mean_cuda_time;
  if (std::isfinite(cuda_rel_stdev))
  {
    m_noise_tracker.push_back(cuda_rel_stdev);
@@ -132,10 +126,10 @@ bool measure_cold_base::is_finished()
    {
      // Use the current noise as the stdev reference.
      const auto current_noise = m_noise_tracker.back();
-      const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
-        m_noise_tracker.cbegin(),
-        m_noise_tracker.cend(),
-        current_noise);
+      const auto noise_stdev =
+        nvbench::detail::statistics::standard_deviation(m_noise_tracker.cbegin(),
+                                                        m_noise_tracker.cend(),
+                                                        current_noise);
      const auto noise_rel_stdev = noise_stdev / current_noise;

      // If the rel stdev of the last N cuda noise measurements is less than
@@ -162,13 +156,11 @@ bool measure_cold_base::is_finished()
 void measure_cold_base::run_trials_epilogue()
 {
  // Only need to compute this at the end, not per iteration.
-  const auto cpu_mean = m_total_cuda_time /
-                        static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cpu_stdev =
-    nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
-                                                    m_cpu_times.cend(),
-                                                    cpu_mean);
-  m_cpu_noise = cpu_stdev / cpu_mean;
+  const auto cpu_mean  = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
+                                                                         m_cpu_times.cend(),
+                                                                         cpu_mean);
+  m_cpu_noise          = cpu_stdev / cpu_mean;

  m_walltime_timer.stop();
 }
@@ -199,8 +191,7 @@ void measure_cold_base::generate_summaries()
    auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
    summ.set_string("name", "Noise");
    summ.set_string("hint", "percentage");
-    summ.set_string("description",
-                    "Relative standard deviation of isolated CPU times");
+    summ.set_string("description", "Relative standard deviation of isolated CPU times");
    summ.set_float64("value", m_cpu_noise);
  }

@@ -219,12 +210,10 @@ void measure_cold_base::generate_summaries()
    auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
    summ.set_string("name", "Noise");
    summ.set_string("hint", "percentage");
-    summ.set_string("description",
-                    "Relative standard deviation of isolated GPU times");
+    summ.set_string("description", "Relative standard deviation of isolated GPU times");
    summ.set_float64("value",
-                     m_noise_tracker.empty()
-                       ? std::numeric_limits<nvbench::float64_t>::infinity()
-                       : m_noise_tracker.back());
+                     m_noise_tracker.empty() ? std::numeric_limits<nvbench::float64_t>::infinity()
+                                             : m_noise_tracker.back());
  }

  if (const auto items = m_state.get_element_count(); items != 0)
@@ -232,8 +221,7 @@ void measure_cold_base::generate_summaries()
    auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
    summ.set_string("name", "Elem/s");
    summ.set_string("hint", "item_rate");
-    summ.set_string("description",
-                    "Number of input elements processed per second");
+    summ.set_string("description", "Number of input elements processed per second");
    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
  }

@@ -251,8 +239,8 @@ void measure_cold_base::generate_summaries()
    }

    {
-      const auto peak_gmem_bw = static_cast<double>(
-        m_state.get_device()->get_global_memory_bus_bandwidth());
+      const auto peak_gmem_bw =
+        static_cast<double>(m_state.get_device()->get_global_memory_bus_bandwidth());

      auto &summ = m_state.add_summary("nv/cold/bw/global/utilization");
      summ.set_string("name", "BWUtil");
@@ -274,8 +262,7 @@ void measure_cold_base::generate_summaries()
  }

  // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
  {
    auto &printer = printer_opt_ref.value().get();

@@ -324,10 +311,7 @@ void measure_cold_base::generate_summaries()
                            m_walltime_timer.get_duration(),
                            m_total_samples));

-    printer.process_bulk_data(m_state,
-                              "nv/cold/sample_times",
-                              "sample_times",
-                              m_cuda_times);
+    printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times);
  }
 }

--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -49,10 +49,10 @@ namespace detail
 struct measure_cold_base
 {
  explicit measure_cold_base(nvbench::state &exec_state);
-  measure_cold_base(const measure_cold_base &) = delete;
-  measure_cold_base(measure_cold_base &&)      = delete;
+  measure_cold_base(const measure_cold_base &)            = delete;
+  measure_cold_base(measure_cold_base &&)                 = delete;
  measure_cold_base &operator=(const measure_cold_base &) = delete;
-  measure_cold_base &operator=(measure_cold_base &&) = delete;
+  measure_cold_base &operator=(measure_cold_base &&)      = delete;

 protected:
  template <bool use_blocking_kernel>
@@ -68,10 +68,7 @@ protected:

  void check_skip_time(nvbench::float64_t warmup_time);

-  __forceinline__ void flush_device_l2()
-  {
-    m_l2flush.flush(m_launch.get_stream());
-  }
+  __forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }

  __forceinline__ void sync_stream() const
  {
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -50,8 +50,7 @@ struct metric_traits;
 template <>
 struct metric_traits<metric_id::dram_peak_sustained_throughput>
 {
-  static constexpr const char *metric_name =
-    "dram__throughput.avg.pct_of_peak_sustained_elapsed";
+  static constexpr const char *metric_name = "dram__throughput.avg.pct_of_peak_sustained_elapsed";

  static constexpr const char *name = "HBWPeak";
  static constexpr const char *hint = "percentage";
@@ -119,10 +118,7 @@ struct metric_traits<metric_id::l1_hit_rate>
  static constexpr const char *description = "Hit rate at L1 cache.";
  static constexpr double divider          = 100.0;

-  static bool is_collected(nvbench::state &m_state)
-  {
-    return m_state.is_l1_hit_rate_collected();
-  };
+  static bool is_collected(nvbench::state &m_state) { return m_state.is_l1_hit_rate_collected(); };
 };

 template <>
@@ -134,10 +130,7 @@ struct metric_traits<metric_id::l2_hit_rate>
  static constexpr const char *description = "Hit rate at L2 cache.";
  static constexpr double divider          = 100.0;

-  static bool is_collected(nvbench::state &m_state)
-  {
-    return m_state.is_l2_hit_rate_collected();
-  };
+  static bool is_collected(nvbench::state &m_state) { return m_state.is_l2_hit_rate_collected(); };
 };

 template <metric_id id = metric_id::dram_peak_sustained_throughput>
@@ -153,8 +146,7 @@ void add_metrics_impl(nvbench::state &state, std::vector<std::string> &metrics)
 }

 template <>
-void add_metrics_impl<metric_id::count>(nvbench::state &,
-                                        std::vector<std::string> &)
+void add_metrics_impl<metric_id::count>(nvbench::state &, std::vector<std::string> &)
 {}

 std::vector<std::string> add_metrics(nvbench::state &state)
@@ -179,13 +171,11 @@ try
 // clang-format on
 catch (const std::exception &ex)
 {
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref)
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref)
  {
    auto &printer = printer_opt_ref.value().get();
    printer.log(nvbench::log_level::warn,
-                fmt::format("CUPTI failed to construct profiler: {}",
-                            ex.what()));
+                fmt::format("CUPTI failed to construct profiler: {}", ex.what()));
  }
 }

@@ -194,15 +184,11 @@ void measure_cupti_base::check()
  const auto device = m_state.get_device();
  if (!device)
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `cupti` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cupti` measurement.");
  }
  if (!device->is_active())
  { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
  }
 }

@@ -210,16 +196,13 @@ namespace
 {

 template <metric_id id = metric_id::dram_peak_sustained_throughput>
-void gen_summary(std::size_t result_id,
-                 nvbench::state &m_state,
-                 const std::vector<double> &result)
+void gen_summary(std::size_t result_id, nvbench::state &m_state, const std::vector<double> &result)
 {
  using metric = metric_traits<id>;

  if (metric::is_collected(m_state))
  {
-    auto &summ =
-      m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
+    auto &summ = m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
    summ.set_string("name", metric::name);
    summ.set_string("hint", metric::hint);
    summ.set_string("description", metric::description);
@@ -231,9 +214,7 @@ void gen_summary(std::size_t result_id,
 }

 template <>
-void gen_summary<metric_id::count>(std::size_t,
-                                   nvbench::state &,
-                                   const std::vector<double> &)
+void gen_summary<metric_id::count>(std::size_t, nvbench::state &, const std::vector<double> &)
 {}

 void gen_summaries(nvbench::state &state, const std::vector<double> &result)
@@ -266,8 +247,7 @@ try
  }

  // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
  {
    auto &printer = printer_opt_ref.value().get();
    printer.log(nvbench::log_level::pass,
@@ -278,13 +258,11 @@ try
 }
 catch (const std::exception &ex)
 {
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref)
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref)
  {
    auto &printer = printer_opt_ref.value().get();
    printer.log(nvbench::log_level::warn,
-                fmt::format("CUPTI failed to generate the summary: {}",
-                            ex.what()));
+                fmt::format("CUPTI failed to generate the summary: {}", ex.what()));
  }
 }

--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -50,10 +50,10 @@ namespace detail
 struct measure_cupti_base
 {
  explicit measure_cupti_base(nvbench::state &exec_state);
-  measure_cupti_base(const measure_cupti_base &) = delete;
-  measure_cupti_base(measure_cupti_base &&)      = delete;
+  measure_cupti_base(const measure_cupti_base &)            = delete;
+  measure_cupti_base(measure_cupti_base &&)                 = delete;
  measure_cupti_base &operator=(const measure_cupti_base &) = delete;
-  measure_cupti_base &operator=(measure_cupti_base &&) = delete;
+  measure_cupti_base &operator=(measure_cupti_base &&)      = delete;

 protected:
  struct kernel_launch_timer;
@@ -61,10 +61,7 @@ protected:
  void check();
  void generate_summaries();

-  __forceinline__ void flush_device_l2()
-  {
-    m_l2flush.flush(m_launch.get_stream());
-  }
+  __forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }

  __forceinline__ void sync_stream() const
  {
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -47,9 +47,8 @@ measure_hot_base::measure_hot_base(state &exec_state)
  // to match the cold result if available.
  try
  {
-    nvbench::int64_t cold_samples =
-      m_state.get_summary("nv/cold/sample_size").get_int64("value");
-    m_min_samples = std::max(m_min_samples, cold_samples);
+    nvbench::int64_t cold_samples = m_state.get_summary("nv/cold/sample_size").get_int64("value");
+    m_min_samples                 = std::max(m_min_samples, cold_samples);

    // If the cold measurement ran successfully, disable skip_time. It'd just
    // be annoying to skip now.
@@ -72,15 +71,11 @@ void measure_hot_base::check()
  const auto device = m_state.get_device();
  if (!device)
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `hot` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `hot` measurement.");
  }
  if (!device->is_active())
  { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
  }
 }

@@ -116,8 +111,7 @@ void measure_hot_base::generate_summaries()
  }

  // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
  {
    auto &printer = printer_opt_ref.value().get();

--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -41,10 +41,10 @@ namespace detail
 struct measure_hot_base
 {
  explicit measure_hot_base(nvbench::state &exec_state);
-  measure_hot_base(const measure_hot_base &) = delete;
-  measure_hot_base(measure_hot_base &&)      = delete;
+  measure_hot_base(const measure_hot_base &)            = delete;
+  measure_hot_base(measure_hot_base &&)                 = delete;
  measure_hot_base &operator=(const measure_hot_base &) = delete;
-  measure_hot_base &operator=(measure_hot_base &&) = delete;
+  measure_hot_base &operator=(measure_hot_base &&)      = delete;

 protected:
  void check();
@@ -131,7 +131,7 @@ private:
    // The .95 factor here pads the batch_size a bit to avoid needing a second
    // batch due to noise.
    const auto time_estimate = m_cuda_timer.get_duration() * 0.95;
-    auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
+    auto batch_size          = static_cast<nvbench::int64_t>(m_min_time / time_estimate);

    do
    {
@@ -142,7 +142,7 @@ private:
        // Block stream until some work is queued.
        // Limit the number of kernel executions while blocked to prevent
        // deadlocks. See warnings on blocking_kernel.
-        const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
+        const auto blocked_launches   = std::min(batch_size, nvbench::int64_t{2});
        const auto unblocked_launches = batch_size - blocked_launches;

        this->block_stream();
@@ -189,7 +189,6 @@ private:
        break; // Stop iterating
      }

-
      m_walltime_timer.stop();
      if (m_walltime_timer.get_duration() > m_timeout)
      {
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -58,18 +58,12 @@ struct ring_buffer
  /**
   * The number of valid values in the ring buffer. Always <= capacity().
   */
-  [[nodiscard]] std::size_t size() const
-  {
-    return m_full ? m_buffer.size() : m_index;
-  }
+  [[nodiscard]] std::size_t size() const { return m_full ? m_buffer.size() : m_index; }

  /**
   * The maximum size of the ring buffer.
   */
-  [[nodiscard]] std::size_t capacity() const
-  {
-    return m_buffer.size();
-  }
+  [[nodiscard]] std::size_t capacity() const { return m_buffer.size(); }

  /**
   * @return True if the ring buffer is empty.
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -53,7 +53,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
                "`ExecTags` argument must be a member (or combination of "
                "members) from nvbench::exec_tag.");

-  constexpr auto measure_tags = tags & measure_mask;
+  constexpr auto measure_tags  = tags & measure_mask;
  constexpr auto modifier_tags = tags & modifier_mask;

  // "run once" is handled by the cold measurement:
@@ -81,8 +81,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    }
    else
    {
-      this->exec(cold | hot | tags,
-                 std::forward<KernelLauncher>(kernel_launcher));
+      this->exec(cold | hot | tags, std::forward<KernelLauncher>(kernel_launcher));
    }
    return;
  }
@@ -99,8 +98,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
    constexpr bool use_blocking_kernel = !(tags & no_block);
    if constexpr (tags & timer)
    {
-      // Estimate bandwidth here
-      #ifdef NVBENCH_HAS_CUPTI
+// Estimate bandwidth here
+#ifdef NVBENCH_HAS_CUPTI
      if constexpr (!(modifier_tags & run_once))
      {
        if (this->is_cupti_required())
@@ -110,7 +109,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
          measure();
        }
      }
-      #endif
+#endif

      using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
      measure_t measure{*this, kernel_launcher};
@@ -121,8 +120,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
      using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
      wrapper_t wrapper{kernel_launcher};

-      // Estimate bandwidth here
-      #ifdef NVBENCH_HAS_CUPTI
+// Estimate bandwidth here
+#ifdef NVBENCH_HAS_CUPTI
      if constexpr (!(modifier_tags & run_once))
      {
        if (this->is_cupti_required())
@@ -132,10 +131,9 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
          measure();
        }
      }
-      #endif
+#endif

-      using measure_t =
-        nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
+      using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
      measure_t measure(*this, wrapper);
      measure();
    }
@@ -143,12 +141,10 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)

  if constexpr (tags & hot)
  {
-    static_assert(!(tags & sync),
-                  "Hot measurement doesn't support the `sync` exec_tag.");
-    static_assert(!(tags & timer),
-                  "Hot measurement doesn't support the `timer` exec_tag.");
+    static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
+    static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
    constexpr bool use_blocking_kernel = !(tags & no_block);
-    using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
+    using measure_t                    = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
    measure_t measure{*this, kernel_launcher};
    measure();
  }
--- a/nvbench/detail/state_generator.cxx
+++ b/nvbench/detail/state_generator.cxx
@@ -40,9 +40,7 @@ void state_iterator::add_axis(const nvbench::axis_base &axis)
  this->add_axis(axis.get_name(), axis.get_type(), axis.get_size());
 }

-void state_iterator::add_axis(std::string axis,
-                              nvbench::axis_type type,
-                              std::size_t size)
+void state_iterator::add_axis(std::string axis, nvbench::axis_type type, std::size_t size)
 {
  m_indices.push_back({std::move(axis), type, std::size_t{0}, size});
 }
@@ -74,10 +72,7 @@ state_iterator::get_current_indices() const
  return m_indices;
 }

-[[nodiscard]] bool state_iterator::iter_valid() const
-{
-  return m_current < m_total;
-}
+[[nodiscard]] bool state_iterator::iter_valid() const { return m_current < m_total; }

 void state_iterator::next()
 {
@@ -102,7 +97,7 @@ state_generator::state_generator(const benchmark_base &bench)

 void state_generator::build_axis_configs()
 {
-  const axes_metadata &axes = m_benchmark.get_axes();
+  const axes_metadata &axes                               = m_benchmark.get_axes();
  const std::vector<std::unique_ptr<axis_base>> &axes_vec = axes.get_axes();

  // Construct two state_generators:
@@ -118,35 +113,29 @@ void state_generator::build_axis_configs()
    type_axes.reserve(axes_vec.size());

    // Filter all axes by into type and non-type:
-    std::for_each(axes_vec.cbegin(),
-                  axes_vec.cend(),
-                  [&non_type_si, &type_axes](const auto &axis) {
-                    if (axis->get_type() == nvbench::axis_type::type)
-                    {
-                      type_axes.push_back(
-                        std::cref(static_cast<const type_axis &>(*axis)));
-                    }
-                    else
-                    {
-                      non_type_si.add_axis(*axis);
-                    }
-                  });
+    std::for_each(axes_vec.cbegin(), axes_vec.cend(), [&non_type_si, &type_axes](const auto &axis) {
+      if (axis->get_type() == nvbench::axis_type::type)
+      {
+        type_axes.push_back(std::cref(static_cast<const type_axis &>(*axis)));
+      }
+      else
+      {
+        non_type_si.add_axis(*axis);
+      }
+    });

    // Reverse sort type axes by index. This way the state_generator's cartesian
    // product of the type axes values will be enumerated in the same order as
    // nvbench::tl::cartesian_product<type_axes>. This is necessary to ensure
    // that the correct states are passed to the corresponding benchmark
    // instantiations.
-    std::sort(type_axes.begin(),
-              type_axes.end(),
-              [](const auto &axis_1, const auto &axis_2) {
-                return axis_1.get().get_axis_index() >
-                       axis_2.get().get_axis_index();
-              });
+    std::sort(type_axes.begin(), type_axes.end(), [](const auto &axis_1, const auto &axis_2) {
+      return axis_1.get().get_axis_index() > axis_2.get().get_axis_index();
+    });

-    std::for_each(type_axes.cbegin(),
-                  type_axes.cend(),
-                  [&type_si](const auto &axis) { type_si.add_axis(axis); });
+    std::for_each(type_axes.cbegin(), type_axes.cend(), [&type_si](const auto &axis) {
+      type_si.add_axis(axis);
+    });
  }

  // type_axis_configs generation:
@@ -157,8 +146,8 @@ void state_generator::build_axis_configs()
    // Build type_axis_configs
    for (type_si.init(); type_si.iter_valid(); type_si.next())
    {
-      auto &[config, active_mask] = m_type_axis_configs.emplace_back(
-        std::make_pair(nvbench::named_values{}, true));
+      auto &[config, active_mask] =
+        m_type_axis_configs.emplace_back(std::make_pair(nvbench::named_values{}, true));

      // Reverse the indices so they're once again in the same order as
      // specified:
@@ -173,8 +162,7 @@ void state_generator::build_axis_configs()
          active_mask = false;
        }

-        config.set_string(axis_info.axis,
-                          axis.get_input_string(axis_info.index));
+        config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
      }
    } // type_si
  }   // type_axis_config generation
@@ -199,21 +187,18 @@ void state_generator::build_axis_configs()
            break;

          case axis_type::int64:
-            config.set_int64(
-              axis_info.axis,
-              axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_int64(axis_info.axis,
+                             axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
            break;

          case axis_type::float64:
-            config.set_float64(
-              axis_info.axis,
-              axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_float64(axis_info.axis,
+                               axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
            break;

          case axis_type::string:
-            config.set_string(
-              axis_info.axis,
-              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_string(axis_info.axis,
+                              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
            break;
        } // switch (type)
      }   // for (axis_info : current_indices)
@@ -239,15 +224,12 @@ void state_generator::build_states()
  }
 }

-void state_generator::add_states_for_device(
-  const std::optional<device_info> &device)
+void state_generator::add_states_for_device(const std::optional<device_info> &device)
 {
  const auto num_type_configs = m_type_axis_configs.size();
-  for (std::size_t type_config_index = 0; type_config_index < num_type_configs;
-       ++type_config_index)
+  for (std::size_t type_config_index = 0; type_config_index < num_type_configs; ++type_config_index)
  {
-    const auto &[type_config,
-                 axis_mask] = m_type_axis_configs[type_config_index];
+    const auto &[type_config, axis_mask] = m_type_axis_configs[type_config_index];

    if (!axis_mask)
    { // Don't generate inner vector if the type config is masked out.
@@ -261,10 +243,7 @@ void state_generator::add_states_for_device(
      config.append(non_type_config);

      // Create benchmark:
-      m_states.push_back(nvbench::state{m_benchmark,
-                                        std::move(config),
-                                        device,
-                                        type_config_index});
+      m_states.push_back(nvbench::state{m_benchmark, std::move(config), device, type_config_index});
    }
  }
 }
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -36,8 +36,7 @@ namespace nvbench::detail::statistics
 *
 * If the input has fewer than 5 sample, infinity is returned.
 */
-template <typename Iter,
-          typename ValueType = typename std::iterator_traits<Iter>::value_type>
+template <typename Iter, typename ValueType = typename std::iterator_traits<Iter>::value_type>
 ValueType standard_deviation(Iter first, Iter last, ValueType mean)
 {
  static_assert(std::is_floating_point_v<ValueType>);
--- a/nvbench/detail/throw.cuh
+++ b/nvbench/detail/throw.cuh
@@ -21,17 +21,15 @@
 #include <fmt/format.h>
 #include <stdexcept>

-#define NVBENCH_THROW(exception_type, format_str, ...)                         \
-  throw exception_type(fmt::format("{}:{}: {}",                                \
-                                   __FILE__,                                   \
-                                   __LINE__,                                   \
-                                   fmt::format(format_str, __VA_ARGS__)))
+#define NVBENCH_THROW(exception_type, format_str, ...)                                             \
+  throw exception_type(                                                                            \
+    fmt::format("{}:{}: {}", __FILE__, __LINE__, fmt::format(format_str, __VA_ARGS__)))

-#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...)           \
-  do                                                                           \
-  {                                                                            \
-    if (condition)                                                             \
-    {                                                                          \
-      NVBENCH_THROW(exception_type, format_str, __VA_ARGS__);                  \
-    }                                                                          \
+#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...)                               \
+  do                                                                                               \
+  {                                                                                                \
+    if (condition)                                                                                 \
+    {                                                                                              \
+      NVBENCH_THROW(exception_type, format_str, __VA_ARGS__);                                      \
+    }                                                                                              \
  } while (false)
--- a/nvbench/detail/transform_reduce.cuh
+++ b/nvbench/detail/transform_reduce.cuh
@@ -27,10 +27,7 @@
 namespace nvbench::detail
 {

-template <typename InIterT,
-          typename InitValueT,
-          typename ReduceOp,
-          typename TransformOp>
+template <typename InIterT, typename InitValueT, typename ReduceOp, typename TransformOp>
 InitValueT transform_reduce(InIterT first,
                            InIterT last,
                            InitValueT init,
--- a/nvbench/detail/type_list_impl.cuh
+++ b/nvbench/detail/type_list_impl.cuh
@@ -20,12 +20,10 @@ namespace tl::detail
 {

 template <typename... Ts>
-auto size(nvbench::type_list<Ts...>)
-  -> std::integral_constant<std::size_t, sizeof...(Ts)>;
+auto size(nvbench::type_list<Ts...>) -> std::integral_constant<std::size_t, sizeof...(Ts)>;

 template <std::size_t I, typename... Ts>
-auto get(nvbench::type_list<Ts...>)
-  -> std::tuple_element_t<I, std::tuple<Ts...>>;
+auto get(nvbench::type_list<Ts...>) -> std::tuple_element_t<I, std::tuple<Ts...>>;

 template <typename... Ts, typename... Us>
 auto concat(nvbench::type_list<Ts...>, nvbench::type_list<Us...>)
@@ -44,9 +42,8 @@ struct prepend_each<T, nvbench::type_list<>>
 template <typename T, typename TL, typename... TLTail>
 struct prepend_each<T, nvbench::type_list<TL, TLTail...>>
 {
-  using cur = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
-  using next =
-    typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
+  using cur  = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
+  using next = typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
  using type = decltype(detail::concat(nvbench::type_list<cur>{}, next{}));
 };

@@ -71,23 +68,20 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<>, TLTail...>>
 template <typename T, typename... Ts>
 struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Ts...>>>
 {
-  using cur = nvbench::type_list<nvbench::type_list<T>>;
-  using next =
-    std::conditional_t<sizeof...(Ts) != 0,
-                       typename detail::cartesian_product<
-                         nvbench::type_list<nvbench::type_list<Ts...>>>::type,
-                       nvbench::type_list<>>;
+  using cur  = nvbench::type_list<nvbench::type_list<T>>;
+  using next = std::conditional_t<
+    sizeof...(Ts) != 0,
+    typename detail::cartesian_product<nvbench::type_list<nvbench::type_list<Ts...>>>::type,
+    nvbench::type_list<>>;
  using type = decltype(detail::concat(cur{}, next{}));
 };

 template <typename T, typename... Tail, typename TL, typename... TLTail>
-struct cartesian_product<
-  nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
+struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
 {
-  using tail_prod =
-    typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
-  using cur  = typename detail::prepend_each<T, tail_prod>::type;
-  using next = typename detail::cartesian_product<
+  using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
+  using cur       = typename detail::prepend_each<T, tail_prod>::type;
+  using next      = typename detail::cartesian_product<
    nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
  using type = decltype(detail::concat(cur{}, next{}));
 };
--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -65,17 +65,15 @@ void device_info::set_persistence_mode(bool state)
 #else  // NVBENCH_HAS_NVML
 try
 {
-  NVBENCH_NVML_CALL(nvmlDeviceSetPersistenceMode(
-    m_nvml_device,
-    state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
+  NVBENCH_NVML_CALL(
+    nvmlDeviceSetPersistenceMode(m_nvml_device,
+                                 state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
 }
 catch (nvml::call_failed &e)
 {
  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED)
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Persistence mode is only supported on Linux.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Persistence mode is only supported on Linux.");
  }
  else if (e.get_error_code() == NVML_ERROR_NO_PERMISSION)
  {
@@ -104,30 +102,26 @@ try
      break;

    case clock_rate::base:
-      NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(
-        m_nvml_device,
-        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
-        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
+      NVBENCH_NVML_CALL(
+        nvmlDeviceSetGpuLockedClocks(m_nvml_device,
+                                     static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
+                                     static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
      break;

    case clock_rate::maximum: {
-      const auto max_mhz = static_cast<unsigned int>(
-        this->get_sm_default_clock_rate() / (1000 * 1000));
-      NVBENCH_NVML_CALL(
-        nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
+      const auto max_mhz =
+        static_cast<unsigned int>(this->get_sm_default_clock_rate() / (1000 * 1000));
+      NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
      break;
    }

    default:
-      NVBENCH_THROW(std::runtime_error,
-                    "Unrecognized clock rate: {}",
-                    static_cast<int>(rate));
+      NVBENCH_THROW(std::runtime_error, "Unrecognized clock rate: {}", static_cast<int>(rate));
  }
 }
 catch (nvml::call_failed &e)
 {
-  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED &&
-      this->get_sm_version() < 700)
+  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED && this->get_sm_version() < 700)
  {
    NVBENCH_THROW(std::runtime_error,
                  "GPU clock rates can only be modified for Volta and later. "
@@ -156,9 +150,7 @@ catch (nvml::call_failed &e)
 {
  if (!is_active())
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "get_context is called for inactive device");
+    NVBENCH_THROW(std::runtime_error, "{}", "get_context is called for inactive device");
  }

  CUcontext cu_context;
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -54,10 +54,7 @@ struct device_info
  [[nodiscard]] int get_id() const { return m_id; }

  /// @return The name of the device.
-  [[nodiscard]] std::string_view get_name() const
-  {
-    return std::string_view(m_prop.name);
-  }
+  [[nodiscard]] std::string_view get_name() const { return std::string_view(m_prop.name); }

  [[nodiscard]] bool is_active() const
  {
@@ -83,7 +80,6 @@ struct device_info
  /// @note Requires root / admin privileges.
  void set_persistence_mode(bool state);

-
  /// Symbolic values for special clock rates
  enum class clock_rate
  {
@@ -101,10 +97,7 @@ struct device_info
  void lock_gpu_clocks(clock_rate rate);

  /// @return The SM version of the current device as (major*100) + (minor*10).
-  [[nodiscard]] int get_sm_version() const
-  {
-    return m_prop.major * 100 + m_prop.minor * 10;
-  }
+  [[nodiscard]] int get_sm_version() const { return m_prop.major * 100 + m_prop.minor * 10; }

  /// @return The PTX version of the current device, e.g. sm_80 returns 800.
  [[nodiscard]] __forceinline__ int get_ptx_version() const
@@ -119,46 +112,25 @@ struct device_info
  }

  /// @return The number of physical streaming multiprocessors on this device.
-  [[nodiscard]] int get_number_of_sms() const
-  {
-    return m_prop.multiProcessorCount;
-  }
+  [[nodiscard]] int get_number_of_sms() const { return m_prop.multiProcessorCount; }

  /// @return The maximum number of resident blocks per SM.
-  [[nodiscard]] int get_max_blocks_per_sm() const
-  {
-    return m_prop.maxBlocksPerMultiProcessor;
-  }
+  [[nodiscard]] int get_max_blocks_per_sm() const { return m_prop.maxBlocksPerMultiProcessor; }

  /// @return The maximum number of resident threads per SM.
-  [[nodiscard]] int get_max_threads_per_sm() const
-  {
-    return m_prop.maxThreadsPerMultiProcessor;
-  }
+  [[nodiscard]] int get_max_threads_per_sm() const { return m_prop.maxThreadsPerMultiProcessor; }

  /// @return The maximum number of threads per block.
-  [[nodiscard]] int get_max_threads_per_block() const
-  {
-    return m_prop.maxThreadsPerBlock;
-  }
+  [[nodiscard]] int get_max_threads_per_block() const { return m_prop.maxThreadsPerBlock; }

  /// @return The number of registers per SM.
-  [[nodiscard]] int get_registers_per_sm() const
-  {
-    return m_prop.regsPerMultiprocessor;
-  }
+  [[nodiscard]] int get_registers_per_sm() const { return m_prop.regsPerMultiprocessor; }

  /// @return The number of registers per block.
-  [[nodiscard]] int get_registers_per_block() const
-  {
-    return m_prop.regsPerBlock;
-  }
+  [[nodiscard]] int get_registers_per_block() const { return m_prop.regsPerBlock; }

  /// @return The total number of bytes available in global memory.
-  [[nodiscard]] std::size_t get_global_memory_size() const
-  {
-    return m_prop.totalGlobalMem;
-  }
+  [[nodiscard]] std::size_t get_global_memory_size() const { return m_prop.totalGlobalMem; }

  struct memory_info
  {
@@ -176,10 +148,7 @@ struct device_info
  }

  /// @return The width of the global memory bus in bits.
-  [[nodiscard]] int get_global_memory_bus_width() const
-  {
-    return m_prop.memoryBusWidth;
-  }
+  [[nodiscard]] int get_global_memory_bus_width() const { return m_prop.memoryBusWidth; }

  //// @return The global memory bus bandwidth in bytes/sec.
  [[nodiscard]] std::size_t get_global_memory_bus_bandwidth() const
@@ -201,10 +170,7 @@ struct device_info
  }

  /// @return The available amount of shared memory in bytes per block.
-  [[nodiscard]] std::size_t get_shared_memory_per_block() const
-  {
-    return m_prop.sharedMemPerBlock;
-  }
+  [[nodiscard]] std::size_t get_shared_memory_per_block() const { return m_prop.sharedMemPerBlock; }

  /// @return True if ECC is enabled on this device.
  [[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }
@@ -224,23 +190,11 @@ struct device_info
 #endif

  /// @return A cached copy of the device's cudaDeviceProp.
-  [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const
-  {
-    return m_prop;
-  }
+  [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const { return m_prop; }

-  [[nodiscard]] bool operator<(const device_info &o) const
-  {
-    return m_id < o.m_id;
-  }
-  [[nodiscard]] bool operator==(const device_info &o) const
-  {
-    return m_id == o.m_id;
-  }
-  [[nodiscard]] bool operator!=(const device_info &o) const
-  {
-    return m_id != o.m_id;
-  }
+  [[nodiscard]] bool operator<(const device_info &o) const { return m_id < o.m_id; }
+  [[nodiscard]] bool operator==(const device_info &o) const { return m_id == o.m_id; }
+  [[nodiscard]] bool operator!=(const device_info &o) const { return m_id != o.m_id; }

 private:
  int m_id;
@@ -267,11 +221,10 @@ try
 {
  nvbench::detail::device_scope _{dev_id};
  cudaFuncAttributes attr{};
-  NVBENCH_CUDA_CALL(
-    cudaFuncGetAttributes(&attr, ((const void*)nvbench::detail::noop_kernel_ptr) ));
+  NVBENCH_CUDA_CALL(cudaFuncGetAttributes(&attr, ((const void *)nvbench::detail::noop_kernel_ptr)));
  return attr.ptxVersion * 10;
 }
-catch(...)
+catch (...)
 { // Fail gracefully when no appropriate PTX is found for this device.
  return -1;
 }
--- a/nvbench/device_manager.cuh
+++ b/nvbench/device_manager.cuh
@@ -40,10 +40,7 @@ struct device_manager
  /**
   * @return The total number of detected CUDA devices.
   */
-  [[nodiscard]] int get_number_of_devices() const
-  {
-    return static_cast<int>(m_devices.size());
-  }
+  [[nodiscard]] int get_number_of_devices() const { return static_cast<int>(m_devices.size()); }

  /**
   * @return The number of devices actually used by all benchmarks.
@@ -57,39 +54,27 @@ struct device_manager
  /**
   * @return The device_info object corresponding to `id`.
   */
-  [[nodiscard]] const nvbench::device_info &get_device(int id)
-  {
-    return m_devices.at(id);
-  }
+  [[nodiscard]] const nvbench::device_info &get_device(int id) { return m_devices.at(id); }

  /**
   * @return A vector containing device_info objects for all detected CUDA
   * devices.
   */
-  [[nodiscard]] const device_info_vector &get_devices() const
-  {
-    return m_devices;
-  }
+  [[nodiscard]] const device_info_vector &get_devices() const { return m_devices; }

  /**
   * @return A vector containing device_info objects for devices that are
   * actively used by all benchmarks.
   * @note This is only valid after nvbench::option_parser::parse executes.
   */
-  [[nodiscard]] const device_info_vector &get_used_devices() const
-  {
-    return m_used_devices;
-  }
+  [[nodiscard]] const device_info_vector &get_used_devices() const { return m_used_devices; }

 private:
  device_manager();

  friend struct option_parser;

-  void set_used_devices(device_info_vector devices)
-  {
-    m_used_devices = std::move(devices);
-  }
+  void set_used_devices(device_info_vector devices) { m_used_devices = std::move(devices); }

  device_info_vector m_devices;
  device_info_vector m_used_devices;
--- a/nvbench/enum_type_list.cuh
+++ b/nvbench/enum_type_list.cuh
@@ -64,10 +64,7 @@ struct type_strings<nvbench::enum_type<Value, T>>
    return std::to_string(Value);
  }

-  static std::string description()
-  {
-    return nvbench::demangle<nvbench::enum_type<Value, T>>();
-  }
+  static std::string description() { return nvbench::demangle<nvbench::enum_type<Value, T>>(); }
 };

 } // namespace nvbench
@@ -86,15 +83,13 @@ struct type_strings<nvbench::enum_type<Value, T>>
 * \relatesalso enum_type_list
 * \relatesalso nvbench::enum_type_list
 */
-#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T,                                   \
-                                          input_generator,                     \
-                                          description_generator)               \
-  namespace nvbench                                                            \
-  {                                                                            \
-  template <T Value>                                                           \
-  struct type_strings<enum_type<Value, T>>                                     \
-  {                                                                            \
-    static std::string input_string() { return input_generator(Value); }       \
-    static std::string description() { return description_generator(Value); }  \
-  };                                                                           \
+#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T, input_generator, description_generator)               \
+  namespace nvbench                                                                                \
+  {                                                                                                \
+  template <T Value>                                                                               \
+  struct type_strings<enum_type<Value, T>>                                                         \
+  {                                                                                                \
+    static std::string input_string() { return input_generator(Value); }                           \
+    static std::string description() { return description_generator(Value); }                      \
+  };                                                                                               \
  }
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -31,16 +31,16 @@ enum class exec_flag
  none = 0x0,

  // Modifiers:
-  timer    = 0x01, // KernelLauncher uses manual timing
-  no_block = 0x02, // Disables use of `blocking_kernel`.
-  sync     = 0x04, // KernelLauncher has indicated that it will sync
-  run_once = 0x08, // Only run the benchmark once (for profiling).
+  timer         = 0x01, // KernelLauncher uses manual timing
+  no_block      = 0x02, // Disables use of `blocking_kernel`.
+  sync          = 0x04, // KernelLauncher has indicated that it will sync
+  run_once      = 0x08, // Only run the benchmark once (for profiling).
  modifier_mask = timer | no_block | sync | run_once,

  // Measurement types:
-  cold = 0x0100, // measure_hot
-  hot  = 0x0200, // measure_cold
-  measure_mask  = cold | hot
+  cold         = 0x0100, // measure_hot
+  hot          = 0x0200, // measure_cold
+  measure_mask = cold | hot
 };

 } // namespace nvbench::detail
@@ -120,7 +120,6 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer;

 /// Modifier used to indicate that the KernelGenerator will perform CUDA
 /// synchronizations. Without this flag such benchmarks will deadlock.
-constexpr inline auto sync = nvbench::exec_tag::impl::no_block |
-                             nvbench::exec_tag::impl::sync;
+constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync;

 } // namespace nvbench::exec_tag
--- a/nvbench/flags.cuh
+++ b/nvbench/flags.cuh
@@ -20,24 +20,24 @@

 #include <type_traits>

-#define NVBENCH_DECLARE_FLAGS(T)                                               \
-  constexpr inline T operator|(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator&(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator^(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator~(T v1)                                           \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(~static_cast<UT>(v1));                               \
+#define NVBENCH_DECLARE_FLAGS(T)                                                                   \
+  constexpr inline T operator|(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator&(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator^(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator~(T v1)                                                               \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(~static_cast<UT>(v1));                                                   \
  }
--- a/nvbench/float64_axis.cuh
+++ b/nvbench/float64_axis.cuh
@@ -36,20 +36,11 @@ struct float64_axis final : public axis_base

  ~float64_axis() final;

-  void set_inputs(std::vector<nvbench::float64_t> inputs)
-  {
-    m_values = std::move(inputs);
-  }
-  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const
-  {
-    return m_values[i];
-  }
+  void set_inputs(std::vector<nvbench::float64_t> inputs) { m_values = std::move(inputs); }
+  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }

 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<float64_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<float64_axis>(*this); }
  std::size_t do_get_size() const final { return m_values.size(); }
  std::string do_get_input_string(std::size_t i) const final;
  std::string do_get_description(std::size_t i) const final;
--- a/nvbench/git_revision.cuh
+++ b/nvbench/git_revision.cuh
@@ -1,20 +1,20 @@
 /*
-*  Copyright 2021 NVIDIA Corporation
-*
-*  Licensed under the Apache License, Version 2.0 with the LLVM exception
-*  (the "License"); you may not use this file except in compliance with
-*  the License.
-*
-*  You may obtain a copy of the License at
-*
-*      http://llvm.org/foundation/relicensing/LICENSE.txt
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*/
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */

 #pragma once

@@ -25,7 +25,5 @@
 #define NVBENCH_GIT_SHA1 NVBench_GIT_SHA1
 #define NVBENCH_GIT_VERSION NVBench_GIT_VERSION
 #ifdef NVBench_GIT_IS_DIRTY
-#  define NVBENCH_GIT_IS_DIRTY
+#define NVBENCH_GIT_IS_DIRTY
 #endif
-
-
--- a/nvbench/int64_axis.cuh
+++ b/nvbench/int64_axis.cuh
@@ -58,28 +58,18 @@ struct int64_axis final : public axis_base
    return static_cast<bool>(m_flags & int64_axis_flags::power_of_two);
  }

-  void set_inputs(std::vector<int64_t> inputs,
-                  int64_axis_flags flags = int64_axis_flags::none);
+  void set_inputs(std::vector<int64_t> inputs, int64_axis_flags flags = int64_axis_flags::none);

-  [[nodiscard]] const std::vector<int64_t> &get_inputs() const
-  {
-    return m_inputs;
-  };
+  [[nodiscard]] const std::vector<int64_t> &get_inputs() const { return m_inputs; };

  [[nodiscard]] int64_t get_value(std::size_t i) const { return m_values[i]; };

-  [[nodiscard]] const std::vector<int64_t> &get_values() const
-  {
-    return m_values;
-  };
+  [[nodiscard]] const std::vector<int64_t> &get_values() const { return m_values; };

  int64_axis_flags get_flags() const { return m_flags; }

  // Helper functions for pow2 conversions:
-  static nvbench::int64_t compute_pow2(nvbench::int64_t exponent)
-  {
-    return 1ll << exponent;
-  }
+  static nvbench::int64_t compute_pow2(nvbench::int64_t exponent) { return 1ll << exponent; }

  // UB if value < 0.
  static nvbench::int64_t compute_log2(nvbench::int64_t value)
@@ -95,10 +85,7 @@ struct int64_axis final : public axis_base
  };

 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<int64_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<int64_axis>(*this); }
  std::size_t do_get_size() const final { return m_inputs.size(); }
  std::string do_get_input_string(std::size_t) const final;
  std::string do_get_description(std::size_t) const final;
--- a/nvbench/int64_axis.cxx
+++ b/nvbench/int64_axis.cxx
@@ -66,9 +66,8 @@ std::string int64_axis::do_get_input_string(std::size_t i) const

 std::string int64_axis::do_get_description(std::size_t i) const
 {
-  return this->is_power_of_two()
-           ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
-           : std::string{};
+  return this->is_power_of_two() ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
+                                 : std::string{};
 }

 std::string_view int64_axis::do_get_flags_as_string() const
--- a/nvbench/internal/markdown_table.cuh
+++ b/nvbench/internal/markdown_table.cuh
@@ -85,8 +85,7 @@ private:
                            " {:^{}} ",
                            col.header,
                            col.max_width);
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
    }
    return fmt::format_to(iter, "\n");
  }
@@ -102,8 +101,7 @@ private:
                            "{:-^{}}",
                            "",
                            col.max_width + 2);
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
    }
    return fmt::format_to(iter, "\n");
  }
@@ -116,8 +114,7 @@ private:

    for (std::size_t row = 0; row < m_num_rows; ++row)
    {
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
      for (const column &col : m_columns)
      {
        iter = fmt::format_to(iter,
@@ -125,8 +122,7 @@ private:
                              " {:>{}} ",
                              col.rows[row],
                              col.max_width);
-        iter =
-          fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+        iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
      } // cols

      iter = fmt::format_to(iter, "\n");
--- a/nvbench/internal/nvml.cuh
+++ b/nvbench/internal/nvml.cuh
@@ -74,10 +74,7 @@ struct call_failed : error

  [[nodiscard]] nvmlReturn_t get_error_code() const { return m_error_code; }

-  [[nodiscard]] const std::string &get_error_string() const
-  {
-    return m_error_string;
-  }
+  [[nodiscard]] const std::string &get_error_string() const { return m_error_string; }

 private:
  nvmlReturn_t m_error_code;
@@ -90,30 +87,26 @@ private:

 #ifdef NVBENCH_HAS_NVML

-#define NVBENCH_NVML_CALL(call)                                                \
-  do                                                                           \
-  {                                                                            \
-    const auto _rr = call;                                                     \
-    if (_rr != NVML_SUCCESS)                                                   \
-    {                                                                          \
-      throw nvbench::nvml::call_failed(__FILE__,                               \
-                                       __LINE__,                               \
-                                       #call,                                  \
-                                       _rr,                                    \
-                                       nvmlErrorString(_rr));                  \
-    }                                                                          \
+#define NVBENCH_NVML_CALL(call)                                                                    \
+  do                                                                                               \
+  {                                                                                                \
+    const auto _rr = call;                                                                         \
+    if (_rr != NVML_SUCCESS)                                                                       \
+    {                                                                                              \
+      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, nvmlErrorString(_rr));      \
+    }                                                                                              \
  } while (false)

 // Same as above, but used for nvmlInit(), where a failure means that
 // nvmlErrorString is not available.
-#define NVBENCH_NVML_CALL_NO_API(call)                                         \
-  do                                                                           \
-  {                                                                            \
-    const auto _rr = call;                                                     \
-    if (_rr != NVML_SUCCESS)                                                   \
-    {                                                                          \
-      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, "");    \
-    }                                                                          \
+#define NVBENCH_NVML_CALL_NO_API(call)                                                             \
+  do                                                                                               \
+  {                                                                                                \
+    const auto _rr = call;                                                                         \
+    if (_rr != NVML_SUCCESS)                                                                       \
+    {                                                                                              \
+      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, "");                        \
+    }                                                                                              \
  } while (false)

 #endif // NVBENCH_HAS_NVML
--- a/nvbench/internal/table_builder.cuh
+++ b/nvbench/internal/table_builder.cuh
@@ -47,18 +47,14 @@ struct table_builder
                const std::string &header,
                std::string value)
  {
-    auto iter = std::find_if(m_columns.begin(),
-                             m_columns.end(),
-                             [&column_key](const column &col) {
-                               return col.key == column_key;
-                             });
+    auto iter = std::find_if(m_columns.begin(), m_columns.end(), [&column_key](const column &col) {
+      return col.key == column_key;
+    });

    auto &col = iter == m_columns.end()
-                ? m_columns.emplace_back(column{column_key,
-                                                header,
-                                                std::vector<std::string>{},
-                                                header.size()})
-                : *iter;
+                  ? m_columns.emplace_back(
+                      column{column_key, header, std::vector<std::string>{}, header.size()})
+                  : *iter;

    col.max_width = std::max(col.max_width, value.size());
    if (col.rows.size() <= row)
@@ -76,11 +72,9 @@ struct table_builder
      std::size_t{},
      [](const auto &a, const auto &b) { return a > b ? a : b; },
      [](const column &col) { return col.rows.size(); });
-    std::for_each(m_columns.begin(),
-                  m_columns.end(),
-                  [num_rows = m_num_rows](column &col) {
-                    col.rows.resize(num_rows);
-                  });
+    std::for_each(m_columns.begin(), m_columns.end(), [num_rows = m_num_rows](column &col) {
+      col.rows.resize(num_rows);
+    });
  }
 };

--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -126,11 +126,10 @@ std::string json_printer::version_t::get_string() const
  return fmt::format("{}.{}.{}", this->major, this->minor, this->patch);
 }

-void json_printer::do_process_bulk_data_float64(
-  state &state,
-  const std::string &tag,
-  const std::string &hint,
-  const std::vector<nvbench::float64_t> &data)
+void json_printer::do_process_bulk_data_float64(state &state,
+                                                const std::string &tag,
+                                                const std::string &hint,
+                                                const std::vector<nvbench::float64_t> &data)
 {
  printer_base::do_process_bulk_data_float64(state, tag, hint, data);

@@ -157,16 +156,12 @@ void json_printer::do_process_bulk_data_float64(
      {
        if (!fs::create_directory(result_path))
        {
-          NVBENCH_THROW(std::runtime_error,
-                        "{}",
-                        "Failed to create result directory '{}'.");
+          NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'.");
        }
      }
      else if (!fs::is_directory(result_path))
      {
-        NVBENCH_THROW(std::runtime_error,
-                      "{}",
-                      "'{}' exists and is not a directory.");
+        NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory.");
      }

      const auto file_id = m_num_jsonbin_files++;
@@ -197,16 +192,12 @@ void json_printer::do_process_bulk_data_float64(
    }
    catch (std::exception &e)
    {
-      if (auto printer_opt_ref = state.get_benchmark().get_printer();
-          printer_opt_ref.has_value())
+      if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
      {
        auto &printer = printer_opt_ref.value().get();
-        printer.log(nvbench::log_level::warn,
-                    fmt::format("Error writing {} ({}) to {}: {}",
-                                tag,
-                                hint,
-                                result_path.string(),
-                                e.what()));
+        printer.log(
+          nvbench::log_level::warn,
+          fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what()));
      }
    } // end catch

@@ -221,14 +212,12 @@ void json_printer::do_process_bulk_data_float64(
    summ.set_string("hide", "Not needed in table.");

    timer.stop();
-    if (auto printer_opt_ref = state.get_benchmark().get_printer();
-        printer_opt_ref.has_value())
+    if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
    {
      auto &printer = printer_opt_ref.value().get();
-      printer.log(nvbench::log_level::info,
-                  fmt::format("Wrote '{}' in {:>6.3f}ms",
-                              result_path.string(),
-                              timer.get_duration() * 1000));
+      printer.log(
+        nvbench::log_level::info,
+        fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
    }
  } // end hint == sample_times
 }
@@ -304,15 +293,12 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
      device["global_memory_size"]    = dev_info.get_global_memory_size();
      device["global_memory_bus_peak_clock_rate"] =
        dev_info.get_global_memory_bus_peak_clock_rate();
-      device["global_memory_bus_width"] =
-        dev_info.get_global_memory_bus_width();
-      device["global_memory_bus_bandwidth"] =
-        dev_info.get_global_memory_bus_bandwidth();
-      device["l2_cache_size"]        = dev_info.get_l2_cache_size();
-      device["shared_memory_per_sm"] = dev_info.get_shared_memory_per_sm();
-      device["shared_memory_per_block"] =
-        dev_info.get_shared_memory_per_block();
-      device["ecc_state"] = dev_info.get_ecc_state();
+      device["global_memory_bus_width"]     = dev_info.get_global_memory_bus_width();
+      device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
+      device["l2_cache_size"]               = dev_info.get_l2_cache_size();
+      device["shared_memory_per_sm"]        = dev_info.get_shared_memory_per_sm();
+      device["shared_memory_per_block"]     = dev_info.get_shared_memory_per_block();
+      device["ecc_state"]                   = dev_info.get_ecc_state();
    }
  } // "devices"

@@ -358,23 +344,19 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
          switch (axis_ptr->get_type())
          {
            case nvbench::axis_type::type:
-              value["is_active"] =
-                static_cast<type_axis &>(*axis_ptr).get_is_active(i);
+              value["is_active"] = static_cast<type_axis &>(*axis_ptr).get_is_active(i);
              break;

            case nvbench::axis_type::int64:
-              value["value"] =
-                static_cast<int64_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<int64_axis &>(*axis_ptr).get_value(i);
              break;

            case nvbench::axis_type::float64:
-              value["value"] =
-                static_cast<float64_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<float64_axis &>(*axis_ptr).get_value(i);
              break;

            case nvbench::axis_type::string:
-              value["value"] =
-                static_cast<string_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<string_axis &>(*axis_ptr).get_value(i);
              break;
            default:
              break;
--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -38,9 +38,7 @@ struct json_printer : nvbench::printer_base
 {
  using printer_base::printer_base;

-  json_printer(std::ostream &stream,
-               std::string stream_name,
-               bool enable_binary_output)
+  json_printer(std::ostream &stream, std::string stream_name, bool enable_binary_output)
      : printer_base(stream, std::move(stream_name))
      , m_enable_binary_output{enable_binary_output}
  {}
@@ -59,23 +57,16 @@ struct json_printer : nvbench::printer_base

  [[nodiscard]] static version_t get_json_file_version();

-  [[nodiscard]] bool get_enable_binary_output() const
-  {
-    return m_enable_binary_output;
-  }
+  [[nodiscard]] bool get_enable_binary_output() const { return m_enable_binary_output; }
  void set_enable_binary_output(bool b) { m_enable_binary_output = b; }

 protected:
  // Virtual API from printer_base:
-  void do_log_argv(const std::vector<std::string>& argv) override
-  {
-    m_argv = argv;
-  }
-  void do_process_bulk_data_float64(
-    nvbench::state &state,
-    const std::string &tag,
-    const std::string &hint,
-    const std::vector<nvbench::float64_t> &data) override;
+  void do_log_argv(const std::vector<std::string> &argv) override { m_argv = argv; }
+  void do_process_bulk_data_float64(nvbench::state &state,
+                                    const std::string &tag,
+                                    const std::string &hint,
+                                    const std::vector<nvbench::float64_t> &data) override;
  void do_print_benchmark_results(const benchmark_vector &benches) override;

  bool m_enable_binary_output{false};
--- a/nvbench/launch.cuh
+++ b/nvbench/launch.cuh
@@ -42,20 +42,17 @@ struct launch
  {}

  // move-only
-  launch(const launch &) = delete;
-  launch(launch &&)      = default;
+  launch(const launch &)            = delete;
+  launch(launch &&)                 = default;
  launch &operator=(const launch &) = delete;
-  launch &operator=(launch &&) = default;
+  launch &operator=(launch &&)      = default;

  /**
   * @return a CUDA stream that all kernels and other stream-ordered CUDA work
   * must use. This stream can be changed by the `KernelGenerator` using the
   * `nvbench::state::set_cuda_stream` method.
   */
-  __forceinline__ const nvbench::cuda_stream &get_stream() const
-  {
-    return m_stream;
-  };
+  __forceinline__ const nvbench::cuda_stream &get_stream() const { return m_stream; };

 private:
  // The stream is owned by the `nvbench::state` associated with this launch.
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -27,23 +27,23 @@

 #include <iostream>

-#define NVBENCH_MAIN                                                           \
-  int main(int argc, char const *const *argv)                                  \
-  try                                                                          \
-  {                                                                            \
-    NVBENCH_MAIN_BODY(argc, argv);                                             \
-    NVBENCH_CUDA_CALL(cudaDeviceReset());                                      \
-    return 0;                                                                  \
-  }                                                                            \
-  catch (std::exception & e)                                                   \
-  {                                                                            \
-    std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";    \
-    return 1;                                                                  \
-  }                                                                            \
-  catch (...)                                                                  \
-  {                                                                            \
-    std::cerr << "\nNVBench encountered an unknown error.\n";                  \
-    return 1;                                                                  \
+#define NVBENCH_MAIN                                                                               \
+  int main(int argc, char const *const *argv)                                                      \
+  try                                                                                              \
+  {                                                                                                \
+    NVBENCH_MAIN_BODY(argc, argv);                                                                 \
+    NVBENCH_CUDA_CALL(cudaDeviceReset());                                                          \
+    return 0;                                                                                      \
+  }                                                                                                \
+  catch (std::exception & e)                                                                       \
+  {                                                                                                \
+    std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";                        \
+    return 1;                                                                                      \
+  }                                                                                                \
+  catch (...)                                                                                      \
+  {                                                                                                \
+    std::cerr << "\nNVBench encountered an unknown error.\n";                                      \
+    return 1;                                                                                      \
  }

 #ifdef NVBENCH_HAS_CUPTI
@@ -54,35 +54,35 @@
 // clang-format on
 #endif

-#define NVBENCH_MAIN_PARSE(argc, argv)                                         \
-  nvbench::option_parser parser;                                               \
+#define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
+  nvbench::option_parser parser;                                                                   \
  parser.parse(argc, argv)

-#define NVBENCH_MAIN_BODY(argc, argv)                                          \
-  do                                                                           \
-  {                                                                            \
-    NVBENCH_INITIALIZE_DRIVER_API;                                             \
-    NVBENCH_MAIN_PARSE(argc, argv);                                            \
-    auto &printer = parser.get_printer();                                      \
-                                                                               \
-    printer.print_device_info();                                               \
-    printer.print_log_preamble();                                              \
-    auto &benchmarks = parser.get_benchmarks();                                \
-                                                                               \
-    std::size_t total_states = 0;                                              \
-    for (auto &bench_ptr : benchmarks)                                         \
-    {                                                                          \
-      total_states += bench_ptr->get_config_count();                           \
-    }                                                                          \
-    printer.set_total_state_count(total_states);                               \
-                                                                               \
-    printer.set_completed_state_count(0);                                      \
-    for (auto &bench_ptr : benchmarks)                                         \
-    {                                                                          \
-      bench_ptr->set_printer(printer);                                         \
-      bench_ptr->run();                                                        \
-      bench_ptr->clear_printer();                                              \
-    }                                                                          \
-    printer.print_log_epilogue();                                              \
-    printer.print_benchmark_results(benchmarks);                               \
+#define NVBENCH_MAIN_BODY(argc, argv)                                                              \
+  do                                                                                               \
+  {                                                                                                \
+    NVBENCH_INITIALIZE_DRIVER_API;                                                                 \
+    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
+    auto &printer = parser.get_printer();                                                          \
+                                                                                                   \
+    printer.print_device_info();                                                                   \
+    printer.print_log_preamble();                                                                  \
+    auto &benchmarks = parser.get_benchmarks();                                                    \
+                                                                                                   \
+    std::size_t total_states = 0;                                                                  \
+    for (auto &bench_ptr : benchmarks)                                                             \
+    {                                                                                              \
+      total_states += bench_ptr->get_config_count();                                               \
+    }                                                                                              \
+    printer.set_total_state_count(total_states);                                                   \
+                                                                                                   \
+    printer.set_completed_state_count(0);                                                          \
+    for (auto &bench_ptr : benchmarks)                                                             \
+    {                                                                                              \
+      bench_ptr->set_printer(printer);                                                             \
+      bench_ptr->run();                                                                            \
+      bench_ptr->clear_printer();                                                                  \
+    }                                                                                              \
+    printer.print_log_epilogue();                                                                  \
+    printer.print_benchmark_results(benchmarks);                                                   \
  } while (false)
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -44,9 +44,8 @@ void markdown_printer::do_print_device_info()
  fmt::format_to(buffer, "# Devices\n\n");

  const auto &device_mgr = nvbench::device_manager::get();
-  const auto &devices    = device_mgr.get_number_of_used_devices() > 0
-                             ? device_mgr.get_used_devices()
-                             : device_mgr.get_devices();
+  const auto &devices = device_mgr.get_number_of_used_devices() > 0 ? device_mgr.get_used_devices()
+                                                                    : device_mgr.get_devices();
  for (const auto &device : devices)
  {
    const auto [gmem_free, gmem_used] = device.get_global_memory_usage();
@@ -64,22 +63,17 @@ void markdown_printer::do_print_device_info()
                   "* Global Memory: {} MiB Free / {} MiB Total\n",
                   gmem_free / 1024 / 1024,
                   gmem_used / 1024 / 1024);
-    fmt::format_to(
-      buffer,
-      "* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
-      device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
-      device.get_global_memory_bus_width(),
-      device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
+    fmt::format_to(buffer,
+                   "* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
+                   device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
+                   device.get_global_memory_bus_width(),
+                   device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
    fmt::format_to(buffer,
                   "* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
                   device.get_shared_memory_per_sm() / 1024,
                   device.get_shared_memory_per_block() / 1024);
-    fmt::format_to(buffer,
-                   "* L2 Cache Size: {} KiB\n",
-                   device.get_l2_cache_size() / 1024);
-    fmt::format_to(buffer,
-                   "* Maximum Active Blocks: {}/SM\n",
-                   device.get_max_blocks_per_sm());
+    fmt::format_to(buffer, "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
+    fmt::format_to(buffer, "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
    fmt::format_to(buffer,
                   "* Maximum Active Threads: {}/SM, {}/Block\n",
                   device.get_max_threads_per_sm(),
@@ -88,18 +82,13 @@ void markdown_printer::do_print_device_info()
                   "* Available Registers: {}/SM, {}/Block\n",
                   device.get_registers_per_sm(),
                   device.get_registers_per_block());
-    fmt::format_to(buffer,
-                   "* ECC Enabled: {}\n",
-                   device.get_ecc_state() ? "Yes" : "No");
+    fmt::format_to(buffer, "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
    fmt::format_to(buffer, "\n");
  }
  m_ostream << fmt::to_string(buffer);
 }

-void markdown_printer::do_print_log_preamble()
-{
-  m_ostream << "# Log\n\n```\n";
-}
+void markdown_printer::do_print_log_preamble() { m_ostream << "# Log\n\n```\n"; }

 void markdown_printer::do_print_log_epilogue() { m_ostream << "```\n\n"; }

@@ -147,8 +136,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
 {
  if (m_total_state_count == 0)
  { // No progress info
-    this->log(nvbench::log_level::run,
-              exec_state.get_short_description(m_color));
+    this->log(nvbench::log_level::run, exec_state.get_short_description(m_color));
  }
  else
  { // Add progress
@@ -160,8 +148,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
  }
 }

-void markdown_printer::do_print_benchmark_list(
-  const printer_base::benchmark_vector &benches)
+void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vector &benches)
 {
  if (benches.empty())
  {
@@ -204,10 +191,7 @@ void markdown_printer::do_print_benchmark_list(
        {
          desc = fmt::format(" ({})", desc);
        }
-        fmt::format_to(buffer,
-                       "  * `{}`{}\n",
-                       axis_ptr->get_input_string(i),
-                       desc);
+        fmt::format_to(buffer, "  * `{}`{}\n", axis_ptr->get_input_string(i), desc);
      } // end foreach value
    }   // end foreach axis
    fmt::format_to(buffer, "\n");
@@ -216,8 +200,7 @@ void markdown_printer::do_print_benchmark_list(
  m_ostream << fmt::to_string(buffer);
 }

-void markdown_printer::do_print_benchmark_results(
-  const printer_base::benchmark_vector &benches)
+void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_vector &benches)
 {
  auto format_visitor = [](const auto &v) {
    using T = std::decay_t<decltype(v)>;
@@ -252,19 +235,15 @@ void markdown_printer::do_print_benchmark_results(
    // Do a single pass when no devices are specified. This happens for
    // benchmarks with `cpu` exec_tags.
    const std::size_t num_device_passes = devices.empty() ? 1 : devices.size();
-    for (std::size_t device_pass = 0; device_pass < num_device_passes;
-         ++device_pass)
+    for (std::size_t device_pass = 0; device_pass < num_device_passes; ++device_pass)
    {
-      std::optional<nvbench::device_info> device =
-        devices.empty() ? std::nullopt
-                        : std::make_optional(devices[device_pass]);
+      std::optional<nvbench::device_info> device = devices.empty()
+                                                     ? std::nullopt
+                                                     : std::make_optional(devices[device_pass]);

      if (device)
      {
-        fmt::format_to(buffer,
-                       "\n### [{}] {}\n\n",
-                       device->get_id(),
-                       device->get_name());
+        fmt::format_to(buffer, "\n### [{}] {}\n\n", device->get_id(), device->get_name());
      }

      std::size_t row = 0;
@@ -288,15 +267,11 @@ void markdown_printer::do_print_benchmark_results(
            {
              const nvbench::int64_t value    = axis_values.get_int64(name);
              const nvbench::int64_t exponent = int64_axis::compute_log2(value);
-              table.add_cell(row,
-                             name,
-                             name,
-                             fmt::format("2^{} = {}", exponent, value));
+              table.add_cell(row, name, name, fmt::format("2^{} = {}", exponent, value));
            }
            else
            {
-              std::string value = std::visit(format_visitor,
-                                             axis_values.get_value(name));
+              std::string value = std::visit(format_visitor, axis_values.get_value(name));
              table.add_cell(row, name + "_axis", name, std::move(value));
            }
          }
@@ -308,12 +283,9 @@ void markdown_printer::do_print_benchmark_results(
              continue;
            }
            const std::string &tag    = summ.get_tag();
-            const std::string &header = summ.has_value("name")
-                                          ? summ.get_string("name")
-                                          : tag;
+            const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;

-            std::string hint = summ.has_value("hint") ? summ.get_string("hint")
-                                                      : std::string{};
+            std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
            if (hint == "duration")
            {
              table.add_cell(row, tag, header, this->do_format_duration(summ));
@@ -332,10 +304,7 @@ void markdown_printer::do_print_benchmark_results(
            }
            else if (hint == "sample_size")
            {
-              table.add_cell(row,
-                             tag,
-                             header,
-                             this->do_format_sample_size(summ));
+              table.add_cell(row, tag, header, this->do_format_sample_size(summ));
            }
            else if (hint == "percentage")
            {
@@ -353,8 +322,7 @@ void markdown_printer::do_print_benchmark_results(
      auto table_str = table.to_string();
      fmt::format_to(buffer,
                     "{}",
-                     table_str.empty() ? "No data -- check log.\n"
-                                       : std::move(table_str));
+                     table_str.empty() ? "No data -- check log.\n" : std::move(table_str));
    } // end foreach device_pass
  }

--- a/nvbench/named_values.cuh
+++ b/nvbench/named_values.cuh
@@ -33,8 +33,7 @@ namespace nvbench
 */
 struct named_values
 {
-  using value_type =
-    std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
+  using value_type = std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;

  enum class type
  {
@@ -43,7 +42,7 @@ struct named_values
    string
  };

-  void append(const named_values& other);
+  void append(const named_values &other);

  [[nodiscard]] std::size_t get_size() const;
  [[nodiscard]] std::vector<std::string> get_names() const;
@@ -60,11 +59,11 @@ struct named_values

  [[nodiscard]] type get_type(const std::string &name) const;
  [[nodiscard]] bool has_value(const std::string &name) const;
-  [[nodiscard]] const value_type& get_value(const std::string &name) const;
+  [[nodiscard]] const value_type &get_value(const std::string &name) const;

  void clear();

-  void remove_value(const std::string& name);
+  void remove_value(const std::string &name);

 private:
  struct named_value
--- a/nvbench/named_values.cxx
+++ b/nvbench/named_values.cxx
@@ -33,9 +33,7 @@ namespace nvbench

 void named_values::append(const named_values &other)
 {
-  m_storage.insert(m_storage.end(),
-                   other.m_storage.cbegin(),
-                   other.m_storage.cend());
+  m_storage.insert(m_storage.end(), other.m_storage.cbegin(), other.m_storage.cend());
 }

 void named_values::clear() { m_storage.clear(); }
@@ -55,20 +53,17 @@ std::vector<std::string> named_values::get_names() const

 bool named_values::has_value(const std::string &name) const
 {
-  auto iter =
-    std::find_if(m_storage.cbegin(),
-                 m_storage.cend(),
-                 [&name](const auto &val) { return val.name == name; });
+  auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
+    return val.name == name;
+  });
  return iter != m_storage.cend();
 }

-const named_values::value_type &
-named_values::get_value(const std::string &name) const
+const named_values::value_type &named_values::get_value(const std::string &name) const
 {
-  auto iter =
-    std::find_if(m_storage.cbegin(),
-                 m_storage.cend(),
-                 [&name](const auto &val) { return val.name == name; });
+  auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
+    return val.name == name;
+  });
  if (iter == m_storage.cend())
  {
    NVBENCH_THROW(std::runtime_error, "No value with name '{}'.", name);
@@ -96,9 +91,7 @@ named_values::type named_values::get_type(const std::string &name) const
      // warning C4702: unreachable code
      // This is a future-proofing check, it'll be reachable if something breaks
      NVBENCH_MSVC_PUSH_DISABLE_WARNING(4702)
-      NVBENCH_THROW(std::runtime_error,
-                    "Unknown variant type for entry '{}'.",
-                    name);
+      NVBENCH_THROW(std::runtime_error, "Unknown variant type for entry '{}'.", name);
    },
    this->get_value(name));
  NVBENCH_MSVC_POP_WARNING()
@@ -111,10 +104,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up int64 value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up int64 value `{}`:\n{}", name, err.what());
 }

 nvbench::float64_t named_values::get_float64(const std::string &name) const
@@ -124,10 +114,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up float64 value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up float64 value `{}`:\n{}", name, err.what());
 }

 const std::string &named_values::get_string(const std::string &name) const
@@ -137,10 +124,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up string value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up string value `{}`:\n{}", name, err.what());
 }

 void named_values::set_int64(std::string name, nvbench::int64_t value)
@@ -165,10 +149,9 @@ void named_values::set_value(std::string name, named_values::value_type value)

 void named_values::remove_value(const std::string &name)
 {
-  auto iter =
-    std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
-      return val.name == name;
-    });
+  auto iter = std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
+    return val.name == name;
+  });
  if (iter != m_storage.end())
  {
    m_storage.erase(iter);
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -82,20 +82,11 @@ std::string_view submatch_to_sv(const sv_submatch &in)
 //
 // So we're stuck with materializing a std::string and calling std::stoX(). Ah
 // well. At least it's not istream.
-void parse(std::string_view input, nvbench::int32_t &val)
-{
-  val = std::stoi(std::string(input));
-}
+void parse(std::string_view input, nvbench::int32_t &val) { val = std::stoi(std::string(input)); }

-void parse(std::string_view input, nvbench::int64_t &val)
-{
-  val = std::stoll(std::string(input));
-}
+void parse(std::string_view input, nvbench::int64_t &val) { val = std::stoll(std::string(input)); }

-void parse(std::string_view input, nvbench::float64_t &val)
-{
-  val = std::stod(std::string(input));
-}
+void parse(std::string_view input, nvbench::float64_t &val) { val = std::stod(std::string(input)); }

 void parse(std::string_view input, std::string &val) { val = input; }

@@ -112,9 +103,8 @@ std::vector<T> parse_list_values(std::string_view list_spec)
    "(?:,|$)"  // Delimiters
  };

-  auto values_begin =
-    sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
-  auto values_end = sv_regex_iterator{};
+  auto values_begin = sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
+  auto values_end   = sv_regex_iterator{};
  while (values_begin != values_end)
  {
    auto match          = *values_begin++;
@@ -131,8 +121,7 @@ std::vector<T> parse_list_values(std::string_view list_spec)
 // Parses a range specification "<start> : <stop> [ : <stride> ]" and returns
 // a vector filled with the specified range.
 template <typename T>
-std::vector<T> parse_range_values(std::string_view range_spec,
-                                  nvbench::wrapped_type<T>)
+std::vector<T> parse_range_values(std::string_view range_spec, nvbench::wrapped_type<T>)
 {
  std::vector<T> range_params;

@@ -143,9 +132,8 @@ std::vector<T> parse_range_values(std::string_view range_spec,
    "(?:$|:)"  // Delimiters
  };

-  auto values_begin =
-    sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
-  auto values_end = sv_regex_iterator{};
+  auto values_begin = sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
+  auto values_end   = sv_regex_iterator{};
  for (; values_begin != values_end; ++values_begin)
  {
    auto match          = *values_begin;
@@ -221,25 +209,15 @@ std::vector<T> parse_values(std::string_view value_spec)
                                       "$"};        // EOS

  sv_match match;
-  if (std::regex_search(value_spec.cbegin(),
-                        value_spec.cend(),
-                        match,
-                        list_regex))
+  if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, list_regex))
  {
    return parse_list_values<T>(submatch_to_sv(match[1]));
  }
-  else if (std::regex_search(value_spec.cbegin(),
-                             value_spec.cend(),
-                             match,
-                             range_regex))
+  else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, range_regex))
  {
-    return parse_range_values(submatch_to_sv(match[1]),
-                              nvbench::wrapped_type<T>{});
+    return parse_range_values(submatch_to_sv(match[1]), nvbench::wrapped_type<T>{});
  }
-  else if (std::regex_search(value_spec.cbegin(),
-                             value_spec.cend(),
-                             match,
-                             single_regex))
+  else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, single_regex))
  {
    T val;
    parse(submatch_to_sv(match[1]), val);
@@ -247,9 +225,7 @@ std::vector<T> parse_values(std::string_view value_spec)
  }
  else
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid axis value spec: {}",
-                  value_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid axis value spec: {}", value_spec);
  }
 }

@@ -514,8 +490,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
      this->update_int64_prop(first[0], first[1]);
      first += 2;
    }
-    else if (arg == "--min-time" || arg == "--max-noise" ||
-             arg == "--skip-time" || arg == "--timeout")
+    else if (arg == "--min-time" || arg == "--max-noise" || arg == "--skip-time" ||
+             arg == "--timeout")
    {
      check_params(1);
      this->update_float64_prop(first[0], first[1]);
@@ -523,9 +499,7 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
    }
    else
    {
-      NVBENCH_THROW(std::runtime_error,
-                    "Unrecognized command-line argument: `{}`.",
-                    arg);
+      NVBENCH_THROW(std::runtime_error, "Unrecognized command-line argument: `{}`.", arg);
    }
  }
 }
@@ -534,7 +508,7 @@ void option_parser::add_markdown_printer(const std::string &spec)
 try
 {
  std::ostream &stream = this->printer_spec_to_ostream(spec);
-  auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
+  auto &printer        = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
  if (spec == "stdout")
  {
    printer.set_color(m_color_md_stdout_printer);
@@ -556,14 +530,10 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error while adding csv output for `{}`:\n{}",
-                spec,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error while adding csv output for `{}`:\n{}", spec, e.what());
 }

-void option_parser::add_json_printer(const std::string &spec,
-                                     bool enable_binary)
+void option_parser::add_json_printer(const std::string &spec, bool enable_binary)
 try
 {
  std::ostream &stream = this->printer_spec_to_ostream(spec);
@@ -624,10 +594,7 @@ void option_parser::print_help() const
  fmt::print("{}\n{}\n", ::cli_help_text, ::cli_help_axis_text);
 }

-void option_parser::print_help_axis() const
-{
-  fmt::print("{}\n", ::cli_help_axis_text);
-}
+void option_parser::print_help_axis() const { fmt::print("{}\n", ::cli_help_axis_text); }

 void option_parser::set_persistence_mode(const std::string &state)
 try
@@ -685,9 +652,7 @@ try
  {
    if (rate_val == nvbench::device_info::clock_rate::none)
    {
-      fmt::print("Unlocking clocks for device '{}' ({}).\n",
-                 device.get_name(),
-                 device.get_id());
+      fmt::print("Unlocking clocks for device '{}' ({}).\n", device.get_name(), device.get_id());
    }
    else
    {
@@ -757,16 +722,12 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --benchmark `{}`:\n{}",
-                name,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --benchmark `{}`:\n{}", name, e.what());
 }

 void option_parser::replay_global_args()
 {
-  this->parse_range(m_global_benchmark_args.cbegin(),
-                    m_global_benchmark_args.cend());
+  this->parse_range(m_global_benchmark_args.cbegin(), m_global_benchmark_args.cend());
 }

 void option_parser::update_devices(const std::string &devices)
@@ -790,10 +751,7 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --devices `{}`:\n{}",
-                devices,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --devices `{}`:\n{}", devices, e.what());
 }

 void option_parser::update_axis(const std::string &spec)
@@ -832,28 +790,20 @@ try
  switch (axis.get_type())
  {
    case axis_type::type:
-      this->update_type_axis(static_cast<nvbench::type_axis &>(axis),
-                             values,
-                             flags);
+      this->update_type_axis(static_cast<nvbench::type_axis &>(axis), values, flags);
      break;

    case axis_type::int64:
-      this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis),
-                              values,
-                              flags);
+      this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis), values, flags);
      break;

    case axis_type::float64:
-      this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis),
-                                values,
-                                flags);
+      this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis), values, flags);

      break;

    case axis_type::string:
-      this->update_string_axis(static_cast<nvbench::string_axis &>(axis),
-                               values,
-                               flags);
+      this->update_string_axis(static_cast<nvbench::string_axis &>(axis), values, flags);

      break;

@@ -866,10 +816,7 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --axis `{}`:\n{}",
-                spec,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --axis `{}`:\n{}", spec, e.what());
 }

 void option_parser::update_int64_axis(int64_axis &axis,
@@ -888,9 +835,7 @@ void option_parser::update_int64_axis(int64_axis &axis,
  }
  else
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for int64 axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for int64 axis: `{}`", flag_spec);
  }

  auto input_values = parse_values<nvbench::int64_t>(value_spec);
@@ -905,9 +850,7 @@ void option_parser::update_float64_axis(float64_axis &axis,
  // Validate flags:
  if (!flag_spec.empty())
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for float64 axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for float64 axis: `{}`", flag_spec);
  }

  auto input_values = parse_values<nvbench::float64_t>(value_spec);
@@ -922,9 +865,7 @@ void option_parser::update_string_axis(string_axis &axis,
  // Validate flags:
  if (!flag_spec.empty())
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for string axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for string axis: `{}`", flag_spec);
  }

  auto input_values = parse_values<std::string>(value_spec);
@@ -939,9 +880,7 @@ void option_parser::update_type_axis(type_axis &axis,
  // Validate flags:
  if (!flag_spec.empty())
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for type axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for type axis: `{}`", flag_spec);
  }

  auto input_values = parse_values<std::string>(value_spec);
@@ -949,8 +888,7 @@ void option_parser::update_type_axis(type_axis &axis,
  axis.set_active_inputs(input_values);
 }

-void option_parser::update_int64_prop(const std::string &prop_arg,
-                                      const std::string &prop_val)
+void option_parser::update_int64_prop(const std::string &prop_arg, const std::string &prop_val)
 try
 {
  // If no active benchmark, save args as global.
@@ -983,8 +921,7 @@ catch (std::exception &e)
                e.what());
 }

-void option_parser::update_float64_prop(const std::string &prop_arg,
-                                        const std::string &prop_val)
+void option_parser::update_float64_prop(const std::string &prop_arg, const std::string &prop_val)
 try
 {
  // If no active benchmark, save args as global.
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -41,8 +41,7 @@ struct type_axis;
 */
 struct option_parser
 {
-  using benchmark_vector =
-    std::vector<std::unique_ptr<nvbench::benchmark_base>>;
+  using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;

  option_parser();
  ~option_parser();
@@ -51,15 +50,9 @@ struct option_parser
  void parse(std::vector<std::string> args);

  [[nodiscard]] benchmark_vector &get_benchmarks() { return m_benchmarks; };
-  [[nodiscard]] const benchmark_vector &get_benchmarks() const
-  {
-    return m_benchmarks;
-  };
+  [[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };

-  [[nodiscard]] const std::vector<std::string> &get_args() const
-  {
-    return m_args;
-  }
+  [[nodiscard]] const std::vector<std::string> &get_args() const { return m_args; }

  /*!
   * Returns the output format requested by the parse options.
@@ -115,10 +108,8 @@ private:
                               std::string_view value_spec,
                               std::string_view flag_spec);

-  void update_int64_prop(const std::string &prop_arg,
-                         const std::string &prop_val);
-  void update_float64_prop(const std::string &prop_arg,
-                           const std::string &prop_val);
+  void update_int64_prop(const std::string &prop_arg, const std::string &prop_val);
+  void update_float64_prop(const std::string &prop_arg, const std::string &prop_val);

  void update_used_device_state() const;

--- a/nvbench/printer_base.cuh
+++ b/nvbench/printer_base.cuh
@@ -76,19 +76,16 @@ struct printer_base
  virtual ~printer_base();

  // move-only
-  printer_base(const printer_base &) = delete;
-  printer_base(printer_base &&)      = default;
+  printer_base(const printer_base &)            = delete;
+  printer_base(printer_base &&)                 = default;
  printer_base &operator=(const printer_base &) = delete;
-  printer_base &operator=(printer_base &&) = default;
+  printer_base &operator=(printer_base &&)      = default;

  /*!
   * Called once with the command line arguments used to invoke the current
   * executable.
   */
-  void log_argv(const std::vector<std::string> &argv)
-  {
-    this->do_log_argv(argv);
-  }
+  void log_argv(const std::vector<std::string> &argv) { this->do_log_argv(argv); }

  /*!
   * Print a summary of all detected devices, if supported.
@@ -108,19 +105,13 @@ struct printer_base
  /*!
   * Print a log message at the specified log level.
   */
-  void log(nvbench::log_level level, const std::string &msg)
-  {
-    this->do_log(level, msg);
-  }
+  void log(nvbench::log_level level, const std::string &msg) { this->do_log(level, msg); }

  /*!
   * Called before running the measurements associated with state.
   * Implementations are expected to call `log(log_level::run, ...)`.
   */
-  void log_run_state(const nvbench::state &exec_state)
-  {
-    this->do_log_run_state(exec_state);
-  }
+  void log_run_state(const nvbench::state &exec_state) { this->do_log_run_state(exec_state); }

  /*!
   * Measurements may call this to allow a printer to perform extra processing
@@ -181,10 +172,7 @@ struct printer_base
    return this->do_get_completed_state_count();
  }

-  virtual void set_total_state_count(std::size_t states)
-  {
-    this->do_set_total_state_count(states);
-  }
+  virtual void set_total_state_count(std::size_t states) { this->do_set_total_state_count(states); }
  [[nodiscard]] virtual std::size_t get_total_state_count() const
  {
    return this->do_get_total_state_count();
@@ -193,17 +181,16 @@ struct printer_base

 protected:
  // Implementation hooks for subclasses:
-  virtual void do_log_argv(const std::vector<std::string>&) {}
+  virtual void do_log_argv(const std::vector<std::string> &) {}
  virtual void do_print_device_info() {}
  virtual void do_print_log_preamble() {}
  virtual void do_print_log_epilogue() {}
  virtual void do_log(nvbench::log_level, const std::string &) {}
  virtual void do_log_run_state(const nvbench::state &) {}
-  virtual void
-  do_process_bulk_data_float64(nvbench::state &,
-                               const std::string &,
-                               const std::string &,
-                               const std::vector<nvbench::float64_t> &){};
+  virtual void do_process_bulk_data_float64(nvbench::state &,
+                                            const std::string &,
+                                            const std::string &,
+                                            const std::vector<nvbench::float64_t> &){};
  virtual void do_print_benchmark_list(const benchmark_vector &) {}
  virtual void do_print_benchmark_results(const benchmark_vector &) {}

--- a/nvbench/printer_base.cxx
+++ b/nvbench/printer_base.cxx
@@ -38,19 +38,10 @@ void printer_base::do_set_completed_state_count(std::size_t states)

 void printer_base::do_add_completed_state() { ++m_completed_state_count; }

-std::size_t printer_base::do_get_completed_state_count() const
-{
-  return m_completed_state_count;
-}
+std::size_t printer_base::do_get_completed_state_count() const { return m_completed_state_count; }

-void printer_base::do_set_total_state_count(std::size_t states)
-{
-  m_total_state_count = states;
-}
+void printer_base::do_set_total_state_count(std::size_t states) { m_total_state_count = states; }

-std::size_t printer_base::do_get_total_state_count() const
-{
-  return m_total_state_count;
-}
+std::size_t printer_base::do_get_total_state_count() const { return m_total_state_count; }

 } // namespace nvbench
--- a/nvbench/printer_multiplex.cuh
+++ b/nvbench/printer_multiplex.cuh
@@ -40,10 +40,7 @@ struct printer_multiplex : nvbench::printer_base
    return static_cast<Format &>(*m_printers.back());
  }

-  [[nodiscard]] std::size_t get_printer_count() const
-  {
-    return m_printers.size();
-  }
+  [[nodiscard]] std::size_t get_printer_count() const { return m_printers.size(); }

 protected:
  void do_log_argv(const std::vector<std::string> &argv) override;
@@ -52,11 +49,10 @@ protected:
  void do_print_log_epilogue() override;
  void do_log(nvbench::log_level, const std::string &) override;
  void do_log_run_state(const nvbench::state &) override;
-  void do_process_bulk_data_float64(
-    nvbench::state &,
-    const std::string &,
-    const std::string &,
-    const std::vector<nvbench::float64_t> &) override;
+  void do_process_bulk_data_float64(nvbench::state &,
+                                    const std::string &,
+                                    const std::string &,
+                                    const std::vector<nvbench::float64_t> &) override;
  void do_print_benchmark_list(const benchmark_vector &benches) override;
  void do_print_benchmark_results(const benchmark_vector &benches) override;
  void do_set_completed_state_count(std::size_t states) override;
--- a/nvbench/printer_multiplex.cxx
+++ b/nvbench/printer_multiplex.cxx
@@ -67,11 +67,10 @@ void printer_multiplex::do_log_run_state(const nvbench::state &exec_state)
  }
 }

-void printer_multiplex::do_process_bulk_data_float64(
-  state &state,
-  const std::string &tag,
-  const std::string &hint,
-  const std::vector<nvbench::float64_t> &data)
+void printer_multiplex::do_process_bulk_data_float64(state &state,
+                                                     const std::string &tag,
+                                                     const std::string &hint,
+                                                     const std::vector<nvbench::float64_t> &data)
 {
  for (auto &format_ptr : m_printers)
  {
@@ -87,8 +86,7 @@ void printer_multiplex::do_print_benchmark_list(const benchmark_vector &benches)
  }
 }

-void printer_multiplex::do_print_benchmark_results(
-  const benchmark_vector &benches)
+void printer_multiplex::do_print_benchmark_results(const benchmark_vector &benches)
 {
  for (auto &format_ptr : m_printers)
  {
--- a/nvbench/range.cuh
+++ b/nvbench/range.cuh
@@ -29,13 +29,11 @@ namespace nvbench
 namespace detail
 {
 template <typename T>
-using range_output_t = std::conditional_t<std::is_floating_point_v<T>,
-                                          nvbench::float64_t,
-                                          nvbench::int64_t>;
+using range_output_t =
+  std::conditional_t<std::is_floating_point_v<T>, nvbench::float64_t, nvbench::int64_t>;
 }

-template <typename InT,
-          typename OutT = nvbench::detail::range_output_t<InT>>
+template <typename InT, typename OutT = nvbench::detail::range_output_t<InT>>
 auto range(InT start, InT end, InT stride = InT{1})
 {
  if constexpr (std::is_floating_point_v<InT>)
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -37,8 +37,7 @@ struct runner_base

  void generate_states();

-  void handle_sampling_exception(const std::exception &e,
-                                 nvbench::state &exec_state) const;
+  void handle_sampling_exception(const std::exception &e, nvbench::state &exec_state) const;

  void run_state_prologue(state &exec_state) const;
  void run_state_epilogue(state &exec_state) const;
@@ -51,11 +50,10 @@ struct runner_base
 template <typename BenchmarkType>
 struct runner : public runner_base
 {
-  using benchmark_type   = BenchmarkType;
-  using kernel_generator = typename benchmark_type::kernel_generator;
-  using type_configs     = typename benchmark_type::type_configs;
-  static constexpr std::size_t num_type_configs =
-    benchmark_type::num_type_configs;
+  using benchmark_type                          = BenchmarkType;
+  using kernel_generator                        = typename benchmark_type::kernel_generator;
+  using type_configs                            = typename benchmark_type::type_configs;
+  static constexpr std::size_t num_type_configs = benchmark_type::num_type_configs;

  explicit runner(benchmark_type &bench)
      : runner_base{bench}
@@ -86,38 +84,37 @@ private:

    // Iterate through type_configs:
    std::size_t type_config_index = 0;
-    nvbench::tl::foreach<type_configs>([&self   = *this,
-                                        &states = m_benchmark.m_states,
-                                        &type_config_index,
-                                        &device](auto type_config_wrapper) {
-      // Get current type_config:
-      using type_config = typename decltype(type_config_wrapper)::type;
+    nvbench::tl::foreach<type_configs>(
+      [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device](
+        auto type_config_wrapper) {
+        // Get current type_config:
+        using type_config = typename decltype(type_config_wrapper)::type;

-      // Find states with the current device / type_config
-      for (nvbench::state &cur_state : states)
-      {
-        if (cur_state.get_device() == device &&
-            cur_state.get_type_config_index() == type_config_index)
+        // Find states with the current device / type_config
+        for (nvbench::state &cur_state : states)
        {
-          self.run_state_prologue(cur_state);
-          try
+          if (cur_state.get_device() == device &&
+              cur_state.get_type_config_index() == type_config_index)
          {
-            kernel_generator{}(cur_state, type_config{});
-            if (cur_state.is_skipped())
+            self.run_state_prologue(cur_state);
+            try
            {
-              self.print_skip_notification(cur_state);
+              kernel_generator{}(cur_state, type_config{});
+              if (cur_state.is_skipped())
+              {
+                self.print_skip_notification(cur_state);
+              }
            }
+            catch (std::exception &e)
+            {
+              self.handle_sampling_exception(e, cur_state);
+            }
+            self.run_state_epilogue(cur_state);
          }
-          catch (std::exception &e)
-          {
-            self.handle_sampling_exception(e, cur_state);
-          }
-          self.run_state_epilogue(cur_state);
        }
-      }

-      ++type_config_index;
-    });
+        ++type_config_index;
+      });
  }
 };

--- a/nvbench/runner.cxx
+++ b/nvbench/runner.cxx
@@ -35,8 +35,7 @@ void runner_base::generate_states()
  m_benchmark.m_states = nvbench::detail::state_generator::create(m_benchmark);
 }

-void runner_base::handle_sampling_exception(const std::exception &e,
-                                            state &exec_state) const
+void runner_base::handle_sampling_exception(const std::exception &e, state &exec_state) const
 {
  // If the state is skipped, that means the execution framework class handled
  // the error already.
@@ -62,8 +61,7 @@ void runner_base::handle_sampling_exception(const std::exception &e,
 void runner_base::run_state_prologue(nvbench::state &exec_state) const
 {
  // Log if a printer exists:
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
  {
    auto &printer = printer_opt_ref.value().get();
    printer.log_run_state(exec_state);
@@ -73,19 +71,16 @@ void runner_base::run_state_prologue(nvbench::state &exec_state) const
 void runner_base::run_state_epilogue(state &exec_state) const
 {
  // Notify the printer that the state has completed::
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
  {
    auto &printer = printer_opt_ref.value().get();
    printer.add_completed_state();
  }
 }

-
 void runner_base::print_skip_notification(state &exec_state) const
 {
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
  {
    auto &printer = printer_opt_ref.value().get();
    printer.log(nvbench::log_level::skip, exec_state.get_skip_reason());
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -58,106 +58,68 @@ struct state_tester;
 struct state
 {
  // move-only
-  state(const state &) = delete;
-  state(state &&)      = default;
+  state(const state &)            = delete;
+  state(state &&)                 = default;
  state &operator=(const state &) = delete;
-  state &operator=(state &&) = default;
+  state &operator=(state &&)      = default;

-  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
-  {
-    return m_cuda_stream;
-  }
-  void set_cuda_stream(nvbench::cuda_stream &&stream)
-  {
-    m_cuda_stream = std::move(stream);
-  }
+  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
+  void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }

  /// The CUDA device associated with with this benchmark state. May be
  /// nullopt for CPU-only benchmarks.
-  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
-  {
-    return m_device;
-  }
+  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }

  /// An index into a benchmark::type_configs type_list. Returns 0 if no type
  /// axes in the associated benchmark.
-  [[nodiscard]] std::size_t get_type_config_index() const
-  {
-    return m_type_config_index;
-  }
+  [[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }

  [[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const;
-  [[nodiscard]] nvbench::int64_t
-  get_int64_or_default(const std::string &axis_name,
-                       nvbench::int64_t default_value) const;
+  [[nodiscard]] nvbench::int64_t get_int64_or_default(const std::string &axis_name,
+                                                      nvbench::int64_t default_value) const;

-  [[nodiscard]] nvbench::float64_t
-  get_float64(const std::string &axis_name) const;
-  [[nodiscard]] nvbench::float64_t
-  get_float64_or_default(const std::string &axis_name,
-                         nvbench::float64_t default_value) const;
+  [[nodiscard]] nvbench::float64_t get_float64(const std::string &axis_name) const;
+  [[nodiscard]] nvbench::float64_t get_float64_or_default(const std::string &axis_name,
+                                                          nvbench::float64_t default_value) const;

-  [[nodiscard]] const std::string &
-  get_string(const std::string &axis_name) const;
-  [[nodiscard]] const std::string &
-  get_string_or_default(const std::string &axis_name,
-                        const std::string &default_value) const;
+  [[nodiscard]] const std::string &get_string(const std::string &axis_name) const;
+  [[nodiscard]] const std::string &get_string_or_default(const std::string &axis_name,
+                                                         const std::string &default_value) const;

  void add_element_count(std::size_t elements, std::string column_name = {});

  void set_element_count(std::size_t elements) { m_element_count = elements; }
-  [[nodiscard]] std::size_t get_element_count() const
-  {
-    return m_element_count;
-  }
+  [[nodiscard]] std::size_t get_element_count() const { return m_element_count; }

  template <typename ElementType>
  void add_global_memory_reads(std::size_t count, std::string column_name = {})
  {
-    this->add_global_memory_reads(count * sizeof(ElementType),
-                                  std::move(column_name));
+    this->add_global_memory_reads(count * sizeof(ElementType), std::move(column_name));
  }
  void add_global_memory_reads(std::size_t bytes, std::string column_name = {});

  template <typename ElementType>
  void add_global_memory_writes(std::size_t count, std::string column_name = {})
  {
-    this->add_global_memory_writes(count * sizeof(ElementType),
-                                   std::move(column_name));
+    this->add_global_memory_writes(count * sizeof(ElementType), std::move(column_name));
  }
-  void add_global_memory_writes(std::size_t bytes,
-                                std::string column_name = {});
+  void add_global_memory_writes(std::size_t bytes, std::string column_name = {});

  void add_buffer_size(std::size_t num_bytes,
                       std::string summary_tag,
                       std::string column_name = {},
                       std::string description = {});

-  void set_global_memory_rw_bytes(std::size_t bytes)
-  {
-    m_global_memory_rw_bytes = bytes;
-  }
-  [[nodiscard]] std::size_t get_global_memory_rw_bytes() const
-  {
-    return m_global_memory_rw_bytes;
-  }
+  void set_global_memory_rw_bytes(std::size_t bytes) { m_global_memory_rw_bytes = bytes; }
+  [[nodiscard]] std::size_t get_global_memory_rw_bytes() const { return m_global_memory_rw_bytes; }

  void skip(std::string reason) { m_skip_reason = std::move(reason); }
  [[nodiscard]] bool is_skipped() const { return !m_skip_reason.empty(); }
-  [[nodiscard]] const std::string &get_skip_reason() const
-  {
-    return m_skip_reason;
-  }
+  [[nodiscard]] const std::string &get_skip_reason() const { return m_skip_reason; }

  /// Execute at least this many trials per measurement. @{
-  [[nodiscard]] nvbench::int64_t get_min_samples() const
-  {
-    return m_min_samples;
-  }
-  void set_min_samples(nvbench::int64_t min_samples)
-  {
-    m_min_samples = min_samples;
-  }
+  [[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
+  void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
  /// @}

  /// If true, the benchmark is only run once, skipping all warmup runs and only
@@ -222,20 +184,14 @@ struct state
  }
  ///@}

-  [[nodiscard]] const named_values &get_axis_values() const
-  {
-    return m_axis_values;
-  }
+  [[nodiscard]] const named_values &get_axis_values() const { return m_axis_values; }

  /*!
   * Return a string of "axis_name1=input_string1 axis_name2=input_string2 ..."
   */
  [[nodiscard]] std::string get_axis_values_as_string(bool color = false) const;

-  [[nodiscard]] const benchmark_base &get_benchmark() const
-  {
-    return m_benchmark;
-  }
+  [[nodiscard]] const benchmark_base &get_benchmark() const { return m_benchmark; }

  void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
  void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
@@ -252,26 +208,11 @@ struct state
    collect_dram_throughput();
  }

-  [[nodiscard]] bool is_l1_hit_rate_collected() const
-  {
-    return m_collect_l1_hit_rates;
-  }
-  [[nodiscard]] bool is_l2_hit_rate_collected() const
-  {
-    return m_collect_l2_hit_rates;
-  }
-  [[nodiscard]] bool is_stores_efficiency_collected() const
-  {
-    return m_collect_stores_efficiency;
-  }
-  [[nodiscard]] bool is_loads_efficiency_collected() const
-  {
-    return m_collect_loads_efficiency;
-  }
-  [[nodiscard]] bool is_dram_throughput_collected() const
-  {
-    return m_collect_dram_throughput;
-  }
+  [[nodiscard]] bool is_l1_hit_rate_collected() const { return m_collect_l1_hit_rates; }
+  [[nodiscard]] bool is_l2_hit_rate_collected() const { return m_collect_l2_hit_rates; }
+  [[nodiscard]] bool is_stores_efficiency_collected() const { return m_collect_stores_efficiency; }
+  [[nodiscard]] bool is_loads_efficiency_collected() const { return m_collect_loads_efficiency; }
+  [[nodiscard]] bool is_dram_throughput_collected() const { return m_collect_dram_throughput; }

  [[nodiscard]] bool is_cupti_required() const
  {
@@ -306,8 +247,7 @@ struct state
  template <typename KernelLauncher>
  void exec(KernelLauncher &&kernel_launcher)
  {
-    this->exec(nvbench::exec_tag::none,
-               std::forward<KernelLauncher>(kernel_launcher));
+    this->exec(nvbench::exec_tag::none, std::forward<KernelLauncher>(kernel_launcher));
  }

 private:
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -65,9 +65,8 @@ nvbench::int64_t state::get_int64(const std::string &axis_name) const
  return m_axis_values.get_int64(axis_name);
 }

-nvbench::int64_t
-state::get_int64_or_default(const std::string &axis_name,
-                            nvbench::int64_t default_value) const
+nvbench::int64_t state::get_int64_or_default(const std::string &axis_name,
+                                             nvbench::int64_t default_value) const
 try
 {
  return this->get_int64(axis_name);
@@ -82,9 +81,8 @@ nvbench::float64_t state::get_float64(const std::string &axis_name) const
  return m_axis_values.get_float64(axis_name);
 }

-nvbench::float64_t
-state::get_float64_or_default(const std::string &axis_name,
-                              nvbench::float64_t default_value) const
+nvbench::float64_t state::get_float64_or_default(const std::string &axis_name,
+                                                 nvbench::float64_t default_value) const
 try
 {
  return this->get_float64(axis_name);
@@ -99,9 +97,8 @@ const std::string &state::get_string(const std::string &axis_name) const
  return m_axis_values.get_string(axis_name);
 }

-const std::string &
-state::get_string_or_default(const std::string &axis_name,
-                             const std::string &default_value) const
+const std::string &state::get_string_or_default(const std::string &axis_name,
+                                                const std::string &default_value) const
 try
 {
  return this->get_string(axis_name);
@@ -125,20 +122,18 @@ summary &state::add_summary(summary s)
 const summary &state::get_summary(std::string_view tag) const
 {
  // Check tags first
-  auto iter =
-    std::find_if(m_summaries.cbegin(),
-                 m_summaries.cend(),
-                 [&tag](const auto &s) { return s.get_tag() == tag; });
+  auto iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
+    return s.get_tag() == tag;
+  });
  if (iter != m_summaries.cend())
  {
    return *iter;
  }

  // Then names:
-  iter =
-    std::find_if(m_summaries.cbegin(),
-                 m_summaries.cend(),
-                 [&tag](const auto &s) { return s.get_string("name") == tag; });
+  iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
+    return s.get_string("name") == tag;
+  });
  if (iter != m_summaries.cend())
  {
    return *iter;
@@ -150,20 +145,18 @@ const summary &state::get_summary(std::string_view tag) const
 summary &state::get_summary(std::string_view tag)
 {
  // Check tags first
-  auto iter =
-    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
-      return s.get_tag() == tag;
-    });
+  auto iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+    return s.get_tag() == tag;
+  });
  if (iter != m_summaries.end())
  {
    return *iter;
  }

  // Then names:
-  iter =
-    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
-      return s.get_string("name") == tag;
-    });
+  iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+    return s.get_string("name") == tag;
+  });
  if (iter != m_summaries.end())
  {
    return *iter;
@@ -187,18 +180,17 @@ std::string state::get_axis_values_as_string(bool color) const
  // Create a Key=Value list of all parameters:
  fmt::memory_buffer buffer;

-  auto append_key_value = [&buffer, &style](const std::string &key,
-                                            const auto &value,
-                                            std::string value_fmtstr = "{}") {
-    constexpr auto key_format   = fmt::emphasis::italic;
-    constexpr auto value_format = fmt::emphasis::bold;
+  auto append_key_value =
+    [&buffer, &style](const std::string &key, const auto &value, std::string value_fmtstr = "{}") {
+      constexpr auto key_format   = fmt::emphasis::italic;
+      constexpr auto value_format = fmt::emphasis::bold;

-    fmt::format_to(buffer,
-                   "{}{}={}",
-                   buffer.size() == 0 ? "" : " ",
-                   fmt::format(style(key_format), "{}", key),
-                   fmt::format(style(value_format), value_fmtstr, value));
-  };
+      fmt::format_to(buffer,
+                     "{}{}={}",
+                     buffer.size() == 0 ? "" : " ",
+                     fmt::format(style(key_format), "{}", key),
+                     fmt::format(style(value_format), value_fmtstr, value));
+    };

  if (m_device)
  {
@@ -211,8 +203,7 @@ std::string state::get_axis_values_as_string(bool color) const
    const auto axis_type = m_axis_values.get_type(name);

    // Handle power-of-two int64 axes differently:
-    if (axis_type == named_values::type::int64 &&
-        axes.get_int64_axis(name).is_power_of_two())
+    if (axis_type == named_values::type::int64 && axes.get_int64_axis(name).is_power_of_two())
    {
      const nvbench::int64_t value    = m_axis_values.get_int64(name);
      const nvbench::int64_t exponent = int64_axis::compute_log2(value);
@@ -242,10 +233,9 @@ std::string state::get_short_description(bool color) const
    return color ? fmt_style : no_style;
  };

-  return fmt::format(
-    "{} [{}]",
-    fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
-    this->get_axis_values_as_string(color));
+  return fmt::format("{} [{}]",
+                     fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
+                     this->get_axis_values_as_string(color));
 }

 void state::add_element_count(std::size_t elements, std::string column_name)
--- a/nvbench/string_axis.cuh
+++ b/nvbench/string_axis.cuh
@@ -36,25 +36,13 @@ struct string_axis final : public axis_base

  ~string_axis() final;

-  void set_inputs(std::vector<std::string> inputs)
-  {
-    m_values = std::move(inputs);
-  }
-  [[nodiscard]] const std::string &get_value(std::size_t i) const
-  {
-    return m_values[i];
-  }
+  void set_inputs(std::vector<std::string> inputs) { m_values = std::move(inputs); }
+  [[nodiscard]] const std::string &get_value(std::size_t i) const { return m_values[i]; }

 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<string_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<string_axis>(*this); }
  std::size_t do_get_size() const final { return m_values.size(); }
-  std::string do_get_input_string(std::size_t i) const final
-  {
-    return m_values[i];
-  }
+  std::string do_get_input_string(std::size_t i) const final { return m_values[i]; }
  std::string do_get_description(std::size_t) const final { return {}; }

  std::vector<std::string> m_values;
--- a/nvbench/summary.cuh
+++ b/nvbench/summary.cuh
@@ -92,10 +92,10 @@ struct summary : public nvbench::named_values
  {}

  // move-only
-  summary(const summary &) = delete;
-  summary(summary &&)      = default;
+  summary(const summary &)            = delete;
+  summary(summary &&)                 = default;
  summary &operator=(const summary &) = delete;
-  summary &operator=(summary &&) = default;
+  summary &operator=(summary &&)      = default;

  void set_tag(std::string tag) { m_tag = std::move(tag); }
  [[nodiscard]] const std::string &get_tag() const { return m_tag; }
--- a/nvbench/test_kernels.cuh
+++ b/nvbench/test_kernels.cuh
@@ -38,8 +38,8 @@ namespace nvbench
 __global__ void sleep_kernel(double seconds)
 {
  const auto start = cuda::std::chrono::high_resolution_clock::now();
-  const auto ns    = cuda::std::chrono::nanoseconds(
-    static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
+  const auto ns =
+    cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
  const auto finish = start + ns;

  auto now = cuda::std::chrono::high_resolution_clock::now();
@@ -53,7 +53,7 @@ __global__ void sleep_kernel(double seconds)
 * Naive copy of `n` values from `in` -> `out`.
 */
 template <typename T, typename U>
-__global__ void copy_kernel(const T* in, U* out, std::size_t n)
+__global__ void copy_kernel(const T *in, U *out, std::size_t n)
 {
  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
  const auto step = blockDim.x * gridDim.x;
@@ -68,7 +68,7 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n)
 * For `i <- [0,n)`, `out[i] = in[i] % 2`.
 */
 template <typename T, typename U>
-__global__ void mod2_kernel(const T* in, U* out, std::size_t n)
+__global__ void mod2_kernel(const T *in, U *out, std::size_t n)
 {
  const auto init = blockIdx.x * blockDim.x + threadIdx.x;
  const auto step = blockDim.x * gridDim.x;
@@ -79,4 +79,4 @@ __global__ void mod2_kernel(const T* in, U* out, std::size_t n)
  }
 }

-}
+} // namespace nvbench
--- a/nvbench/type_axis.cuh
+++ b/nvbench/type_axis.cuh
@@ -43,7 +43,7 @@ struct type_axis final : public axis_base
  template <typename TypeList>
  void set_inputs();

-  void set_active_inputs(const std::vector<std::string>& inputs);
+  void set_active_inputs(const std::vector<std::string> &inputs);

  [[nodiscard]] bool get_is_active(const std::string &input) const;
  [[nodiscard]] bool get_is_active(std::size_t index) const;
@@ -57,23 +57,13 @@ struct type_axis final : public axis_base
  /**
   * The index in this axis of the type with the specified `input_string`.
   */
-  [[nodiscard]] std::size_t
-  get_type_index(const std::string &input_string) const;
+  [[nodiscard]] std::size_t get_type_index(const std::string &input_string) const;

 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<type_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<type_axis>(*this); }
  std::size_t do_get_size() const final { return m_input_strings.size(); }
-  std::string do_get_input_string(std::size_t i) const final
-  {
-    return m_input_strings[i];
-  }
-  std::string do_get_description(std::size_t i) const final
-  {
-    return m_descriptions[i];
-  }
+  std::string do_get_input_string(std::size_t i) const final { return m_input_strings[i]; }
+  std::string do_get_description(std::size_t i) const final { return m_descriptions[i]; }

  std::vector<std::string> m_input_strings;
  std::vector<std::string> m_descriptions;
--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -35,10 +35,10 @@ void type_axis::set_active_inputs(const std::vector<std::string> &inputs)
 {
  m_mask.clear();
  m_mask.resize(m_input_strings.size(), false);
-  for (const auto& input : inputs)
+  for (const auto &input : inputs)
  {
    const auto idx = this->get_type_index(input);
-    m_mask[idx] = true;
+    m_mask[idx]    = true;
  }
 }

@@ -47,21 +47,16 @@ bool type_axis::get_is_active(const std::string &input) const
  return this->get_is_active(this->get_type_index(input));
 }

-bool type_axis::get_is_active(std::size_t idx) const
-{
-  return m_mask.at(idx);
-}
+bool type_axis::get_is_active(std::size_t idx) const { return m_mask.at(idx); }

 std::size_t type_axis::get_active_count() const
 {
-  return static_cast<std::size_t>(
-    std::count(m_mask.cbegin(), m_mask.cend(), true));
+  return static_cast<std::size_t>(std::count(m_mask.cbegin(), m_mask.cend(), true));
 }

 std::size_t type_axis::get_type_index(const std::string &input_string) const
 {
-  auto it =
-    std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
+  auto it = std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
  if (it == m_input_strings.end())
  {
    NVBENCH_THROW(std::runtime_error,
--- a/nvbench/type_strings.cuh
+++ b/nvbench/type_strings.cuh
@@ -30,17 +30,17 @@ namespace nvbench
 std::string demangle(const std::string &str);

 template <typename T>
-std::string demangle() { return demangle(typeid(T).name()); }
+std::string demangle()
+{
+  return demangle(typeid(T).name());
+}

 template <typename T>
 struct type_strings
 {
  // The string used to identify the type in shorthand (e.g. output tables and
  // CLI options):
-  static std::string input_string()
-  {
-    return nvbench::demangle<T>();
-  }
+  static std::string input_string() { return nvbench::demangle<T>(); }

  // A more descriptive identifier for the type, if input_string is not a common
  // identifier. May be blank if `input_string` is obvious.
@@ -56,10 +56,7 @@ struct type_strings<std::integral_constant<T, Value>>

  // A more descriptive identifier for the type, if input_string is not a common
  // identifier. May be blank if `input_string` is obvious.
-  static std::string description()
-  {
-    return nvbench::demangle<std::integral_constant<T, Value>>();
-  }
+  static std::string description() { return nvbench::demangle<std::integral_constant<T, Value>>(); }
 };

 } // namespace nvbench
@@ -67,15 +64,15 @@ struct type_strings<std::integral_constant<T, Value>>
 /*!
 * Declare an `input_string` and `description` to use with a specific `type`.
 */
-#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description)           \
-  namespace nvbench                                                            \
-  {                                                                            \
-  template <>                                                                  \
-  struct type_strings<Type>                                                    \
-  {                                                                            \
-    static std::string input_string() { return {InputString}; }                \
-    static std::string description() { return {Description}; }                 \
-  };                                                                           \
+#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description)                               \
+  namespace nvbench                                                                                \
+  {                                                                                                \
+  template <>                                                                                      \
+  struct type_strings<Type>                                                                        \
+  {                                                                                                \
+    static std::string input_string() { return {InputString}; }                                    \
+    static std::string description() { return {Description}; }                                     \
+  };                                                                                               \
  }

 NVBENCH_DECLARE_TYPE_STRINGS(nvbench::int8_t, "I8", "int8_t");