From 8f1152d4a22287a35be2dde596e3cf86ace8054a Mon Sep 17 00:00:00 2001
From: clang-format <>
Date: Thu, 18 Aug 2022 22:09:24 +0400
Subject: [PATCH] Auto format

---
 nvbench/axes_metadata.cuh                     |  28 ++-
 nvbench/axes_metadata.cxx                     |  33 ++--
 nvbench/axis_base.cuh                         |   5 +-
 nvbench/axis_base.cxx                         |   5 +-
 nvbench/benchmark.cuh                         |   8 +-
 nvbench/benchmark_base.cuh                    |  48 ++----
 nvbench/benchmark_base.cxx                    |   3 +-
 nvbench/benchmark_manager.cuh                 |  19 +--
 nvbench/benchmark_manager.cxx                 |  19 +--
 nvbench/blocking_kernel.cu                    |  90 +++++-----
 nvbench/blocking_kernel.cuh                   |   6 +-
 nvbench/callable.cuh                          |  42 ++---
 nvbench/cpu_timer.cuh                         |  19 +--
 nvbench/create.cuh                            |  24 ++-
 nvbench/csv_printer.cu                        |  29 +---
 nvbench/cuda_call.cu                          |   8 +-
 nvbench/cuda_call.cuh                         |  59 +++----
 nvbench/cuda_stream.cuh                       |   4 +-
 nvbench/cuda_timer.cuh                        |   6 +-
 nvbench/cupti_profiler.cuh                    |  11 +-
 nvbench/cupti_profiler.cxx                    | 159 +++++++-----------
 nvbench/detail/device_scope.cuh               |   6 +-
 .../detail/kernel_launcher_timer_wrapper.cuh  |   2 +-
 nvbench/detail/l2flush.cuh                    |   7 +-
 nvbench/detail/measure_cold.cu                |  66 +++-----
 nvbench/detail/measure_cold.cuh               |  11 +-
 nvbench/detail/measure_cupti.cu               |  50 ++----
 nvbench/detail/measure_cupti.cuh              |  11 +-
 nvbench/detail/measure_hot.cu                 |  16 +-
 nvbench/detail/measure_hot.cuh                |  11 +-
 nvbench/detail/ring_buffer.cuh                |  10 +-
 nvbench/detail/state_exec.cuh                 |  28 ++-
 nvbench/detail/state_generator.cxx            |  85 ++++------
 nvbench/detail/statistics.cuh                 |   3 +-
 nvbench/detail/throw.cuh                      |  22 ++-
 nvbench/detail/transform_reduce.cuh           |   5 +-
 nvbench/detail/type_list_impl.cuh             |  32 ++--
 nvbench/device_info.cu                        |  36 ++--
 nvbench/device_info.cuh                       |  81 ++-------
 nvbench/device_manager.cuh                    |  25 +--
 nvbench/enum_type_list.cuh                    |  25 ++-
 nvbench/exec_tag.cuh                          |  17 +-
 nvbench/flags.cuh                             |  40 ++---
 nvbench/float64_axis.cuh                      |  15 +-
 nvbench/git_revision.cuh                      |  36 ++--
 nvbench/int64_axis.cuh                        |  23 +--
 nvbench/int64_axis.cxx                        |   5 +-
 nvbench/internal/markdown_table.cuh           |  12 +-
 nvbench/internal/nvml.cuh                     |  41 ++---
 nvbench/internal/table_builder.cuh            |  24 +--
 nvbench/json_printer.cu                       |  66 +++-----
 nvbench/json_printer.cuh                      |  23 +--
 nvbench/launch.cuh                            |  11 +-
 nvbench/main.cuh                              |  92 +++++-----
 nvbench/markdown_printer.cu                   |  84 +++------
 nvbench/named_values.cuh                      |   9 +-
 nvbench/named_values.cxx                      |  47 ++----
 nvbench/option_parser.cu                      | 133 ++++-----------
 nvbench/option_parser.cuh                     |  19 +--
 nvbench/printer_base.cuh                      |  37 ++--
 nvbench/printer_base.cxx                      |  15 +-
 nvbench/printer_multiplex.cuh                 |  14 +-
 nvbench/printer_multiplex.cxx                 |  12 +-
 nvbench/range.cuh                             |   8 +-
 nvbench/runner.cuh                            |  59 +++----
 nvbench/runner.cxx                            |  13 +-
 nvbench/state.cuh                             | 124 ++++----------
 nvbench/state.cxx                             |  74 ++++----
 nvbench/string_axis.cuh                       |  20 +--
 nvbench/summary.cuh                           |   6 +-
 nvbench/test_kernels.cuh                      |  10 +-
 nvbench/type_axis.cuh                         |  20 +--
 nvbench/type_axis.cxx                         |  15 +-
 nvbench/type_strings.cuh                      |  33 ++--
 74 files changed, 835 insertions(+), 1479 deletions(-)

diff --git a/nvbench/axes_metadata.cuh b/nvbench/axes_metadata.cuh
index 353855a..2663191 100644
--- a/nvbench/axes_metadata.cuh
+++ b/nvbench/axes_metadata.cuh
@@ -41,8 +41,8 @@ struct axes_metadata
   template <typename... TypeAxes>
   explicit axes_metadata(nvbench::type_list<TypeAxes...>);
 
-  axes_metadata()                 = default;
-  axes_metadata(axes_metadata &&) = default;
+  axes_metadata()                            = default;
+  axes_metadata(axes_metadata &&)            = default;
   axes_metadata &operator=(axes_metadata &&) = default;
 
   axes_metadata(const axes_metadata &);
@@ -58,20 +58,16 @@ struct axes_metadata
 
   void add_string_axis(std::string name, std::vector<std::string> data);
 
-  [[nodiscard]] const nvbench::int64_axis &
-  get_int64_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::int64_axis &get_int64_axis(std::string_view name) const;
   [[nodiscard]] nvbench::int64_axis &get_int64_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::float64_axis &
-  get_float64_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::float64_axis &get_float64_axis(std::string_view name) const;
   [[nodiscard]] nvbench::float64_axis &get_float64_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::string_axis &
-  get_string_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::string_axis &get_string_axis(std::string_view name) const;
   [[nodiscard]] nvbench::string_axis &get_string_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::type_axis &
-  get_type_axis(std::string_view name) const;
+  [[nodiscard]] const nvbench::type_axis &get_type_axis(std::string_view name) const;
   [[nodiscard]] nvbench::type_axis &get_type_axis(std::string_view name);
 
   [[nodiscard]] const nvbench::type_axis &get_type_axis(std::size_t index) const;
@@ -83,10 +79,9 @@ struct axes_metadata
   [[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name) const;
   [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name);
 
-  [[nodiscard]] const nvbench::axis_base &
-  get_axis(std::string_view name, nvbench::axis_type type) const;
-  [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name,
-                                             nvbench::axis_type type);
+  [[nodiscard]] const nvbench::axis_base &get_axis(std::string_view name,
+                                                   nvbench::axis_type type) const;
+  [[nodiscard]] nvbench::axis_base &get_axis(std::string_view name, nvbench::axis_type type);
 
   [[nodiscard]] static std::vector<std::string>
   generate_default_type_axis_names(std::size_t num_type_axes);
@@ -101,7 +96,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
 {
   using type_axes_list         = nvbench::type_list<TypeAxes...>;
   constexpr auto num_type_axes = nvbench::tl::size<type_axes_list>::value;
-  auto names = axes_metadata::generate_default_type_axis_names(num_type_axes);
+  auto names                   = axes_metadata::generate_default_type_axis_names(num_type_axes);
 
   auto names_iter = names.begin(); // contents will be moved from
   nvbench::tl::foreach<type_axes_list>(
@@ -114,8 +109,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
       // The word "type" appears 6 times in the next line.
       // Every. Single. Token.
       typedef typename decltype(wrapped_type)::type type_list;
-      auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++),
-                                                       type_axis_index);
+      auto axis = std::make_unique<nvbench::type_axis>(std::move(*names_iter++), type_axis_index);
       axis->template set_inputs<type_list>();
       axes.push_back(std::move(axis));
     });
diff --git a/nvbench/axes_metadata.cxx b/nvbench/axes_metadata.cxx
index 044bc91..ef51a96 100644
--- a/nvbench/axes_metadata.cxx
+++ b/nvbench/axes_metadata.cxx
@@ -64,9 +64,7 @@ try
     auto &axis = *m_axes[i];
     if (axis.get_type() != nvbench::axis_type::type)
     {
-      NVBENCH_THROW(std::runtime_error,
-                    "Number of names exceeds number of type axes ({})",
-                    i);
+      NVBENCH_THROW(std::runtime_error, "Number of names exceeds number of type axes ({})", i);
     }
 
     axis.set_name(std::move(names[i]));
@@ -81,8 +79,7 @@ catch (std::exception &e)
                 names);
 }
 
-void axes_metadata::add_float64_axis(std::string name,
-                                     std::vector<nvbench::float64_t> data)
+void axes_metadata::add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
 {
   auto axis = std::make_unique<nvbench::float64_axis>(std::move(name));
   axis->set_inputs(std::move(data));
@@ -98,8 +95,7 @@ void axes_metadata::add_int64_axis(std::string name,
   m_axes.push_back(std::move(axis));
 }
 
-void axes_metadata::add_string_axis(std::string name,
-                                    std::vector<std::string> data)
+void axes_metadata::add_string_axis(std::string name, std::vector<std::string> data)
 {
   auto axis = std::make_unique<nvbench::string_axis>(std::move(name));
   axis->set_inputs(std::move(data));
@@ -188,10 +184,9 @@ nvbench::type_axis &axes_metadata::get_type_axis(std::size_t index)
 
 const axis_base &axes_metadata::get_axis(std::string_view name) const
 {
-  auto iter =
-    std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
-      return axis->get_name() == name;
-    });
+  auto iter = std::find_if(m_axes.cbegin(), m_axes.cend(), [&name](const auto &axis) {
+    return axis->get_name() == name;
+  });
 
   if (iter == m_axes.cend())
   {
@@ -203,10 +198,9 @@ const axis_base &axes_metadata::get_axis(std::string_view name) const
 
 axis_base &axes_metadata::get_axis(std::string_view name)
 {
-  auto iter =
-    std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
-      return axis->get_name() == name;
-    });
+  auto iter = std::find_if(m_axes.begin(), m_axes.end(), [&name](const auto &axis) {
+    return axis->get_name() == name;
+  });
 
   if (iter == m_axes.end())
   {
@@ -216,8 +210,7 @@ axis_base &axes_metadata::get_axis(std::string_view name)
   return **iter;
 }
 
-const axis_base &axes_metadata::get_axis(std::string_view name,
-                                         nvbench::axis_type type) const
+const axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type) const
 {
   const auto &axis = this->get_axis(name);
   if (axis.get_type() != type)
@@ -231,8 +224,7 @@ const axis_base &axes_metadata::get_axis(std::string_view name,
   return axis;
 }
 
-axis_base &axes_metadata::get_axis(std::string_view name,
-                                   nvbench::axis_type type)
+axis_base &axes_metadata::get_axis(std::string_view name, nvbench::axis_type type)
 {
   auto &axis = this->get_axis(name);
   if (axis.get_type() != type)
@@ -246,8 +238,7 @@ axis_base &axes_metadata::get_axis(std::string_view name,
   return axis;
 }
 
-std::vector<std::string>
-axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
+std::vector<std::string> axes_metadata::generate_default_type_axis_names(std::size_t num_type_axes)
 {
   switch (num_type_axes)
   {
diff --git a/nvbench/axis_base.cuh b/nvbench/axis_base.cuh
index 712172f..85d92c7 100644
--- a/nvbench/axis_base.cuh
+++ b/nvbench/axis_base.cuh
@@ -47,10 +47,7 @@ struct axis_base
 
   [[nodiscard]] axis_type get_type() const { return m_type; }
 
-  [[nodiscard]] std::string_view get_type_as_string() const
-  {
-    return axis_type_to_string(m_type);
-  }
+  [[nodiscard]] std::string_view get_type_as_string() const { return axis_type_to_string(m_type); }
 
   [[nodiscard]] std::string_view get_flags_as_string() const
   {
diff --git a/nvbench/axis_base.cxx b/nvbench/axis_base.cxx
index 6d0bd4d..166f1ba 100644
--- a/nvbench/axis_base.cxx
+++ b/nvbench/axis_base.cxx
@@ -23,9 +23,6 @@ namespace nvbench
 
 axis_base::~axis_base() = default;
 
-std::unique_ptr<axis_base> axis_base::clone() const
-{
-  return this->do_clone();
-}
+std::unique_ptr<axis_base> axis_base::clone() const { return this->do_clone(); }
 
 } // namespace nvbench
diff --git a/nvbench/benchmark.cuh b/nvbench/benchmark.cuh
index 5e050d1..a226070 100644
--- a/nvbench/benchmark.cuh
+++ b/nvbench/benchmark.cuh
@@ -57,18 +57,14 @@ struct benchmark final : public benchmark_base
   using type_axes        = TypeAxes;
   using type_configs     = nvbench::tl::cartesian_product<type_axes>;
 
-  static constexpr std::size_t num_type_configs =
-    nvbench::tl::size<type_configs>{};
+  static constexpr std::size_t num_type_configs = nvbench::tl::size<type_configs>{};
 
   benchmark()
       : benchmark_base(type_axes{})
   {}
 
 private:
-  std::unique_ptr<benchmark_base> do_clone() const final
-  {
-    return std::make_unique<benchmark>();
-  }
+  std::unique_ptr<benchmark_base> do_clone() const final { return std::make_unique<benchmark>(); }
 
   void do_set_type_axes_names(std::vector<std::string> names) final
   {
diff --git a/nvbench/benchmark_base.cuh b/nvbench/benchmark_base.cuh
index 3a16408..723afcd 100644
--- a/nvbench/benchmark_base.cuh
+++ b/nvbench/benchmark_base.cuh
@@ -80,32 +80,28 @@ struct benchmark_base
     return *this;
   }
 
-  benchmark_base &add_float64_axis(std::string name,
-                                   std::vector<nvbench::float64_t> data)
+  benchmark_base &add_float64_axis(std::string name, std::vector<nvbench::float64_t> data)
   {
     m_axes.add_float64_axis(std::move(name), std::move(data));
     return *this;
   }
 
-  benchmark_base &add_int64_axis(
-    std::string name,
-    std::vector<nvbench::int64_t> data,
-    nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
+  benchmark_base &add_int64_axis(std::string name,
+                                 std::vector<nvbench::int64_t> data,
+                                 nvbench::int64_axis_flags flags = nvbench::int64_axis_flags::none)
   {
     m_axes.add_int64_axis(std::move(name), std::move(data), flags);
     return *this;
   }
 
-  benchmark_base &add_int64_power_of_two_axis(std::string name,
-                                              std::vector<nvbench::int64_t> data)
+  benchmark_base &add_int64_power_of_two_axis(std::string name, std::vector<nvbench::int64_t> data)
   {
     return this->add_int64_axis(std::move(name),
                                 std::move(data),
                                 nvbench::int64_axis_flags::power_of_two);
   }
 
-  benchmark_base &add_string_axis(std::string name,
-                                  std::vector<std::string> data)
+  benchmark_base &add_string_axis(std::string name, std::vector<std::string> data)
   {
     m_axes.add_string_axis(std::move(name), std::move(data));
     return *this;
@@ -133,48 +129,30 @@ struct benchmark_base
     return *this;
   }
 
-  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const
-  {
-    return m_devices;
-  }
+  [[nodiscard]] const std::vector<nvbench::device_info> &get_devices() const { return m_devices; }
 
   [[nodiscard]] nvbench::axes_metadata &get_axes() { return m_axes; }
 
-  [[nodiscard]] const nvbench::axes_metadata &get_axes() const
-  {
-    return m_axes;
-  }
+  [[nodiscard]] const nvbench::axes_metadata &get_axes() const { return m_axes; }
 
   // Computes the number of configs in the benchmark.
   // Unlike get_states().size(), this method may be used prior to calling run().
   [[nodiscard]] std::size_t get_config_count() const;
 
   // Is empty until run() is called.
-  [[nodiscard]] const std::vector<nvbench::state> &get_states() const
-  {
-    return m_states;
-  }
+  [[nodiscard]] const std::vector<nvbench::state> &get_states() const { return m_states; }
   [[nodiscard]] std::vector<nvbench::state> &get_states() { return m_states; }
 
   void run() { this->do_run(); }
 
-  void set_printer(nvbench::printer_base &printer)
-  {
-    m_printer = std::ref(printer);
-  }
+  void set_printer(nvbench::printer_base &printer) { m_printer = std::ref(printer); }
 
   void clear_printer() { m_printer = std::nullopt; }
 
-  [[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const
-  {
-    return m_printer;
-  }
+  [[nodiscard]] optional_ref<nvbench::printer_base> get_printer() const { return m_printer; }
 
   /// Execute at least this many trials per measurement. @{
-  [[nodiscard]] nvbench::int64_t get_min_samples() const
-  {
-    return m_min_samples;
-  }
+  [[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
   benchmark_base &set_min_samples(nvbench::int64_t min_samples)
   {
     m_min_samples = min_samples;
@@ -193,7 +171,7 @@ struct benchmark_base
   }
   /// @}
 
-  /// If true, the benchmark does not use the blocking_kernel. This is intended 
+  /// If true, the benchmark does not use the blocking_kernel. This is intended
   /// for use with external profiling tools. @{
   [[nodiscard]] bool get_disable_blocking_kernel() const { return m_disable_blocking_kernel; }
   benchmark_base &set_disable_blocking_kernel(bool v)
diff --git a/nvbench/benchmark_base.cxx b/nvbench/benchmark_base.cxx
index 2d08fdb..5121a52 100644
--- a/nvbench/benchmark_base.cxx
+++ b/nvbench/benchmark_base.cxx
@@ -68,8 +68,7 @@ std::size_t benchmark_base::get_config_count() const
     std::size_t{1},
     std::multiplies<>{},
     [](const auto &axis_ptr) {
-      if (const auto *type_axis_ptr =
-            dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
+      if (const auto *type_axis_ptr = dynamic_cast<const nvbench::type_axis *>(axis_ptr.get());
           type_axis_ptr != nullptr)
       {
         return type_axis_ptr->get_active_count();
diff --git a/nvbench/benchmark_manager.cuh b/nvbench/benchmark_manager.cuh
index 39b1717..7316445 100644
--- a/nvbench/benchmark_manager.cuh
+++ b/nvbench/benchmark_manager.cuh
@@ -31,8 +31,7 @@ namespace nvbench
  */
 struct benchmark_manager
 {
-  using benchmark_vector =
-    std::vector<std::unique_ptr<nvbench::benchmark_base>>;
+  using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
 
   /**
    * @return The singleton benchmark_manager instance.
@@ -53,25 +52,21 @@ struct benchmark_manager
    * Get a non-mutable reference to benchmark with the specified name/index.
    * @{
    */
-  [[nodiscard]] const benchmark_base &
-  get_benchmark(const std::string &name) const;
+  [[nodiscard]] const benchmark_base &get_benchmark(const std::string &name) const;
   [[nodiscard]] const benchmark_base &get_benchmark(std::size_t idx) const
   {
     return *m_benchmarks.at(idx);
   }
   /**@}*/
 
-  [[nodiscard]] const benchmark_vector &get_benchmarks() const
-  {
-    return m_benchmarks;
-  };
+  [[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
 
 private:
-  benchmark_manager()                          = default;
-  benchmark_manager(const benchmark_manager &) = delete;
-  benchmark_manager(benchmark_manager &&)      = delete;
+  benchmark_manager()                                     = default;
+  benchmark_manager(const benchmark_manager &)            = delete;
+  benchmark_manager(benchmark_manager &&)                 = delete;
   benchmark_manager &operator=(const benchmark_manager &) = delete;
-  benchmark_manager &operator=(benchmark_manager &&) = delete;
+  benchmark_manager &operator=(benchmark_manager &&)      = delete;
 
   benchmark_vector m_benchmarks;
 };
diff --git a/nvbench/benchmark_manager.cxx b/nvbench/benchmark_manager.cxx
index 2a0ca60..cd84f61 100644
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -43,21 +43,18 @@ benchmark_base &benchmark_manager::add(std::unique_ptr<benchmark_base> bench)
 benchmark_manager::benchmark_vector benchmark_manager::clone_benchmarks() const
 {
   benchmark_vector result(m_benchmarks.size());
-  std::transform(m_benchmarks.cbegin(),
-                 m_benchmarks.cend(),
-                 result.begin(),
-                 [](const auto &bench) { return bench->clone(); });
+  std::transform(m_benchmarks.cbegin(), m_benchmarks.cend(), result.begin(), [](const auto &bench) {
+    return bench->clone();
+  });
   return result;
 }
 
-const benchmark_base &
-benchmark_manager::get_benchmark(const std::string &name) const
+const benchmark_base &benchmark_manager::get_benchmark(const std::string &name) const
 {
-  auto iter = std::find_if(m_benchmarks.cbegin(),
-                           m_benchmarks.cend(),
-                           [&name](const auto &bench_ptr) {
-                             return bench_ptr->get_name() == name;
-                           });
+  auto iter =
+    std::find_if(m_benchmarks.cbegin(), m_benchmarks.cend(), [&name](const auto &bench_ptr) {
+      return bench_ptr->get_name() == name;
+    });
   if (iter == m_benchmarks.cend())
   {
     NVBENCH_THROW(std::out_of_range, "No benchmark named '{}'.", name);
diff --git a/nvbench/blocking_kernel.cu b/nvbench/blocking_kernel.cu
index 1ee5855..f347833 100644
--- a/nvbench/blocking_kernel.cu
+++ b/nvbench/blocking_kernel.cu
@@ -42,8 +42,8 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
                              nvbench::float64_t timeout)
 {
   const auto start_point = cuda::std::chrono::high_resolution_clock::now();
-  const auto timeout_ns  = cuda::std::chrono::nanoseconds(
-    static_cast<nvbench::int64_t>(timeout * 1e9));
+  const auto timeout_ns =
+    cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(timeout * 1e9));
   const auto timeout_point = start_point + timeout_ns;
 
   const bool use_timeout = timeout >= 0.;
@@ -57,41 +57,40 @@ __global__ void block_stream(const volatile nvbench::int32_t *flag,
   {
     *timeout_flag = 1;
     __threadfence_system(); // Ensure timeout flag visibility on host.
-    printf(
-      "\n"
-      "######################################################################\n"
-      "##################### Possible Deadlock Detected #####################\n"
-      "######################################################################\n"
-      "\n"
-      "Forcing unblock: The current measurement appears to have deadlocked\n"
-      "and the results cannot be trusted.\n"
-      "\n"
-      "This happens when the KernelLauncher synchronizes the CUDA device.\n"
-      "If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
-      "\n"
-      "    state.exec(<KernelLauncher>); // Deadlock\n"
-      "    state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
-      "\n"
-      "This tells NVBench about the sync so it can run the benchmark safely.\n"
-      "\n"
-      "If the KernelLauncher does not synchronize but has a very long \n"
-      "execution time, this may be a false positive. If so, disable this\n"
-      "check with:\n"
-      "\n"
-      "    state.set_blocking_kernel_timeout(-1);\n"
-      "\n"
-      "The current timeout is set to %0.5g seconds.\n"
-      "\n"
-      "For more information, see the 'Benchmarks that sync' section of the\n"
-      "NVBench documentation.\n"
-      "\n"
-      "If this happens while profiling with an external tool,\n"
-      "pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
-      "(to also only run the benchmark once) to the executable.\n"
-      "\n"
-      "For more information, see the 'Benchmark Properties' section of the\n"
-      "NVBench documentation.\n\n",
-      timeout);
+    printf("\n"
+           "######################################################################\n"
+           "##################### Possible Deadlock Detected #####################\n"
+           "######################################################################\n"
+           "\n"
+           "Forcing unblock: The current measurement appears to have deadlocked\n"
+           "and the results cannot be trusted.\n"
+           "\n"
+           "This happens when the KernelLauncher synchronizes the CUDA device.\n"
+           "If this is the case, pass the `sync` exec_tag to the `exec` call:\n"
+           "\n"
+           "    state.exec(<KernelLauncher>); // Deadlock\n"
+           "    state.exec(nvbench::exec_tag::sync, <KernelLauncher>); // Safe\n"
+           "\n"
+           "This tells NVBench about the sync so it can run the benchmark safely.\n"
+           "\n"
+           "If the KernelLauncher does not synchronize but has a very long \n"
+           "execution time, this may be a false positive. If so, disable this\n"
+           "check with:\n"
+           "\n"
+           "    state.set_blocking_kernel_timeout(-1);\n"
+           "\n"
+           "The current timeout is set to %0.5g seconds.\n"
+           "\n"
+           "For more information, see the 'Benchmarks that sync' section of the\n"
+           "NVBench documentation.\n"
+           "\n"
+           "If this happens while profiling with an external tool,\n"
+           "pass the `--disable-blocking-kernel` flag or the `--profile` flag\n"
+           "(to also only run the benchmark once) to the executable.\n"
+           "\n"
+           "For more information, see the 'Benchmark Properties' section of the\n"
+           "NVBench documentation.\n\n",
+           timeout);
   }
 }
 
@@ -102,15 +101,11 @@ namespace nvbench
 
 blocking_kernel::blocking_kernel()
 {
-  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag,
-                                     sizeof(m_host_flag),
-                                     cudaHostRegisterMapped));
+  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_flag, sizeof(m_host_flag), cudaHostRegisterMapped));
   NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_flag, &m_host_flag, 0));
-  NVBENCH_CUDA_CALL(cudaHostRegister(&m_host_timeout_flag,
-                                     sizeof(m_host_timeout_flag),
-                                     cudaHostRegisterMapped));
   NVBENCH_CUDA_CALL(
-    cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
+    cudaHostRegister(&m_host_timeout_flag, sizeof(m_host_timeout_flag), cudaHostRegisterMapped));
+  NVBENCH_CUDA_CALL(cudaHostGetDevicePointer(&m_device_timeout_flag, &m_host_timeout_flag, 0));
 }
 
 blocking_kernel::~blocking_kernel()
@@ -119,14 +114,11 @@ blocking_kernel::~blocking_kernel()
   NVBENCH_CUDA_CALL_NOEXCEPT(cudaHostUnregister(&m_host_timeout_flag));
 }
 
-void blocking_kernel::block(const nvbench::cuda_stream &stream,
-                            nvbench::float64_t timeout)
+void blocking_kernel::block(const nvbench::cuda_stream &stream, nvbench::float64_t timeout)
 {
   m_host_flag         = 0;
   m_host_timeout_flag = 0;
-  block_stream<<<1, 1, 0, stream>>>(m_device_flag,
-                                    m_device_timeout_flag,
-                                    timeout);
+  block_stream<<<1, 1, 0, stream>>>(m_device_flag, m_device_timeout_flag, timeout);
 }
 
 void blocking_kernel::timeout_detected()
diff --git a/nvbench/blocking_kernel.cuh b/nvbench/blocking_kernel.cuh
index ecbfed8..13f737e 100644
--- a/nvbench/blocking_kernel.cuh
+++ b/nvbench/blocking_kernel.cuh
@@ -97,10 +97,10 @@ struct blocking_kernel
   }
 
   // move-only
-  blocking_kernel(const blocking_kernel &) = delete;
-  blocking_kernel(blocking_kernel &&)      = default;
+  blocking_kernel(const blocking_kernel &)            = delete;
+  blocking_kernel(blocking_kernel &&)                 = default;
   blocking_kernel &operator=(const blocking_kernel &) = delete;
-  blocking_kernel &operator=(blocking_kernel &&) = default;
+  blocking_kernel &operator=(blocking_kernel &&)      = default;
 
 private:
   nvbench::int32_t m_host_flag{};
diff --git a/nvbench/callable.cuh b/nvbench/callable.cuh
index ce7fff0..2cd1f15 100644
--- a/nvbench/callable.cuh
+++ b/nvbench/callable.cuh
@@ -30,35 +30,29 @@ struct state;
 // Define a simple callable wrapper around a function. This allows the function
 // to be used as a class template parameter. Intended for use with kernel
 // generators and `NVBENCH_BENCH` macros.
-#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function)                               \
+#define NVBENCH_DEFINE_UNIQUE_CALLABLE(function)                                                   \
   NVBENCH_DEFINE_CALLABLE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
 
-#define NVBENCH_DEFINE_CALLABLE(function, callable_name)                       \
-  struct callable_name                                                         \
-  {                                                                            \
-    void operator()(nvbench::state &state, nvbench::type_list<>)               \
-    {                                                                          \
-      function(state);                                                         \
-    }                                                                          \
+#define NVBENCH_DEFINE_CALLABLE(function, callable_name)                                           \
+  struct callable_name                                                                             \
+  {                                                                                                \
+    void operator()(nvbench::state &state, nvbench::type_list<>) { function(state); }              \
   }
 
-#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function)                      \
-  NVBENCH_DEFINE_CALLABLE_TEMPLATE(function,                                   \
-                                   NVBENCH_UNIQUE_IDENTIFIER(function))
+#define NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(function)                                          \
+  NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, NVBENCH_UNIQUE_IDENTIFIER(function))
 
-#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name)              \
-  struct callable_name                                                         \
-  {                                                                            \
-    template <typename... Ts>                                                  \
-    void operator()(nvbench::state &state, nvbench::type_list<Ts...>)          \
-    {                                                                          \
-      function(state, nvbench::type_list<Ts...>{});                            \
-    }                                                                          \
+#define NVBENCH_DEFINE_CALLABLE_TEMPLATE(function, callable_name)                                  \
+  struct callable_name                                                                             \
+  {                                                                                                \
+    template <typename... Ts>                                                                      \
+    void operator()(nvbench::state &state, nvbench::type_list<Ts...>)                              \
+    {                                                                                              \
+      function(state, nvbench::type_list<Ts...>{});                                                \
+    }                                                                                              \
   }
 
-#define NVBENCH_UNIQUE_IDENTIFIER(prefix)                                      \
-  NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
-#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id)                     \
+#define NVBENCH_UNIQUE_IDENTIFIER(prefix) NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, __LINE__)
+#define NVBENCH_UNIQUE_IDENTIFIER_IMPL1(prefix, unique_id)                                         \
   NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)
-#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id)                     \
-  prefix##_line_##unique_id
+#define NVBENCH_UNIQUE_IDENTIFIER_IMPL2(prefix, unique_id) prefix##_line_##unique_id
diff --git a/nvbench/cpu_timer.cuh b/nvbench/cpu_timer.cuh
index 09d3c54..d4ba655 100644
--- a/nvbench/cpu_timer.cuh
+++ b/nvbench/cpu_timer.cuh
@@ -30,27 +30,20 @@ struct cpu_timer
   __forceinline__ cpu_timer() = default;
 
   // move-only
-  cpu_timer(const cpu_timer &) = delete;
-  cpu_timer(cpu_timer &&)      = default;
+  cpu_timer(const cpu_timer &)            = delete;
+  cpu_timer(cpu_timer &&)                 = default;
   cpu_timer &operator=(const cpu_timer &) = delete;
-  cpu_timer &operator=(cpu_timer &&) = default;
+  cpu_timer &operator=(cpu_timer &&)      = default;
 
-  __forceinline__ void start()
-  {
-    m_start = std::chrono::high_resolution_clock::now();
-  }
+  __forceinline__ void start() { m_start = std::chrono::high_resolution_clock::now(); }
 
-  __forceinline__ void stop()
-  {
-    m_stop = std::chrono::high_resolution_clock::now();
-  }
+  __forceinline__ void stop() { m_stop = std::chrono::high_resolution_clock::now(); }
 
   // In seconds:
   [[nodiscard]] __forceinline__ nvbench::float64_t get_duration()
   {
     const auto duration = m_stop - m_start;
-    const auto ns =
-      std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+    const auto ns       = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
     return static_cast<nvbench::float64_t>(ns) * (1e-9);
   }
 
diff --git a/nvbench/create.cuh b/nvbench/create.cuh
index 7aed1b7..902d6c3 100644
--- a/nvbench/create.cuh
+++ b/nvbench/create.cuh
@@ -27,19 +27,17 @@
 
 #define NVBENCH_TYPE_AXES(...) nvbench::type_list<__VA_ARGS__>
 
-#define NVBENCH_BENCH(KernelGenerator)                                         \
-  NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator);                             \
-  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =  \
-    nvbench::benchmark_manager::get()                                          \
-      .add(std::make_unique<                                                   \
-           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>())  \
+#define NVBENCH_BENCH(KernelGenerator)                                                             \
+  NVBENCH_DEFINE_UNIQUE_CALLABLE(KernelGenerator);                                                 \
+  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =                      \
+    nvbench::benchmark_manager::get()                                                              \
+      .add(std::make_unique<nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator)>>())     \
       .set_name(#KernelGenerator)
 
-#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes)                         \
-  NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator);                    \
-  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =  \
-    nvbench::benchmark_manager::get()                                          \
-      .add(std::make_unique<                                                   \
-           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator),      \
-                              TypeAxes>>())                                    \
+#define NVBENCH_BENCH_TYPES(KernelGenerator, TypeAxes)                                             \
+  NVBENCH_DEFINE_UNIQUE_CALLABLE_TEMPLATE(KernelGenerator);                                        \
+  nvbench::benchmark_base &NVBENCH_UNIQUE_IDENTIFIER(obj_##KernelGenerator) =                      \
+    nvbench::benchmark_manager::get()                                                              \
+      .add(std::make_unique<                                                                       \
+           nvbench::benchmark<NVBENCH_UNIQUE_IDENTIFIER(KernelGenerator), TypeAxes>>())            \
       .set_name(#KernelGenerator)
diff --git a/nvbench/csv_printer.cu b/nvbench/csv_printer.cu
index 6acb535..87fb88a 100644
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -66,10 +66,8 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
     {
       std::optional<nvbench::device_info> device = cur_state.get_device();
 
-      std::string device_id   = device ? fmt::to_string(device->get_id())
-                                       : std::string{};
-      std::string device_name = device ? std::string{device->get_name()}
-                                       : std::string{};
+      std::string device_id   = device ? fmt::to_string(device->get_id()) : std::string{};
+      std::string device_name = device ? std::string{device->get_name()} : std::string{};
 
       table.add_cell(row, "_bench_name", "Benchmark", bench_name);
       table.add_cell(row, "_device_id", "Device", std::move(device_id));
@@ -88,15 +86,11 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
                          name + "_axis_pow2_pretty",
                          name + " (pow2)",
                          fmt::format("2^{}", exponent));
-          table.add_cell(row,
-                         name + "_axis_plain",
-                         fmt::format("{}", name),
-                         fmt::to_string(value));
+          table.add_cell(row, name + "_axis_plain", fmt::format("{}", name), fmt::to_string(value));
         }
         else
         {
-          std::string value = std::visit(format_visitor,
-                                         axis_values.get_value(name));
+          std::string value = std::visit(format_visitor, axis_values.get_value(name));
           table.add_cell(row, name + "_axis", name, std::move(value));
         }
       }
@@ -117,14 +111,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
           continue;
         }
         const std::string &tag    = summ.get_tag();
-        const std::string &header = summ.has_value("name")
-                                      ? summ.get_string("name")
-                                      : tag;
+        const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
 
-        const std::string hint = summ.has_value("hint")
-                                   ? summ.get_string("hint")
-                                   : std::string{};
-        std::string value = std::visit(format_visitor, summ.get_value("value"));
+        const std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
+        std::string value      = std::visit(format_visitor, summ.get_value("value"));
         if (hint == "duration")
         {
           table.add_cell(row, tag, header + " (sec)", std::move(value));
@@ -182,10 +172,7 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
       std::size_t remaining = table.m_columns.size();
       for (const auto &col : table.m_columns)
       {
-        fmt::format_to(buffer,
-                       "{}{}",
-                       col.rows[i],
-                       (--remaining == 0) ? "" : ",");
+        fmt::format_to(buffer, "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
       }
       fmt::format_to(buffer, "\n");
     }
diff --git a/nvbench/cuda_call.cu b/nvbench/cuda_call.cu
index 6cb304b..662c759 100644
--- a/nvbench/cuda_call.cu
+++ b/nvbench/cuda_call.cu
@@ -16,8 +16,8 @@
  *  limitations under the License.
  */
 
-#include <nvbench/cuda_call.cuh>
 #include <nvbench/config.cuh>
+#include <nvbench/cuda_call.cuh>
 
 #include <fmt/format.h>
 
@@ -66,11 +66,7 @@ void throw_error(const std::string &filename,
                                        command));
 }
 #else
-void throw_error(const std::string &,
-                 std::size_t,
-                 const std::string &,
-                 CUresult)
-{}
+void throw_error(const std::string &, std::size_t, const std::string &, CUresult) {}
 #endif
 
 void exit_error(const std::string &filename,
diff --git a/nvbench/cuda_call.cuh b/nvbench/cuda_call.cuh
index f1d6c45..ca8e765 100644
--- a/nvbench/cuda_call.cuh
+++ b/nvbench/cuda_call.cuh
@@ -18,52 +18,43 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
 #include <cuda.h>
+#include <cuda_runtime_api.h>
 
 #include <string>
 
 /// Throws a std::runtime_error if `call` doesn't return `cudaSuccess`.
-#define NVBENCH_CUDA_CALL(call)                                                \
-  do                                                                           \
-  {                                                                            \
-    const cudaError_t nvbench_cuda_call_error = call;                          \
-    if (nvbench_cuda_call_error != cudaSuccess)                                \
-    {                                                                          \
-      nvbench::cuda_call::throw_error(__FILE__,                                \
-                                      __LINE__,                                \
-                                      #call,                                   \
-                                      nvbench_cuda_call_error);                \
-    }                                                                          \
+#define NVBENCH_CUDA_CALL(call)                                                                    \
+  do                                                                                               \
+  {                                                                                                \
+    const cudaError_t nvbench_cuda_call_error = call;                                              \
+    if (nvbench_cuda_call_error != cudaSuccess)                                                    \
+    {                                                                                              \
+      nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);         \
+    }                                                                                              \
   } while (false)
 
 /// Throws a std::runtime_error if `call` doesn't return `CUDA_SUCCESS`.
-#define NVBENCH_DRIVER_API_CALL(call)                                          \
-  do                                                                           \
-  {                                                                            \
-    const CUresult nvbench_cuda_call_error = call;                             \
-    if (nvbench_cuda_call_error != CUDA_SUCCESS)                               \
-    {                                                                          \
-      nvbench::cuda_call::throw_error(__FILE__,                                \
-                                      __LINE__,                                \
-                                      #call,                                   \
-                                      nvbench_cuda_call_error);                \
-    }                                                                          \
+#define NVBENCH_DRIVER_API_CALL(call)                                                              \
+  do                                                                                               \
+  {                                                                                                \
+    const CUresult nvbench_cuda_call_error = call;                                                 \
+    if (nvbench_cuda_call_error != CUDA_SUCCESS)                                                   \
+    {                                                                                              \
+      nvbench::cuda_call::throw_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);         \
+    }                                                                                              \
   } while (false)
 
 /// Terminates process with failure status if `call` doesn't return
 /// `cudaSuccess`.
-#define NVBENCH_CUDA_CALL_NOEXCEPT(call)                                       \
-  do                                                                           \
-  {                                                                            \
-    const cudaError_t nvbench_cuda_call_error = call;                          \
-    if (nvbench_cuda_call_error != cudaSuccess)                                \
-    {                                                                          \
-      nvbench::cuda_call::exit_error(__FILE__,                                 \
-                                     __LINE__,                                 \
-                                     #call,                                    \
-                                     nvbench_cuda_call_error);                 \
-    }                                                                          \
+#define NVBENCH_CUDA_CALL_NOEXCEPT(call)                                                           \
+  do                                                                                               \
+  {                                                                                                \
+    const cudaError_t nvbench_cuda_call_error = call;                                              \
+    if (nvbench_cuda_call_error != cudaSuccess)                                                    \
+    {                                                                                              \
+      nvbench::cuda_call::exit_error(__FILE__, __LINE__, #call, nvbench_cuda_call_error);          \
+    }                                                                                              \
   } while (false)
 
 namespace nvbench::cuda_call
diff --git a/nvbench/cuda_stream.cuh b/nvbench/cuda_stream.cuh
index 6674c27..2c7536c 100644
--- a/nvbench/cuda_stream.cuh
+++ b/nvbench/cuda_stream.cuh
@@ -66,10 +66,10 @@ struct cuda_stream
   ~cuda_stream() = default;
 
   // move-only
-  cuda_stream(const cuda_stream &) = delete;
+  cuda_stream(const cuda_stream &)            = delete;
   cuda_stream &operator=(const cuda_stream &) = delete;
   cuda_stream(cuda_stream &&)                 = default;
-  cuda_stream &operator=(cuda_stream &&) = default;
+  cuda_stream &operator=(cuda_stream &&)      = default;
 
   /**
    * @return The `cudaStream_t` managed by this `cuda_stream`.
diff --git a/nvbench/cuda_timer.cuh b/nvbench/cuda_timer.cuh
index 0e022ce..e1c6e66 100644
--- a/nvbench/cuda_timer.cuh
+++ b/nvbench/cuda_timer.cuh
@@ -42,10 +42,10 @@ struct cuda_timer
   }
 
   // move-only
-  cuda_timer(const cuda_timer &) = delete;
-  cuda_timer(cuda_timer &&)      = default;
+  cuda_timer(const cuda_timer &)            = delete;
+  cuda_timer(cuda_timer &&)                 = default;
   cuda_timer &operator=(const cuda_timer &) = delete;
-  cuda_timer &operator=(cuda_timer &&) = default;
+  cuda_timer &operator=(cuda_timer &&)      = default;
 
   __forceinline__ void start(cudaStream_t stream)
   {
diff --git a/nvbench/cupti_profiler.cuh b/nvbench/cupti_profiler.cuh
index 6e0e255..214706a 100644
--- a/nvbench/cupti_profiler.cuh
+++ b/nvbench/cupti_profiler.cuh
@@ -21,14 +21,13 @@
 #include <nvbench/config.cuh>
 #include <nvbench/device_info.cuh>
 
+#include <optional>
 #include <string>
 #include <vector>
-#include <optional>
 
 namespace nvbench::detail
 {
 
-
 #ifdef NVBENCH_HAS_CUPTI
 /**
  * Pass required metrics in the constructor and organize your code as follows
@@ -62,7 +61,7 @@ namespace nvbench::detail
  */
 class cupti_profiler
 {
-  bool m_available {};
+  bool m_available{};
   std::string m_chip_name;
 
   // Counter data
@@ -87,11 +86,10 @@ public:
   cupti_profiler(cupti_profiler &&) noexcept;
   cupti_profiler &operator=(cupti_profiler &&) noexcept;
 
-  cupti_profiler(const cupti_profiler &) = delete;
+  cupti_profiler(const cupti_profiler &)            = delete;
   cupti_profiler &operator=(const cupti_profiler &) = delete;
 
-  cupti_profiler(nvbench::device_info device,
-                 std::vector<std::string> &&metric_names);
+  cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names);
   ~cupti_profiler();
 
   [[nodiscard]] bool is_initialized() const;
@@ -125,5 +123,4 @@ private:
 };
 #endif
 
-
 } // namespace nvbench::detail
diff --git a/nvbench/cupti_profiler.cxx b/nvbench/cupti_profiler.cxx
index 6dcd81d..a3c076b 100644
--- a/nvbench/cupti_profiler.cxx
+++ b/nvbench/cupti_profiler.cxx
@@ -59,8 +59,7 @@ void nvpw_call(const NVPA_Status status)
 
 } // namespace
 
-cupti_profiler::cupti_profiler(nvbench::device_info device,
-                               std::vector<std::string> &&metric_names)
+cupti_profiler::cupti_profiler(nvbench::device_info device, std::vector<std::string> &&metric_names)
     : m_metric_names(metric_names)
     , m_device(device)
 {
@@ -154,12 +153,10 @@ class eval_request
   NVPW_MetricsEvaluator *evaluator_ptr;
 
 public:
-  eval_request(NVPW_MetricsEvaluator *evaluator_ptr,
-               const std::string &metric_name)
+  eval_request(NVPW_MetricsEvaluator *evaluator_ptr, const std::string &metric_name)
       : evaluator_ptr(evaluator_ptr)
   {
-    NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params =
-      {};
+    NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params params = {};
 
     params.structSize =
       NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE;
@@ -168,8 +165,7 @@ public:
     params.pMetricEvalRequest          = &request;
     params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
 
-    nvpw_call(
-      NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
+    nvpw_call(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(&params));
   }
 
   [[nodiscard]] std::vector<const char *> get_raw_dependencies()
@@ -178,10 +174,9 @@ public:
 
     NVPW_MetricsEvaluator_GetMetricRawDependencies_Params params{};
 
-    params.structSize =
-      NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
-    params.pMetricsEvaluator           = evaluator_ptr;
-    params.pMetricEvalRequests         = &request;
+    params.structSize          = NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE;
+    params.pMetricsEvaluator   = evaluator_ptr;
+    params.pMetricEvalRequests = &request;
     params.numMetricEvalRequests       = 1;
     params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
     params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
@@ -211,26 +206,23 @@ public:
                    const std::uint8_t *counter_data_image         = nullptr,
                    const std::size_t counter_data_image_size      = 0)
   {
-    NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
-      scratch_buffer_param{};
+    NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params scratch_buffer_param{};
 
     scratch_buffer_param.structSize =
       NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE;
     scratch_buffer_param.pChipName                 = chip_name.c_str();
     scratch_buffer_param.pCounterAvailabilityImage = counter_availability_image;
 
-    nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(
-      &scratch_buffer_param));
+    nvpw_call(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(&scratch_buffer_param));
 
     scratch_buffer.resize(scratch_buffer_param.scratchBufferSize);
 
     NVPW_CUDA_MetricsEvaluator_Initialize_Params evaluator_params{};
 
-    evaluator_params.structSize =
-      NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
-    evaluator_params.scratchBufferSize         = scratch_buffer.size();
-    evaluator_params.pScratchBuffer            = scratch_buffer.data();
-    evaluator_params.pChipName                 = chip_name.c_str();
+    evaluator_params.structSize        = NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE;
+    evaluator_params.scratchBufferSize = scratch_buffer.size();
+    evaluator_params.pScratchBuffer    = scratch_buffer.data();
+    evaluator_params.pChipName         = chip_name.c_str();
     evaluator_params.pCounterAvailabilityImage = counter_availability_image;
     evaluator_params.pCounterDataImage         = counter_data_image;
     evaluator_params.counterDataImageSize      = counter_data_image_size;
@@ -247,7 +239,7 @@ public:
     {
       NVPW_MetricsEvaluator_Destroy_Params params{};
 
-      params.structSize = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
+      params.structSize        = NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE;
       params.pMetricsEvaluator = evaluator_ptr;
 
       nvpw_call(NVPW_MetricsEvaluator_Destroy(&params));
@@ -259,10 +251,7 @@ public:
     return {evaluator_ptr, metric_name};
   }
 
-  [[nodiscard]] operator NVPW_MetricsEvaluator *() const
-  {
-    return evaluator_ptr;
-  }
+  [[nodiscard]] operator NVPW_MetricsEvaluator *() const { return evaluator_ptr; }
 };
 
 } // namespace
@@ -270,10 +259,10 @@ public:
 namespace
 {
 
-[[nodiscard]] std::vector<NVPA_RawMetricRequest> get_raw_metric_requests(
-  const std::string &chip_name,
-  const std::vector<std::string> &metric_names,
-  const std::uint8_t *counter_availability_image = nullptr)
+[[nodiscard]] std::vector<NVPA_RawMetricRequest>
+get_raw_metric_requests(const std::string &chip_name,
+                        const std::vector<std::string> &metric_names,
+                        const std::uint8_t *counter_availability_image = nullptr)
 {
   metric_evaluator evaluator(chip_name, counter_availability_image);
 
@@ -282,8 +271,7 @@ namespace
 
   for (auto &metric_name : metric_names)
   {
-    for (auto &raw_dependency :
-         evaluator.create_request(metric_name).get_raw_dependencies())
+    for (auto &raw_dependency : evaluator.create_request(metric_name).get_raw_dependencies())
     {
       raw_metric_names.push_back(raw_dependency);
     }
@@ -295,10 +283,10 @@ namespace
   for (auto &raw_name : raw_metric_names)
   {
     NVPA_RawMetricRequest metricRequest{};
-    metricRequest.structSize            = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
-    metricRequest.pMetricName           = raw_name;
-    metricRequest.isolated              = true;
-    metricRequest.keepInstances         = true;
+    metricRequest.structSize    = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE;
+    metricRequest.pMetricName   = raw_name;
+    metricRequest.isolated      = true;
+    metricRequest.keepInstances = true;
     raw_requests.push_back(metricRequest);
   }
 
@@ -309,12 +297,11 @@ class metrics_config
 {
   bool initialized{};
 
-  void create(const std::string &chip_name,
-              const std::uint8_t *availability_image)
+  void create(const std::string &chip_name, const std::uint8_t *availability_image)
   {
     NVPW_CUDA_RawMetricsConfig_Create_V2_Params params{};
 
-    params.structSize = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
+    params.structSize                = NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE;
     params.activityKind              = NVPA_ACTIVITY_KIND_PROFILER;
     params.pChipName                 = chip_name.c_str();
     params.pCounterAvailabilityImage = availability_image;
@@ -329,9 +316,8 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_SetCounterAvailability_Params params{};
 
-    params.structSize =
-      NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
-    params.pRawMetricsConfig         = raw_metrics_config;
+    params.structSize        = NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE;
+    params.pRawMetricsConfig = raw_metrics_config;
     params.pCounterAvailabilityImage = availability_image;
 
     nvpw_call(NVPW_RawMetricsConfig_SetCounterAvailability(&params));
@@ -341,7 +327,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_BeginPassGroup_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
 
     nvpw_call(NVPW_RawMetricsConfig_BeginPassGroup(&params));
@@ -351,7 +337,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_AddMetrics_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
+    params.structSize         = NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE;
     params.pRawMetricsConfig  = raw_metrics_config;
     params.pRawMetricRequests = raw_metric_requests.data();
     params.numMetricRequests  = raw_metric_requests.size();
@@ -363,7 +349,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_EndPassGroup_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
 
     nvpw_call(NVPW_RawMetricsConfig_EndPassGroup(&params));
@@ -373,8 +359,7 @@ class metrics_config
   {
     NVPW_RawMetricsConfig_GenerateConfigImage_Params params{};
 
-    params.structSize =
-      NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
 
     nvpw_call(NVPW_RawMetricsConfig_GenerateConfigImage(&params));
@@ -398,7 +383,7 @@ public:
   {
     NVPW_RawMetricsConfig_GetConfigImage_Params params{};
 
-    params.structSize = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
+    params.structSize        = NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE;
     params.pRawMetricsConfig = raw_metrics_config;
     params.bytesAllocated    = 0;
     params.pBuffer           = nullptr;
@@ -419,7 +404,7 @@ public:
     {
       NVPW_RawMetricsConfig_Destroy_Params params{};
 
-      params.structSize = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
+      params.structSize        = NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE;
       params.pRawMetricsConfig = raw_metrics_config;
 
       NVPW_RawMetricsConfig_Destroy(&params);
@@ -433,13 +418,12 @@ public:
 
 void cupti_profiler::initialize_config_image()
 {
-  m_config_image =
-    metrics_config(m_chip_name,
-                   get_raw_metric_requests(m_chip_name,
-                                           m_metric_names,
-                                           m_availability_image.data()),
-                   m_availability_image.data())
-      .get_config_image();
+  m_config_image = metrics_config(m_chip_name,
+                                  get_raw_metric_requests(m_chip_name,
+                                                          m_metric_names,
+                                                          m_availability_image.data()),
+                                  m_availability_image.data())
+                     .get_config_image();
 }
 
 namespace
@@ -450,12 +434,11 @@ class counter_data_builder
   bool initialized{};
 
 public:
-  counter_data_builder(const std::string &chip_name,
-                       const std::uint8_t *pCounterAvailabilityImage)
+  counter_data_builder(const std::string &chip_name, const std::uint8_t *pCounterAvailabilityImage)
   {
     NVPW_CUDA_CounterDataBuilder_Create_Params params{};
 
-    params.structSize = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
+    params.structSize                = NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE;
     params.pChipName                 = chip_name.c_str();
     params.pCounterAvailabilityImage = pCounterAvailabilityImage;
 
@@ -471,7 +454,7 @@ public:
     {
       NVPW_CounterDataBuilder_Destroy_Params params{};
 
-      params.structSize = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
+      params.structSize          = NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE;
       params.pCounterDataBuilder = builder;
 
       NVPW_CounterDataBuilder_Destroy(&params);
@@ -488,16 +471,14 @@ void cupti_profiler::initialize_counter_data_prefix_image()
   const std::uint8_t *counter_availability_image = nullptr;
 
   std::vector<NVPA_RawMetricRequest> raw_metric_requests =
-    get_raw_metric_requests(m_chip_name,
-                            m_metric_names,
-                            counter_availability_image);
+    get_raw_metric_requests(m_chip_name, m_metric_names, counter_availability_image);
 
   counter_data_builder data_builder(m_chip_name, counter_availability_image);
 
   {
     NVPW_CounterDataBuilder_AddMetrics_Params params{};
 
-    params.structSize = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
+    params.structSize          = NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE;
     params.pCounterDataBuilder = data_builder.builder;
     params.pRawMetricRequests  = raw_metric_requests.data();
     params.numMetricRequests   = raw_metric_requests.size();
@@ -508,8 +489,7 @@ void cupti_profiler::initialize_counter_data_prefix_image()
   {
     NVPW_CounterDataBuilder_GetCounterDataPrefix_Params params{};
 
-    params.structSize =
-      NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
+    params.structSize          = NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE;
     params.pCounterDataBuilder = data_builder.builder;
     params.bytesAllocated      = 0;
     params.pBuffer             = nullptr;
@@ -532,11 +512,9 @@ get_counter_data_image_size(CUpti_Profiler_CounterDataImageOptions *options)
 {
   CUpti_Profiler_CounterDataImage_CalculateSize_Params params{};
 
-  params.structSize =
-    CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
-  params.pOptions = options;
-  params.sizeofCounterDataImageOptions =
-    CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+  params.structSize = CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE;
+  params.pOptions   = options;
+  params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
 
   cupti_call(cuptiProfilerCounterDataImageCalculateSize(&params));
   return params.counterDataImageSize;
@@ -559,12 +537,10 @@ void cupti_profiler::initialize_counter_data_image()
   {
     CUpti_Profiler_CounterDataImage_Initialize_Params params{};
 
-    params.structSize =
-      CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
-    params.sizeofCounterDataImageOptions =
-      CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
-    params.pOptions             = &counter_data_image_options;
-    params.counterDataImageSize = m_data_image.size();
+    params.structSize = CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE;
+    params.sizeofCounterDataImageOptions = CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
+    params.pOptions                      = &counter_data_image_options;
+    params.counterDataImageSize          = m_data_image.size();
 
     params.pCounterDataImage = &m_data_image[0];
     cupti_call(cuptiProfilerCounterDataImageInitialize(&params));
@@ -578,8 +554,7 @@ void cupti_profiler::initialize_counter_data_image()
     params.counterDataImageSize = m_data_image.size();
     params.pCounterDataImage    = &m_data_image[0];
 
-    cupti_call(
-      cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
+    cupti_call(cuptiProfilerCounterDataImageCalculateScratchBufferSize(&params));
 
     m_data_scratch_buffer.resize(params.counterDataScratchBufferSize);
   }
@@ -587,8 +562,7 @@ void cupti_profiler::initialize_counter_data_image()
   {
     CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params params{};
 
-    params.structSize =
-      CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
+    params.structSize = CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE;
     params.counterDataImageSize         = m_data_image.size();
     params.pCounterDataImage            = &m_data_image[0];
     params.counterDataScratchBufferSize = m_data_scratch_buffer.size();
@@ -608,17 +582,14 @@ cupti_profiler::~cupti_profiler()
   }
 }
 
-bool cupti_profiler::is_initialized() const
-{
-  return m_available;
-}
+bool cupti_profiler::is_initialized() const { return m_available; }
 
 void cupti_profiler::prepare_user_loop()
 {
   {
     CUpti_Profiler_BeginSession_Params params{};
 
-    params.structSize = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
+    params.structSize                   = CUpti_Profiler_BeginSession_Params_STRUCT_SIZE;
     params.ctx                          = nullptr;
     params.counterDataImageSize         = m_data_image.size();
     params.pCounterDataImage            = &m_data_image[0];
@@ -735,9 +706,7 @@ std::vector<double> cupti_profiler::get_counter_values()
 
     if (params.numRanges != 1)
     {
-      NVBENCH_THROW(std::runtime_error,
-                    "{}",
-                    "Something's gone wrong, one range is expected");
+      NVBENCH_THROW(std::runtime_error, "{}", "Something's gone wrong, one range is expected");
     }
   }
 
@@ -752,8 +721,7 @@ std::vector<double> cupti_profiler::get_counter_values()
     {
       NVPW_MetricsEvaluator_SetDeviceAttributes_Params params{};
 
-      params.structSize =
-        NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
+      params.structSize           = NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE;
       params.pMetricsEvaluator    = evaluator;
       params.pCounterDataImage    = m_data_image.data();
       params.counterDataImageSize = m_data_image.size();
@@ -764,11 +732,10 @@ std::vector<double> cupti_profiler::get_counter_values()
     {
       NVPW_MetricsEvaluator_EvaluateToGpuValues_Params params{};
 
-      params.structSize =
-        NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
-      params.pMetricsEvaluator           = evaluator;
-      params.pMetricEvalRequests         = &request.request;
-      params.numMetricEvalRequests       = 1;
+      params.structSize            = NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE;
+      params.pMetricsEvaluator     = evaluator;
+      params.pMetricEvalRequests   = &request.request;
+      params.numMetricEvalRequests = 1;
       params.metricEvalRequestStructSize = NVPW_MetricEvalRequest_STRUCT_SIZE;
       params.metricEvalRequestStrideSize = sizeof(NVPW_MetricEvalRequest);
       params.pCounterDataImage           = m_data_image.data();
diff --git a/nvbench/detail/device_scope.cuh b/nvbench/detail/device_scope.cuh
index de3a55a..c924bee 100644
--- a/nvbench/detail/device_scope.cuh
+++ b/nvbench/detail/device_scope.cuh
@@ -39,9 +39,9 @@ struct [[maybe_unused]] device_scope
   ~device_scope() { NVBENCH_CUDA_CALL(cudaSetDevice(m_old_device_id)); }
 
   // move-only
-  device_scope(device_scope &&) = default;
-  device_scope &operator=(device_scope &&) = default;
-  device_scope(const device_scope &)       = delete;
+  device_scope(device_scope &&)                 = default;
+  device_scope &operator=(device_scope &&)      = default;
+  device_scope(const device_scope &)            = delete;
   device_scope &operator=(const device_scope &) = delete;
 
 private:
diff --git a/nvbench/detail/kernel_launcher_timer_wrapper.cuh b/nvbench/detail/kernel_launcher_timer_wrapper.cuh
index 39a999e..1efdf6e 100644
--- a/nvbench/detail/kernel_launcher_timer_wrapper.cuh
+++ b/nvbench/detail/kernel_launcher_timer_wrapper.cuh
@@ -33,7 +33,7 @@ namespace detail
 template <typename KernelLauncher>
 struct kernel_launch_timer_wrapper
 {
- explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
+  explicit kernel_launch_timer_wrapper(KernelLauncher &launcher)
       : m_kernel_launcher{launcher}
   {}
 
diff --git a/nvbench/detail/l2flush.cuh b/nvbench/detail/l2flush.cuh
index 0e33f7e..aefbfef 100644
--- a/nvbench/detail/l2flush.cuh
+++ b/nvbench/detail/l2flush.cuh
@@ -31,13 +31,12 @@ struct l2flush
   {
     int dev_id{};
     NVBENCH_CUDA_CALL(cudaGetDevice(&dev_id));
-    NVBENCH_CUDA_CALL(
-      cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
+    NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&m_l2_size, cudaDevAttrL2CacheSize, dev_id));
     if (m_l2_size > 0)
     {
-      void* buffer = m_l2_buffer;
+      void *buffer = m_l2_buffer;
       NVBENCH_CUDA_CALL(cudaMalloc(&buffer, m_l2_size));
-      m_l2_buffer = reinterpret_cast<int*>(buffer);
+      m_l2_buffer = reinterpret_cast<int *>(buffer);
     }
   }
 
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 380d2cd..69ceb7e 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -54,15 +54,11 @@ void measure_cold_base::check()
   const auto device = m_state.get_device();
   if (!device)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `cold` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cold` measurement.");
   }
   if (!device->is_active())
   { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
   }
 }
 
@@ -92,13 +88,11 @@ void measure_cold_base::record_measurements()
   ++m_total_samples;
 
   // Compute convergence statistics using CUDA timings:
-  const auto mean_cuda_time = m_total_cuda_time /
-                              static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cuda_stdev =
-    nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
-                                                    m_cuda_times.cend(),
-                                                    mean_cuda_time);
-  auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
+  const auto mean_cuda_time = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
+                                                                          m_cuda_times.cend(),
+                                                                          mean_cuda_time);
+  auto cuda_rel_stdev       = cuda_stdev / mean_cuda_time;
   if (std::isfinite(cuda_rel_stdev))
   {
     m_noise_tracker.push_back(cuda_rel_stdev);
@@ -132,10 +126,10 @@ bool measure_cold_base::is_finished()
     {
       // Use the current noise as the stdev reference.
       const auto current_noise = m_noise_tracker.back();
-      const auto noise_stdev = nvbench::detail::statistics::standard_deviation(
-        m_noise_tracker.cbegin(),
-        m_noise_tracker.cend(),
-        current_noise);
+      const auto noise_stdev =
+        nvbench::detail::statistics::standard_deviation(m_noise_tracker.cbegin(),
+                                                        m_noise_tracker.cend(),
+                                                        current_noise);
       const auto noise_rel_stdev = noise_stdev / current_noise;
 
       // If the rel stdev of the last N cuda noise measurements is less than
@@ -162,13 +156,11 @@ bool measure_cold_base::is_finished()
 void measure_cold_base::run_trials_epilogue()
 {
   // Only need to compute this at the end, not per iteration.
-  const auto cpu_mean = m_total_cuda_time /
-                        static_cast<nvbench::float64_t>(m_total_samples);
-  const auto cpu_stdev =
-    nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
-                                                    m_cpu_times.cend(),
-                                                    cpu_mean);
-  m_cpu_noise = cpu_stdev / cpu_mean;
+  const auto cpu_mean  = m_total_cuda_time / static_cast<nvbench::float64_t>(m_total_samples);
+  const auto cpu_stdev = nvbench::detail::statistics::standard_deviation(m_cpu_times.cbegin(),
+                                                                         m_cpu_times.cend(),
+                                                                         cpu_mean);
+  m_cpu_noise          = cpu_stdev / cpu_mean;
 
   m_walltime_timer.stop();
 }
@@ -199,8 +191,7 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/time/cpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("description",
-                    "Relative standard deviation of isolated CPU times");
+    summ.set_string("description", "Relative standard deviation of isolated CPU times");
     summ.set_float64("value", m_cpu_noise);
   }
 
@@ -219,12 +210,10 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/time/gpu/stdev/relative");
     summ.set_string("name", "Noise");
     summ.set_string("hint", "percentage");
-    summ.set_string("description",
-                    "Relative standard deviation of isolated GPU times");
+    summ.set_string("description", "Relative standard deviation of isolated GPU times");
     summ.set_float64("value",
-                     m_noise_tracker.empty()
-                       ? std::numeric_limits<nvbench::float64_t>::infinity()
-                       : m_noise_tracker.back());
+                     m_noise_tracker.empty() ? std::numeric_limits<nvbench::float64_t>::infinity()
+                                             : m_noise_tracker.back());
   }
 
   if (const auto items = m_state.get_element_count(); items != 0)
@@ -232,8 +221,7 @@ void measure_cold_base::generate_summaries()
     auto &summ = m_state.add_summary("nv/cold/bw/item_rate");
     summ.set_string("name", "Elem/s");
     summ.set_string("hint", "item_rate");
-    summ.set_string("description",
-                    "Number of input elements processed per second");
+    summ.set_string("description", "Number of input elements processed per second");
     summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
   }
 
@@ -251,8 +239,8 @@ void measure_cold_base::generate_summaries()
     }
 
     {
-      const auto peak_gmem_bw = static_cast<double>(
-        m_state.get_device()->get_global_memory_bus_bandwidth());
+      const auto peak_gmem_bw =
+        static_cast<double>(m_state.get_device()->get_global_memory_bus_bandwidth());
 
       auto &summ = m_state.add_summary("nv/cold/bw/global/utilization");
       summ.set_string("name", "BWUtil");
@@ -274,8 +262,7 @@ void measure_cold_base::generate_summaries()
   }
 
   // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
 
@@ -324,10 +311,7 @@ void measure_cold_base::generate_summaries()
                             m_walltime_timer.get_duration(),
                             m_total_samples));
 
-    printer.process_bulk_data(m_state,
-                              "nv/cold/sample_times",
-                              "sample_times",
-                              m_cuda_times);
+    printer.process_bulk_data(m_state, "nv/cold/sample_times", "sample_times", m_cuda_times);
   }
 }
 
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 0cab36a..a5c2604 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -49,10 +49,10 @@ namespace detail
 struct measure_cold_base
 {
   explicit measure_cold_base(nvbench::state &exec_state);
-  measure_cold_base(const measure_cold_base &) = delete;
-  measure_cold_base(measure_cold_base &&)      = delete;
+  measure_cold_base(const measure_cold_base &)            = delete;
+  measure_cold_base(measure_cold_base &&)                 = delete;
   measure_cold_base &operator=(const measure_cold_base &) = delete;
-  measure_cold_base &operator=(measure_cold_base &&) = delete;
+  measure_cold_base &operator=(measure_cold_base &&)      = delete;
 
 protected:
   template <bool use_blocking_kernel>
@@ -68,10 +68,7 @@ protected:
 
   void check_skip_time(nvbench::float64_t warmup_time);
 
-  __forceinline__ void flush_device_l2()
-  {
-    m_l2flush.flush(m_launch.get_stream());
-  }
+  __forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
 
   __forceinline__ void sync_stream() const
   {
diff --git a/nvbench/detail/measure_cupti.cu b/nvbench/detail/measure_cupti.cu
index 9e8de6c..e583cd5 100644
--- a/nvbench/detail/measure_cupti.cu
+++ b/nvbench/detail/measure_cupti.cu
@@ -50,8 +50,7 @@ struct metric_traits;
 template <>
 struct metric_traits<metric_id::dram_peak_sustained_throughput>
 {
-  static constexpr const char *metric_name =
-    "dram__throughput.avg.pct_of_peak_sustained_elapsed";
+  static constexpr const char *metric_name = "dram__throughput.avg.pct_of_peak_sustained_elapsed";
 
   static constexpr const char *name = "HBWPeak";
   static constexpr const char *hint = "percentage";
@@ -119,10 +118,7 @@ struct metric_traits<metric_id::l1_hit_rate>
   static constexpr const char *description = "Hit rate at L1 cache.";
   static constexpr double divider          = 100.0;
 
-  static bool is_collected(nvbench::state &m_state)
-  {
-    return m_state.is_l1_hit_rate_collected();
-  };
+  static bool is_collected(nvbench::state &m_state) { return m_state.is_l1_hit_rate_collected(); };
 };
 
 template <>
@@ -134,10 +130,7 @@ struct metric_traits<metric_id::l2_hit_rate>
   static constexpr const char *description = "Hit rate at L2 cache.";
   static constexpr double divider          = 100.0;
 
-  static bool is_collected(nvbench::state &m_state)
-  {
-    return m_state.is_l2_hit_rate_collected();
-  };
+  static bool is_collected(nvbench::state &m_state) { return m_state.is_l2_hit_rate_collected(); };
 };
 
 template <metric_id id = metric_id::dram_peak_sustained_throughput>
@@ -153,8 +146,7 @@ void add_metrics_impl(nvbench::state &state, std::vector<std::string> &metrics)
 }
 
 template <>
-void add_metrics_impl<metric_id::count>(nvbench::state &,
-                                        std::vector<std::string> &)
+void add_metrics_impl<metric_id::count>(nvbench::state &, std::vector<std::string> &)
 {}
 
 std::vector<std::string> add_metrics(nvbench::state &state)
@@ -179,13 +171,11 @@ try
 // clang-format on
 catch (const std::exception &ex)
 {
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref)
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref)
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::warn,
-                fmt::format("CUPTI failed to construct profiler: {}",
-                            ex.what()));
+                fmt::format("CUPTI failed to construct profiler: {}", ex.what()));
   }
 }
 
@@ -194,15 +184,11 @@ void measure_cupti_base::check()
   const auto device = m_state.get_device();
   if (!device)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `cupti` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `cupti` measurement.");
   }
   if (!device->is_active())
   { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
   }
 }
 
@@ -210,16 +196,13 @@ namespace
 {
 
 template <metric_id id = metric_id::dram_peak_sustained_throughput>
-void gen_summary(std::size_t result_id,
-                 nvbench::state &m_state,
-                 const std::vector<double> &result)
+void gen_summary(std::size_t result_id, nvbench::state &m_state, const std::vector<double> &result)
 {
   using metric = metric_traits<id>;
 
   if (metric::is_collected(m_state))
   {
-    auto &summ =
-      m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
+    auto &summ = m_state.add_summary(fmt::format("nv/cupti/{}", metric::metric_name));
     summ.set_string("name", metric::name);
     summ.set_string("hint", metric::hint);
     summ.set_string("description", metric::description);
@@ -231,9 +214,7 @@ void gen_summary(std::size_t result_id,
 }
 
 template <>
-void gen_summary<metric_id::count>(std::size_t,
-                                   nvbench::state &,
-                                   const std::vector<double> &)
+void gen_summary<metric_id::count>(std::size_t, nvbench::state &, const std::vector<double> &)
 {}
 
 void gen_summaries(nvbench::state &state, const std::vector<double> &result)
@@ -266,8 +247,7 @@ try
   }
 
   // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::pass,
@@ -278,13 +258,11 @@ try
 }
 catch (const std::exception &ex)
 {
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref)
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref)
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::warn,
-                fmt::format("CUPTI failed to generate the summary: {}",
-                            ex.what()));
+                fmt::format("CUPTI failed to generate the summary: {}", ex.what()));
   }
 }
 
diff --git a/nvbench/detail/measure_cupti.cuh b/nvbench/detail/measure_cupti.cuh
index 736c375..ec7b212 100644
--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -50,10 +50,10 @@ namespace detail
 struct measure_cupti_base
 {
   explicit measure_cupti_base(nvbench::state &exec_state);
-  measure_cupti_base(const measure_cupti_base &) = delete;
-  measure_cupti_base(measure_cupti_base &&)      = delete;
+  measure_cupti_base(const measure_cupti_base &)            = delete;
+  measure_cupti_base(measure_cupti_base &&)                 = delete;
   measure_cupti_base &operator=(const measure_cupti_base &) = delete;
-  measure_cupti_base &operator=(measure_cupti_base &&) = delete;
+  measure_cupti_base &operator=(measure_cupti_base &&)      = delete;
 
 protected:
   struct kernel_launch_timer;
@@ -61,10 +61,7 @@ protected:
   void check();
   void generate_summaries();
 
-  __forceinline__ void flush_device_l2()
-  {
-    m_l2flush.flush(m_launch.get_stream());
-  }
+  __forceinline__ void flush_device_l2() { m_l2flush.flush(m_launch.get_stream()); }
 
   __forceinline__ void sync_stream() const
   {
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index 25e2119..9497122 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -47,9 +47,8 @@ measure_hot_base::measure_hot_base(state &exec_state)
   // to match the cold result if available.
   try
   {
-    nvbench::int64_t cold_samples =
-      m_state.get_summary("nv/cold/sample_size").get_int64("value");
-    m_min_samples = std::max(m_min_samples, cold_samples);
+    nvbench::int64_t cold_samples = m_state.get_summary("nv/cold/sample_size").get_int64("value");
+    m_min_samples                 = std::max(m_min_samples, cold_samples);
 
     // If the cold measurement ran successfully, disable skip_time. It'd just
     // be annoying to skip now.
@@ -72,15 +71,11 @@ void measure_hot_base::check()
   const auto device = m_state.get_device();
   if (!device)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Device required for `hot` measurement.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Device required for `hot` measurement.");
   }
   if (!device->is_active())
   { // This means something went wrong higher up. Throw an error.
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Internal error: Current device is not active.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Internal error: Current device is not active.");
   }
 }
 
@@ -116,8 +111,7 @@ void measure_hot_base::generate_summaries()
   }
 
   // Log if a printer exists:
-  if (auto printer_opt_ref = m_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = m_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
 
diff --git a/nvbench/detail/measure_hot.cuh b/nvbench/detail/measure_hot.cuh
index 9c4c2ec..ae2b4f1 100644
--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -41,10 +41,10 @@ namespace detail
 struct measure_hot_base
 {
   explicit measure_hot_base(nvbench::state &exec_state);
-  measure_hot_base(const measure_hot_base &) = delete;
-  measure_hot_base(measure_hot_base &&)      = delete;
+  measure_hot_base(const measure_hot_base &)            = delete;
+  measure_hot_base(measure_hot_base &&)                 = delete;
   measure_hot_base &operator=(const measure_hot_base &) = delete;
-  measure_hot_base &operator=(measure_hot_base &&) = delete;
+  measure_hot_base &operator=(measure_hot_base &&)      = delete;
 
 protected:
   void check();
@@ -131,7 +131,7 @@ private:
     // The .95 factor here pads the batch_size a bit to avoid needing a second
     // batch due to noise.
     const auto time_estimate = m_cuda_timer.get_duration() * 0.95;
-    auto batch_size = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
+    auto batch_size          = static_cast<nvbench::int64_t>(m_min_time / time_estimate);
 
     do
     {
@@ -142,7 +142,7 @@ private:
         // Block stream until some work is queued.
         // Limit the number of kernel executions while blocked to prevent
         // deadlocks. See warnings on blocking_kernel.
-        const auto blocked_launches = std::min(batch_size, nvbench::int64_t{2});
+        const auto blocked_launches   = std::min(batch_size, nvbench::int64_t{2});
         const auto unblocked_launches = batch_size - blocked_launches;
 
         this->block_stream();
@@ -189,7 +189,6 @@ private:
         break; // Stop iterating
       }
 
-
       m_walltime_timer.stop();
       if (m_walltime_timer.get_duration() > m_timeout)
       {
diff --git a/nvbench/detail/ring_buffer.cuh b/nvbench/detail/ring_buffer.cuh
index fa86200..645fa03 100644
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -58,18 +58,12 @@ struct ring_buffer
   /**
    * The number of valid values in the ring buffer. Always <= capacity().
    */
-  [[nodiscard]] std::size_t size() const
-  {
-    return m_full ? m_buffer.size() : m_index;
-  }
+  [[nodiscard]] std::size_t size() const { return m_full ? m_buffer.size() : m_index; }
 
   /**
    * The maximum size of the ring buffer.
    */
-  [[nodiscard]] std::size_t capacity() const
-  {
-    return m_buffer.size();
-  }
+  [[nodiscard]] std::size_t capacity() const { return m_buffer.size(); }
 
   /**
    * @return True if the ring buffer is empty.
diff --git a/nvbench/detail/state_exec.cuh b/nvbench/detail/state_exec.cuh
index 9352a5f..bab2daf 100644
--- a/nvbench/detail/state_exec.cuh
+++ b/nvbench/detail/state_exec.cuh
@@ -53,7 +53,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
                 "`ExecTags` argument must be a member (or combination of "
                 "members) from nvbench::exec_tag.");
 
-  constexpr auto measure_tags = tags & measure_mask;
+  constexpr auto measure_tags  = tags & measure_mask;
   constexpr auto modifier_tags = tags & modifier_mask;
 
   // "run once" is handled by the cold measurement:
@@ -81,8 +81,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
     }
     else
     {
-      this->exec(cold | hot | tags,
-                 std::forward<KernelLauncher>(kernel_launcher));
+      this->exec(cold | hot | tags, std::forward<KernelLauncher>(kernel_launcher));
     }
     return;
   }
@@ -99,8 +98,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
     constexpr bool use_blocking_kernel = !(tags & no_block);
     if constexpr (tags & timer)
     {
-      // Estimate bandwidth here
-      #ifdef NVBENCH_HAS_CUPTI
+// Estimate bandwidth here
+#ifdef NVBENCH_HAS_CUPTI
       if constexpr (!(modifier_tags & run_once))
       {
         if (this->is_cupti_required())
@@ -110,7 +109,7 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
           measure();
         }
       }
-      #endif
+#endif
 
       using measure_t = nvbench::detail::measure_cold<KL, use_blocking_kernel>;
       measure_t measure{*this, kernel_launcher};
@@ -121,8 +120,8 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
       using wrapper_t = nvbench::detail::kernel_launch_timer_wrapper<KL>;
       wrapper_t wrapper{kernel_launcher};
 
-      // Estimate bandwidth here
-      #ifdef NVBENCH_HAS_CUPTI
+// Estimate bandwidth here
+#ifdef NVBENCH_HAS_CUPTI
       if constexpr (!(modifier_tags & run_once))
       {
         if (this->is_cupti_required())
@@ -132,10 +131,9 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
           measure();
         }
       }
-      #endif
+#endif
 
-      using measure_t =
-        nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
+      using measure_t = nvbench::detail::measure_cold<wrapper_t, use_blocking_kernel>;
       measure_t measure(*this, wrapper);
       measure();
     }
@@ -143,12 +141,10 @@ void state::exec(ExecTags tags, KernelLauncher &&kernel_launcher)
 
   if constexpr (tags & hot)
   {
-    static_assert(!(tags & sync),
-                  "Hot measurement doesn't support the `sync` exec_tag.");
-    static_assert(!(tags & timer),
-                  "Hot measurement doesn't support the `timer` exec_tag.");
+    static_assert(!(tags & sync), "Hot measurement doesn't support the `sync` exec_tag.");
+    static_assert(!(tags & timer), "Hot measurement doesn't support the `timer` exec_tag.");
     constexpr bool use_blocking_kernel = !(tags & no_block);
-    using measure_t = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
+    using measure_t                    = nvbench::detail::measure_hot<KL, use_blocking_kernel>;
     measure_t measure{*this, kernel_launcher};
     measure();
   }
diff --git a/nvbench/detail/state_generator.cxx b/nvbench/detail/state_generator.cxx
index 8c153bf..26a897a 100644
--- a/nvbench/detail/state_generator.cxx
+++ b/nvbench/detail/state_generator.cxx
@@ -40,9 +40,7 @@ void state_iterator::add_axis(const nvbench::axis_base &axis)
   this->add_axis(axis.get_name(), axis.get_type(), axis.get_size());
 }
 
-void state_iterator::add_axis(std::string axis,
-                              nvbench::axis_type type,
-                              std::size_t size)
+void state_iterator::add_axis(std::string axis, nvbench::axis_type type, std::size_t size)
 {
   m_indices.push_back({std::move(axis), type, std::size_t{0}, size});
 }
@@ -74,10 +72,7 @@ state_iterator::get_current_indices() const
   return m_indices;
 }
 
-[[nodiscard]] bool state_iterator::iter_valid() const
-{
-  return m_current < m_total;
-}
+[[nodiscard]] bool state_iterator::iter_valid() const { return m_current < m_total; }
 
 void state_iterator::next()
 {
@@ -102,7 +97,7 @@ state_generator::state_generator(const benchmark_base &bench)
 
 void state_generator::build_axis_configs()
 {
-  const axes_metadata &axes = m_benchmark.get_axes();
+  const axes_metadata &axes                               = m_benchmark.get_axes();
   const std::vector<std::unique_ptr<axis_base>> &axes_vec = axes.get_axes();
 
   // Construct two state_generators:
@@ -118,35 +113,29 @@ void state_generator::build_axis_configs()
     type_axes.reserve(axes_vec.size());
 
     // Filter all axes by into type and non-type:
-    std::for_each(axes_vec.cbegin(),
-                  axes_vec.cend(),
-                  [&non_type_si, &type_axes](const auto &axis) {
-                    if (axis->get_type() == nvbench::axis_type::type)
-                    {
-                      type_axes.push_back(
-                        std::cref(static_cast<const type_axis &>(*axis)));
-                    }
-                    else
-                    {
-                      non_type_si.add_axis(*axis);
-                    }
-                  });
+    std::for_each(axes_vec.cbegin(), axes_vec.cend(), [&non_type_si, &type_axes](const auto &axis) {
+      if (axis->get_type() == nvbench::axis_type::type)
+      {
+        type_axes.push_back(std::cref(static_cast<const type_axis &>(*axis)));
+      }
+      else
+      {
+        non_type_si.add_axis(*axis);
+      }
+    });
 
     // Reverse sort type axes by index. This way the state_generator's cartesian
     // product of the type axes values will be enumerated in the same order as
     // nvbench::tl::cartesian_product<type_axes>. This is necessary to ensure
     // that the correct states are passed to the corresponding benchmark
     // instantiations.
-    std::sort(type_axes.begin(),
-              type_axes.end(),
-              [](const auto &axis_1, const auto &axis_2) {
-                return axis_1.get().get_axis_index() >
-                       axis_2.get().get_axis_index();
-              });
+    std::sort(type_axes.begin(), type_axes.end(), [](const auto &axis_1, const auto &axis_2) {
+      return axis_1.get().get_axis_index() > axis_2.get().get_axis_index();
+    });
 
-    std::for_each(type_axes.cbegin(),
-                  type_axes.cend(),
-                  [&type_si](const auto &axis) { type_si.add_axis(axis); });
+    std::for_each(type_axes.cbegin(), type_axes.cend(), [&type_si](const auto &axis) {
+      type_si.add_axis(axis);
+    });
   }
 
   // type_axis_configs generation:
@@ -157,8 +146,8 @@ void state_generator::build_axis_configs()
     // Build type_axis_configs
     for (type_si.init(); type_si.iter_valid(); type_si.next())
     {
-      auto &[config, active_mask] = m_type_axis_configs.emplace_back(
-        std::make_pair(nvbench::named_values{}, true));
+      auto &[config, active_mask] =
+        m_type_axis_configs.emplace_back(std::make_pair(nvbench::named_values{}, true));
 
       // Reverse the indices so they're once again in the same order as
       // specified:
@@ -173,8 +162,7 @@ void state_generator::build_axis_configs()
           active_mask = false;
         }
 
-        config.set_string(axis_info.axis,
-                          axis.get_input_string(axis_info.index));
+        config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
       }
     } // type_si
   }   // type_axis_config generation
@@ -199,21 +187,18 @@ void state_generator::build_axis_configs()
             break;
 
           case axis_type::int64:
-            config.set_int64(
-              axis_info.axis,
-              axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_int64(axis_info.axis,
+                             axes.get_int64_axis(axis_info.axis).get_value(axis_info.index));
             break;
 
           case axis_type::float64:
-            config.set_float64(
-              axis_info.axis,
-              axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_float64(axis_info.axis,
+                               axes.get_float64_axis(axis_info.axis).get_value(axis_info.index));
             break;
 
           case axis_type::string:
-            config.set_string(
-              axis_info.axis,
-              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
+            config.set_string(axis_info.axis,
+                              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
             break;
         } // switch (type)
       }   // for (axis_info : current_indices)
@@ -239,15 +224,12 @@ void state_generator::build_states()
   }
 }
 
-void state_generator::add_states_for_device(
-  const std::optional<device_info> &device)
+void state_generator::add_states_for_device(const std::optional<device_info> &device)
 {
   const auto num_type_configs = m_type_axis_configs.size();
-  for (std::size_t type_config_index = 0; type_config_index < num_type_configs;
-       ++type_config_index)
+  for (std::size_t type_config_index = 0; type_config_index < num_type_configs; ++type_config_index)
   {
-    const auto &[type_config,
-                 axis_mask] = m_type_axis_configs[type_config_index];
+    const auto &[type_config, axis_mask] = m_type_axis_configs[type_config_index];
 
     if (!axis_mask)
     { // Don't generate inner vector if the type config is masked out.
@@ -261,10 +243,7 @@ void state_generator::add_states_for_device(
       config.append(non_type_config);
 
       // Create benchmark:
-      m_states.push_back(nvbench::state{m_benchmark,
-                                        std::move(config),
-                                        device,
-                                        type_config_index});
+      m_states.push_back(nvbench::state{m_benchmark, std::move(config), device, type_config_index});
     }
   }
 }
diff --git a/nvbench/detail/statistics.cuh b/nvbench/detail/statistics.cuh
index 957bca4..ddafc43 100644
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -36,8 +36,7 @@ namespace nvbench::detail::statistics
  *
  * If the input has fewer than 5 sample, infinity is returned.
  */
-template <typename Iter,
-          typename ValueType = typename std::iterator_traits<Iter>::value_type>
+template <typename Iter, typename ValueType = typename std::iterator_traits<Iter>::value_type>
 ValueType standard_deviation(Iter first, Iter last, ValueType mean)
 {
   static_assert(std::is_floating_point_v<ValueType>);
diff --git a/nvbench/detail/throw.cuh b/nvbench/detail/throw.cuh
index ffbe5bb..e3bb9fd 100644
--- a/nvbench/detail/throw.cuh
+++ b/nvbench/detail/throw.cuh
@@ -21,17 +21,15 @@
 #include <fmt/format.h>
 #include <stdexcept>
 
-#define NVBENCH_THROW(exception_type, format_str, ...)                         \
-  throw exception_type(fmt::format("{}:{}: {}",                                \
-                                   __FILE__,                                   \
-                                   __LINE__,                                   \
-                                   fmt::format(format_str, __VA_ARGS__)))
+#define NVBENCH_THROW(exception_type, format_str, ...)                                             \
+  throw exception_type(                                                                            \
+    fmt::format("{}:{}: {}", __FILE__, __LINE__, fmt::format(format_str, __VA_ARGS__)))
 
-#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...)           \
-  do                                                                           \
-  {                                                                            \
-    if (condition)                                                             \
-    {                                                                          \
-      NVBENCH_THROW(exception_type, format_str, __VA_ARGS__);                  \
-    }                                                                          \
+#define NVBENCH_THROW_IF(condition, exception_type, format_str, ...)                               \
+  do                                                                                               \
+  {                                                                                                \
+    if (condition)                                                                                 \
+    {                                                                                              \
+      NVBENCH_THROW(exception_type, format_str, __VA_ARGS__);                                      \
+    }                                                                                              \
   } while (false)
diff --git a/nvbench/detail/transform_reduce.cuh b/nvbench/detail/transform_reduce.cuh
index 8bc5db6..5625358 100644
--- a/nvbench/detail/transform_reduce.cuh
+++ b/nvbench/detail/transform_reduce.cuh
@@ -27,10 +27,7 @@
 namespace nvbench::detail
 {
 
-template <typename InIterT,
-          typename InitValueT,
-          typename ReduceOp,
-          typename TransformOp>
+template <typename InIterT, typename InitValueT, typename ReduceOp, typename TransformOp>
 InitValueT transform_reduce(InIterT first,
                             InIterT last,
                             InitValueT init,
diff --git a/nvbench/detail/type_list_impl.cuh b/nvbench/detail/type_list_impl.cuh
index d2e498c..8a18aa3 100644
--- a/nvbench/detail/type_list_impl.cuh
+++ b/nvbench/detail/type_list_impl.cuh
@@ -20,12 +20,10 @@ namespace tl::detail
 {
 
 template <typename... Ts>
-auto size(nvbench::type_list<Ts...>)
-  -> std::integral_constant<std::size_t, sizeof...(Ts)>;
+auto size(nvbench::type_list<Ts...>) -> std::integral_constant<std::size_t, sizeof...(Ts)>;
 
 template <std::size_t I, typename... Ts>
-auto get(nvbench::type_list<Ts...>)
-  -> std::tuple_element_t<I, std::tuple<Ts...>>;
+auto get(nvbench::type_list<Ts...>) -> std::tuple_element_t<I, std::tuple<Ts...>>;
 
 template <typename... Ts, typename... Us>
 auto concat(nvbench::type_list<Ts...>, nvbench::type_list<Us...>)
@@ -44,9 +42,8 @@ struct prepend_each<T, nvbench::type_list<>>
 template <typename T, typename TL, typename... TLTail>
 struct prepend_each<T, nvbench::type_list<TL, TLTail...>>
 {
-  using cur = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
-  using next =
-    typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
+  using cur  = decltype(detail::concat(nvbench::type_list<T>{}, TL{}));
+  using next = typename detail::prepend_each<T, nvbench::type_list<TLTail...>>::type;
   using type = decltype(detail::concat(nvbench::type_list<cur>{}, next{}));
 };
 
@@ -71,23 +68,20 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<>, TLTail...>>
 template <typename T, typename... Ts>
 struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Ts...>>>
 {
-  using cur = nvbench::type_list<nvbench::type_list<T>>;
-  using next =
-    std::conditional_t<sizeof...(Ts) != 0,
-                       typename detail::cartesian_product<
-                         nvbench::type_list<nvbench::type_list<Ts...>>>::type,
-                       nvbench::type_list<>>;
+  using cur  = nvbench::type_list<nvbench::type_list<T>>;
+  using next = std::conditional_t<
+    sizeof...(Ts) != 0,
+    typename detail::cartesian_product<nvbench::type_list<nvbench::type_list<Ts...>>>::type,
+    nvbench::type_list<>>;
   using type = decltype(detail::concat(cur{}, next{}));
 };
 
 template <typename T, typename... Tail, typename TL, typename... TLTail>
-struct cartesian_product<
-  nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
+struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL, TLTail...>>
 {
-  using tail_prod =
-    typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
-  using cur  = typename detail::prepend_each<T, tail_prod>::type;
-  using next = typename detail::cartesian_product<
+  using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
+  using cur       = typename detail::prepend_each<T, tail_prod>::type;
+  using next      = typename detail::cartesian_product<
     nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
   using type = decltype(detail::concat(cur{}, next{}));
 };
diff --git a/nvbench/device_info.cu b/nvbench/device_info.cu
index 02c6b97..8edad22 100644
--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -65,17 +65,15 @@ void device_info::set_persistence_mode(bool state)
 #else  // NVBENCH_HAS_NVML
 try
 {
-  NVBENCH_NVML_CALL(nvmlDeviceSetPersistenceMode(
-    m_nvml_device,
-    state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
+  NVBENCH_NVML_CALL(
+    nvmlDeviceSetPersistenceMode(m_nvml_device,
+                                 state ? NVML_FEATURE_ENABLED : NVML_FEATURE_DISABLED));
 }
 catch (nvml::call_failed &e)
 {
   if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "Persistence mode is only supported on Linux.");
+    NVBENCH_THROW(std::runtime_error, "{}", "Persistence mode is only supported on Linux.");
   }
   else if (e.get_error_code() == NVML_ERROR_NO_PERMISSION)
   {
@@ -104,30 +102,26 @@ try
       break;
 
     case clock_rate::base:
-      NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(
-        m_nvml_device,
-        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
-        static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
+      NVBENCH_NVML_CALL(
+        nvmlDeviceSetGpuLockedClocks(m_nvml_device,
+                                     static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP),
+                                     static_cast<unsigned int>(NVML_CLOCK_LIMIT_ID_TDP)));
       break;
 
     case clock_rate::maximum: {
-      const auto max_mhz = static_cast<unsigned int>(
-        this->get_sm_default_clock_rate() / (1000 * 1000));
-      NVBENCH_NVML_CALL(
-        nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
+      const auto max_mhz =
+        static_cast<unsigned int>(this->get_sm_default_clock_rate() / (1000 * 1000));
+      NVBENCH_NVML_CALL(nvmlDeviceSetGpuLockedClocks(m_nvml_device, max_mhz, max_mhz));
       break;
     }
 
     default:
-      NVBENCH_THROW(std::runtime_error,
-                    "Unrecognized clock rate: {}",
-                    static_cast<int>(rate));
+      NVBENCH_THROW(std::runtime_error, "Unrecognized clock rate: {}", static_cast<int>(rate));
   }
 }
 catch (nvml::call_failed &e)
 {
-  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED &&
-      this->get_sm_version() < 700)
+  if (e.get_error_code() == NVML_ERROR_NOT_SUPPORTED && this->get_sm_version() < 700)
   {
     NVBENCH_THROW(std::runtime_error,
                   "GPU clock rates can only be modified for Volta and later. "
@@ -156,9 +150,7 @@ catch (nvml::call_failed &e)
 {
   if (!is_active())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "{}",
-                  "get_context is called for inactive device");
+    NVBENCH_THROW(std::runtime_error, "{}", "get_context is called for inactive device");
   }
 
   CUcontext cu_context;
diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh
index 296a2c2..f0694df 100644
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -54,10 +54,7 @@ struct device_info
   [[nodiscard]] int get_id() const { return m_id; }
 
   /// @return The name of the device.
-  [[nodiscard]] std::string_view get_name() const
-  {
-    return std::string_view(m_prop.name);
-  }
+  [[nodiscard]] std::string_view get_name() const { return std::string_view(m_prop.name); }
 
   [[nodiscard]] bool is_active() const
   {
@@ -83,7 +80,6 @@ struct device_info
   /// @note Requires root / admin privileges.
   void set_persistence_mode(bool state);
 
-
   /// Symbolic values for special clock rates
   enum class clock_rate
   {
@@ -101,10 +97,7 @@ struct device_info
   void lock_gpu_clocks(clock_rate rate);
 
   /// @return The SM version of the current device as (major*100) + (minor*10).
-  [[nodiscard]] int get_sm_version() const
-  {
-    return m_prop.major * 100 + m_prop.minor * 10;
-  }
+  [[nodiscard]] int get_sm_version() const { return m_prop.major * 100 + m_prop.minor * 10; }
 
   /// @return The PTX version of the current device, e.g. sm_80 returns 800.
   [[nodiscard]] __forceinline__ int get_ptx_version() const
@@ -119,46 +112,25 @@ struct device_info
   }
 
   /// @return The number of physical streaming multiprocessors on this device.
-  [[nodiscard]] int get_number_of_sms() const
-  {
-    return m_prop.multiProcessorCount;
-  }
+  [[nodiscard]] int get_number_of_sms() const { return m_prop.multiProcessorCount; }
 
   /// @return The maximum number of resident blocks per SM.
-  [[nodiscard]] int get_max_blocks_per_sm() const
-  {
-    return m_prop.maxBlocksPerMultiProcessor;
-  }
+  [[nodiscard]] int get_max_blocks_per_sm() const { return m_prop.maxBlocksPerMultiProcessor; }
 
   /// @return The maximum number of resident threads per SM.
-  [[nodiscard]] int get_max_threads_per_sm() const
-  {
-    return m_prop.maxThreadsPerMultiProcessor;
-  }
+  [[nodiscard]] int get_max_threads_per_sm() const { return m_prop.maxThreadsPerMultiProcessor; }
 
   /// @return The maximum number of threads per block.
-  [[nodiscard]] int get_max_threads_per_block() const
-  {
-    return m_prop.maxThreadsPerBlock;
-  }
+  [[nodiscard]] int get_max_threads_per_block() const { return m_prop.maxThreadsPerBlock; }
 
   /// @return The number of registers per SM.
-  [[nodiscard]] int get_registers_per_sm() const
-  {
-    return m_prop.regsPerMultiprocessor;
-  }
+  [[nodiscard]] int get_registers_per_sm() const { return m_prop.regsPerMultiprocessor; }
 
   /// @return The number of registers per block.
-  [[nodiscard]] int get_registers_per_block() const
-  {
-    return m_prop.regsPerBlock;
-  }
+  [[nodiscard]] int get_registers_per_block() const { return m_prop.regsPerBlock; }
 
   /// @return The total number of bytes available in global memory.
-  [[nodiscard]] std::size_t get_global_memory_size() const
-  {
-    return m_prop.totalGlobalMem;
-  }
+  [[nodiscard]] std::size_t get_global_memory_size() const { return m_prop.totalGlobalMem; }
 
   struct memory_info
   {
@@ -176,10 +148,7 @@ struct device_info
   }
 
   /// @return The width of the global memory bus in bits.
-  [[nodiscard]] int get_global_memory_bus_width() const
-  {
-    return m_prop.memoryBusWidth;
-  }
+  [[nodiscard]] int get_global_memory_bus_width() const { return m_prop.memoryBusWidth; }
 
   //// @return The global memory bus bandwidth in bytes/sec.
   [[nodiscard]] std::size_t get_global_memory_bus_bandwidth() const
@@ -201,10 +170,7 @@ struct device_info
   }
 
   /// @return The available amount of shared memory in bytes per block.
-  [[nodiscard]] std::size_t get_shared_memory_per_block() const
-  {
-    return m_prop.sharedMemPerBlock;
-  }
+  [[nodiscard]] std::size_t get_shared_memory_per_block() const { return m_prop.sharedMemPerBlock; }
 
   /// @return True if ECC is enabled on this device.
   [[nodiscard]] bool get_ecc_state() const { return m_prop.ECCEnabled; }
@@ -224,23 +190,11 @@ struct device_info
 #endif
 
   /// @return A cached copy of the device's cudaDeviceProp.
-  [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const
-  {
-    return m_prop;
-  }
+  [[nodiscard]] const cudaDeviceProp &get_cuda_device_prop() const { return m_prop; }
 
-  [[nodiscard]] bool operator<(const device_info &o) const
-  {
-    return m_id < o.m_id;
-  }
-  [[nodiscard]] bool operator==(const device_info &o) const
-  {
-    return m_id == o.m_id;
-  }
-  [[nodiscard]] bool operator!=(const device_info &o) const
-  {
-    return m_id != o.m_id;
-  }
+  [[nodiscard]] bool operator<(const device_info &o) const { return m_id < o.m_id; }
+  [[nodiscard]] bool operator==(const device_info &o) const { return m_id == o.m_id; }
+  [[nodiscard]] bool operator!=(const device_info &o) const { return m_id != o.m_id; }
 
 private:
   int m_id;
@@ -267,11 +221,10 @@ try
 {
   nvbench::detail::device_scope _{dev_id};
   cudaFuncAttributes attr{};
-  NVBENCH_CUDA_CALL(
-    cudaFuncGetAttributes(&attr, ((const void*)nvbench::detail::noop_kernel_ptr) ));
+  NVBENCH_CUDA_CALL(cudaFuncGetAttributes(&attr, ((const void *)nvbench::detail::noop_kernel_ptr)));
   return attr.ptxVersion * 10;
 }
-catch(...)
+catch (...)
 { // Fail gracefully when no appropriate PTX is found for this device.
   return -1;
 }
diff --git a/nvbench/device_manager.cuh b/nvbench/device_manager.cuh
index 9490775..8e6fe9d 100644
--- a/nvbench/device_manager.cuh
+++ b/nvbench/device_manager.cuh
@@ -40,10 +40,7 @@ struct device_manager
   /**
    * @return The total number of detected CUDA devices.
    */
-  [[nodiscard]] int get_number_of_devices() const
-  {
-    return static_cast<int>(m_devices.size());
-  }
+  [[nodiscard]] int get_number_of_devices() const { return static_cast<int>(m_devices.size()); }
 
   /**
    * @return The number of devices actually used by all benchmarks.
@@ -57,39 +54,27 @@ struct device_manager
   /**
    * @return The device_info object corresponding to `id`.
    */
-  [[nodiscard]] const nvbench::device_info &get_device(int id)
-  {
-    return m_devices.at(id);
-  }
+  [[nodiscard]] const nvbench::device_info &get_device(int id) { return m_devices.at(id); }
 
   /**
    * @return A vector containing device_info objects for all detected CUDA
    * devices.
    */
-  [[nodiscard]] const device_info_vector &get_devices() const
-  {
-    return m_devices;
-  }
+  [[nodiscard]] const device_info_vector &get_devices() const { return m_devices; }
 
   /**
    * @return A vector containing device_info objects for devices that are
    * actively used by all benchmarks.
    * @note This is only valid after nvbench::option_parser::parse executes.
    */
-  [[nodiscard]] const device_info_vector &get_used_devices() const
-  {
-    return m_used_devices;
-  }
+  [[nodiscard]] const device_info_vector &get_used_devices() const { return m_used_devices; }
 
 private:
   device_manager();
 
   friend struct option_parser;
 
-  void set_used_devices(device_info_vector devices)
-  {
-    m_used_devices = std::move(devices);
-  }
+  void set_used_devices(device_info_vector devices) { m_used_devices = std::move(devices); }
 
   device_info_vector m_devices;
   device_info_vector m_used_devices;
diff --git a/nvbench/enum_type_list.cuh b/nvbench/enum_type_list.cuh
index 6ec529b..614057f 100644
--- a/nvbench/enum_type_list.cuh
+++ b/nvbench/enum_type_list.cuh
@@ -64,10 +64,7 @@ struct type_strings<nvbench::enum_type<Value, T>>
     return std::to_string(Value);
   }
 
-  static std::string description()
-  {
-    return nvbench::demangle<nvbench::enum_type<Value, T>>();
-  }
+  static std::string description() { return nvbench::demangle<nvbench::enum_type<Value, T>>(); }
 };
 
 } // namespace nvbench
@@ -86,15 +83,13 @@ struct type_strings<nvbench::enum_type<Value, T>>
  * \relatesalso enum_type_list
  * \relatesalso nvbench::enum_type_list
  */
-#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T,                                   \
-                                          input_generator,                     \
-                                          description_generator)               \
-  namespace nvbench                                                            \
-  {                                                                            \
-  template <T Value>                                                           \
-  struct type_strings<enum_type<Value, T>>                                     \
-  {                                                                            \
-    static std::string input_string() { return input_generator(Value); }       \
-    static std::string description() { return description_generator(Value); }  \
-  };                                                                           \
+#define NVBENCH_DECLARE_ENUM_TYPE_STRINGS(T, input_generator, description_generator)               \
+  namespace nvbench                                                                                \
+  {                                                                                                \
+  template <T Value>                                                                               \
+  struct type_strings<enum_type<Value, T>>                                                         \
+  {                                                                                                \
+    static std::string input_string() { return input_generator(Value); }                           \
+    static std::string description() { return description_generator(Value); }                      \
+  };                                                                                               \
   }
diff --git a/nvbench/exec_tag.cuh b/nvbench/exec_tag.cuh
index b49ed36..466a441 100644
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -31,16 +31,16 @@ enum class exec_flag
   none = 0x0,
 
   // Modifiers:
-  timer    = 0x01, // KernelLauncher uses manual timing
-  no_block = 0x02, // Disables use of `blocking_kernel`.
-  sync     = 0x04, // KernelLauncher has indicated that it will sync
-  run_once = 0x08, // Only run the benchmark once (for profiling).
+  timer         = 0x01, // KernelLauncher uses manual timing
+  no_block      = 0x02, // Disables use of `blocking_kernel`.
+  sync          = 0x04, // KernelLauncher has indicated that it will sync
+  run_once      = 0x08, // Only run the benchmark once (for profiling).
   modifier_mask = timer | no_block | sync | run_once,
 
   // Measurement types:
-  cold = 0x0100, // measure_hot
-  hot  = 0x0200, // measure_cold
-  measure_mask  = cold | hot
+  cold         = 0x0100, // measure_hot
+  hot          = 0x0200, // measure_cold
+  measure_mask = cold | hot
 };
 
 } // namespace nvbench::detail
@@ -120,7 +120,6 @@ constexpr inline auto timer = nvbench::exec_tag::impl::timer;
 
 /// Modifier used to indicate that the KernelGenerator will perform CUDA
 /// synchronizations. Without this flag such benchmarks will deadlock.
-constexpr inline auto sync = nvbench::exec_tag::impl::no_block |
-                             nvbench::exec_tag::impl::sync;
+constexpr inline auto sync = nvbench::exec_tag::impl::no_block | nvbench::exec_tag::impl::sync;
 
 } // namespace nvbench::exec_tag
diff --git a/nvbench/flags.cuh b/nvbench/flags.cuh
index 30ba84e..cefefa3 100644
--- a/nvbench/flags.cuh
+++ b/nvbench/flags.cuh
@@ -20,24 +20,24 @@
 
 #include <type_traits>
 
-#define NVBENCH_DECLARE_FLAGS(T)                                               \
-  constexpr inline T operator|(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator&(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator^(T v1, T v2)                                     \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2));          \
-  }                                                                            \
-  constexpr inline T operator~(T v1)                                           \
-  {                                                                            \
-    using UT = std::underlying_type_t<T>;                                      \
-    return static_cast<T>(~static_cast<UT>(v1));                               \
+#define NVBENCH_DECLARE_FLAGS(T)                                                                   \
+  constexpr inline T operator|(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) | static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator&(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) & static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator^(T v1, T v2)                                                         \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(static_cast<UT>(v1) ^ static_cast<UT>(v2));                              \
+  }                                                                                                \
+  constexpr inline T operator~(T v1)                                                               \
+  {                                                                                                \
+    using UT = std::underlying_type_t<T>;                                                          \
+    return static_cast<T>(~static_cast<UT>(v1));                                                   \
   }
diff --git a/nvbench/float64_axis.cuh b/nvbench/float64_axis.cuh
index 0d60651..947b581 100644
--- a/nvbench/float64_axis.cuh
+++ b/nvbench/float64_axis.cuh
@@ -36,20 +36,11 @@ struct float64_axis final : public axis_base
 
   ~float64_axis() final;
 
-  void set_inputs(std::vector<nvbench::float64_t> inputs)
-  {
-    m_values = std::move(inputs);
-  }
-  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const
-  {
-    return m_values[i];
-  }
+  void set_inputs(std::vector<nvbench::float64_t> inputs) { m_values = std::move(inputs); }
+  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<float64_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<float64_axis>(*this); }
   std::size_t do_get_size() const final { return m_values.size(); }
   std::string do_get_input_string(std::size_t i) const final;
   std::string do_get_description(std::size_t i) const final;
diff --git a/nvbench/git_revision.cuh b/nvbench/git_revision.cuh
index 2b29e92..50fc9da 100644
--- a/nvbench/git_revision.cuh
+++ b/nvbench/git_revision.cuh
@@ -1,20 +1,20 @@
 /*
-*  Copyright 2021 NVIDIA Corporation
-*
-*  Licensed under the Apache License, Version 2.0 with the LLVM exception
-*  (the "License"); you may not use this file except in compliance with
-*  the License.
-*
-*  You may obtain a copy of the License at
-*
-*      http://llvm.org/foundation/relicensing/LICENSE.txt
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*/
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
 
 #pragma once
 
@@ -25,7 +25,5 @@
 #define NVBENCH_GIT_SHA1 NVBench_GIT_SHA1
 #define NVBENCH_GIT_VERSION NVBench_GIT_VERSION
 #ifdef NVBench_GIT_IS_DIRTY
-#  define NVBENCH_GIT_IS_DIRTY
+#define NVBENCH_GIT_IS_DIRTY
 #endif
-
-
diff --git a/nvbench/int64_axis.cuh b/nvbench/int64_axis.cuh
index a6cec2e..baa7641 100644
--- a/nvbench/int64_axis.cuh
+++ b/nvbench/int64_axis.cuh
@@ -58,28 +58,18 @@ struct int64_axis final : public axis_base
     return static_cast<bool>(m_flags & int64_axis_flags::power_of_two);
   }
 
-  void set_inputs(std::vector<int64_t> inputs,
-                  int64_axis_flags flags = int64_axis_flags::none);
+  void set_inputs(std::vector<int64_t> inputs, int64_axis_flags flags = int64_axis_flags::none);
 
-  [[nodiscard]] const std::vector<int64_t> &get_inputs() const
-  {
-    return m_inputs;
-  };
+  [[nodiscard]] const std::vector<int64_t> &get_inputs() const { return m_inputs; };
 
   [[nodiscard]] int64_t get_value(std::size_t i) const { return m_values[i]; };
 
-  [[nodiscard]] const std::vector<int64_t> &get_values() const
-  {
-    return m_values;
-  };
+  [[nodiscard]] const std::vector<int64_t> &get_values() const { return m_values; };
 
   int64_axis_flags get_flags() const { return m_flags; }
 
   // Helper functions for pow2 conversions:
-  static nvbench::int64_t compute_pow2(nvbench::int64_t exponent)
-  {
-    return 1ll << exponent;
-  }
+  static nvbench::int64_t compute_pow2(nvbench::int64_t exponent) { return 1ll << exponent; }
 
   // UB if value < 0.
   static nvbench::int64_t compute_log2(nvbench::int64_t value)
@@ -95,10 +85,7 @@ struct int64_axis final : public axis_base
   };
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<int64_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<int64_axis>(*this); }
   std::size_t do_get_size() const final { return m_inputs.size(); }
   std::string do_get_input_string(std::size_t) const final;
   std::string do_get_description(std::size_t) const final;
diff --git a/nvbench/int64_axis.cxx b/nvbench/int64_axis.cxx
index 24ff913..599c388 100644
--- a/nvbench/int64_axis.cxx
+++ b/nvbench/int64_axis.cxx
@@ -66,9 +66,8 @@ std::string int64_axis::do_get_input_string(std::size_t i) const
 
 std::string int64_axis::do_get_description(std::size_t i) const
 {
-  return this->is_power_of_two()
-           ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
-           : std::string{};
+  return this->is_power_of_two() ? fmt::format("2^{} = {}", m_inputs[i], m_values[i])
+                                 : std::string{};
 }
 
 std::string_view int64_axis::do_get_flags_as_string() const
diff --git a/nvbench/internal/markdown_table.cuh b/nvbench/internal/markdown_table.cuh
index bb721c6..518f57b 100644
--- a/nvbench/internal/markdown_table.cuh
+++ b/nvbench/internal/markdown_table.cuh
@@ -85,8 +85,7 @@ private:
                             " {:^{}} ",
                             col.header,
                             col.max_width);
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
     }
     return fmt::format_to(iter, "\n");
   }
@@ -102,8 +101,7 @@ private:
                             "{:-^{}}",
                             "",
                             col.max_width + 2);
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
     }
     return fmt::format_to(iter, "\n");
   }
@@ -116,8 +114,7 @@ private:
 
     for (std::size_t row = 0; row < m_num_rows; ++row)
     {
-      iter =
-        fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+      iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
       for (const column &col : m_columns)
       {
         iter = fmt::format_to(iter,
@@ -125,8 +122,7 @@ private:
                               " {:>{}} ",
                               col.rows[row],
                               col.max_width);
-        iter =
-          fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
+        iter = fmt::format_to(iter, m_color ? (m_bg | m_vdiv_fg) : m_no_style, "|");
       } // cols
 
       iter = fmt::format_to(iter, "\n");
diff --git a/nvbench/internal/nvml.cuh b/nvbench/internal/nvml.cuh
index 497f31a..963c381 100644
--- a/nvbench/internal/nvml.cuh
+++ b/nvbench/internal/nvml.cuh
@@ -74,10 +74,7 @@ struct call_failed : error
 
   [[nodiscard]] nvmlReturn_t get_error_code() const { return m_error_code; }
 
-  [[nodiscard]] const std::string &get_error_string() const
-  {
-    return m_error_string;
-  }
+  [[nodiscard]] const std::string &get_error_string() const { return m_error_string; }
 
 private:
   nvmlReturn_t m_error_code;
@@ -90,30 +87,26 @@ private:
 
 #ifdef NVBENCH_HAS_NVML
 
-#define NVBENCH_NVML_CALL(call)                                                \
-  do                                                                           \
-  {                                                                            \
-    const auto _rr = call;                                                     \
-    if (_rr != NVML_SUCCESS)                                                   \
-    {                                                                          \
-      throw nvbench::nvml::call_failed(__FILE__,                               \
-                                       __LINE__,                               \
-                                       #call,                                  \
-                                       _rr,                                    \
-                                       nvmlErrorString(_rr));                  \
-    }                                                                          \
+#define NVBENCH_NVML_CALL(call)                                                                    \
+  do                                                                                               \
+  {                                                                                                \
+    const auto _rr = call;                                                                         \
+    if (_rr != NVML_SUCCESS)                                                                       \
+    {                                                                                              \
+      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, nvmlErrorString(_rr));      \
+    }                                                                                              \
   } while (false)
 
 // Same as above, but used for nvmlInit(), where a failure means that
 // nvmlErrorString is not available.
-#define NVBENCH_NVML_CALL_NO_API(call)                                         \
-  do                                                                           \
-  {                                                                            \
-    const auto _rr = call;                                                     \
-    if (_rr != NVML_SUCCESS)                                                   \
-    {                                                                          \
-      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, "");    \
-    }                                                                          \
+#define NVBENCH_NVML_CALL_NO_API(call)                                                             \
+  do                                                                                               \
+  {                                                                                                \
+    const auto _rr = call;                                                                         \
+    if (_rr != NVML_SUCCESS)                                                                       \
+    {                                                                                              \
+      throw nvbench::nvml::call_failed(__FILE__, __LINE__, #call, _rr, "");                        \
+    }                                                                                              \
   } while (false)
 
 #endif // NVBENCH_HAS_NVML
diff --git a/nvbench/internal/table_builder.cuh b/nvbench/internal/table_builder.cuh
index 81fca0a..ae029b4 100644
--- a/nvbench/internal/table_builder.cuh
+++ b/nvbench/internal/table_builder.cuh
@@ -47,18 +47,14 @@ struct table_builder
                 const std::string &header,
                 std::string value)
   {
-    auto iter = std::find_if(m_columns.begin(),
-                             m_columns.end(),
-                             [&column_key](const column &col) {
-                               return col.key == column_key;
-                             });
+    auto iter = std::find_if(m_columns.begin(), m_columns.end(), [&column_key](const column &col) {
+      return col.key == column_key;
+    });
 
     auto &col = iter == m_columns.end()
-                ? m_columns.emplace_back(column{column_key,
-                                                header,
-                                                std::vector<std::string>{},
-                                                header.size()})
-                : *iter;
+                  ? m_columns.emplace_back(
+                      column{column_key, header, std::vector<std::string>{}, header.size()})
+                  : *iter;
 
     col.max_width = std::max(col.max_width, value.size());
     if (col.rows.size() <= row)
@@ -76,11 +72,9 @@ struct table_builder
       std::size_t{},
       [](const auto &a, const auto &b) { return a > b ? a : b; },
       [](const column &col) { return col.rows.size(); });
-    std::for_each(m_columns.begin(),
-                  m_columns.end(),
-                  [num_rows = m_num_rows](column &col) {
-                    col.rows.resize(num_rows);
-                  });
+    std::for_each(m_columns.begin(), m_columns.end(), [num_rows = m_num_rows](column &col) {
+      col.rows.resize(num_rows);
+    });
   }
 };
 
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index 7c99f2a..b78e5be 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -126,11 +126,10 @@ std::string json_printer::version_t::get_string() const
   return fmt::format("{}.{}.{}", this->major, this->minor, this->patch);
 }
 
-void json_printer::do_process_bulk_data_float64(
-  state &state,
-  const std::string &tag,
-  const std::string &hint,
-  const std::vector<nvbench::float64_t> &data)
+void json_printer::do_process_bulk_data_float64(state &state,
+                                                const std::string &tag,
+                                                const std::string &hint,
+                                                const std::vector<nvbench::float64_t> &data)
 {
   printer_base::do_process_bulk_data_float64(state, tag, hint, data);
 
@@ -157,16 +156,12 @@ void json_printer::do_process_bulk_data_float64(
       {
         if (!fs::create_directory(result_path))
         {
-          NVBENCH_THROW(std::runtime_error,
-                        "{}",
-                        "Failed to create result directory '{}'.");
+          NVBENCH_THROW(std::runtime_error, "{}", "Failed to create result directory '{}'.");
         }
       }
       else if (!fs::is_directory(result_path))
       {
-        NVBENCH_THROW(std::runtime_error,
-                      "{}",
-                      "'{}' exists and is not a directory.");
+        NVBENCH_THROW(std::runtime_error, "{}", "'{}' exists and is not a directory.");
       }
 
       const auto file_id = m_num_jsonbin_files++;
@@ -197,16 +192,12 @@ void json_printer::do_process_bulk_data_float64(
     }
     catch (std::exception &e)
     {
-      if (auto printer_opt_ref = state.get_benchmark().get_printer();
-          printer_opt_ref.has_value())
+      if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
       {
         auto &printer = printer_opt_ref.value().get();
-        printer.log(nvbench::log_level::warn,
-                    fmt::format("Error writing {} ({}) to {}: {}",
-                                tag,
-                                hint,
-                                result_path.string(),
-                                e.what()));
+        printer.log(
+          nvbench::log_level::warn,
+          fmt::format("Error writing {} ({}) to {}: {}", tag, hint, result_path.string(), e.what()));
       }
     } // end catch
 
@@ -221,14 +212,12 @@ void json_printer::do_process_bulk_data_float64(
     summ.set_string("hide", "Not needed in table.");
 
     timer.stop();
-    if (auto printer_opt_ref = state.get_benchmark().get_printer();
-        printer_opt_ref.has_value())
+    if (auto printer_opt_ref = state.get_benchmark().get_printer(); printer_opt_ref.has_value())
     {
       auto &printer = printer_opt_ref.value().get();
-      printer.log(nvbench::log_level::info,
-                  fmt::format("Wrote '{}' in {:>6.3f}ms",
-                              result_path.string(),
-                              timer.get_duration() * 1000));
+      printer.log(
+        nvbench::log_level::info,
+        fmt::format("Wrote '{}' in {:>6.3f}ms", result_path.string(), timer.get_duration() * 1000));
     }
   } // end hint == sample_times
 }
@@ -304,15 +293,12 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
       device["global_memory_size"]    = dev_info.get_global_memory_size();
       device["global_memory_bus_peak_clock_rate"] =
         dev_info.get_global_memory_bus_peak_clock_rate();
-      device["global_memory_bus_width"] =
-        dev_info.get_global_memory_bus_width();
-      device["global_memory_bus_bandwidth"] =
-        dev_info.get_global_memory_bus_bandwidth();
-      device["l2_cache_size"]        = dev_info.get_l2_cache_size();
-      device["shared_memory_per_sm"] = dev_info.get_shared_memory_per_sm();
-      device["shared_memory_per_block"] =
-        dev_info.get_shared_memory_per_block();
-      device["ecc_state"] = dev_info.get_ecc_state();
+      device["global_memory_bus_width"]     = dev_info.get_global_memory_bus_width();
+      device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
+      device["l2_cache_size"]               = dev_info.get_l2_cache_size();
+      device["shared_memory_per_sm"]        = dev_info.get_shared_memory_per_sm();
+      device["shared_memory_per_block"]     = dev_info.get_shared_memory_per_block();
+      device["ecc_state"]                   = dev_info.get_ecc_state();
     }
   } // "devices"
 
@@ -358,23 +344,19 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
           switch (axis_ptr->get_type())
           {
             case nvbench::axis_type::type:
-              value["is_active"] =
-                static_cast<type_axis &>(*axis_ptr).get_is_active(i);
+              value["is_active"] = static_cast<type_axis &>(*axis_ptr).get_is_active(i);
               break;
 
             case nvbench::axis_type::int64:
-              value["value"] =
-                static_cast<int64_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<int64_axis &>(*axis_ptr).get_value(i);
               break;
 
             case nvbench::axis_type::float64:
-              value["value"] =
-                static_cast<float64_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<float64_axis &>(*axis_ptr).get_value(i);
               break;
 
             case nvbench::axis_type::string:
-              value["value"] =
-                static_cast<string_axis &>(*axis_ptr).get_value(i);
+              value["value"] = static_cast<string_axis &>(*axis_ptr).get_value(i);
               break;
             default:
               break;
diff --git a/nvbench/json_printer.cuh b/nvbench/json_printer.cuh
index a58448f..394efb0 100644
--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -38,9 +38,7 @@ struct json_printer : nvbench::printer_base
 {
   using printer_base::printer_base;
 
-  json_printer(std::ostream &stream,
-               std::string stream_name,
-               bool enable_binary_output)
+  json_printer(std::ostream &stream, std::string stream_name, bool enable_binary_output)
       : printer_base(stream, std::move(stream_name))
       , m_enable_binary_output{enable_binary_output}
   {}
@@ -59,23 +57,16 @@ struct json_printer : nvbench::printer_base
 
   [[nodiscard]] static version_t get_json_file_version();
 
-  [[nodiscard]] bool get_enable_binary_output() const
-  {
-    return m_enable_binary_output;
-  }
+  [[nodiscard]] bool get_enable_binary_output() const { return m_enable_binary_output; }
   void set_enable_binary_output(bool b) { m_enable_binary_output = b; }
 
 protected:
   // Virtual API from printer_base:
-  void do_log_argv(const std::vector<std::string>& argv) override
-  {
-    m_argv = argv;
-  }
-  void do_process_bulk_data_float64(
-    nvbench::state &state,
-    const std::string &tag,
-    const std::string &hint,
-    const std::vector<nvbench::float64_t> &data) override;
+  void do_log_argv(const std::vector<std::string> &argv) override { m_argv = argv; }
+  void do_process_bulk_data_float64(nvbench::state &state,
+                                    const std::string &tag,
+                                    const std::string &hint,
+                                    const std::vector<nvbench::float64_t> &data) override;
   void do_print_benchmark_results(const benchmark_vector &benches) override;
 
   bool m_enable_binary_output{false};
diff --git a/nvbench/launch.cuh b/nvbench/launch.cuh
index 4b973f3..0cb4961 100644
--- a/nvbench/launch.cuh
+++ b/nvbench/launch.cuh
@@ -42,20 +42,17 @@ struct launch
   {}
 
   // move-only
-  launch(const launch &) = delete;
-  launch(launch &&)      = default;
+  launch(const launch &)            = delete;
+  launch(launch &&)                 = default;
   launch &operator=(const launch &) = delete;
-  launch &operator=(launch &&) = default;
+  launch &operator=(launch &&)      = default;
 
   /**
    * @return a CUDA stream that all kernels and other stream-ordered CUDA work
    * must use. This stream can be changed by the `KernelGenerator` using the
    * `nvbench::state::set_cuda_stream` method.
    */
-  __forceinline__ const nvbench::cuda_stream &get_stream() const
-  {
-    return m_stream;
-  };
+  __forceinline__ const nvbench::cuda_stream &get_stream() const { return m_stream; };
 
 private:
   // The stream is owned by the `nvbench::state` associated with this launch.
diff --git a/nvbench/main.cuh b/nvbench/main.cuh
index 4c1588c..0ba82d7 100644
--- a/nvbench/main.cuh
+++ b/nvbench/main.cuh
@@ -27,23 +27,23 @@
 
 #include <iostream>
 
-#define NVBENCH_MAIN                                                           \
-  int main(int argc, char const *const *argv)                                  \
-  try                                                                          \
-  {                                                                            \
-    NVBENCH_MAIN_BODY(argc, argv);                                             \
-    NVBENCH_CUDA_CALL(cudaDeviceReset());                                      \
-    return 0;                                                                  \
-  }                                                                            \
-  catch (std::exception & e)                                                   \
-  {                                                                            \
-    std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";    \
-    return 1;                                                                  \
-  }                                                                            \
-  catch (...)                                                                  \
-  {                                                                            \
-    std::cerr << "\nNVBench encountered an unknown error.\n";                  \
-    return 1;                                                                  \
+#define NVBENCH_MAIN                                                                               \
+  int main(int argc, char const *const *argv)                                                      \
+  try                                                                                              \
+  {                                                                                                \
+    NVBENCH_MAIN_BODY(argc, argv);                                                                 \
+    NVBENCH_CUDA_CALL(cudaDeviceReset());                                                          \
+    return 0;                                                                                      \
+  }                                                                                                \
+  catch (std::exception & e)                                                                       \
+  {                                                                                                \
+    std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";                        \
+    return 1;                                                                                      \
+  }                                                                                                \
+  catch (...)                                                                                      \
+  {                                                                                                \
+    std::cerr << "\nNVBench encountered an unknown error.\n";                                      \
+    return 1;                                                                                      \
   }
 
 #ifdef NVBENCH_HAS_CUPTI
@@ -54,35 +54,35 @@
 // clang-format on
 #endif
 
-#define NVBENCH_MAIN_PARSE(argc, argv)                                         \
-  nvbench::option_parser parser;                                               \
+#define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
+  nvbench::option_parser parser;                                                                   \
   parser.parse(argc, argv)
 
-#define NVBENCH_MAIN_BODY(argc, argv)                                          \
-  do                                                                           \
-  {                                                                            \
-    NVBENCH_INITIALIZE_DRIVER_API;                                             \
-    NVBENCH_MAIN_PARSE(argc, argv);                                            \
-    auto &printer = parser.get_printer();                                      \
-                                                                               \
-    printer.print_device_info();                                               \
-    printer.print_log_preamble();                                              \
-    auto &benchmarks = parser.get_benchmarks();                                \
-                                                                               \
-    std::size_t total_states = 0;                                              \
-    for (auto &bench_ptr : benchmarks)                                         \
-    {                                                                          \
-      total_states += bench_ptr->get_config_count();                           \
-    }                                                                          \
-    printer.set_total_state_count(total_states);                               \
-                                                                               \
-    printer.set_completed_state_count(0);                                      \
-    for (auto &bench_ptr : benchmarks)                                         \
-    {                                                                          \
-      bench_ptr->set_printer(printer);                                         \
-      bench_ptr->run();                                                        \
-      bench_ptr->clear_printer();                                              \
-    }                                                                          \
-    printer.print_log_epilogue();                                              \
-    printer.print_benchmark_results(benchmarks);                               \
+#define NVBENCH_MAIN_BODY(argc, argv)                                                              \
+  do                                                                                               \
+  {                                                                                                \
+    NVBENCH_INITIALIZE_DRIVER_API;                                                                 \
+    NVBENCH_MAIN_PARSE(argc, argv);                                                                \
+    auto &printer = parser.get_printer();                                                          \
+                                                                                                   \
+    printer.print_device_info();                                                                   \
+    printer.print_log_preamble();                                                                  \
+    auto &benchmarks = parser.get_benchmarks();                                                    \
+                                                                                                   \
+    std::size_t total_states = 0;                                                                  \
+    for (auto &bench_ptr : benchmarks)                                                             \
+    {                                                                                              \
+      total_states += bench_ptr->get_config_count();                                               \
+    }                                                                                              \
+    printer.set_total_state_count(total_states);                                                   \
+                                                                                                   \
+    printer.set_completed_state_count(0);                                                          \
+    for (auto &bench_ptr : benchmarks)                                                             \
+    {                                                                                              \
+      bench_ptr->set_printer(printer);                                                             \
+      bench_ptr->run();                                                                            \
+      bench_ptr->clear_printer();                                                                  \
+    }                                                                                              \
+    printer.print_log_epilogue();                                                                  \
+    printer.print_benchmark_results(benchmarks);                                                   \
   } while (false)
diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu
index 276ca86..94151d4 100644
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -44,9 +44,8 @@ void markdown_printer::do_print_device_info()
   fmt::format_to(buffer, "# Devices\n\n");
 
   const auto &device_mgr = nvbench::device_manager::get();
-  const auto &devices    = device_mgr.get_number_of_used_devices() > 0
-                             ? device_mgr.get_used_devices()
-                             : device_mgr.get_devices();
+  const auto &devices = device_mgr.get_number_of_used_devices() > 0 ? device_mgr.get_used_devices()
+                                                                    : device_mgr.get_devices();
   for (const auto &device : devices)
   {
     const auto [gmem_free, gmem_used] = device.get_global_memory_usage();
@@ -64,22 +63,17 @@ void markdown_printer::do_print_device_info()
                    "* Global Memory: {} MiB Free / {} MiB Total\n",
                    gmem_free / 1024 / 1024,
                    gmem_used / 1024 / 1024);
-    fmt::format_to(
-      buffer,
-      "* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
-      device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
-      device.get_global_memory_bus_width(),
-      device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
+    fmt::format_to(buffer,
+                   "* Global Memory Bus Peak: {} GB/sec ({}-bit DDR @{}MHz)\n",
+                   device.get_global_memory_bus_bandwidth() / 1000 / 1000 / 1000,
+                   device.get_global_memory_bus_width(),
+                   device.get_global_memory_bus_peak_clock_rate() / 1000 / 1000);
     fmt::format_to(buffer,
                    "* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
                    device.get_shared_memory_per_sm() / 1024,
                    device.get_shared_memory_per_block() / 1024);
-    fmt::format_to(buffer,
-                   "* L2 Cache Size: {} KiB\n",
-                   device.get_l2_cache_size() / 1024);
-    fmt::format_to(buffer,
-                   "* Maximum Active Blocks: {}/SM\n",
-                   device.get_max_blocks_per_sm());
+    fmt::format_to(buffer, "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
+    fmt::format_to(buffer, "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
     fmt::format_to(buffer,
                    "* Maximum Active Threads: {}/SM, {}/Block\n",
                    device.get_max_threads_per_sm(),
@@ -88,18 +82,13 @@ void markdown_printer::do_print_device_info()
                    "* Available Registers: {}/SM, {}/Block\n",
                    device.get_registers_per_sm(),
                    device.get_registers_per_block());
-    fmt::format_to(buffer,
-                   "* ECC Enabled: {}\n",
-                   device.get_ecc_state() ? "Yes" : "No");
+    fmt::format_to(buffer, "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
     fmt::format_to(buffer, "\n");
   }
   m_ostream << fmt::to_string(buffer);
 }
 
-void markdown_printer::do_print_log_preamble()
-{
-  m_ostream << "# Log\n\n```\n";
-}
+void markdown_printer::do_print_log_preamble() { m_ostream << "# Log\n\n```\n"; }
 
 void markdown_printer::do_print_log_epilogue() { m_ostream << "```\n\n"; }
 
@@ -147,8 +136,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
 {
   if (m_total_state_count == 0)
   { // No progress info
-    this->log(nvbench::log_level::run,
-              exec_state.get_short_description(m_color));
+    this->log(nvbench::log_level::run, exec_state.get_short_description(m_color));
   }
   else
   { // Add progress
@@ -160,8 +148,7 @@ void markdown_printer::do_log_run_state(const nvbench::state &exec_state)
   }
 }
 
-void markdown_printer::do_print_benchmark_list(
-  const printer_base::benchmark_vector &benches)
+void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vector &benches)
 {
   if (benches.empty())
   {
@@ -204,10 +191,7 @@ void markdown_printer::do_print_benchmark_list(
         {
           desc = fmt::format(" ({})", desc);
         }
-        fmt::format_to(buffer,
-                       "  * `{}`{}\n",
-                       axis_ptr->get_input_string(i),
-                       desc);
+        fmt::format_to(buffer, "  * `{}`{}\n", axis_ptr->get_input_string(i), desc);
       } // end foreach value
     }   // end foreach axis
     fmt::format_to(buffer, "\n");
@@ -216,8 +200,7 @@ void markdown_printer::do_print_benchmark_list(
   m_ostream << fmt::to_string(buffer);
 }
 
-void markdown_printer::do_print_benchmark_results(
-  const printer_base::benchmark_vector &benches)
+void markdown_printer::do_print_benchmark_results(const printer_base::benchmark_vector &benches)
 {
   auto format_visitor = [](const auto &v) {
     using T = std::decay_t<decltype(v)>;
@@ -252,19 +235,15 @@ void markdown_printer::do_print_benchmark_results(
     // Do a single pass when no devices are specified. This happens for
     // benchmarks with `cpu` exec_tags.
     const std::size_t num_device_passes = devices.empty() ? 1 : devices.size();
-    for (std::size_t device_pass = 0; device_pass < num_device_passes;
-         ++device_pass)
+    for (std::size_t device_pass = 0; device_pass < num_device_passes; ++device_pass)
     {
-      std::optional<nvbench::device_info> device =
-        devices.empty() ? std::nullopt
-                        : std::make_optional(devices[device_pass]);
+      std::optional<nvbench::device_info> device = devices.empty()
+                                                     ? std::nullopt
+                                                     : std::make_optional(devices[device_pass]);
 
       if (device)
       {
-        fmt::format_to(buffer,
-                       "\n### [{}] {}\n\n",
-                       device->get_id(),
-                       device->get_name());
+        fmt::format_to(buffer, "\n### [{}] {}\n\n", device->get_id(), device->get_name());
       }
 
       std::size_t row = 0;
@@ -288,15 +267,11 @@ void markdown_printer::do_print_benchmark_results(
             {
               const nvbench::int64_t value    = axis_values.get_int64(name);
               const nvbench::int64_t exponent = int64_axis::compute_log2(value);
-              table.add_cell(row,
-                             name,
-                             name,
-                             fmt::format("2^{} = {}", exponent, value));
+              table.add_cell(row, name, name, fmt::format("2^{} = {}", exponent, value));
             }
             else
             {
-              std::string value = std::visit(format_visitor,
-                                             axis_values.get_value(name));
+              std::string value = std::visit(format_visitor, axis_values.get_value(name));
               table.add_cell(row, name + "_axis", name, std::move(value));
             }
           }
@@ -308,12 +283,9 @@ void markdown_printer::do_print_benchmark_results(
               continue;
             }
             const std::string &tag    = summ.get_tag();
-            const std::string &header = summ.has_value("name")
-                                          ? summ.get_string("name")
-                                          : tag;
+            const std::string &header = summ.has_value("name") ? summ.get_string("name") : tag;
 
-            std::string hint = summ.has_value("hint") ? summ.get_string("hint")
-                                                      : std::string{};
+            std::string hint = summ.has_value("hint") ? summ.get_string("hint") : std::string{};
             if (hint == "duration")
             {
               table.add_cell(row, tag, header, this->do_format_duration(summ));
@@ -332,10 +304,7 @@ void markdown_printer::do_print_benchmark_results(
             }
             else if (hint == "sample_size")
             {
-              table.add_cell(row,
-                             tag,
-                             header,
-                             this->do_format_sample_size(summ));
+              table.add_cell(row, tag, header, this->do_format_sample_size(summ));
             }
             else if (hint == "percentage")
             {
@@ -353,8 +322,7 @@ void markdown_printer::do_print_benchmark_results(
       auto table_str = table.to_string();
       fmt::format_to(buffer,
                      "{}",
-                     table_str.empty() ? "No data -- check log.\n"
-                                       : std::move(table_str));
+                     table_str.empty() ? "No data -- check log.\n" : std::move(table_str));
     } // end foreach device_pass
   }
 
diff --git a/nvbench/named_values.cuh b/nvbench/named_values.cuh
index c11dab4..1ce5185 100644
--- a/nvbench/named_values.cuh
+++ b/nvbench/named_values.cuh
@@ -33,8 +33,7 @@ namespace nvbench
  */
 struct named_values
 {
-  using value_type =
-    std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
+  using value_type = std::variant<nvbench::int64_t, nvbench::float64_t, std::string>;
 
   enum class type
   {
@@ -43,7 +42,7 @@ struct named_values
     string
   };
 
-  void append(const named_values& other);
+  void append(const named_values &other);
 
   [[nodiscard]] std::size_t get_size() const;
   [[nodiscard]] std::vector<std::string> get_names() const;
@@ -60,11 +59,11 @@ struct named_values
 
   [[nodiscard]] type get_type(const std::string &name) const;
   [[nodiscard]] bool has_value(const std::string &name) const;
-  [[nodiscard]] const value_type& get_value(const std::string &name) const;
+  [[nodiscard]] const value_type &get_value(const std::string &name) const;
 
   void clear();
 
-  void remove_value(const std::string& name);
+  void remove_value(const std::string &name);
 
 private:
   struct named_value
diff --git a/nvbench/named_values.cxx b/nvbench/named_values.cxx
index 1aeb4dc..605789e 100644
--- a/nvbench/named_values.cxx
+++ b/nvbench/named_values.cxx
@@ -33,9 +33,7 @@ namespace nvbench
 
 void named_values::append(const named_values &other)
 {
-  m_storage.insert(m_storage.end(),
-                   other.m_storage.cbegin(),
-                   other.m_storage.cend());
+  m_storage.insert(m_storage.end(), other.m_storage.cbegin(), other.m_storage.cend());
 }
 
 void named_values::clear() { m_storage.clear(); }
@@ -55,20 +53,17 @@ std::vector<std::string> named_values::get_names() const
 
 bool named_values::has_value(const std::string &name) const
 {
-  auto iter =
-    std::find_if(m_storage.cbegin(),
-                 m_storage.cend(),
-                 [&name](const auto &val) { return val.name == name; });
+  auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
+    return val.name == name;
+  });
   return iter != m_storage.cend();
 }
 
-const named_values::value_type &
-named_values::get_value(const std::string &name) const
+const named_values::value_type &named_values::get_value(const std::string &name) const
 {
-  auto iter =
-    std::find_if(m_storage.cbegin(),
-                 m_storage.cend(),
-                 [&name](const auto &val) { return val.name == name; });
+  auto iter = std::find_if(m_storage.cbegin(), m_storage.cend(), [&name](const auto &val) {
+    return val.name == name;
+  });
   if (iter == m_storage.cend())
   {
     NVBENCH_THROW(std::runtime_error, "No value with name '{}'.", name);
@@ -96,9 +91,7 @@ named_values::type named_values::get_type(const std::string &name) const
       // warning C4702: unreachable code
       // This is a future-proofing check, it'll be reachable if something breaks
       NVBENCH_MSVC_PUSH_DISABLE_WARNING(4702)
-      NVBENCH_THROW(std::runtime_error,
-                    "Unknown variant type for entry '{}'.",
-                    name);
+      NVBENCH_THROW(std::runtime_error, "Unknown variant type for entry '{}'.", name);
     },
     this->get_value(name));
   NVBENCH_MSVC_POP_WARNING()
@@ -111,10 +104,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up int64 value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up int64 value `{}`:\n{}", name, err.what());
 }
 
 nvbench::float64_t named_values::get_float64(const std::string &name) const
@@ -124,10 +114,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up float64 value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up float64 value `{}`:\n{}", name, err.what());
 }
 
 const std::string &named_values::get_string(const std::string &name) const
@@ -137,10 +124,7 @@ try
 }
 catch (std::exception &err)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error looking up string value `{}`:\n{}",
-                name,
-                err.what());
+  NVBENCH_THROW(std::runtime_error, "Error looking up string value `{}`:\n{}", name, err.what());
 }
 
 void named_values::set_int64(std::string name, nvbench::int64_t value)
@@ -165,10 +149,9 @@ void named_values::set_value(std::string name, named_values::value_type value)
 
 void named_values::remove_value(const std::string &name)
 {
-  auto iter =
-    std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
-      return val.name == name;
-    });
+  auto iter = std::find_if(m_storage.begin(), m_storage.end(), [&name](const auto &val) {
+    return val.name == name;
+  });
   if (iter != m_storage.end())
   {
     m_storage.erase(iter);
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index 55f7f1c..744479d 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -82,20 +82,11 @@ std::string_view submatch_to_sv(const sv_submatch &in)
 //
 // So we're stuck with materializing a std::string and calling std::stoX(). Ah
 // well. At least it's not istream.
-void parse(std::string_view input, nvbench::int32_t &val)
-{
-  val = std::stoi(std::string(input));
-}
+void parse(std::string_view input, nvbench::int32_t &val) { val = std::stoi(std::string(input)); }
 
-void parse(std::string_view input, nvbench::int64_t &val)
-{
-  val = std::stoll(std::string(input));
-}
+void parse(std::string_view input, nvbench::int64_t &val) { val = std::stoll(std::string(input)); }
 
-void parse(std::string_view input, nvbench::float64_t &val)
-{
-  val = std::stod(std::string(input));
-}
+void parse(std::string_view input, nvbench::float64_t &val) { val = std::stod(std::string(input)); }
 
 void parse(std::string_view input, std::string &val) { val = input; }
 
@@ -112,9 +103,8 @@ std::vector<T> parse_list_values(std::string_view list_spec)
     "(?:,|$)"  // Delimiters
   };
 
-  auto values_begin =
-    sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
-  auto values_end = sv_regex_iterator{};
+  auto values_begin = sv_regex_iterator(list_spec.cbegin(), list_spec.cend(), value_regex);
+  auto values_end   = sv_regex_iterator{};
   while (values_begin != values_end)
   {
     auto match          = *values_begin++;
@@ -131,8 +121,7 @@ std::vector<T> parse_list_values(std::string_view list_spec)
 // Parses a range specification "<start> : <stop> [ : <stride> ]" and returns
 // a vector filled with the specified range.
 template <typename T>
-std::vector<T> parse_range_values(std::string_view range_spec,
-                                  nvbench::wrapped_type<T>)
+std::vector<T> parse_range_values(std::string_view range_spec, nvbench::wrapped_type<T>)
 {
   std::vector<T> range_params;
 
@@ -143,9 +132,8 @@ std::vector<T> parse_range_values(std::string_view range_spec,
     "(?:$|:)"  // Delimiters
   };
 
-  auto values_begin =
-    sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
-  auto values_end = sv_regex_iterator{};
+  auto values_begin = sv_regex_iterator(range_spec.cbegin(), range_spec.cend(), value_regex);
+  auto values_end   = sv_regex_iterator{};
   for (; values_begin != values_end; ++values_begin)
   {
     auto match          = *values_begin;
@@ -221,25 +209,15 @@ std::vector<T> parse_values(std::string_view value_spec)
                                        "$"};        // EOS
 
   sv_match match;
-  if (std::regex_search(value_spec.cbegin(),
-                        value_spec.cend(),
-                        match,
-                        list_regex))
+  if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, list_regex))
   {
     return parse_list_values<T>(submatch_to_sv(match[1]));
   }
-  else if (std::regex_search(value_spec.cbegin(),
-                             value_spec.cend(),
-                             match,
-                             range_regex))
+  else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, range_regex))
   {
-    return parse_range_values(submatch_to_sv(match[1]),
-                              nvbench::wrapped_type<T>{});
+    return parse_range_values(submatch_to_sv(match[1]), nvbench::wrapped_type<T>{});
   }
-  else if (std::regex_search(value_spec.cbegin(),
-                             value_spec.cend(),
-                             match,
-                             single_regex))
+  else if (std::regex_search(value_spec.cbegin(), value_spec.cend(), match, single_regex))
   {
     T val;
     parse(submatch_to_sv(match[1]), val);
@@ -247,9 +225,7 @@ std::vector<T> parse_values(std::string_view value_spec)
   }
   else
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid axis value spec: {}",
-                  value_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid axis value spec: {}", value_spec);
   }
 }
 
@@ -514,8 +490,8 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
       this->update_int64_prop(first[0], first[1]);
       first += 2;
     }
-    else if (arg == "--min-time" || arg == "--max-noise" ||
-             arg == "--skip-time" || arg == "--timeout")
+    else if (arg == "--min-time" || arg == "--max-noise" || arg == "--skip-time" ||
+             arg == "--timeout")
     {
       check_params(1);
       this->update_float64_prop(first[0], first[1]);
@@ -523,9 +499,7 @@ void option_parser::parse_range(option_parser::arg_iterator_t first,
     }
     else
     {
-      NVBENCH_THROW(std::runtime_error,
-                    "Unrecognized command-line argument: `{}`.",
-                    arg);
+      NVBENCH_THROW(std::runtime_error, "Unrecognized command-line argument: `{}`.", arg);
     }
   }
 }
@@ -534,7 +508,7 @@ void option_parser::add_markdown_printer(const std::string &spec)
 try
 {
   std::ostream &stream = this->printer_spec_to_ostream(spec);
-  auto &printer = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
+  auto &printer        = m_printer.emplace<nvbench::markdown_printer>(stream, spec);
   if (spec == "stdout")
   {
     printer.set_color(m_color_md_stdout_printer);
@@ -556,14 +530,10 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error while adding csv output for `{}`:\n{}",
-                spec,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error while adding csv output for `{}`:\n{}", spec, e.what());
 }
 
-void option_parser::add_json_printer(const std::string &spec,
-                                     bool enable_binary)
+void option_parser::add_json_printer(const std::string &spec, bool enable_binary)
 try
 {
   std::ostream &stream = this->printer_spec_to_ostream(spec);
@@ -624,10 +594,7 @@ void option_parser::print_help() const
   fmt::print("{}\n{}\n", ::cli_help_text, ::cli_help_axis_text);
 }
 
-void option_parser::print_help_axis() const
-{
-  fmt::print("{}\n", ::cli_help_axis_text);
-}
+void option_parser::print_help_axis() const { fmt::print("{}\n", ::cli_help_axis_text); }
 
 void option_parser::set_persistence_mode(const std::string &state)
 try
@@ -685,9 +652,7 @@ try
   {
     if (rate_val == nvbench::device_info::clock_rate::none)
     {
-      fmt::print("Unlocking clocks for device '{}' ({}).\n",
-                 device.get_name(),
-                 device.get_id());
+      fmt::print("Unlocking clocks for device '{}' ({}).\n", device.get_name(), device.get_id());
     }
     else
     {
@@ -757,16 +722,12 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --benchmark `{}`:\n{}",
-                name,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --benchmark `{}`:\n{}", name, e.what());
 }
 
 void option_parser::replay_global_args()
 {
-  this->parse_range(m_global_benchmark_args.cbegin(),
-                    m_global_benchmark_args.cend());
+  this->parse_range(m_global_benchmark_args.cbegin(), m_global_benchmark_args.cend());
 }
 
 void option_parser::update_devices(const std::string &devices)
@@ -790,10 +751,7 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --devices `{}`:\n{}",
-                devices,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --devices `{}`:\n{}", devices, e.what());
 }
 
 void option_parser::update_axis(const std::string &spec)
@@ -832,28 +790,20 @@ try
   switch (axis.get_type())
   {
     case axis_type::type:
-      this->update_type_axis(static_cast<nvbench::type_axis &>(axis),
-                             values,
-                             flags);
+      this->update_type_axis(static_cast<nvbench::type_axis &>(axis), values, flags);
       break;
 
     case axis_type::int64:
-      this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis),
-                              values,
-                              flags);
+      this->update_int64_axis(static_cast<nvbench::int64_axis &>(axis), values, flags);
       break;
 
     case axis_type::float64:
-      this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis),
-                                values,
-                                flags);
+      this->update_float64_axis(static_cast<nvbench::float64_axis &>(axis), values, flags);
 
       break;
 
     case axis_type::string:
-      this->update_string_axis(static_cast<nvbench::string_axis &>(axis),
-                               values,
-                               flags);
+      this->update_string_axis(static_cast<nvbench::string_axis &>(axis), values, flags);
 
       break;
 
@@ -866,10 +816,7 @@ try
 }
 catch (std::exception &e)
 {
-  NVBENCH_THROW(std::runtime_error,
-                "Error handling option --axis `{}`:\n{}",
-                spec,
-                e.what());
+  NVBENCH_THROW(std::runtime_error, "Error handling option --axis `{}`:\n{}", spec, e.what());
 }
 
 void option_parser::update_int64_axis(int64_axis &axis,
@@ -888,9 +835,7 @@ void option_parser::update_int64_axis(int64_axis &axis,
   }
   else
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for int64 axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for int64 axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<nvbench::int64_t>(value_spec);
@@ -905,9 +850,7 @@ void option_parser::update_float64_axis(float64_axis &axis,
   // Validate flags:
   if (!flag_spec.empty())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for float64 axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for float64 axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<nvbench::float64_t>(value_spec);
@@ -922,9 +865,7 @@ void option_parser::update_string_axis(string_axis &axis,
   // Validate flags:
   if (!flag_spec.empty())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for string axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for string axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<std::string>(value_spec);
@@ -939,9 +880,7 @@ void option_parser::update_type_axis(type_axis &axis,
   // Validate flags:
   if (!flag_spec.empty())
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Invalid flag for type axis: `{}`",
-                  flag_spec);
+    NVBENCH_THROW(std::runtime_error, "Invalid flag for type axis: `{}`", flag_spec);
   }
 
   auto input_values = parse_values<std::string>(value_spec);
@@ -949,8 +888,7 @@ void option_parser::update_type_axis(type_axis &axis,
   axis.set_active_inputs(input_values);
 }
 
-void option_parser::update_int64_prop(const std::string &prop_arg,
-                                      const std::string &prop_val)
+void option_parser::update_int64_prop(const std::string &prop_arg, const std::string &prop_val)
 try
 {
   // If no active benchmark, save args as global.
@@ -983,8 +921,7 @@ catch (std::exception &e)
                 e.what());
 }
 
-void option_parser::update_float64_prop(const std::string &prop_arg,
-                                        const std::string &prop_val)
+void option_parser::update_float64_prop(const std::string &prop_arg, const std::string &prop_val)
 try
 {
   // If no active benchmark, save args as global.
diff --git a/nvbench/option_parser.cuh b/nvbench/option_parser.cuh
index e35d724..c183764 100644
--- a/nvbench/option_parser.cuh
+++ b/nvbench/option_parser.cuh
@@ -41,8 +41,7 @@ struct type_axis;
  */
 struct option_parser
 {
-  using benchmark_vector =
-    std::vector<std::unique_ptr<nvbench::benchmark_base>>;
+  using benchmark_vector = std::vector<std::unique_ptr<nvbench::benchmark_base>>;
 
   option_parser();
   ~option_parser();
@@ -51,15 +50,9 @@ struct option_parser
   void parse(std::vector<std::string> args);
 
   [[nodiscard]] benchmark_vector &get_benchmarks() { return m_benchmarks; };
-  [[nodiscard]] const benchmark_vector &get_benchmarks() const
-  {
-    return m_benchmarks;
-  };
+  [[nodiscard]] const benchmark_vector &get_benchmarks() const { return m_benchmarks; };
 
-  [[nodiscard]] const std::vector<std::string> &get_args() const
-  {
-    return m_args;
-  }
+  [[nodiscard]] const std::vector<std::string> &get_args() const { return m_args; }
 
   /*!
    * Returns the output format requested by the parse options.
@@ -115,10 +108,8 @@ private:
                                std::string_view value_spec,
                                std::string_view flag_spec);
 
-  void update_int64_prop(const std::string &prop_arg,
-                         const std::string &prop_val);
-  void update_float64_prop(const std::string &prop_arg,
-                           const std::string &prop_val);
+  void update_int64_prop(const std::string &prop_arg, const std::string &prop_val);
+  void update_float64_prop(const std::string &prop_arg, const std::string &prop_val);
 
   void update_used_device_state() const;
 
diff --git a/nvbench/printer_base.cuh b/nvbench/printer_base.cuh
index 0e28a35..3de8874 100644
--- a/nvbench/printer_base.cuh
+++ b/nvbench/printer_base.cuh
@@ -76,19 +76,16 @@ struct printer_base
   virtual ~printer_base();
 
   // move-only
-  printer_base(const printer_base &) = delete;
-  printer_base(printer_base &&)      = default;
+  printer_base(const printer_base &)            = delete;
+  printer_base(printer_base &&)                 = default;
   printer_base &operator=(const printer_base &) = delete;
-  printer_base &operator=(printer_base &&) = default;
+  printer_base &operator=(printer_base &&)      = default;
 
   /*!
    * Called once with the command line arguments used to invoke the current
    * executable.
    */
-  void log_argv(const std::vector<std::string> &argv)
-  {
-    this->do_log_argv(argv);
-  }
+  void log_argv(const std::vector<std::string> &argv) { this->do_log_argv(argv); }
 
   /*!
    * Print a summary of all detected devices, if supported.
@@ -108,19 +105,13 @@ struct printer_base
   /*!
    * Print a log message at the specified log level.
    */
-  void log(nvbench::log_level level, const std::string &msg)
-  {
-    this->do_log(level, msg);
-  }
+  void log(nvbench::log_level level, const std::string &msg) { this->do_log(level, msg); }
 
   /*!
    * Called before running the measurements associated with state.
    * Implementations are expected to call `log(log_level::run, ...)`.
    */
-  void log_run_state(const nvbench::state &exec_state)
-  {
-    this->do_log_run_state(exec_state);
-  }
+  void log_run_state(const nvbench::state &exec_state) { this->do_log_run_state(exec_state); }
 
   /*!
    * Measurements may call this to allow a printer to perform extra processing
@@ -181,10 +172,7 @@ struct printer_base
     return this->do_get_completed_state_count();
   }
 
-  virtual void set_total_state_count(std::size_t states)
-  {
-    this->do_set_total_state_count(states);
-  }
+  virtual void set_total_state_count(std::size_t states) { this->do_set_total_state_count(states); }
   [[nodiscard]] virtual std::size_t get_total_state_count() const
   {
     return this->do_get_total_state_count();
@@ -193,17 +181,16 @@ struct printer_base
 
 protected:
   // Implementation hooks for subclasses:
-  virtual void do_log_argv(const std::vector<std::string>&) {}
+  virtual void do_log_argv(const std::vector<std::string> &) {}
   virtual void do_print_device_info() {}
   virtual void do_print_log_preamble() {}
   virtual void do_print_log_epilogue() {}
   virtual void do_log(nvbench::log_level, const std::string &) {}
   virtual void do_log_run_state(const nvbench::state &) {}
-  virtual void
-  do_process_bulk_data_float64(nvbench::state &,
-                               const std::string &,
-                               const std::string &,
-                               const std::vector<nvbench::float64_t> &){};
+  virtual void do_process_bulk_data_float64(nvbench::state &,
+                                            const std::string &,
+                                            const std::string &,
+                                            const std::vector<nvbench::float64_t> &){};
   virtual void do_print_benchmark_list(const benchmark_vector &) {}
   virtual void do_print_benchmark_results(const benchmark_vector &) {}
 
diff --git a/nvbench/printer_base.cxx b/nvbench/printer_base.cxx
index 66de795..639edc2 100644
--- a/nvbench/printer_base.cxx
+++ b/nvbench/printer_base.cxx
@@ -38,19 +38,10 @@ void printer_base::do_set_completed_state_count(std::size_t states)
 
 void printer_base::do_add_completed_state() { ++m_completed_state_count; }
 
-std::size_t printer_base::do_get_completed_state_count() const
-{
-  return m_completed_state_count;
-}
+std::size_t printer_base::do_get_completed_state_count() const { return m_completed_state_count; }
 
-void printer_base::do_set_total_state_count(std::size_t states)
-{
-  m_total_state_count = states;
-}
+void printer_base::do_set_total_state_count(std::size_t states) { m_total_state_count = states; }
 
-std::size_t printer_base::do_get_total_state_count() const
-{
-  return m_total_state_count;
-}
+std::size_t printer_base::do_get_total_state_count() const { return m_total_state_count; }
 
 } // namespace nvbench
diff --git a/nvbench/printer_multiplex.cuh b/nvbench/printer_multiplex.cuh
index f32a0e9..797b480 100644
--- a/nvbench/printer_multiplex.cuh
+++ b/nvbench/printer_multiplex.cuh
@@ -40,10 +40,7 @@ struct printer_multiplex : nvbench::printer_base
     return static_cast<Format &>(*m_printers.back());
   }
 
-  [[nodiscard]] std::size_t get_printer_count() const
-  {
-    return m_printers.size();
-  }
+  [[nodiscard]] std::size_t get_printer_count() const { return m_printers.size(); }
 
 protected:
   void do_log_argv(const std::vector<std::string> &argv) override;
@@ -52,11 +49,10 @@ protected:
   void do_print_log_epilogue() override;
   void do_log(nvbench::log_level, const std::string &) override;
   void do_log_run_state(const nvbench::state &) override;
-  void do_process_bulk_data_float64(
-    nvbench::state &,
-    const std::string &,
-    const std::string &,
-    const std::vector<nvbench::float64_t> &) override;
+  void do_process_bulk_data_float64(nvbench::state &,
+                                    const std::string &,
+                                    const std::string &,
+                                    const std::vector<nvbench::float64_t> &) override;
   void do_print_benchmark_list(const benchmark_vector &benches) override;
   void do_print_benchmark_results(const benchmark_vector &benches) override;
   void do_set_completed_state_count(std::size_t states) override;
diff --git a/nvbench/printer_multiplex.cxx b/nvbench/printer_multiplex.cxx
index 86d9954..89867c1 100644
--- a/nvbench/printer_multiplex.cxx
+++ b/nvbench/printer_multiplex.cxx
@@ -67,11 +67,10 @@ void printer_multiplex::do_log_run_state(const nvbench::state &exec_state)
   }
 }
 
-void printer_multiplex::do_process_bulk_data_float64(
-  state &state,
-  const std::string &tag,
-  const std::string &hint,
-  const std::vector<nvbench::float64_t> &data)
+void printer_multiplex::do_process_bulk_data_float64(state &state,
+                                                     const std::string &tag,
+                                                     const std::string &hint,
+                                                     const std::vector<nvbench::float64_t> &data)
 {
   for (auto &format_ptr : m_printers)
   {
@@ -87,8 +86,7 @@ void printer_multiplex::do_print_benchmark_list(const benchmark_vector &benches)
   }
 }
 
-void printer_multiplex::do_print_benchmark_results(
-  const benchmark_vector &benches)
+void printer_multiplex::do_print_benchmark_results(const benchmark_vector &benches)
 {
   for (auto &format_ptr : m_printers)
   {
diff --git a/nvbench/range.cuh b/nvbench/range.cuh
index f0e8255..7000f87 100644
--- a/nvbench/range.cuh
+++ b/nvbench/range.cuh
@@ -29,13 +29,11 @@ namespace nvbench
 namespace detail
 {
 template <typename T>
-using range_output_t = std::conditional_t<std::is_floating_point_v<T>,
-                                          nvbench::float64_t,
-                                          nvbench::int64_t>;
+using range_output_t =
+  std::conditional_t<std::is_floating_point_v<T>, nvbench::float64_t, nvbench::int64_t>;
 }
 
-template <typename InT,
-          typename OutT = nvbench::detail::range_output_t<InT>>
+template <typename InT, typename OutT = nvbench::detail::range_output_t<InT>>
 auto range(InT start, InT end, InT stride = InT{1})
 {
   if constexpr (std::is_floating_point_v<InT>)
diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh
index 9435906..f32b222 100644
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -37,8 +37,7 @@ struct runner_base
 
   void generate_states();
 
-  void handle_sampling_exception(const std::exception &e,
-                                 nvbench::state &exec_state) const;
+  void handle_sampling_exception(const std::exception &e, nvbench::state &exec_state) const;
 
   void run_state_prologue(state &exec_state) const;
   void run_state_epilogue(state &exec_state) const;
@@ -51,11 +50,10 @@ struct runner_base
 template <typename BenchmarkType>
 struct runner : public runner_base
 {
-  using benchmark_type   = BenchmarkType;
-  using kernel_generator = typename benchmark_type::kernel_generator;
-  using type_configs     = typename benchmark_type::type_configs;
-  static constexpr std::size_t num_type_configs =
-    benchmark_type::num_type_configs;
+  using benchmark_type                          = BenchmarkType;
+  using kernel_generator                        = typename benchmark_type::kernel_generator;
+  using type_configs                            = typename benchmark_type::type_configs;
+  static constexpr std::size_t num_type_configs = benchmark_type::num_type_configs;
 
   explicit runner(benchmark_type &bench)
       : runner_base{bench}
@@ -86,38 +84,37 @@ private:
 
     // Iterate through type_configs:
     std::size_t type_config_index = 0;
-    nvbench::tl::foreach<type_configs>([&self   = *this,
-                                        &states = m_benchmark.m_states,
-                                        &type_config_index,
-                                        &device](auto type_config_wrapper) {
-      // Get current type_config:
-      using type_config = typename decltype(type_config_wrapper)::type;
+    nvbench::tl::foreach<type_configs>(
+      [&self = *this, &states = m_benchmark.m_states, &type_config_index, &device](
+        auto type_config_wrapper) {
+        // Get current type_config:
+        using type_config = typename decltype(type_config_wrapper)::type;
 
-      // Find states with the current device / type_config
-      for (nvbench::state &cur_state : states)
-      {
-        if (cur_state.get_device() == device &&
-            cur_state.get_type_config_index() == type_config_index)
+        // Find states with the current device / type_config
+        for (nvbench::state &cur_state : states)
         {
-          self.run_state_prologue(cur_state);
-          try
+          if (cur_state.get_device() == device &&
+              cur_state.get_type_config_index() == type_config_index)
           {
-            kernel_generator{}(cur_state, type_config{});
-            if (cur_state.is_skipped())
+            self.run_state_prologue(cur_state);
+            try
             {
-              self.print_skip_notification(cur_state);
+              kernel_generator{}(cur_state, type_config{});
+              if (cur_state.is_skipped())
+              {
+                self.print_skip_notification(cur_state);
+              }
             }
+            catch (std::exception &e)
+            {
+              self.handle_sampling_exception(e, cur_state);
+            }
+            self.run_state_epilogue(cur_state);
           }
-          catch (std::exception &e)
-          {
-            self.handle_sampling_exception(e, cur_state);
-          }
-          self.run_state_epilogue(cur_state);
         }
-      }
 
-      ++type_config_index;
-    });
+        ++type_config_index;
+      });
   }
 };
 
diff --git a/nvbench/runner.cxx b/nvbench/runner.cxx
index 3aba964..93cedf5 100644
--- a/nvbench/runner.cxx
+++ b/nvbench/runner.cxx
@@ -35,8 +35,7 @@ void runner_base::generate_states()
   m_benchmark.m_states = nvbench::detail::state_generator::create(m_benchmark);
 }
 
-void runner_base::handle_sampling_exception(const std::exception &e,
-                                            state &exec_state) const
+void runner_base::handle_sampling_exception(const std::exception &e, state &exec_state) const
 {
   // If the state is skipped, that means the execution framework class handled
   // the error already.
@@ -62,8 +61,7 @@ void runner_base::handle_sampling_exception(const std::exception &e,
 void runner_base::run_state_prologue(nvbench::state &exec_state) const
 {
   // Log if a printer exists:
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log_run_state(exec_state);
@@ -73,19 +71,16 @@ void runner_base::run_state_prologue(nvbench::state &exec_state) const
 void runner_base::run_state_epilogue(state &exec_state) const
 {
   // Notify the printer that the state has completed::
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.add_completed_state();
   }
 }
 
-
 void runner_base::print_skip_notification(state &exec_state) const
 {
-  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer();
-      printer_opt_ref.has_value())
+  if (auto printer_opt_ref = exec_state.get_benchmark().get_printer(); printer_opt_ref.has_value())
   {
     auto &printer = printer_opt_ref.value().get();
     printer.log(nvbench::log_level::skip, exec_state.get_skip_reason());
diff --git a/nvbench/state.cuh b/nvbench/state.cuh
index 336ba2b..53c7413 100644
--- a/nvbench/state.cuh
+++ b/nvbench/state.cuh
@@ -58,106 +58,68 @@ struct state_tester;
 struct state
 {
   // move-only
-  state(const state &) = delete;
-  state(state &&)      = default;
+  state(const state &)            = delete;
+  state(state &&)                 = default;
   state &operator=(const state &) = delete;
-  state &operator=(state &&) = default;
+  state &operator=(state &&)      = default;
 
-  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const
-  {
-    return m_cuda_stream;
-  }
-  void set_cuda_stream(nvbench::cuda_stream &&stream)
-  {
-    m_cuda_stream = std::move(stream);
-  }
+  [[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
+  void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
 
   /// The CUDA device associated with with this benchmark state. May be
   /// nullopt for CPU-only benchmarks.
-  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const
-  {
-    return m_device;
-  }
+  [[nodiscard]] const std::optional<nvbench::device_info> &get_device() const { return m_device; }
 
   /// An index into a benchmark::type_configs type_list. Returns 0 if no type
   /// axes in the associated benchmark.
-  [[nodiscard]] std::size_t get_type_config_index() const
-  {
-    return m_type_config_index;
-  }
+  [[nodiscard]] std::size_t get_type_config_index() const { return m_type_config_index; }
 
   [[nodiscard]] nvbench::int64_t get_int64(const std::string &axis_name) const;
-  [[nodiscard]] nvbench::int64_t
-  get_int64_or_default(const std::string &axis_name,
-                       nvbench::int64_t default_value) const;
+  [[nodiscard]] nvbench::int64_t get_int64_or_default(const std::string &axis_name,
+                                                      nvbench::int64_t default_value) const;
 
-  [[nodiscard]] nvbench::float64_t
-  get_float64(const std::string &axis_name) const;
-  [[nodiscard]] nvbench::float64_t
-  get_float64_or_default(const std::string &axis_name,
-                         nvbench::float64_t default_value) const;
+  [[nodiscard]] nvbench::float64_t get_float64(const std::string &axis_name) const;
+  [[nodiscard]] nvbench::float64_t get_float64_or_default(const std::string &axis_name,
+                                                          nvbench::float64_t default_value) const;
 
-  [[nodiscard]] const std::string &
-  get_string(const std::string &axis_name) const;
-  [[nodiscard]] const std::string &
-  get_string_or_default(const std::string &axis_name,
-                        const std::string &default_value) const;
+  [[nodiscard]] const std::string &get_string(const std::string &axis_name) const;
+  [[nodiscard]] const std::string &get_string_or_default(const std::string &axis_name,
+                                                         const std::string &default_value) const;
 
   void add_element_count(std::size_t elements, std::string column_name = {});
 
   void set_element_count(std::size_t elements) { m_element_count = elements; }
-  [[nodiscard]] std::size_t get_element_count() const
-  {
-    return m_element_count;
-  }
+  [[nodiscard]] std::size_t get_element_count() const { return m_element_count; }
 
   template <typename ElementType>
   void add_global_memory_reads(std::size_t count, std::string column_name = {})
   {
-    this->add_global_memory_reads(count * sizeof(ElementType),
-                                  std::move(column_name));
+    this->add_global_memory_reads(count * sizeof(ElementType), std::move(column_name));
   }
   void add_global_memory_reads(std::size_t bytes, std::string column_name = {});
 
   template <typename ElementType>
   void add_global_memory_writes(std::size_t count, std::string column_name = {})
   {
-    this->add_global_memory_writes(count * sizeof(ElementType),
-                                   std::move(column_name));
+    this->add_global_memory_writes(count * sizeof(ElementType), std::move(column_name));
   }
-  void add_global_memory_writes(std::size_t bytes,
-                                std::string column_name = {});
+  void add_global_memory_writes(std::size_t bytes, std::string column_name = {});
 
   void add_buffer_size(std::size_t num_bytes,
                        std::string summary_tag,
                        std::string column_name = {},
                        std::string description = {});
 
-  void set_global_memory_rw_bytes(std::size_t bytes)
-  {
-    m_global_memory_rw_bytes = bytes;
-  }
-  [[nodiscard]] std::size_t get_global_memory_rw_bytes() const
-  {
-    return m_global_memory_rw_bytes;
-  }
+  void set_global_memory_rw_bytes(std::size_t bytes) { m_global_memory_rw_bytes = bytes; }
+  [[nodiscard]] std::size_t get_global_memory_rw_bytes() const { return m_global_memory_rw_bytes; }
 
   void skip(std::string reason) { m_skip_reason = std::move(reason); }
   [[nodiscard]] bool is_skipped() const { return !m_skip_reason.empty(); }
-  [[nodiscard]] const std::string &get_skip_reason() const
-  {
-    return m_skip_reason;
-  }
+  [[nodiscard]] const std::string &get_skip_reason() const { return m_skip_reason; }
 
   /// Execute at least this many trials per measurement. @{
-  [[nodiscard]] nvbench::int64_t get_min_samples() const
-  {
-    return m_min_samples;
-  }
-  void set_min_samples(nvbench::int64_t min_samples)
-  {
-    m_min_samples = min_samples;
-  }
+  [[nodiscard]] nvbench::int64_t get_min_samples() const { return m_min_samples; }
+  void set_min_samples(nvbench::int64_t min_samples) { m_min_samples = min_samples; }
   /// @}
 
   /// If true, the benchmark is only run once, skipping all warmup runs and only
@@ -222,20 +184,14 @@ struct state
   }
   ///@}
 
-  [[nodiscard]] const named_values &get_axis_values() const
-  {
-    return m_axis_values;
-  }
+  [[nodiscard]] const named_values &get_axis_values() const { return m_axis_values; }
 
   /*!
    * Return a string of "axis_name1=input_string1 axis_name2=input_string2 ..."
    */
   [[nodiscard]] std::string get_axis_values_as_string(bool color = false) const;
 
-  [[nodiscard]] const benchmark_base &get_benchmark() const
-  {
-    return m_benchmark;
-  }
+  [[nodiscard]] const benchmark_base &get_benchmark() const { return m_benchmark; }
 
   void collect_l1_hit_rates() { m_collect_l1_hit_rates = true; }
   void collect_l2_hit_rates() { m_collect_l2_hit_rates = true; }
@@ -252,26 +208,11 @@ struct state
     collect_dram_throughput();
   }
 
-  [[nodiscard]] bool is_l1_hit_rate_collected() const
-  {
-    return m_collect_l1_hit_rates;
-  }
-  [[nodiscard]] bool is_l2_hit_rate_collected() const
-  {
-    return m_collect_l2_hit_rates;
-  }
-  [[nodiscard]] bool is_stores_efficiency_collected() const
-  {
-    return m_collect_stores_efficiency;
-  }
-  [[nodiscard]] bool is_loads_efficiency_collected() const
-  {
-    return m_collect_loads_efficiency;
-  }
-  [[nodiscard]] bool is_dram_throughput_collected() const
-  {
-    return m_collect_dram_throughput;
-  }
+  [[nodiscard]] bool is_l1_hit_rate_collected() const { return m_collect_l1_hit_rates; }
+  [[nodiscard]] bool is_l2_hit_rate_collected() const { return m_collect_l2_hit_rates; }
+  [[nodiscard]] bool is_stores_efficiency_collected() const { return m_collect_stores_efficiency; }
+  [[nodiscard]] bool is_loads_efficiency_collected() const { return m_collect_loads_efficiency; }
+  [[nodiscard]] bool is_dram_throughput_collected() const { return m_collect_dram_throughput; }
 
   [[nodiscard]] bool is_cupti_required() const
   {
@@ -306,8 +247,7 @@ struct state
   template <typename KernelLauncher>
   void exec(KernelLauncher &&kernel_launcher)
   {
-    this->exec(nvbench::exec_tag::none,
-               std::forward<KernelLauncher>(kernel_launcher));
+    this->exec(nvbench::exec_tag::none, std::forward<KernelLauncher>(kernel_launcher));
   }
 
 private:
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index 0774faa..d8e15ff 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -65,9 +65,8 @@ nvbench::int64_t state::get_int64(const std::string &axis_name) const
   return m_axis_values.get_int64(axis_name);
 }
 
-nvbench::int64_t
-state::get_int64_or_default(const std::string &axis_name,
-                            nvbench::int64_t default_value) const
+nvbench::int64_t state::get_int64_or_default(const std::string &axis_name,
+                                             nvbench::int64_t default_value) const
 try
 {
   return this->get_int64(axis_name);
@@ -82,9 +81,8 @@ nvbench::float64_t state::get_float64(const std::string &axis_name) const
   return m_axis_values.get_float64(axis_name);
 }
 
-nvbench::float64_t
-state::get_float64_or_default(const std::string &axis_name,
-                              nvbench::float64_t default_value) const
+nvbench::float64_t state::get_float64_or_default(const std::string &axis_name,
+                                                 nvbench::float64_t default_value) const
 try
 {
   return this->get_float64(axis_name);
@@ -99,9 +97,8 @@ const std::string &state::get_string(const std::string &axis_name) const
   return m_axis_values.get_string(axis_name);
 }
 
-const std::string &
-state::get_string_or_default(const std::string &axis_name,
-                             const std::string &default_value) const
+const std::string &state::get_string_or_default(const std::string &axis_name,
+                                                const std::string &default_value) const
 try
 {
   return this->get_string(axis_name);
@@ -125,20 +122,18 @@ summary &state::add_summary(summary s)
 const summary &state::get_summary(std::string_view tag) const
 {
   // Check tags first
-  auto iter =
-    std::find_if(m_summaries.cbegin(),
-                 m_summaries.cend(),
-                 [&tag](const auto &s) { return s.get_tag() == tag; });
+  auto iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
+    return s.get_tag() == tag;
+  });
   if (iter != m_summaries.cend())
   {
     return *iter;
   }
 
   // Then names:
-  iter =
-    std::find_if(m_summaries.cbegin(),
-                 m_summaries.cend(),
-                 [&tag](const auto &s) { return s.get_string("name") == tag; });
+  iter = std::find_if(m_summaries.cbegin(), m_summaries.cend(), [&tag](const auto &s) {
+    return s.get_string("name") == tag;
+  });
   if (iter != m_summaries.cend())
   {
     return *iter;
@@ -150,20 +145,18 @@ const summary &state::get_summary(std::string_view tag) const
 summary &state::get_summary(std::string_view tag)
 {
   // Check tags first
-  auto iter =
-    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
-      return s.get_tag() == tag;
-    });
+  auto iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+    return s.get_tag() == tag;
+  });
   if (iter != m_summaries.end())
   {
     return *iter;
   }
 
   // Then names:
-  iter =
-    std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
-      return s.get_string("name") == tag;
-    });
+  iter = std::find_if(m_summaries.begin(), m_summaries.end(), [&tag](const auto &s) {
+    return s.get_string("name") == tag;
+  });
   if (iter != m_summaries.end())
   {
     return *iter;
@@ -187,18 +180,17 @@ std::string state::get_axis_values_as_string(bool color) const
   // Create a Key=Value list of all parameters:
   fmt::memory_buffer buffer;
 
-  auto append_key_value = [&buffer, &style](const std::string &key,
-                                            const auto &value,
-                                            std::string value_fmtstr = "{}") {
-    constexpr auto key_format   = fmt::emphasis::italic;
-    constexpr auto value_format = fmt::emphasis::bold;
+  auto append_key_value =
+    [&buffer, &style](const std::string &key, const auto &value, std::string value_fmtstr = "{}") {
+      constexpr auto key_format   = fmt::emphasis::italic;
+      constexpr auto value_format = fmt::emphasis::bold;
 
-    fmt::format_to(buffer,
-                   "{}{}={}",
-                   buffer.size() == 0 ? "" : " ",
-                   fmt::format(style(key_format), "{}", key),
-                   fmt::format(style(value_format), value_fmtstr, value));
-  };
+      fmt::format_to(buffer,
+                     "{}{}={}",
+                     buffer.size() == 0 ? "" : " ",
+                     fmt::format(style(key_format), "{}", key),
+                     fmt::format(style(value_format), value_fmtstr, value));
+    };
 
   if (m_device)
   {
@@ -211,8 +203,7 @@ std::string state::get_axis_values_as_string(bool color) const
     const auto axis_type = m_axis_values.get_type(name);
 
     // Handle power-of-two int64 axes differently:
-    if (axis_type == named_values::type::int64 &&
-        axes.get_int64_axis(name).is_power_of_two())
+    if (axis_type == named_values::type::int64 && axes.get_int64_axis(name).is_power_of_two())
     {
       const nvbench::int64_t value    = m_axis_values.get_int64(name);
       const nvbench::int64_t exponent = int64_axis::compute_log2(value);
@@ -242,10 +233,9 @@ std::string state::get_short_description(bool color) const
     return color ? fmt_style : no_style;
   };
 
-  return fmt::format(
-    "{} [{}]",
-    fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
-    this->get_axis_values_as_string(color));
+  return fmt::format("{} [{}]",
+                     fmt::format(style(fmt::emphasis::bold), "{}", m_benchmark.get().get_name()),
+                     this->get_axis_values_as_string(color));
 }
 
 void state::add_element_count(std::size_t elements, std::string column_name)
diff --git a/nvbench/string_axis.cuh b/nvbench/string_axis.cuh
index 2f526e7..a4e8c62 100644
--- a/nvbench/string_axis.cuh
+++ b/nvbench/string_axis.cuh
@@ -36,25 +36,13 @@ struct string_axis final : public axis_base
 
   ~string_axis() final;
 
-  void set_inputs(std::vector<std::string> inputs)
-  {
-    m_values = std::move(inputs);
-  }
-  [[nodiscard]] const std::string &get_value(std::size_t i) const
-  {
-    return m_values[i];
-  }
+  void set_inputs(std::vector<std::string> inputs) { m_values = std::move(inputs); }
+  [[nodiscard]] const std::string &get_value(std::size_t i) const { return m_values[i]; }
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<string_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<string_axis>(*this); }
   std::size_t do_get_size() const final { return m_values.size(); }
-  std::string do_get_input_string(std::size_t i) const final
-  {
-    return m_values[i];
-  }
+  std::string do_get_input_string(std::size_t i) const final { return m_values[i]; }
   std::string do_get_description(std::size_t) const final { return {}; }
 
   std::vector<std::string> m_values;
diff --git a/nvbench/summary.cuh b/nvbench/summary.cuh
index 4576b15..66093c0 100644
--- a/nvbench/summary.cuh
+++ b/nvbench/summary.cuh
@@ -92,10 +92,10 @@ struct summary : public nvbench::named_values
   {}
 
   // move-only
-  summary(const summary &) = delete;
-  summary(summary &&)      = default;
+  summary(const summary &)            = delete;
+  summary(summary &&)                 = default;
   summary &operator=(const summary &) = delete;
-  summary &operator=(summary &&) = default;
+  summary &operator=(summary &&)      = default;
 
   void set_tag(std::string tag) { m_tag = std::move(tag); }
   [[nodiscard]] const std::string &get_tag() const { return m_tag; }
diff --git a/nvbench/test_kernels.cuh b/nvbench/test_kernels.cuh
index e08db31..f01305c 100644
--- a/nvbench/test_kernels.cuh
+++ b/nvbench/test_kernels.cuh
@@ -38,8 +38,8 @@ namespace nvbench
 __global__ void sleep_kernel(double seconds)
 {
   const auto start = cuda::std::chrono::high_resolution_clock::now();
-  const auto ns    = cuda::std::chrono::nanoseconds(
-    static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
+  const auto ns =
+    cuda::std::chrono::nanoseconds(static_cast<nvbench::int64_t>(seconds * 1000 * 1000 * 1000));
   const auto finish = start + ns;
 
   auto now = cuda::std::chrono::high_resolution_clock::now();
@@ -53,7 +53,7 @@ __global__ void sleep_kernel(double seconds)
  * Naive copy of `n` values from `in` -> `out`.
  */
 template <typename T, typename U>
-__global__ void copy_kernel(const T* in, U* out, std::size_t n)
+__global__ void copy_kernel(const T *in, U *out, std::size_t n)
 {
   const auto init = blockIdx.x * blockDim.x + threadIdx.x;
   const auto step = blockDim.x * gridDim.x;
@@ -68,7 +68,7 @@ __global__ void copy_kernel(const T* in, U* out, std::size_t n)
  * For `i <- [0,n)`, `out[i] = in[i] % 2`.
  */
 template <typename T, typename U>
-__global__ void mod2_kernel(const T* in, U* out, std::size_t n)
+__global__ void mod2_kernel(const T *in, U *out, std::size_t n)
 {
   const auto init = blockIdx.x * blockDim.x + threadIdx.x;
   const auto step = blockDim.x * gridDim.x;
@@ -79,4 +79,4 @@ __global__ void mod2_kernel(const T* in, U* out, std::size_t n)
   }
 }
 
-}
+} // namespace nvbench
diff --git a/nvbench/type_axis.cuh b/nvbench/type_axis.cuh
index 2ee9144..27c1cc1 100644
--- a/nvbench/type_axis.cuh
+++ b/nvbench/type_axis.cuh
@@ -43,7 +43,7 @@ struct type_axis final : public axis_base
   template <typename TypeList>
   void set_inputs();
 
-  void set_active_inputs(const std::vector<std::string>& inputs);
+  void set_active_inputs(const std::vector<std::string> &inputs);
 
   [[nodiscard]] bool get_is_active(const std::string &input) const;
   [[nodiscard]] bool get_is_active(std::size_t index) const;
@@ -57,23 +57,13 @@ struct type_axis final : public axis_base
   /**
    * The index in this axis of the type with the specified `input_string`.
    */
-  [[nodiscard]] std::size_t
-  get_type_index(const std::string &input_string) const;
+  [[nodiscard]] std::size_t get_type_index(const std::string &input_string) const;
 
 private:
-  std::unique_ptr<axis_base> do_clone() const
-  {
-    return std::make_unique<type_axis>(*this);
-  }
+  std::unique_ptr<axis_base> do_clone() const { return std::make_unique<type_axis>(*this); }
   std::size_t do_get_size() const final { return m_input_strings.size(); }
-  std::string do_get_input_string(std::size_t i) const final
-  {
-    return m_input_strings[i];
-  }
-  std::string do_get_description(std::size_t i) const final
-  {
-    return m_descriptions[i];
-  }
+  std::string do_get_input_string(std::size_t i) const final { return m_input_strings[i]; }
+  std::string do_get_description(std::size_t i) const final { return m_descriptions[i]; }
 
   std::vector<std::string> m_input_strings;
   std::vector<std::string> m_descriptions;
diff --git a/nvbench/type_axis.cxx b/nvbench/type_axis.cxx
index af436ad..e678ff9 100644
--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -35,10 +35,10 @@ void type_axis::set_active_inputs(const std::vector<std::string> &inputs)
 {
   m_mask.clear();
   m_mask.resize(m_input_strings.size(), false);
-  for (const auto& input : inputs)
+  for (const auto &input : inputs)
   {
     const auto idx = this->get_type_index(input);
-    m_mask[idx] = true;
+    m_mask[idx]    = true;
   }
 }
 
@@ -47,21 +47,16 @@ bool type_axis::get_is_active(const std::string &input) const
   return this->get_is_active(this->get_type_index(input));
 }
 
-bool type_axis::get_is_active(std::size_t idx) const
-{
-  return m_mask.at(idx);
-}
+bool type_axis::get_is_active(std::size_t idx) const { return m_mask.at(idx); }
 
 std::size_t type_axis::get_active_count() const
 {
-  return static_cast<std::size_t>(
-    std::count(m_mask.cbegin(), m_mask.cend(), true));
+  return static_cast<std::size_t>(std::count(m_mask.cbegin(), m_mask.cend(), true));
 }
 
 std::size_t type_axis::get_type_index(const std::string &input_string) const
 {
-  auto it =
-    std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
+  auto it = std::find(m_input_strings.cbegin(), m_input_strings.cend(), input_string);
   if (it == m_input_strings.end())
   {
     NVBENCH_THROW(std::runtime_error,
diff --git a/nvbench/type_strings.cuh b/nvbench/type_strings.cuh
index 287e0f9..b915854 100644
--- a/nvbench/type_strings.cuh
+++ b/nvbench/type_strings.cuh
@@ -30,17 +30,17 @@ namespace nvbench
 std::string demangle(const std::string &str);
 
 template <typename T>
-std::string demangle() { return demangle(typeid(T).name()); }
+std::string demangle()
+{
+  return demangle(typeid(T).name());
+}
 
 template <typename T>
 struct type_strings
 {
   // The string used to identify the type in shorthand (e.g. output tables and
   // CLI options):
-  static std::string input_string()
-  {
-    return nvbench::demangle<T>();
-  }
+  static std::string input_string() { return nvbench::demangle<T>(); }
 
   // A more descriptive identifier for the type, if input_string is not a common
   // identifier. May be blank if `input_string` is obvious.
@@ -56,10 +56,7 @@ struct type_strings<std::integral_constant<T, Value>>
 
   // A more descriptive identifier for the type, if input_string is not a common
   // identifier. May be blank if `input_string` is obvious.
-  static std::string description()
-  {
-    return nvbench::demangle<std::integral_constant<T, Value>>();
-  }
+  static std::string description() { return nvbench::demangle<std::integral_constant<T, Value>>(); }
 };
 
 } // namespace nvbench
@@ -67,15 +64,15 @@ struct type_strings<std::integral_constant<T, Value>>
 /*!
  * Declare an `input_string` and `description` to use with a specific `type`.
  */
-#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description)           \
-  namespace nvbench                                                            \
-  {                                                                            \
-  template <>                                                                  \
-  struct type_strings<Type>                                                    \
-  {                                                                            \
-    static std::string input_string() { return {InputString}; }                \
-    static std::string description() { return {Description}; }                 \
-  };                                                                           \
+#define NVBENCH_DECLARE_TYPE_STRINGS(Type, InputString, Description)                               \
+  namespace nvbench                                                                                \
+  {                                                                                                \
+  template <>                                                                                      \
+  struct type_strings<Type>                                                                        \
+  {                                                                                                \
+    static std::string input_string() { return {InputString}; }                                    \
+    static std::string description() { return {Description}; }                                     \
+  };                                                                                               \
   }
 
 NVBENCH_DECLARE_TYPE_STRINGS(nvbench::int8_t, "I8", "int8_t");