diff --git a/.clang-format b/.clang-format
index e593f29..61bfc27 100644
--- a/.clang-format
+++ b/.clang-format
@@ -41,16 +41,28 @@ CompactNamespaces: false
 ContinuationIndentWidth: 2
 IncludeBlocks:   Regroup
 IncludeCategories:
-  - Regex:           '^<cub'
-    Priority:        1
-  - Regex:           '^<thrust'
-    Priority:        2
-  - Regex:           '^<cuda'
-    Priority:        3
   - Regex:           '^<nvbench'
+    Priority:        1
+  - Regex:           '^<cub'
+    Priority:        2
+  - Regex:           '^<thrust'
+    Priority:        3
+  - Regex:           '^<cuda/'
     Priority:        4
-  - Regex:           '^<[a-z]*>$'
+  - Regex:           '^<cuda'
     Priority:        5
+  - Regex:           '^<nvml'
+    Priority:        6
+  - Regex:           '^<cupti'
+    Priority:        7
+  - Regex:           '^<nvperf'
+    Priority:        8
+  - Regex:           '^<nlohmann'
+    Priority:        9
+  - Regex:           '^<fmt'
+    Priority:        10
+  - Regex:           '^<[a-z_]*>$'
+    Priority:        11
 IndentCaseLabels: true
 IndentPPDirectives: None
 IndentWidth: 2
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
index e84b5f3..c359cc2 100644
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -34,7 +34,7 @@ CCCL uses [Development Containers](https://containers.dev/) to provide consisten
 
    - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.
 
-     ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png)
+     ![Shows "Reopen in Container" in command palette.](./img/open_in_container_manual.png)
 
 4. Select an environment with the desired CTK and host compiler from the list:
 
@@ -136,7 +136,7 @@ For more information, see the `.devcontainer/make_devcontainers.sh --help` messa
 
 2. Install WSL 2 by running:
 ```bash
-wsl --install 
+wsl --install
 ```
 This should probably install Ubuntu distro as a default.
 
@@ -182,14 +182,14 @@ then run `sudo systemctl restart docker.service`.
 10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).
 
 11. If prompted, choose `Reopen in Container`.
-    
+
     - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.
 
 12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.
 
 From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:
 
-13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message. 
+13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message.
 
 > Failed opening a web browser at https://github.com/login/device
   exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
diff --git a/.devcontainer/launch.sh b/.devcontainer/launch.sh
index a9ef143..dab6340 100755
--- a/.devcontainer/launch.sh
+++ b/.devcontainer/launch.sh
@@ -304,4 +304,3 @@ main() {
 }
 
 main "$@"
-
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index 1113040..4306215 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -14,4 +14,4 @@
 #
 # Only add commits that are pure formatting changes (e.g. clang-format version changes, etc).
 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
-
+3440855dbd405db614861885ad1577fffd882867 # Initial addition of pre-commit.ci formatting.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..8ba3419
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,70 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+ci:
+    autofix_commit_msg: |
+      [pre-commit.ci] auto code formatting
+    autofix_prs: false
+    autoupdate_branch: ''
+    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
+    autoupdate_schedule: quarterly
+    skip: []
+    submodules: false
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v19.1.6
+    hooks:
+      - id: clang-format
+        types_or: [file]
+        files: |
+          (?x)^(
+            ^.*\.c$|
+            ^.*\.cpp$|
+            ^.*\.cu$|
+            ^.*\.cuh$|
+            ^.*\.cxx$|
+            ^.*\.h$|
+            ^.*\.hpp$|
+            ^.*\.inl$|
+            ^.*\.mm$
+          )
+        args: ["-fallback-style=none", "-style=file", "-i"]
+
+  # TODO/REMINDER: add the Ruff vscode extension to the devcontainers
+  # Ruff, the Python auto-correcting linter/formatter written in Rust
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.8.6
+    hooks:
+    - id: ruff  # linter
+    - id: ruff-format  # formatter
+
+  # TOML lint & format
+  - repo: https://github.com/ComPWA/taplo-pre-commit
+    rev: v0.9.3
+    hooks:
+      # See https://github.com/NVIDIA/cccl/issues/3426
+      # - id: taplo-lint
+      #   exclude: "^docs/"
+      - id: taplo-format
+        exclude: "^docs/"
+
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.3.0
+    hooks:
+      - id: codespell
+        additional_dependencies: [tomli]
+        args: ["--toml", "pyproject.toml"]
+        exclude: |
+          (?x)^(
+            build|
+            CITATION.md
+          )
+
+
+default_language_version:
+  python: python3
diff --git a/ci/ninja_summary.py b/ci/ninja_summary.py
index f496db5..526a58b 100755
--- a/ci/ninja_summary.py
+++ b/ci/ninja_summary.py
@@ -65,6 +65,7 @@ long_ext_count = 10
 
 class Target:
     """Represents a single line read for a .ninja_log file."""
+
     def __init__(self, start, end):
         """Creates a target object by passing in the start/end times in seconds
         as a float."""
@@ -94,9 +95,9 @@ class Target:
         """
         # Allow for modest floating-point errors
         epsilon = 0.000002
-        if (self.weighted_duration > self.Duration() + epsilon):
-            print('%s > %s?' % (self.weighted_duration, self.Duration()))
-        assert (self.weighted_duration <= self.Duration() + epsilon)
+        if self.weighted_duration > self.Duration() + epsilon:
+            print("%s > %s?" % (self.weighted_duration, self.Duration()))
+        assert self.weighted_duration <= self.Duration() + epsilon
         return self.weighted_duration
 
     def DescribeTargets(self):
@@ -104,10 +105,10 @@ class Target:
         # Some build steps generate dozens of outputs - handle them sanely.
         # The max_length was chosen so that it can fit most of the long
         # single-target names, while minimizing word wrapping.
-        result = ', '.join(self.targets)
+        result = ", ".join(self.targets)
         max_length = 65
         if len(result) > max_length:
-            result = result[:max_length] + '...'
+            result = result[:max_length] + "..."
         return result
 
 
@@ -121,12 +122,11 @@ def ReadTargets(log, show_all):
     # targets.
     if not header:
         return []
-    assert header == '# ninja log v5\n', \
-           'unrecognized ninja log version %r' % header
+    assert header == "# ninja log v5\n", "unrecognized ninja log version %r" % header
     targets_dict = {}
     last_end_seen = 0.0
     for line in log:
-        parts = line.strip().split('\t')
+        parts = line.strip().split("\t")
         if len(parts) != 5:
             # If ninja.exe is rudely halted then the .ninja_log file may be
             # corrupt. Silently continue.
@@ -165,17 +165,17 @@ def ReadTargets(log, show_all):
 def GetExtension(target, extra_patterns):
     """Return the file extension that best represents a target.
 
-  For targets that generate multiple outputs it is important to return a
-  consistent 'canonical' extension. Ultimately the goal is to group build steps
-  by type."""
+    For targets that generate multiple outputs it is important to return a
+    consistent 'canonical' extension. Ultimately the goal is to group build steps
+    by type."""
     for output in target.targets:
         if extra_patterns:
-            for fn_pattern in extra_patterns.split(';'):
-                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+            for fn_pattern in extra_patterns.split(";"):
+                if fnmatch.fnmatch(output, "*" + fn_pattern + "*"):
                     return fn_pattern
         # Not a true extension, but a good grouping.
-        if output.endswith('type_mappings'):
-            extension = 'type_mappings'
+        if output.endswith("type_mappings"):
+            extension = "type_mappings"
             break
 
         # Capture two extensions if present. For example: file.javac.jar should
@@ -185,26 +185,26 @@ def GetExtension(target, extra_patterns):
         extension = ext2 + ext1  # Preserve the order in the file name.
 
         if len(extension) == 0:
-            extension = '(no extension found)'
+            extension = "(no extension found)"
 
-        if ext1 in ['.pdb', '.dll', '.exe']:
-            extension = 'PEFile (linking)'
+        if ext1 in [".pdb", ".dll", ".exe"]:
+            extension = "PEFile (linking)"
             # Make sure that .dll and .exe are grouped together and that the
             # .dll.lib files don't cause these to be listed as libraries
             break
-        if ext1 in ['.so', '.TOC']:
-            extension = '.so (linking)'
+        if ext1 in [".so", ".TOC"]:
+            extension = ".so (linking)"
             # Attempt to identify linking, avoid identifying as '.TOC'
             break
         # Make sure .obj files don't get categorized as mojo files
-        if ext1 in ['.obj', '.o']:
+        if ext1 in [".obj", ".o"]:
             break
         # Jars are the canonical output of java targets.
-        if ext1 == '.jar':
+        if ext1 == ".jar":
             break
         # Normalize all mojo related outputs to 'mojo'.
-        if output.count('.mojom') > 0:
-            extension = 'mojo'
+        if output.count(".mojom") > 0:
+            extension = "mojo"
             break
     return extension
 
@@ -229,8 +229,8 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
         if target.end > latest:
             latest = target.end
         total_cpu_time += target.Duration()
-        task_start_stop_times.append((target.start, 'start', target))
-        task_start_stop_times.append((target.end, 'stop', target))
+        task_start_stop_times.append((target.start, "start", target))
+        task_start_stop_times.append((target.end, "stop", target))
     length = latest - earliest
     weighted_total = 0.0
 
@@ -256,10 +256,10 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
         if num_running > 0:
             # Update the total weighted time up to this moment.
             last_weighted_time += (time - last_time) / float(num_running)
-        if action_name == 'start':
+        if action_name == "start":
             # Record the total weighted task time when this task starts.
             running_tasks[target] = last_weighted_time
-        if action_name == 'stop':
+        if action_name == "stop":
             # Record the change in the total weighted task time while this task
             # ran.
             weighted_duration = last_weighted_time - running_tasks[target]
@@ -267,24 +267,27 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
             weighted_total += weighted_duration
             del running_tasks[target]
         last_time = time
-    assert (len(running_tasks) == 0)
+    assert len(running_tasks) == 0
 
     # Warn if the sum of weighted times is off by more than half a second.
     if abs(length - weighted_total) > 500:
-        print('Warning: Possible corrupt ninja log, results may be '
-              'untrustworthy. Length = %.3f, weighted total = %.3f' %
-              (length, weighted_total))
+        print(
+            "Warning: Possible corrupt ninja log, results may be "
+            "untrustworthy. Length = %.3f, weighted total = %.3f"
+            % (length, weighted_total)
+        )
 
     # Print the slowest build steps:
-    print('    Longest build steps:')
+    print("    Longest build steps:")
     if elapsed_time_sorting:
         entries.sort(key=lambda x: x.Duration())
     else:
         entries.sort(key=lambda x: x.WeightedDuration())
     for target in entries[-long_count:]:
-        print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
-              (target.WeightedDuration(), target.DescribeTargets(),
-               target.Duration()))
+        print(
+            "      %8.1f weighted s to build %s (%.1f s elapsed time)"
+            % (target.WeightedDuration(), target.DescribeTargets(), target.Duration())
+        )
 
     # Sum up the time by file extension/type of the output file
     count_by_ext = {}
@@ -293,51 +296,56 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
     # Scan through all of the targets to build up per-extension statistics.
     for target in entries:
         extension = GetExtension(target, extra_step_types)
-        time_by_ext[extension] = time_by_ext.get(extension,
-                                                 0) + target.Duration()
-        weighted_time_by_ext[extension] = weighted_time_by_ext.get(
-            extension, 0) + target.WeightedDuration()
+        time_by_ext[extension] = time_by_ext.get(extension, 0) + target.Duration()
+        weighted_time_by_ext[extension] = (
+            weighted_time_by_ext.get(extension, 0) + target.WeightedDuration()
+        )
         count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
 
-    print('    Time by build-step type:')
+    print("    Time by build-step type:")
     # Copy to a list with extension name and total time swapped, to (time, ext)
     if elapsed_time_sorting:
-        weighted_time_by_ext_sorted = sorted(
-            (y, x) for (x, y) in time_by_ext.items())
+        weighted_time_by_ext_sorted = sorted((y, x) for (x, y) in time_by_ext.items())
     else:
         weighted_time_by_ext_sorted = sorted(
-            (y, x) for (x, y) in weighted_time_by_ext.items())
+            (y, x) for (x, y) in weighted_time_by_ext.items()
+        )
     # Print the slowest build target types:
     for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
         print(
-            '      %8.1f s weighted time to generate %d %s files '
-            '(%1.1f s elapsed time sum)' %
-            (time, count_by_ext[extension], extension, time_by_ext[extension]))
+            "      %8.1f s weighted time to generate %d %s files "
+            "(%1.1f s elapsed time sum)"
+            % (time, count_by_ext[extension], extension, time_by_ext[extension])
+        )
 
-    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
-          'parallelism)' %
-          (length, total_cpu_time, total_cpu_time * 1.0 / length))
-    print('    %d build steps completed, average of %1.2f/s' %
-          (len(entries), len(entries) / (length)))
+    print(
+        "    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx "
+        "parallelism)" % (length, total_cpu_time, total_cpu_time * 1.0 / length)
+    )
+    print(
+        "    %d build steps completed, average of %1.2f/s"
+        % (len(entries), len(entries) / (length))
+    )
 
 
 def main():
-    log_file = '.ninja_log'
-    metrics_file = 'siso_metrics.json'
+    log_file = ".ninja_log"
+    metrics_file = "siso_metrics.json"
     parser = argparse.ArgumentParser()
-    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument("-C", dest="build_directory", help="Build directory.")
     parser.add_argument(
-        '-s',
-        '--step-types',
-        help='semicolon separated fnmatch patterns for build-step grouping')
+        "-s",
+        "--step-types",
+        help="semicolon separated fnmatch patterns for build-step grouping",
+    )
     parser.add_argument(
-        '-e',
-        '--elapsed_time_sorting',
+        "-e",
+        "--elapsed_time_sorting",
         default=False,
-        action='store_true',
-        help='Sort output by elapsed time instead of weighted time')
-    parser.add_argument('--log-file',
-                        help="specific ninja log file to analyze.")
+        action="store_true",
+        help="Sort output by elapsed time instead of weighted time",
+    )
+    parser.add_argument("--log-file", help="specific ninja log file to analyze.")
     args, _extra_args = parser.parse_known_args()
     if args.build_directory:
         log_file = os.path.join(args.build_directory, log_file)
@@ -348,34 +356,35 @@ def main():
         # Offer a convenient way to add extra step types automatically,
         # including when this script is run by autoninja. get() returns None if
         # the variable isn't set.
-        args.step_types = os.environ.get('chromium_step_types')
+        args.step_types = os.environ.get("chromium_step_types")
     if args.step_types:
         # Make room for the extra build types.
         global long_ext_count
-        long_ext_count += len(args.step_types.split(';'))
+        long_ext_count += len(args.step_types.split(";"))
 
     if os.path.exists(metrics_file):
         # Automatically handle summarizing siso builds.
-        cmd = ['siso.bat' if 'win32' in sys.platform else 'siso']
-        cmd.extend(['metrics', 'summary'])
+        cmd = ["siso.bat" if "win32" in sys.platform else "siso"]
+        cmd.extend(["metrics", "summary"])
         if args.build_directory:
-            cmd.extend(['-C', args.build_directory])
+            cmd.extend(["-C", args.build_directory])
         if args.step_types:
-            cmd.extend(['--step_types', args.step_types])
+            cmd.extend(["--step_types", args.step_types])
         if args.elapsed_time_sorting:
-            cmd.append('--elapsed_time_sorting')
+            cmd.append("--elapsed_time_sorting")
         subprocess.run(cmd)
     else:
         try:
-            with open(log_file, 'r') as log:
+            with open(log_file, "r") as log:
                 entries = ReadTargets(log, False)
                 if entries:
-                    SummarizeEntries(entries, args.step_types,
-                                     args.elapsed_time_sorting)
+                    SummarizeEntries(
+                        entries, args.step_types, args.elapsed_time_sorting
+                    )
         except IOError:
-            print('Log file %r not found, no build summary created.' % log_file)
+            print("Log file %r not found, no build summary created." % log_file)
             return errno.ENOENT
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main())
diff --git a/ci/sccache_hit_rate.sh b/ci/sccache_hit_rate.sh
index de8ae46..3284aa4 100755
--- a/ci/sccache_hit_rate.sh
+++ b/ci/sccache_hit_rate.sh
@@ -37,5 +37,5 @@ if [ $requests_diff -eq 0 ]; then
 else
     hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
     echo "sccache hit rate: $hit_rate%" >&2
-    echo "$hit_rate" 
+    echo "$hit_rate"
 fi
diff --git a/cmake/DetectSupportedStandards.cmake b/cmake/DetectSupportedStandards.cmake
index 6a86d6a..7dd186a 100644
--- a/cmake/DetectSupportedStandards.cmake
+++ b/cmake/DetectSupportedStandards.cmake
@@ -1,4 +1,4 @@
-# Detect the langauge standards supported by the current compilers.
+# Detect the language standards supported by the current compilers.
 #
 # Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
 #
diff --git a/cmake/NVBenchUtilities.cmake b/cmake/NVBenchUtilities.cmake
index caa79b8..ecef18e 100644
--- a/cmake/NVBenchUtilities.cmake
+++ b/cmake/NVBenchUtilities.cmake
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 # Passes all args directly to execute_process while setting up the following
-# results variables and propogating them to the caller's scope:
+# results variables and propagating them to the caller's scope:
 #
 # - nvbench_process_exit_code
 # - nvbench_process_stdout
diff --git a/cmake/header_test.in.cxx b/cmake/header_test.in.cxx
index 8772c83..3cf2897 100644
--- a/cmake/header_test.in.cxx
+++ b/cmake/header_test.in.cxx
@@ -9,9 +9,9 @@
 // a potential macro collision and halts.
 //
 // Hacky way to build a string, but it works on all tested platforms.
-#define NVBench_MACRO_CHECK(MACRO, HEADER)                                      \
-  NVBench_MACRO_CHECK_IMPL(Identifier MACRO should not be used from NVBench      \
-                           headers due to conflicts with HEADER macros.)
+#define NVBench_MACRO_CHECK(MACRO, HEADER)                                                         \
+  NVBench_MACRO_CHECK_IMPL(                                                                        \
+    Identifier MACRO should not be used from NVBench headers due to conflicts with HEADER macros.)
 
 // Use raw platform checks instead of the NVBench_HOST_COMPILER macros since we
 // don't want to #include any headers other than the one being tested.
@@ -34,8 +34,8 @@
 // library implementations unconditionally `#undef` these macros, which then
 // causes random failures later.
 // Leaving these commented out as a warning: Here be dragons.
-//#define min(...) NVBench_MACRO_CHECK('min', windows.h)
-//#define max(...) NVBench_MACRO_CHECK('max', windows.h)
+// #define min(...) NVBench_MACRO_CHECK('min', windows.h)
+// #define max(...) NVBench_MACRO_CHECK('max', windows.h)
 
 // termios.h conflicts (NVIDIA/thrust#1547)
 #define B0 NVBench_MACRO_CHECK("B0", termios.h)
diff --git a/docs/cli_help.md b/docs/cli_help.md
index 424c1be..d6af964 100644
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -90,7 +90,7 @@
     before any `--benchmark` arguments.
 
 * `--stopping-criterion <criterion>`
-  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough 
+  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough
     samples were collected.
   * Only applies to Cold measurements.
   * Default is stdrel (`--stopping-criterion stdrel`)
diff --git a/examples/auto_throughput.cu b/examples/auto_throughput.cu
index 14b6b94..6ce641a 100644
--- a/examples/auto_throughput.cu
+++ b/examples/auto_throughput.cu
@@ -24,37 +24,33 @@
 template <int ItemsPerThread>
 __global__ void kernel(std::size_t stride,
                        std::size_t elements,
-                       const nvbench::int32_t * __restrict__ in,
+                       const nvbench::int32_t *__restrict__ in,
                        nvbench::int32_t *__restrict__ out)
 {
-  const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const std::size_t tid  = threadIdx.x + blockIdx.x * blockDim.x;
   const std::size_t step = gridDim.x * blockDim.x;
 
-  for (std::size_t i = stride * tid;
-       i < stride * elements;
-       i += stride * step)
+  for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
   {
     for (int j = 0; j < ItemsPerThread; j++)
     {
-      const auto read_id = (ItemsPerThread * i + j) % elements;
+      const auto read_id  = (ItemsPerThread * i + j) % elements;
       const auto write_id = tid + j * elements;
-      out[write_id] = in[read_id];
+      out[write_id]       = in[read_id];
     }
   }
 }
 
-
 // `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
 // and cache hit rates.
 //
 // Calling state.collect_*() enables particular metric collection if nvbench
 // was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
 template <int ItemsPerThread>
-void throughput_bench(nvbench::state &state,
-                      nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
+void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
 {
   // Allocate input data:
-  const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+  const std::size_t stride   = static_cast<std::size_t>(state.get_int64("Stride"));
   const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
   thrust::device_vector<nvbench::int32_t> input(elements);
   thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
     static_cast<int>((elements + threads_in_block - 1) / threads_in_block);
 
   state.exec([&](nvbench::launch &launch) {
-    kernel<ItemsPerThread>
-      <<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
-        stride,
-        elements,
-        thrust::raw_pointer_cast(input.data()),
-        thrust::raw_pointer_cast(output.data()));
+    kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+      stride,
+      elements,
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()));
   });
 }
 
diff --git a/examples/axes.cu b/examples/axes.cu
index 44ae598..59139b5 100644
--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -71,18 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
   thrust::device_vector<nvbench::int32_t> in(num_values, 0);
   thrust::device_vector<nvbench::int32_t> out(num_values, 0);
 
-  state.exec(
-    [block_size,
-     num_blocks,
-     num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
-        in_ptr,
-        out_ptr,
-        num_values);
-    });
+  state.exec([block_size,
+              num_blocks,
+              num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
+                                                                             out_ptr,
+                                                                             num_values);
+  });
 }
 NVBENCH_BENCH(copy_sweep_grid_shape)
   // Every second power of two from  64->1024:
@@ -107,15 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
   thrust::device_vector<ValueType> in(num_values, 0);
   thrust::device_vector<ValueType> out(num_values, 0);
 
-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Define a type_list to use for the type axis:
 using cts_types = nvbench::type_list<nvbench::uint8_t,
@@ -131,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
 // Convert 64 MiB of InputTypes to OutputTypes, represented with various
 // value_types.
 template <typename InputType, typename OutputType>
-void copy_type_conversion_sweep(nvbench::state &state,
-                                nvbench::type_list<InputType, OutputType>)
+void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
   // Optional: Skip narrowing conversions.
-  if constexpr(sizeof(InputType) > sizeof(OutputType))
+  if constexpr (sizeof(InputType) > sizeof(OutputType))
   {
     state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
     return;
@@ -154,15 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
   thrust::device_vector<InputType> in(num_values, 0);
   thrust::device_vector<OutputType> out(num_values, 0);
 
-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Optional: Skip when InputType == OutputType. This approach avoids
 // instantiating the benchmark at all.
@@ -178,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
                                       nvbench::float32_t,
                                       nvbench::int64_t,
                                       nvbench::float64_t>;
-NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
-                    NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
+NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
   .set_type_axes_names({"In", "Out"});
diff --git a/examples/custom_criterion.cu b/examples/custom_criterion.cu
index 4661235..3be3743 100644
--- a/examples/custom_criterion.cu
+++ b/examples/custom_criterion.cu
@@ -36,10 +36,7 @@ public:
 
 protected:
   // Setup the criterion in the `do_initialize()` method:
-  virtual void do_initialize() override
-  {
-    m_num_samples = 0;
-  }
+  virtual void do_initialize() override { m_num_samples = 0; }
 
   // Process new measurements in the `add_measurement()` method:
   virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
@@ -52,7 +49,6 @@ protected:
   {
     return m_num_samples >= m_params.get_int64("max-samples");
   }
-
 };
 
 // Register the criterion with NVBench:
@@ -71,7 +67,7 @@ void throughput_bench(nvbench::state &state)
   state.add_global_memory_writes<nvbench::int32_t>(num_values);
 
   state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
     nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
       thrust::raw_pointer_cast(input.data()),
       thrust::raw_pointer_cast(output.data()),
diff --git a/examples/enums.cu b/examples/enums.cu
index fa149ac..ff3bef2 100644
--- a/examples/enums.cu
+++ b/examples/enums.cu
@@ -17,7 +17,6 @@
  */
 
 #include <nvbench/nvbench.cuh>
-
 #include <nvbench/test_kernels.cuh>
 
 // Enum to use as parameter axis:
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
   // Create inputs, etc, configure runtime kernel parameters, etc.
 
   // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-NVBENCH_BENCH(runtime_enum_sweep_string)
-  .add_string_axis("MyEnum", {"A", "B", "C"});
+NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});
 
 //==============================================================================
 // Sweep through enum values at runtime using an int64 axis.
@@ -97,9 +94,8 @@ void runtime_enum_sweep_int64(nvbench::state &state)
   // Create inputs, etc, configure runtime kernel parameters, etc.
 
   // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 NVBENCH_BENCH(runtime_enum_sweep_int64)
   .add_int64_axis("MyEnum",
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
   // Template parameters, static dispatch, etc.
 
   // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-using MyEnumList =
-  nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
+using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
 NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
   .set_type_axes_names({"MyEnum"});
 
@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
 //  * `-12` (struct std::integral_constant<int,-12>)
 // ```
 template <nvbench::int32_t IntValue>
-void compile_time_int_sweep(nvbench::state &state,
-                            nvbench::type_list<nvbench::enum_type<IntValue>>)
+void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
 {
   // Use IntValue in compile time contexts.
   // Template parameters, static dispatch, etc.
 
   // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
 NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))
diff --git a/examples/exec_tag_sync.cu b/examples/exec_tag_sync.cu
index 1366931..02c57c5 100644
--- a/examples/exec_tag_sync.cu
+++ b/examples/exec_tag_sync.cu
@@ -53,9 +53,7 @@ void sequence_bench(nvbench::state &state)
 
   // nvbench::exec_tag::sync indicates that this will implicitly sync:
   state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
-    thrust::sequence(thrust::device.on(launch.get_stream()),
-                     data.begin(),
-                     data.end());
+    thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
   });
 }
 NVBENCH_BENCH(sequence_bench);
diff --git a/examples/exec_tag_timer.cu b/examples/exec_tag_timer.cu
index e283f43..2eda842 100644
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -23,8 +23,8 @@
 
 // Thrust simplifies memory management, etc:
 #include <thrust/copy.h>
-#include <thrust/execution_policy.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 
 // mod2_inplace performs an in-place mod2 over every element in `data`. `data`
@@ -54,7 +54,7 @@ void mod2_inplace(nvbench::state &state)
   state.exec(nvbench::exec_tag::timer,
              // Lambda now takes a `timer` argument:
              [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
-               (void) num_values; // clang thinks this is unused...
+               (void)num_values; // clang thinks this is unused...
 
                // Reset working data:
                thrust::copy(thrust::device.on(launch.get_stream()),
diff --git a/examples/skip.cu b/examples/skip.cu
index fc96656..2757fac 100644
--- a/examples/skip.cu
+++ b/examples/skip.cu
@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
 // Two type axes are swept, but configurations where InputType == OutputType are
 // skipped.
 template <typename InputType, typename OutputType>
-void skip_overload(nvbench::state &state,
-                   nvbench::type_list<InputType, OutputType>)
+void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
   // This is a contrived example that focuses on the skip overloads, so this is
   // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Overload of skip_overload that is called when InputType == OutputType.
 template <typename T>
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
   // This is a contrived example that focuses on the skip overloads, so this is
   // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Enable this overload if InputType is larger than OutputType
 template <typename InputType, typename OutputType>
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
   state.skip("sizeof(InputType) > sizeof(OutputType).");
 }
 // The same type_list is used for both inputs/outputs.
-using sn_types = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+using sn_types =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
 // Setup benchmark:
 NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
   .set_type_axes_names({"In", "Out"});
diff --git a/examples/stream.cu b/examples/stream.cu
index 20254e5..2373b32 100644
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -52,7 +52,7 @@ void stream_bench(nvbench::state &state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
 
   state.exec([&input, &output, num_values](nvbench::launch &) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
     copy(thrust::raw_pointer_cast(input.data()),
          thrust::raw_pointer_cast(output.data()),
          num_values);
diff --git a/examples/summaries.cu b/examples/summaries.cu
index 57428c4..ad4fc06 100644
--- a/examples/summaries.cu
+++ b/examples/summaries.cu
@@ -26,8 +26,8 @@
 void summary_example(nvbench::state &state)
 {
   // Fetch parameters and compute duration in seconds:
-  const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
-  const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
+  const auto ms       = static_cast<nvbench::float64_t>(state.get_int64("ms"));
+  const auto us       = static_cast<nvbench::float64_t>(state.get_int64("us"));
   const auto duration = ms * 1e-3 + us * 1e-6;
 
   // Add a new column to the summary table with the derived duration used by the benchmark.
diff --git a/examples/throughput.cu b/examples/throughput.cu
index 24df6ee..5168a3f 100644
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -51,7 +51,7 @@ void throughput_bench(nvbench::state &state)
   state.add_global_memory_writes<nvbench::int32_t>(num_values);
 
   state.exec([&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
     nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
       thrust::raw_pointer_cast(input.data()),
       thrust::raw_pointer_cast(output.data()),
diff --git a/exec/nvbench-ctl.cu b/exec/nvbench-ctl.cu
index 3968440..be26d6c 100644
--- a/exec/nvbench-ctl.cu
+++ b/exec/nvbench-ctl.cu
@@ -1,20 +1,20 @@
 /*
-*  Copyright 2021 NVIDIA Corporation
-*
-*  Licensed under the Apache License, Version 2.0 with the LLVM exception
-*  (the "License"); you may not use this file except in compliance with
-*  the License.
-*
-*  You may obtain a copy of the License at
-*
-*      http://llvm.org/foundation/relicensing/LICENSE.txt
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*/
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
 
 #include <nvbench/main.cuh>
 
@@ -24,7 +24,7 @@ int main(int argc, char const *const *argv)
 try
 {
   // If no args, substitute a new argv that prints the version
-  std::vector<const char*> alt_argv;
+  std::vector<const char *> alt_argv;
   if (argc == 1)
   {
     alt_argv.push_back("--version");
@@ -36,7 +36,7 @@ try
   NVBENCH_CUDA_CALL(cudaDeviceReset());
   return 0;
 }
-catch (std::exception & e)
+catch (std::exception &e)
 {
   std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";
   return 1;
diff --git a/nvbench/axes_metadata.cxx b/nvbench/axes_metadata.cxx
index 188c689..7afc900 100644
--- a/nvbench/axes_metadata.cxx
+++ b/nvbench/axes_metadata.cxx
@@ -19,13 +19,13 @@
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/detail/throw.cuh>
 
+#include <fmt/format.h>
+#include <fmt/ranges.h>
+
 #include <algorithm>
 #include <cassert>
 #include <stdexcept>
 
-#include <fmt/format.h>
-#include <fmt/ranges.h>
-
 namespace nvbench
 {
 
diff --git a/nvbench/axis_base.cxx b/nvbench/axis_base.cxx
index 166f1ba..f2d4a39 100644
--- a/nvbench/axis_base.cxx
+++ b/nvbench/axis_base.cxx
@@ -16,7 +16,7 @@
  *  limitations under the License.
  */
 
-#include "axis_base.cuh"
+#include <nvbench/axis_base.cuh>
 
 namespace nvbench
 {
diff --git a/nvbench/benchmark.cuh b/nvbench/benchmark.cuh
index a226070..c985793 100644
--- a/nvbench/benchmark.cuh
+++ b/nvbench/benchmark.cuh
@@ -18,9 +18,8 @@
 
 #pragma once
 
-#include <nvbench/benchmark_base.cuh>
-
 #include <nvbench/axes_metadata.cuh>
+#include <nvbench/benchmark_base.cuh>
 #include <nvbench/runner.cuh>
 #include <nvbench/type_list.cuh>
 
diff --git a/nvbench/benchmark_manager.cuh b/nvbench/benchmark_manager.cuh
index 51fab18..19406f4 100644
--- a/nvbench/benchmark_manager.cuh
+++ b/nvbench/benchmark_manager.cuh
@@ -45,7 +45,7 @@ struct benchmark_manager
    * benchmarks should be done here to avoid creating a CUDA context before we configure the CUDA
    * environment in `main`.
    */
-   void initialize();
+  void initialize();
 
   /**
    * Register a new benchmark.
diff --git a/nvbench/benchmark_manager.cxx b/nvbench/benchmark_manager.cxx
index 4ff9fe4..7fb59e1 100644
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -17,9 +17,8 @@
  */
 
 #include <nvbench/benchmark_manager.cuh>
-
-#include <nvbench/device_manager.cuh>
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/device_manager.cuh>
 
 #include <fmt/format.h>
 
@@ -37,8 +36,8 @@ benchmark_manager &benchmark_manager::get()
 
 void benchmark_manager::initialize()
 {
-  const auto& mgr = device_manager::get();
-  for (auto& bench : m_benchmarks)
+  const auto &mgr = device_manager::get();
+  for (auto &bench : m_benchmarks)
   {
     if (!bench->get_is_cpu_only())
     {
diff --git a/nvbench/blocking_kernel.cu b/nvbench/blocking_kernel.cu
index f347833..9514ee6 100644
--- a/nvbench/blocking_kernel.cu
+++ b/nvbench/blocking_kernel.cu
@@ -17,12 +17,10 @@
  */
 
 #include <nvbench/blocking_kernel.cuh>
-
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
-#include <nvbench/types.cuh>
-
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/types.cuh>
 
 #include <cuda/std/chrono>
 
diff --git a/nvbench/criterion_manager.cuh b/nvbench/criterion_manager.cuh
index 6c60993..65678b8 100644
--- a/nvbench/criterion_manager.cuh
+++ b/nvbench/criterion_manager.cuh
@@ -24,7 +24,6 @@
 #include <nvbench/types.cuh>
 
 #include <memory>
-
 #include <unordered_map>
 
 namespace nvbench
@@ -40,14 +39,14 @@ public:
   /**
    * @return The singleton criterion_manager instance.
    */
-  static criterion_manager& get();
+  static criterion_manager &get();
 
   /**
    * Register a new stopping criterion.
    */
-  nvbench::stopping_criterion_base& add(std::unique_ptr<nvbench::stopping_criterion_base> criterion);
-  nvbench::stopping_criterion_base& get_criterion(const std::string& name);
-  const nvbench::stopping_criterion_base& get_criterion(const std::string& name) const;
+  nvbench::stopping_criterion_base &add(std::unique_ptr<nvbench::stopping_criterion_base> criterion);
+  nvbench::stopping_criterion_base &get_criterion(const std::string &name);
+  const nvbench::stopping_criterion_base &get_criterion(const std::string &name) const;
 
   using params_description = std::vector<std::pair<std::string, nvbench::named_values::type>>;
   params_description get_params_description() const;
diff --git a/nvbench/criterion_manager.cxx b/nvbench/criterion_manager.cxx
index f4857e9..d2b480c 100644
--- a/nvbench/criterion_manager.cxx
+++ b/nvbench/criterion_manager.cxx
@@ -41,7 +41,7 @@ criterion_manager &criterion_manager::get()
   return registry;
 }
 
-stopping_criterion_base& criterion_manager::get_criterion(const std::string& name)
+stopping_criterion_base &criterion_manager::get_criterion(const std::string &name)
 {
   auto iter = m_map.find(name);
   if (iter == m_map.end())
@@ -51,7 +51,8 @@ stopping_criterion_base& criterion_manager::get_criterion(const std::string& nam
   return *iter->second.get();
 }
 
-const nvbench::stopping_criterion_base& criterion_manager::get_criterion(const std::string& name) const
+const nvbench::stopping_criterion_base &
+criterion_manager::get_criterion(const std::string &name) const
 {
   auto iter = m_map.find(name);
   if (iter == m_map.end())
@@ -69,8 +70,7 @@ stopping_criterion_base &criterion_manager::add(std::unique_ptr<stopping_criteri
 
   if (!success)
   {
-    NVBENCH_THROW(std::runtime_error,
-                  "Stopping criterion \"{}\" is already registered.", name);
+    NVBENCH_THROW(std::runtime_error, "Stopping criterion \"{}\" is already registered.", name);
   }
 
   return *it->second.get();
diff --git a/nvbench/csv_printer.cu b/nvbench/csv_printer.cu
index a0db080..eba4f74 100644
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -16,14 +16,12 @@
  *  limitations under the License.
  */
 
-#include <nvbench/csv_printer.cuh>
-
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/csv_printer.cuh>
 #include <nvbench/device_info.cuh>
-#include <nvbench/summary.cuh>
-
 #include <nvbench/internal/table_builder.cuh>
+#include <nvbench/summary.cuh>
 
 #include <fmt/format.h>
 
@@ -169,7 +167,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
       std::size_t remaining = table.m_columns.size();
       for (const auto &col : table.m_columns)
       {
-        fmt::format_to(std::back_inserter(buffer), "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
+        fmt::format_to(std::back_inserter(buffer),
+                       "{}{}",
+                       col.rows[i],
+                       (--remaining == 0) ? "" : ",");
       }
       fmt::format_to(std::back_inserter(buffer), "\n");
     }
diff --git a/nvbench/cuda_timer.cuh b/nvbench/cuda_timer.cuh
index e1c6e66..082cdca 100644
--- a/nvbench/cuda_timer.cuh
+++ b/nvbench/cuda_timer.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/cuda_call.cuh>
-
 #include <nvbench/types.cuh>
 
 #include <cuda_runtime_api.h>
diff --git a/nvbench/cupti_profiler.cxx b/nvbench/cupti_profiler.cxx
index 6233ef0..6ce1cf7 100644
--- a/nvbench/cupti_profiler.cxx
+++ b/nvbench/cupti_profiler.cxx
@@ -17,7 +17,6 @@
  */
 
 #include <nvbench/cupti_profiler.cuh>
-
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 
@@ -54,7 +53,9 @@ void nvpw_call(const NVPA_Status status)
 {
   if (status != NVPA_STATUS_SUCCESS)
   {
-    NVBENCH_THROW(std::runtime_error, "NVPW call returned error: {}", static_cast<std::underlying_type_t<NVPA_Status>>(status));
+    NVBENCH_THROW(std::runtime_error,
+                  "NVPW call returned error: {}",
+                  static_cast<std::underlying_type_t<NVPA_Status>>(status));
   }
 }
 
diff --git a/nvbench/detail/entropy_criterion.cuh b/nvbench/detail/entropy_criterion.cuh
index b0e4ebe..33d5634 100644
--- a/nvbench/detail/entropy_criterion.cuh
+++ b/nvbench/detail/entropy_criterion.cuh
@@ -18,9 +18,9 @@
 
 #pragma once
 
-#include <nvbench/types.cuh>
-#include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/ring_buffer.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>
 
 #include <vector>
 
@@ -38,7 +38,7 @@ class entropy_criterion final : public stopping_criterion_base
   nvbench::detail::ring_buffer<nvbench::float64_t> m_entropy_tracker{299};
 
   // Used to avoid re-allocating temporary memory
-  std::vector<nvbench::float64_t> m_probabilities; 
+  std::vector<nvbench::float64_t> m_probabilities;
 
   nvbench::float64_t compute_entropy();
 
@@ -49,7 +49,6 @@ protected:
   virtual void do_initialize() override;
   virtual void do_add_measurement(nvbench::float64_t measurement) override;
   virtual bool do_is_finished() override;
-  
 };
 
 } // namespace nvbench::detail
diff --git a/nvbench/detail/entropy_criterion.cxx b/nvbench/detail/entropy_criterion.cxx
index 6d9ba8c..4cc0668 100644
--- a/nvbench/detail/entropy_criterion.cxx
+++ b/nvbench/detail/entropy_criterion.cxx
@@ -21,7 +21,6 @@
 
 #include <cmath>
 
-
 namespace nvbench::detail
 {
 
@@ -40,7 +39,7 @@ void entropy_criterion::do_initialize()
   m_freq_tracker.clear();
 }
 
-nvbench::float64_t entropy_criterion::compute_entropy() 
+nvbench::float64_t entropy_criterion::compute_entropy()
 {
   const std::size_t n = m_freq_tracker.size();
   if (n == 0)
@@ -70,15 +69,15 @@ void entropy_criterion::do_add_measurement(nvbench::float64_t measurement)
   m_total_cuda_time += measurement;
 
   {
-    auto key = measurement;
+    auto key                = measurement;
     constexpr bool bin_keys = false;
 
-    if (bin_keys) 
+    if (bin_keys)
     {
       const auto resolution_us = 0.5;
-      const auto resulution_s = resolution_us / 1'000'000;
-      const auto epsilon = resulution_s * 2;
-      key = std::round(key / epsilon) * epsilon;
+      const auto resulution_s  = resolution_us / 1000000;
+      const auto epsilon       = resulution_s * 2;
+      key                      = std::round(key / epsilon) * epsilon;
     }
 
     // This approach is about 3x faster than `std::{unordered_,}map`
@@ -120,7 +119,7 @@ bool entropy_criterion::do_is_finished()
 
   const auto [slope, intercept] = statistics::compute_linear_regression(begin, end, mean);
 
-  if (statistics::slope2deg(slope) > m_params.get_float64("max-angle")) 
+  if (statistics::slope2deg(slope) > m_params.get_float64("max-angle"))
   {
     return false;
   }
diff --git a/nvbench/detail/measure_cold.cu b/nvbench/detail/measure_cold.cu
index 1421241..2bbf0e0 100644
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -25,13 +25,13 @@
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 
+#include <fmt/format.h>
+
 #include <algorithm>
 #include <chrono>
 #include <limits>
 #include <thread>
 
-#include <fmt/format.h>
-
 namespace nvbench::detail
 {
 
diff --git a/nvbench/detail/measure_cold.cuh b/nvbench/detail/measure_cold.cuh
index 80f8e5a..2138b3d 100644
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -18,8 +18,6 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
-
 #include <nvbench/blocking_kernel.cuh>
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
@@ -32,12 +30,13 @@
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>
+
+#include <cuda_runtime.h>
 
 #include <utility>
 #include <vector>
 
-#include "nvbench/types.cuh"
-
 namespace nvbench
 {
 
diff --git a/nvbench/detail/measure_cpu_only.cuh b/nvbench/detail/measure_cpu_only.cuh
index a42c604..39580bc 100644
--- a/nvbench/detail/measure_cpu_only.cuh
+++ b/nvbench/detail/measure_cpu_only.cuh
@@ -19,13 +19,12 @@
 #pragma once
 
 #include <nvbench/cpu_timer.cuh>
+#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#include <nvbench/detail/statistics.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>
 
-#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
-#include <nvbench/detail/statistics.cuh>
-
 #include <utility>
 #include <vector>
 
@@ -66,7 +65,7 @@ protected:
   nvbench::cpu_timer m_walltime_timer;
 
   nvbench::criterion_params m_criterion_params;
-  nvbench::stopping_criterion_base& m_stopping_criterion;
+  nvbench::stopping_criterion_base &m_stopping_criterion;
 
   bool m_run_once{false};
 
diff --git a/nvbench/detail/measure_cpu_only.cxx b/nvbench/detail/measure_cpu_only.cxx
index 69095a2..4e95dae 100644
--- a/nvbench/detail/measure_cpu_only.cxx
+++ b/nvbench/detail/measure_cpu_only.cxx
@@ -24,11 +24,11 @@
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 
+#include <fmt/format.h>
+
 #include <algorithm>
 #include <limits>
 
-#include <fmt/format.h>
-
 namespace nvbench::detail
 {
 
@@ -36,7 +36,8 @@ measure_cpu_only_base::measure_cpu_only_base(state &exec_state)
     : m_state{exec_state}
     , m_launch(m_state.get_cuda_stream())
     , m_criterion_params{exec_state.get_criterion_params()}
-    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
+    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
+        exec_state.get_stopping_criterion())}
     , m_run_once{exec_state.get_run_once()}
     , m_min_samples{exec_state.get_min_samples()}
     , m_skip_time{exec_state.get_skip_time()}
@@ -72,7 +73,7 @@ void measure_cpu_only_base::run_trials_prologue() { m_walltime_timer.start(); }
 void measure_cpu_only_base::record_measurements()
 {
   // Update and record timers and counters:
-  const auto cur_cpu_time  = m_cpu_timer.get_duration();
+  const auto cur_cpu_time = m_cpu_timer.get_duration();
 
   m_min_cpu_time = std::min(m_min_cpu_time, cur_cpu_time);
   m_max_cpu_time = std::max(m_max_cpu_time, cur_cpu_time);
@@ -188,8 +189,7 @@ void measure_cpu_only_base::generate_summaries()
       auto &summ = m_state.add_summary("nv/cpu_only/bw/global/bytes_per_second");
       summ.set_string("name", "GlobalMem BW");
       summ.set_string("hint", "byte_rate");
-      summ.set_string("description",
-                      "Number of bytes read/written per second.");
+      summ.set_string("description", "Number of bytes read/written per second.");
       summ.set_float64("value", avg_used_gmem_bw);
     }
   } // bandwidth
@@ -210,9 +210,9 @@ void measure_cpu_only_base::generate_summaries()
 
     if (m_max_time_exceeded)
     {
-      const auto timeout = m_walltime_timer.get_duration();
+      const auto timeout   = m_walltime_timer.get_duration();
       const auto max_noise = m_criterion_params.get_float64("max-noise");
-      const auto min_time = m_criterion_params.get_float64("min-time");
+      const auto min_time  = m_criterion_params.get_float64("min-time");
 
       if (cpu_noise > max_noise)
       {
diff --git a/nvbench/detail/measure_cupti.cuh b/nvbench/detail/measure_cupti.cuh
index ec7b212..9d9f689 100644
--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -24,13 +24,12 @@
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
 #include <nvbench/cupti_profiler.cuh>
-#include <nvbench/device_info.cuh>
-#include <nvbench/exec_tag.cuh>
-#include <nvbench/launch.cuh>
-
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
 #include <nvbench/detail/statistics.cuh>
+#include <nvbench/device_info.cuh>
+#include <nvbench/exec_tag.cuh>
+#include <nvbench/launch.cuh>
 
 #include <cuda_runtime.h>
 
diff --git a/nvbench/detail/measure_hot.cu b/nvbench/detail/measure_hot.cu
index 9497122..2a38f16 100644
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -16,9 +16,8 @@
  *  limitations under the License.
  */
 
-#include <nvbench/detail/measure_hot.cuh>
-
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/detail/measure_hot.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/printer_base.cuh>
diff --git a/nvbench/detail/ring_buffer.cuh b/nvbench/detail/ring_buffer.cuh
index 5c00b24..e478eaa 100644
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -19,12 +19,11 @@
 #pragma once
 
 #include <nvbench/config.cuh>
-
 #include <nvbench/detail/statistics.cuh>
 
+#include <cassert>
 #include <cstddef>
 #include <iterator>
-#include <cassert>
 #include <vector>
 
 namespace nvbench::detail
@@ -76,14 +75,14 @@ public:
     return temp;
   }
 
-  ring_buffer_iterator operator+(difference_type n) const 
-  { 
-    return ring_buffer_iterator(m_index + n, m_capacity, m_ptr); 
+  ring_buffer_iterator operator+(difference_type n) const
+  {
+    return ring_buffer_iterator(m_index + n, m_capacity, m_ptr);
   }
 
-  ring_buffer_iterator operator-(difference_type n) const 
-  { 
-    return ring_buffer_iterator(m_index - n, m_capacity, m_ptr); 
+  ring_buffer_iterator operator-(difference_type n) const
+  {
+    return ring_buffer_iterator(m_index - n, m_capacity, m_ptr);
   }
 
   difference_type operator-(const ring_buffer_iterator &other) const
@@ -121,13 +120,9 @@ private:
   std::size_t m_index{0};
   bool m_full{false};
 
-  std::size_t get_front_index() const 
-  {
-    return m_full ? m_index : 0;
-  }
+  std::size_t get_front_index() const { return m_full ? m_index : 0; }
 
 public:
-
   /**
    * Create a new ring buffer with the requested capacity.
    */
diff --git a/nvbench/detail/state_generator.cxx b/nvbench/detail/state_generator.cxx
index 26a897a..8a5ef82 100644
--- a/nvbench/detail/state_generator.cxx
+++ b/nvbench/detail/state_generator.cxx
@@ -16,15 +16,13 @@
  *  limitations under the License.
  */
 
-#include <nvbench/detail/state_generator.cuh>
-
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/detail/state_generator.cuh>
+#include <nvbench/detail/transform_reduce.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/type_axis.cuh>
 
-#include <nvbench/detail/transform_reduce.cuh>
-
 #include <algorithm>
 #include <cassert>
 #include <functional>
@@ -165,7 +163,7 @@ void state_generator::build_axis_configs()
         config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
       }
     } // type_si
-  }   // type_axis_config generation
+  } // type_axis_config generation
 
   // non_type_axis_config generation
   {
@@ -201,9 +199,9 @@ void state_generator::build_axis_configs()
                               axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
             break;
         } // switch (type)
-      }   // for (axis_info : current_indices)
-    }     // for non_type_sg configs
-  }       // non_type_axis_config generation
+      } // for (axis_info : current_indices)
+    } // for non_type_sg configs
+  } // non_type_axis_config generation
 }
 
 void state_generator::build_states()
diff --git a/nvbench/detail/statistics.cuh b/nvbench/detail/statistics.cuh
index 522b4f2..719d6f8 100644
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -26,12 +26,10 @@
 #include <iterator>
 #include <limits>
 #include <numeric>
-#include <cmath>
-
 #include <type_traits>
 
 #ifndef M_PI
-  #define M_PI 3.14159265358979323846
+#define M_PI 3.14159265358979323846
 #endif
 
 namespace nvbench::detail::statistics
@@ -154,7 +152,7 @@ nvbench::float64_t compute_r2(It first,
 
   for (std::size_t i = 0; i < n; ++i, ++first)
   {
-    const nvbench::float64_t y = *first;
+    const nvbench::float64_t y      = *first;
     const nvbench::float64_t y_pred = slope * static_cast<nvbench::float64_t>(i) + intercept;
 
     ss_tot += (y - mean_y) * (y - mean_y);
@@ -179,19 +177,10 @@ compute_r2(It first, It last, nvbench::float64_t slope, nvbench::float64_t inter
   return compute_r2(first, last, compute_mean(first, last), slope, intercept);
 }
 
-inline nvbench::float64_t rad2deg(nvbench::float64_t rad)
-{
-  return rad * 180.0 / M_PI;
-}
+inline nvbench::float64_t rad2deg(nvbench::float64_t rad) { return rad * 180.0 / M_PI; }
 
-inline nvbench::float64_t slope2rad(nvbench::float64_t slope)
-{
-  return std::atan2(slope, 1.0);
-}
+inline nvbench::float64_t slope2rad(nvbench::float64_t slope) { return std::atan2(slope, 1.0); }
 
-inline nvbench::float64_t slope2deg(nvbench::float64_t slope)
-{
-  return rad2deg(slope2rad(slope));
-}
+inline nvbench::float64_t slope2deg(nvbench::float64_t slope) { return rad2deg(slope2rad(slope)); }
 
 } // namespace nvbench::detail::statistics
diff --git a/nvbench/detail/stdrel_criterion.cuh b/nvbench/detail/stdrel_criterion.cuh
index 5f87e84..8abdb73 100644
--- a/nvbench/detail/stdrel_criterion.cuh
+++ b/nvbench/detail/stdrel_criterion.cuh
@@ -18,9 +18,9 @@
 
 #pragma once
 
-#include <nvbench/types.cuh>
-#include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/ring_buffer.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>
 
 #include <vector>
 
diff --git a/nvbench/detail/stdrel_criterion.cxx b/nvbench/detail/stdrel_criterion.cxx
index a6c5ea8..c0f7ef8 100644
--- a/nvbench/detail/stdrel_criterion.cxx
+++ b/nvbench/detail/stdrel_criterion.cxx
@@ -29,7 +29,7 @@ stdrel_criterion::stdrel_criterion()
 
 void stdrel_criterion::do_initialize()
 {
-  m_total_samples = 0;
+  m_total_samples   = 0;
   m_total_cuda_time = 0.0;
   m_cuda_times.clear();
   m_noise_tracker.clear();
@@ -46,7 +46,7 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
   const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
                                                                           m_cuda_times.cend(),
                                                                           mean_cuda_time);
-  const auto cuda_rel_stdev       = cuda_stdev / mean_cuda_time;
+  const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
   if (std::isfinite(cuda_rel_stdev))
   {
     m_noise_tracker.push_back(cuda_rel_stdev);
diff --git a/nvbench/detail/throw.cuh b/nvbench/detail/throw.cuh
index e3bb9fd..00d05f4 100644
--- a/nvbench/detail/throw.cuh
+++ b/nvbench/detail/throw.cuh
@@ -19,6 +19,7 @@
 #pragma once
 
 #include <fmt/format.h>
+
 #include <stdexcept>
 
 #define NVBENCH_THROW(exception_type, format_str, ...)                                             \
diff --git a/nvbench/detail/timestamps_kernel.cu b/nvbench/detail/timestamps_kernel.cu
index 8fa7a64..31b138d 100644
--- a/nvbench/detail/timestamps_kernel.cu
+++ b/nvbench/detail/timestamps_kernel.cu
@@ -16,13 +16,13 @@
  *  limitations under the License.
  */
 
-#include <cuda_runtime.h>
-
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/detail/timestamps_kernel.cuh>
 #include <nvbench/types.cuh>
 
+#include <cuda_runtime.h>
+
 #include <cstdio>
 #include <cstdlib>
 
@@ -71,12 +71,11 @@ void timestamps_kernel::record(const nvbench::cuda_stream &stream)
   int num_sms   = 0;
 
   NVBENCH_CUDA_CALL(cudaGetDevice(&device_id));
-  NVBENCH_CUDA_CALL(
-    cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
+  NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
 
   get_timestamps_kernel<<<static_cast<unsigned int>(num_sms), 1, 0, stream.get_stream()>>>(
     m_device_timestamps,
     m_device_timestamps + 1);
 }
 
-} // namespace nvbench
+} // namespace nvbench::detail
diff --git a/nvbench/detail/type_list_impl.cuh b/nvbench/detail/type_list_impl.cuh
index e97aaaa..96ca901 100644
--- a/nvbench/detail/type_list_impl.cuh
+++ b/nvbench/detail/type_list_impl.cuh
@@ -82,7 +82,7 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL,
   using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
   using cur       = typename detail::prepend_each<T, tail_prod>::type;
   using next      = typename detail::cartesian_product<
-    nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
+         nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
   using type = decltype(detail::concat(cur{}, next{}));
 };
 
diff --git a/nvbench/device_info.cu b/nvbench/device_info.cu
index 3b26cdb..b7a6c05 100644
--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -16,11 +16,10 @@
  *  limitations under the License.
  */
 
-#include <nvbench/device_info.cuh>
-
 #include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
+#include <nvbench/device_info.cuh>
 #include <nvbench/internal/nvml.cuh>
 
 #include <cuda_runtime_api.h>
diff --git a/nvbench/device_info.cuh b/nvbench/device_info.cuh
index 1bb5262..b31a096 100644
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -18,17 +18,16 @@
 
 #pragma once
 
-#include <cuda_runtime_api.h>
-
 #include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
 
+#include <cuda_runtime_api.h>
+
 #include <cstdint> // CHAR_BIT
 #include <stdexcept>
-#include <utility>
-
 #include <string_view>
+#include <utility>
 
 // forward declare this for internal storage
 struct nvmlDevice_st;
diff --git a/nvbench/device_manager.cu b/nvbench/device_manager.cu
index a70a18c..4ba40f7 100644
--- a/nvbench/device_manager.cu
+++ b/nvbench/device_manager.cu
@@ -16,13 +16,12 @@
  *  limitations under the License.
  */
 
-#include <nvbench/device_manager.cuh>
-
-#include <cuda_runtime_api.h>
-
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/device_manager.cuh>
+
+#include <cuda_runtime_api.h>
 
 namespace nvbench
 {
@@ -45,13 +44,13 @@ device_manager::device_manager()
   }
 }
 
-const nvbench::device_info &device_manager::get_device(int id) 
-{ 
-  if (id < 0) 
+const nvbench::device_info &device_manager::get_device(int id)
+{
+  if (id < 0)
   {
     NVBENCH_THROW(std::runtime_error, "Negative index: {}.", id);
   }
-  return m_devices.at(static_cast<std::size_t>(id)); 
+  return m_devices.at(static_cast<std::size_t>(id));
 }
 
 } // namespace nvbench
diff --git a/nvbench/exec_tag.cuh b/nvbench/exec_tag.cuh
index 26691b0..83bf271 100644
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -101,10 +101,10 @@ using no_gpu_t        = tag<nvbench::detail::exec_flag::no_gpu>;
 using no_batch_t      = tag<nvbench::detail::exec_flag::no_batch>;
 using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
 
-using hot_t           = tag<nvbench::detail::exec_flag::hot>;
-using cold_t          = tag<nvbench::detail::exec_flag::cold>;
-using cpu_only_t      = tag<nvbench::detail::exec_flag::cpu_only>;
-using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;
+using hot_t          = tag<nvbench::detail::exec_flag::hot>;
+using cold_t         = tag<nvbench::detail::exec_flag::cold>;
+using cpu_only_t     = tag<nvbench::detail::exec_flag::cpu_only>;
+using measure_mask_t = tag<nvbench::detail::exec_flag::measure_mask>;
 
 constexpr inline none_t none;
 constexpr inline timer_t timer;
diff --git a/nvbench/float64_axis.cuh b/nvbench/float64_axis.cuh
index ef7b089..5e07b55 100644
--- a/nvbench/float64_axis.cuh
+++ b/nvbench/float64_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/types.cuh>
 
 #include <vector>
@@ -40,7 +39,10 @@ struct float64_axis final : public axis_base
   [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }
 
 private:
-  std::unique_ptr<axis_base> do_clone() const final { return std::make_unique<float64_axis>(*this); }
+  std::unique_ptr<axis_base> do_clone() const final
+  {
+    return std::make_unique<float64_axis>(*this);
+  }
   std::size_t do_get_size() const final { return m_values.size(); }
   std::string do_get_input_string(std::size_t i) const final;
   std::string do_get_description(std::size_t i) const final;
diff --git a/nvbench/int64_axis.cuh b/nvbench/int64_axis.cuh
index adc95d1..d646309 100644
--- a/nvbench/int64_axis.cuh
+++ b/nvbench/int64_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/flags.cuh>
 #include <nvbench/types.cuh>
 
diff --git a/nvbench/int64_axis.cxx b/nvbench/int64_axis.cxx
index 599c388..16ca2e7 100644
--- a/nvbench/int64_axis.cxx
+++ b/nvbench/int64_axis.cxx
@@ -16,9 +16,8 @@
  *  limitations under the License.
  */
 
-#include <nvbench/int64_axis.cuh>
-
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/int64_axis.cuh>
 
 #include <fmt/format.h>
 
diff --git a/nvbench/internal/markdown_table.cuh b/nvbench/internal/markdown_table.cuh
index 518f57b..c484dbe 100644
--- a/nvbench/internal/markdown_table.cuh
+++ b/nvbench/internal/markdown_table.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/detail/transform_reduce.cuh>
-
 #include <nvbench/internal/table_builder.cuh>
 
 #include <fmt/color.h>
diff --git a/nvbench/internal/nvml.cuh b/nvbench/internal/nvml.cuh
index 05c6764..656f5e1 100644
--- a/nvbench/internal/nvml.cuh
+++ b/nvbench/internal/nvml.cuh
@@ -21,12 +21,12 @@
 #include <nvbench/config.cuh>
 #include <nvbench/detail/throw.cuh>
 
-#include <fmt/format.h>
-
 #ifdef NVBENCH_HAS_NVML
 #include <nvml.h>
 #endif // NVBENCH_HAS_NVML
 
+#include <fmt/format.h>
+
 #include <stdexcept>
 
 namespace nvbench::nvml
@@ -38,6 +38,7 @@ struct NVMLLifetimeManager
 {
   NVMLLifetimeManager();
   ~NVMLLifetimeManager();
+
 private:
   bool m_inited{false};
 };
diff --git a/nvbench/json_printer.cu b/nvbench/json_printer.cu
index f7e337a..f3010a1 100644
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -16,24 +16,22 @@
  *  limitations under the License.
  */
 
-#include <nvbench/json_printer.cuh>
-
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/config.cuh>
+#include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/device_manager.cuh>
 #include <nvbench/git_revision.cuh>
+#include <nvbench/json_printer.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/version.cuh>
 
-#include <nvbench/detail/throw.cuh>
+#include <nlohmann/json.hpp>
 
 #include <fmt/format.h>
 
-#include <nlohmann/json.hpp>
-
 #include <cstdint>
 #include <fstream>
 #include <iterator>
@@ -105,7 +103,7 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
       default:
         NVBENCH_THROW(std::runtime_error, "{}", "Unrecognized value type.");
     } // end switch (value type)
-  }   // end foreach value name
+  } // end foreach value name
 }
 
 } // end namespace
@@ -225,27 +223,26 @@ static void add_devices_section(nlohmann::ordered_json &root)
   auto &devices = root["devices"];
   for (const auto &dev_info : nvbench::device_manager::get().get_devices())
   {
-    auto &device                    = devices.emplace_back();
-    device["id"]                    = dev_info.get_id();
-    device["name"]                  = dev_info.get_name();
-    device["sm_version"]            = dev_info.get_sm_version();
-    device["ptx_version"]           = dev_info.get_ptx_version();
-    device["sm_default_clock_rate"] = dev_info.get_sm_default_clock_rate();
-    device["number_of_sms"]         = dev_info.get_number_of_sms();
-    device["max_blocks_per_sm"]     = dev_info.get_max_blocks_per_sm();
-    device["max_threads_per_sm"]    = dev_info.get_max_threads_per_sm();
-    device["max_threads_per_block"] = dev_info.get_max_threads_per_block();
-    device["registers_per_sm"]      = dev_info.get_registers_per_sm();
-    device["registers_per_block"]   = dev_info.get_registers_per_block();
-    device["global_memory_size"]    = dev_info.get_global_memory_size();
-    device["global_memory_bus_peak_clock_rate"] =
-      dev_info.get_global_memory_bus_peak_clock_rate();
-    device["global_memory_bus_width"]     = dev_info.get_global_memory_bus_width();
-    device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
-    device["l2_cache_size"]               = dev_info.get_l2_cache_size();
-    device["shared_memory_per_sm"]        = dev_info.get_shared_memory_per_sm();
-    device["shared_memory_per_block"]     = dev_info.get_shared_memory_per_block();
-    device["ecc_state"]                   = dev_info.get_ecc_state();
+    auto &device                                = devices.emplace_back();
+    device["id"]                                = dev_info.get_id();
+    device["name"]                              = dev_info.get_name();
+    device["sm_version"]                        = dev_info.get_sm_version();
+    device["ptx_version"]                       = dev_info.get_ptx_version();
+    device["sm_default_clock_rate"]             = dev_info.get_sm_default_clock_rate();
+    device["number_of_sms"]                     = dev_info.get_number_of_sms();
+    device["max_blocks_per_sm"]                 = dev_info.get_max_blocks_per_sm();
+    device["max_threads_per_sm"]                = dev_info.get_max_threads_per_sm();
+    device["max_threads_per_block"]             = dev_info.get_max_threads_per_block();
+    device["registers_per_sm"]                  = dev_info.get_registers_per_sm();
+    device["registers_per_block"]               = dev_info.get_registers_per_block();
+    device["global_memory_size"]                = dev_info.get_global_memory_size();
+    device["global_memory_bus_peak_clock_rate"] = dev_info.get_global_memory_bus_peak_clock_rate();
+    device["global_memory_bus_width"]           = dev_info.get_global_memory_bus_width();
+    device["global_memory_bus_bandwidth"]       = dev_info.get_global_memory_bus_bandwidth();
+    device["l2_cache_size"]                     = dev_info.get_l2_cache_size();
+    device["shared_memory_per_sm"]              = dev_info.get_shared_memory_per_sm();
+    device["shared_memory_per_block"]           = dev_info.get_shared_memory_per_block();
+    device["ecc_state"]                         = dev_info.get_ecc_state();
   }
 }
 
@@ -298,8 +295,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
           false;
 #endif
       } // "nvbench"
-    }   // "version"
-  }     // "meta"
+    } // "version"
+  } // "meta"
 
   add_devices_section(root);
 
@@ -362,8 +359,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
             default:
               break;
           } // end switch (axis type)
-        }   // end foreach axis value
-      }     // end foreach axis
+        } // end foreach axis value
+      } // end foreach axis
 
       auto &states = bench["states"];
       for (const auto &exec_state : bench_ptr->get_states())
@@ -431,8 +428,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
           continue;
         }
       } // end foreach exec_state
-    }   // end foreach benchmark
-  }     // "benchmarks"
+    } // end foreach benchmark
+  } // "benchmarks"
 
   m_ostream << root.dump(2) << "\n";
 }
@@ -492,7 +489,7 @@ void json_printer::do_print_benchmark_list(const benchmark_vector &benches)
           default:
             break;
         } // end switch (axis type)
-      }   // end foreach axis value
+      } // end foreach axis value
     }
   } // end foreach bench
 
diff --git a/nvbench/json_printer.cuh b/nvbench/json_printer.cuh
index 8457687..ae361ff 100644
--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/printer_base.cuh>
-
 #include <nvbench/types.cuh>
 
 #include <string>
diff --git a/nvbench/markdown_printer.cu b/nvbench/markdown_printer.cu
index db3d871..31c0bcf 100644
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -16,15 +16,13 @@
  *  limitations under the License.
  */
 
-#include <nvbench/markdown_printer.cuh>
-
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/device_manager.cuh>
+#include <nvbench/internal/markdown_table.cuh>
+#include <nvbench/markdown_printer.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 
-#include <nvbench/internal/markdown_table.cuh>
-
 #include <fmt/color.h>
 #include <fmt/format.h>
 
@@ -72,8 +70,12 @@ void markdown_printer::do_print_device_info()
                    "* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
                    device.get_shared_memory_per_sm() / 1024,
                    device.get_shared_memory_per_block() / 1024);
-    fmt::format_to(std::back_inserter(buffer), "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
-    fmt::format_to(std::back_inserter(buffer), "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
+    fmt::format_to(std::back_inserter(buffer),
+                   "* L2 Cache Size: {} KiB\n",
+                   device.get_l2_cache_size() / 1024);
+    fmt::format_to(std::back_inserter(buffer),
+                   "* Maximum Active Blocks: {}/SM\n",
+                   device.get_max_blocks_per_sm());
     fmt::format_to(std::back_inserter(buffer),
                    "* Maximum Active Threads: {}/SM, {}/Block\n",
                    device.get_max_threads_per_sm(),
@@ -82,7 +84,9 @@ void markdown_printer::do_print_device_info()
                    "* Available Registers: {}/SM, {}/Block\n",
                    device.get_registers_per_sm(),
                    device.get_registers_per_block());
-    fmt::format_to(std::back_inserter(buffer), "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
+    fmt::format_to(std::back_inserter(buffer),
+                   "* ECC Enabled: {}\n",
+                   device.get_ecc_state() ? "Yes" : "No");
     fmt::format_to(std::back_inserter(buffer), "\n");
   }
   m_ostream << fmt::to_string(buffer);
@@ -191,9 +195,12 @@ void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vec
         {
           desc = fmt::format(" ({})", desc);
         }
-        fmt::format_to(std::back_inserter(buffer), "  * `{}`{}\n", axis_ptr->get_input_string(i), desc);
+        fmt::format_to(std::back_inserter(buffer),
+                       "  * `{}`{}\n",
+                       axis_ptr->get_input_string(i),
+                       desc);
       } // end foreach value
-    }   // end foreach axis
+    } // end foreach axis
     fmt::format_to(std::back_inserter(buffer), "\n");
   } // end foreach bench
 
diff --git a/nvbench/named_values.cxx b/nvbench/named_values.cxx
index e672301..f5042ab 100644
--- a/nvbench/named_values.cxx
+++ b/nvbench/named_values.cxx
@@ -16,10 +16,9 @@
  *  limitations under the License.
  */
 
-#include <nvbench/named_values.cuh>
-
 #include <nvbench/config.cuh>
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/named_values.cuh>
 
 #include <fmt/format.h>
 
diff --git a/nvbench/nvbench.cuh b/nvbench/nvbench.cuh
index 3fb933f..bf1eb21 100644
--- a/nvbench/nvbench.cuh
+++ b/nvbench/nvbench.cuh
@@ -24,8 +24,8 @@
 #include <nvbench/callable.cuh>
 #include <nvbench/config.cuh>
 #include <nvbench/cpu_timer.cuh>
-#include <nvbench/criterion_manager.cuh>
 #include <nvbench/create.cuh>
+#include <nvbench/criterion_manager.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/cuda_timer.cuh>
diff --git a/nvbench/option_parser.cu b/nvbench/option_parser.cu
index eef46f8..0341075 100644
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -34,6 +34,8 @@
 #include <nvbench/internal/cli_help.cuh>
 #include <nvbench/internal/cli_help_axis.cuh>
 
+#include <fmt/format.h>
+
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
@@ -44,12 +46,10 @@
 #include <regex>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <tuple>
 #include <vector>
 
-#include <fmt/format.h>
-#include <string_view>
-
 namespace
 {
 
diff --git a/nvbench/printer_base.cuh b/nvbench/printer_base.cuh
index 13cf803..444752e 100644
--- a/nvbench/printer_base.cuh
+++ b/nvbench/printer_base.cuh
@@ -191,9 +191,9 @@ protected:
   virtual void do_process_bulk_data_float64(nvbench::state &,
                                             const std::string &,
                                             const std::string &,
-                                            const std::vector<nvbench::float64_t> &){};
+                                            const std::vector<nvbench::float64_t> &) {};
 
-  virtual void do_print_benchmark_list(const benchmark_vector &) 
+  virtual void do_print_benchmark_list(const benchmark_vector &)
   {
     throw std::runtime_error{"nvbench::do_print_benchmark_list is not supported by this printer."};
   }
diff --git a/nvbench/runner.cuh b/nvbench/runner.cuh
index f32b222..3924296 100644
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/benchmark_base.cuh>
-
 #include <nvbench/detail/state_generator.cuh>
 
 #include <stdexcept>
diff --git a/nvbench/runner.cxx b/nvbench/runner.cxx
index 93cedf5..09ddb46 100644
--- a/nvbench/runner.cxx
+++ b/nvbench/runner.cxx
@@ -16,10 +16,9 @@
  *  limitations under the License.
  */
 
-#include <nvbench/runner.cuh>
-
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/printer_base.cuh>
+#include <nvbench/runner.cuh>
 #include <nvbench/state.cuh>
 
 #include <fmt/format.h>
diff --git a/nvbench/state.cxx b/nvbench/state.cxx
index 49af11b..df4ec91 100644
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -20,13 +20,13 @@
 #include <nvbench/state.cuh>
 #include <nvbench/types.cuh>
 
+#include <fmt/color.h>
+#include <fmt/format.h>
+
 #include <algorithm>
 #include <stdexcept>
 #include <string>
 
-#include <fmt/color.h>
-#include <fmt/format.h>
-
 namespace nvbench
 {
 
diff --git a/nvbench/stopping_criterion.cuh b/nvbench/stopping_criterion.cuh
index 006a699..5daaaa5 100644
--- a/nvbench/stopping_criterion.cuh
+++ b/nvbench/stopping_criterion.cuh
@@ -21,19 +21,21 @@
 #include <nvbench/named_values.cuh>
 #include <nvbench/types.cuh>
 
-#include <string>
-
 #include <initializer_list>
+#include <string>
 #include <unordered_map>
 
 namespace nvbench
 {
 
-namespace detail 
+namespace detail
 {
 
-constexpr nvbench::float64_t compat_min_time() { return 0.5; }    // 0.5 seconds
-constexpr nvbench::float64_t compat_max_noise() { return 0.005; } // 0.5% relative standard deviation
+constexpr nvbench::float64_t compat_min_time() { return 0.5; } // 0.5 seconds
+constexpr nvbench::float64_t compat_max_noise()
+{
+  return 0.005;
+} // 0.5% relative standard deviation
 
 } // namespace detail
 
@@ -43,6 +45,7 @@ constexpr nvbench::float64_t compat_max_noise() { return 0.005; } // 0.5% relati
 class criterion_params
 {
   nvbench::named_values m_named_values;
+
 public:
   criterion_params();
   criterion_params(std::initializer_list<std::pair<std::string, nvbench::named_values::value_type>>);
@@ -96,7 +99,7 @@ public:
    *
    * This method is called once per benchmark run, before any measurements are provided.
    */
-  void initialize(const criterion_params &params) 
+  void initialize(const criterion_params &params)
   {
     m_params.set_from(params);
     this->do_initialize();
@@ -105,18 +108,12 @@ public:
   /**
    * Add the latest measurement to the criterion
    */
-  void add_measurement(nvbench::float64_t measurement)
-  {
-    this->do_add_measurement(measurement);
-  }
+  void add_measurement(nvbench::float64_t measurement) { this->do_add_measurement(measurement); }
 
   /**
    * Check if the criterion has been met for all measurements processed by `add_measurement`
    */
-  bool is_finished()
-  {
-    return this->do_is_finished();
-  }
+  bool is_finished() { return this->do_is_finished(); }
 
 protected:
   /**
diff --git a/nvbench/stopping_criterion.cxx b/nvbench/stopping_criterion.cxx
index 976a1a7..f6a4ae5 100644
--- a/nvbench/stopping_criterion.cxx
+++ b/nvbench/stopping_criterion.cxx
@@ -16,10 +16,8 @@
  *  limitations under the License.
  */
 
-#include <nvbench/stopping_criterion.cuh>
-
 #include <nvbench/detail/throw.cuh>
-
+#include <nvbench/stopping_criterion.cuh>
 
 namespace nvbench
 {
@@ -62,7 +60,7 @@ void criterion_params::set_from(const criterion_params &other)
 
 void criterion_params::set_int64(std::string name, nvbench::int64_t value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
   {
     m_named_values.remove_value(name);
   }
@@ -72,7 +70,7 @@ void criterion_params::set_int64(std::string name, nvbench::int64_t value)
 
 void criterion_params::set_float64(std::string name, nvbench::float64_t value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
   {
     m_named_values.remove_value(name);
   }
@@ -82,7 +80,7 @@ void criterion_params::set_float64(std::string name, nvbench::float64_t value)
 
 void criterion_params::set_string(std::string name, std::string value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
   {
     m_named_values.remove_value(name);
   }
@@ -110,15 +108,11 @@ std::string criterion_params::get_string(const std::string &name) const
   return m_named_values.get_string(name);
 }
 
-std::vector<std::string> criterion_params::get_names() const
-{
-  return m_named_values.get_names();
-}
+std::vector<std::string> criterion_params::get_names() const { return m_named_values.get_names(); }
 
 nvbench::named_values::type criterion_params::get_type(const std::string &name) const
 {
   return m_named_values.get_type(name);
 }
 
-
-} // namespace nvbench::detail
+} // namespace nvbench
diff --git a/nvbench/string_axis.cuh b/nvbench/string_axis.cuh
index a8af16e..7491870 100644
--- a/nvbench/string_axis.cuh
+++ b/nvbench/string_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/types.cuh>
 
 #include <vector>
diff --git a/nvbench/type_axis.cuh b/nvbench/type_axis.cuh
index 3a4c59a..3349ed9 100644
--- a/nvbench/type_axis.cuh
+++ b/nvbench/type_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 
 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 
diff --git a/nvbench/type_axis.cxx b/nvbench/type_axis.cxx
index f89ec1d..b706971 100644
--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -16,9 +16,8 @@
  *  limitations under the License.
  */
 
-#include <nvbench/type_axis.cuh>
-
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/type_axis.cuh>
 
 #include <fmt/format.h>
 #include <fmt/ranges.h>
diff --git a/nvbench/type_list.cuh b/nvbench/type_list.cuh
index e52425a..91118a8 100644
--- a/nvbench/type_list.cuh
+++ b/nvbench/type_list.cuh
@@ -18,7 +18,7 @@
 
 #pragma once
 
-#include "detail/type_list_impl.cuh"
+#include <nvbench/detail/type_list_impl.cuh>
 
 #include <tuple>
 #include <type_traits>
diff --git a/nvbench/type_strings.cxx b/nvbench/type_strings.cxx
index 5046db3..2aa9ac8 100644
--- a/nvbench/type_strings.cxx
+++ b/nvbench/type_strings.cxx
@@ -27,11 +27,11 @@
 #endif
 
 #ifdef NVBENCH_CXXABI_DEMANGLE
-#include <cxxabi.h>
-
 #include <cstdlib>
 #include <memory>
 
+#include <cxxabi.h>
+
 namespace
 {
 struct free_wrapper
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..86f9909
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[tool.ruff]
+target-version = "py310"
+fix = true
+show-fixes = true
+exclude = ["docs/tools"]
+
+[tool.ruff.lint]
+extend-select = ["I"]
+
+[tool.codespell]
+# To run codespell interactively and fix errors that pre-commit reports, try
+# `codespell -i 3 -w -H`. This will run with interactive review (-i 3), writes
+# changes to disk (-w), and includes hidden files (-H).
+# Note: pre-commit passes explicit lists of files here, which this skip file
+# list doesn't override - the skip list is only to allow you to run codespell
+# interactively.
+skip = "./.git,./build,./CITATION.md"
+# ignore short words, and typename parameters like OffsetT
+ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
+ignore-words-list = "inout,imovable,optionN,aCount,quitted,Invokable,countr,unexpect,numer,euclidian,couldn,OffsetT,FromM"
+builtin = "clear"
+quiet-level = 3
diff --git a/scripts/nvbench_compare.py b/scripts/nvbench_compare.py
index 2f4fae1..d5ff0b5 100755
--- a/scripts/nvbench_compare.py
+++ b/scripts/nvbench_compare.py
@@ -5,12 +5,11 @@ import math
 import os
 import sys
 
-from colorama import Fore
-
 import tabulate
-
+from colorama import Fore
 from nvbench_json import reader
 
+
 # Parse version string into tuple, "x.y.z" -> (x, y, z)
 def version_tuple(v):
     return tuple(map(int, (v.split("."))))
@@ -139,15 +138,14 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
         colalign.append("center")
 
         for device_id in device_ids:
-
             rows = []
-            plot_data = {'cmp': {}, 'ref': {}, 'cmp_noise': {}, 'ref_noise': {}}
+            plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}}
 
             for cmp_state in cmp_states:
                 cmp_state_name = cmp_state["name"]
-                ref_state = next(filter(lambda st: st["name"] == cmp_state_name,
-                                        ref_states),
-                                 None)
+                ref_state = next(
+                    filter(lambda st: st["name"] == cmp_state_name, ref_states), None
+                )
                 if not ref_state:
                     continue
 
@@ -158,9 +156,7 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                 row = []
                 for axis_value in axis_values:
                     axis_value_name = axis_value["name"]
-                    row.append(format_axis_value(axis_value_name,
-                                                 axis_value,
-                                                 axes))
+                    row.append(format_axis_value(axis_value_name, axis_value, axes))
 
                 cmp_summaries = cmp_state["summaries"]
                 ref_summaries = ref_state["summaries"]
@@ -171,23 +167,37 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                 def lookup_summary(summaries, tag):
                     return next(filter(lambda s: s["tag"] == tag, summaries), None)
 
-                cmp_time_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/mean")
-                ref_time_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/mean")
-                cmp_noise_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/stdev/relative")
-                ref_noise_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/stdev/relative")
+                cmp_time_summary = lookup_summary(
+                    cmp_summaries, "nv/cold/time/gpu/mean"
+                )
+                ref_time_summary = lookup_summary(
+                    ref_summaries, "nv/cold/time/gpu/mean"
+                )
+                cmp_noise_summary = lookup_summary(
+                    cmp_summaries, "nv/cold/time/gpu/stdev/relative"
+                )
+                ref_noise_summary = lookup_summary(
+                    ref_summaries, "nv/cold/time/gpu/stdev/relative"
+                )
 
                 # TODO: Use other timings, too. Maybe multiple rows, with a
                 # "Timing" column + values "CPU/GPU/Batch"?
-                if not all([cmp_time_summary,
-                            ref_time_summary,
-                            cmp_noise_summary,
-                            ref_noise_summary]):
+                if not all(
+                    [
+                        cmp_time_summary,
+                        ref_time_summary,
+                        cmp_noise_summary,
+                        ref_noise_summary,
+                    ]
+                ):
                     continue
 
                 def extract_value(summary):
                     summary_data = summary["data"]
-                    value_data = next(filter(lambda v: v["name"] == "value", summary_data))
-                    assert(value_data["type"] == "float64")
+                    value_data = next(
+                        filter(lambda v: v["name"] == "value", summary_data)
+                    )
+                    assert value_data["type"] == "float64"
                     return value_data["value"]
 
                 cmp_time = extract_value(cmp_time_summary)
@@ -218,23 +228,27 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                 if plot:
                     axis_name = []
                     axis_value = "--"
-                    for aid in range(len(axis_values)): 
+                    for aid in range(len(axis_values)):
                         if axis_values[aid]["name"] != plot:
-                           axis_name.append("{} = {}".format(axis_values[aid]["name"], axis_values[aid]["value"]))
+                            axis_name.append(
+                                "{} = {}".format(
+                                    axis_values[aid]["name"], axis_values[aid]["value"]
+                                )
+                            )
                         else:
-                           axis_value = float(axis_values[aid]["value"])
-                    axis_name = ', '.join(axis_name)
+                            axis_value = float(axis_values[aid]["value"])
+                    axis_name = ", ".join(axis_name)
 
-                    if axis_name not in plot_data['cmp']:
-                        plot_data['cmp'][axis_name] = {}
-                        plot_data['ref'][axis_name] = {}
-                        plot_data['cmp_noise'][axis_name] = {}
-                        plot_data['ref_noise'][axis_name] = {}
+                    if axis_name not in plot_data["cmp"]:
+                        plot_data["cmp"][axis_name] = {}
+                        plot_data["ref"][axis_name] = {}
+                        plot_data["cmp_noise"][axis_name] = {}
+                        plot_data["ref_noise"][axis_name] = {}
 
-                    plot_data['cmp'][axis_name][axis_value] = cmp_time
-                    plot_data['ref'][axis_name][axis_value] = ref_time
-                    plot_data['cmp_noise'][axis_name][axis_value] = cmp_noise
-                    plot_data['ref_noise'][axis_name][axis_value] = ref_noise
+                    plot_data["cmp"][axis_name][axis_value] = cmp_time
+                    plot_data["ref"][axis_name][axis_value] = ref_time
+                    plot_data["cmp_noise"][axis_name][axis_value] = cmp_noise
+                    plot_data["ref_noise"][axis_name][axis_value] = ref_noise
 
                 global config_count
                 global unknown_count
@@ -273,14 +287,13 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
             print("## [%d] %s\n" % (device["id"], device["name"]))
             # colalign and github format require tabulate 0.8.3
             if tabulate_version >= (0, 8, 3):
-                print(tabulate.tabulate(rows,
-                                        headers=headers,
-                                        colalign=colalign,
-                                        tablefmt="github"))
+                print(
+                    tabulate.tabulate(
+                        rows, headers=headers, colalign=colalign, tablefmt="github"
+                    )
+                )
             else:
-                print(tabulate.tabulate(rows,
-                                        headers=headers,
-                                        tablefmt="markdown"))
+                print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
 
             print("")
 
@@ -295,18 +308,17 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                     x = [float(x) for x in plot_data[key][axis].keys()]
                     y = list(plot_data[key][axis].values())
 
-                    noise = list(plot_data[key + '_noise'][axis].values())
+                    noise = list(plot_data[key + "_noise"][axis].values())
 
                     top = [y[i] + y[i] * noise[i] for i in range(len(x))]
                     bottom = [y[i] - y[i] * noise[i] for i in range(len(x))]
 
-                    p = plt.plot(x, y, shape, marker='o', label=label)
+                    p = plt.plot(x, y, shape, marker="o", label=label)
                     plt.fill_between(x, bottom, top, color=p[0].get_color(), alpha=0.1)
 
-
-                for axis in plot_data['cmp'].keys():
-                    plot_line('cmp', '-', axis)
-                    plot_line('ref', '--', axis + ' ref')
+                for axis in plot_data["cmp"].keys():
+                    plot_line("cmp", "-", axis)
+                    plot_line("ref", "--", axis + " ref")
 
                 plt.legend()
                 plt.show()
@@ -314,11 +326,17 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
 
 def main():
     help_text = "%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]"
-    parser = argparse.ArgumentParser(prog='nvbench_compare', usage=help_text)
-    parser.add_argument('--threshold-diff', type=float, dest='threshold', default=0.0,
-                        help='only show benchmarks where percentage diff is >= THRESHOLD')
-    parser.add_argument('--plot-along', type=str, dest='plot', default=None,
-                        help='plot results')
+    parser = argparse.ArgumentParser(prog="nvbench_compare", usage=help_text)
+    parser.add_argument(
+        "--threshold-diff",
+        type=float,
+        dest="threshold",
+        default=0.0,
+        help="only show benchmarks where percentage diff is >= THRESHOLD",
+    )
+    parser.add_argument(
+        "--plot-along", type=str, dest="plot", default=None, help="plot results"
+    )
 
     args, files_or_dirs = parser.parse_known_args()
     print(files_or_dirs)
@@ -336,14 +354,17 @@ def main():
                 continue
             r = os.path.join(files_or_dirs[0], f)
             c = os.path.join(files_or_dirs[1], f)
-            if os.path.isfile(r) and os.path.isfile(c) and \
-               os.path.getsize(r) > 0 and os.path.getsize(c) > 0:
+            if (
+                os.path.isfile(r)
+                and os.path.isfile(c)
+                and os.path.getsize(r) > 0
+                and os.path.getsize(c) > 0
+            ):
                 to_compare.append((r, c))
     else:
         to_compare = [(files_or_dirs[0], files_or_dirs[1])]
 
     for ref, comp in to_compare:
-
         ref_root = reader.read_file(ref)
         cmp_root = reader.read_file(comp)
 
@@ -355,7 +376,9 @@ def main():
             print("Device sections do not match.")
             sys.exit(1)
 
-        compare_benches(ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot)
+        compare_benches(
+            ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot
+        )
 
     print("# Summary\n")
     print("- Total Matches: %d" % config_count)
@@ -365,5 +388,5 @@ def main():
     return failure_count
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main())
diff --git a/scripts/nvbench_histogram.py b/scripts/nvbench_histogram.py
index 5c37d29..4eb6155 100755
--- a/scripts/nvbench_histogram.py
+++ b/scripts/nvbench_histogram.py
@@ -1,19 +1,19 @@
 #!/usr/bin/env python
 
-import numpy as np
-import pandas as pd
-
-import matplotlib.pyplot as plt
-import seaborn as sns
 import argparse
 import os
 import sys
 
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
 from nvbench_json import reader
 
+
 def parse_files():
     help_text = "%(prog)s [nvbench.out.json | dir/] ..."
-    parser = argparse.ArgumentParser(prog='nvbench_histogram', usage=help_text)
+    parser = argparse.ArgumentParser(prog="nvbench_histogram", usage=help_text)
 
     args, files_or_dirs = parser.parse_known_args()
 
@@ -41,14 +41,14 @@ def parse_files():
 def extract_filename(summary):
     summary_data = summary["data"]
     value_data = next(filter(lambda v: v["name"] == "filename", summary_data))
-    assert(value_data["type"] == "string")
+    assert value_data["type"] == "string"
     return value_data["value"]
 
 
 def extract_size(summary):
     summary_data = summary["data"]
     value_data = next(filter(lambda v: v["name"] == "size", summary_data))
-    assert(value_data["type"] == "int64")
+    assert value_data["type"] == "int64"
     return int(value_data["value"])
 
 
@@ -57,9 +57,10 @@ def parse_samples_meta(filename, state):
     if not summaries:
         return None, None
 
-    summary = next(filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times",
-                          summaries),
-                   None)
+    summary = next(
+        filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times", summaries),
+        None,
+    )
     if not summary:
         return None, None
 
@@ -81,7 +82,7 @@ def parse_samples(filename, state):
     with open(samples_filename, "rb") as f:
         samples = np.fromfile(f, "<f4")
 
-    assert (sample_count == len(samples))
+    assert sample_count == len(samples)
     return samples
 
 
@@ -118,5 +119,5 @@ def main():
     plt.show()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main())
diff --git a/scripts/nvbench_json/__init__.py b/scripts/nvbench_json/__init__.py
index 09990f1..4edcf0f 100644
--- a/scripts/nvbench_json/__init__.py
+++ b/scripts/nvbench_json/__init__.py
@@ -1,2 +1,3 @@
-from . import reader
-from . import version
+from . import reader, version
+
+__all__ = ["reader", "version"]
diff --git a/scripts/nvbench_json/version.py b/scripts/nvbench_json/version.py
index be1c03f..810108a 100644
--- a/scripts/nvbench_json/version.py
+++ b/scripts/nvbench_json/version.py
@@ -1,8 +1,8 @@
 file_version = (1, 0, 0)
 
-file_version_string = "{}.{}.{}".format(file_version[0],
-                                        file_version[1],
-                                        file_version[2])
+file_version_string = "{}.{}.{}".format(
+    file_version[0], file_version[1], file_version[2]
+)
 
 
 def check_file_version(filename, root_node):
@@ -19,8 +19,14 @@ def check_file_version(filename, root_node):
     # for now just warn on mismatch.
     if version_node["string"] != file_version_string:
         print("WARNING:")
-        print("  {} was written using a different NVBench JSON file version."
-              .format(filename))
+        print(
+            "  {} was written using a different NVBench JSON file version.".format(
+                filename
+            )
+        )
         print("  It may not read correctly.")
-        print("  (file version: {} reader version: {})"
-              .format(version_node["string"], file_version_string))
+        print(
+            "  (file version: {} reader version: {})".format(
+                version_node["string"], file_version_string
+            )
+        )
diff --git a/scripts/nvbench_walltime.py b/scripts/nvbench_walltime.py
index 18ca243..ad78e8c 100644
--- a/scripts/nvbench_walltime.py
+++ b/scripts/nvbench_walltime.py
@@ -5,9 +5,8 @@ import math
 import os
 import sys
 
-from nvbench_json import reader
-
 import tabulate
+from nvbench_json import reader
 
 
 # Parse version string into tuple, "x.y.z" -> (x, y, z)
@@ -39,7 +38,8 @@ def format_walltime(seconds_in):
         "{:0>2d}:".format(h) if h > 1e-9 else "",
         "{:0>2d}:".format(m) if (h > 1e-9 or m > 1e-9) else "",
         "{:0>2d}.".format(s) if (h > 1e-9 or m > 1e-9) else "{:d}.".format(s),
-        "{:0>3d}".format(ms))
+        "{:0>3d}".format(ms),
+    )
 
 
 def format_percentage(percentage):
@@ -58,7 +58,7 @@ measure_column_names = {"cold": "Isolated", "batch": "Batch", "cupti": "CUPTI"}
 def init_measures():
     out = {}
     for name in measure_names:
-        out[name] = 0.
+        out[name] = 0.0
     return out
 
 
@@ -67,17 +67,17 @@ def get_measures(state):
     times = {}
     for name in measure_names:
         measure_walltime_tag = "nv/{}/walltime".format(name)
-        summary = next(filter(lambda s: s["tag"] == measure_walltime_tag,
-                              summaries),
-                       None)
+        summary = next(
+            filter(lambda s: s["tag"] == measure_walltime_tag, summaries), None
+        )
         if not summary:
             continue
 
         walltime_data = next(filter(lambda d: d["name"] == "value", summary["data"]))
-        assert(walltime_data["type"] == "float64")
+        assert walltime_data["type"] == "float64"
         walltime = walltime_data["value"]
         walltime = float(walltime)
-        times[name] = walltime if walltime else 0.
+        times[name] = walltime if walltime else 0.0
     return times
 
 
@@ -87,7 +87,7 @@ def merge_measures(target, src):
 
 
 def sum_measures(measures):
-    total_time = 0.
+    total_time = 0.0
     for time in measures.values():
         total_time += time
     return total_time
@@ -194,20 +194,21 @@ def print_overview_section(data):
 
     # colalign and github format require tabulate 0.8.3
     if tabulate_version >= (0, 8, 3):
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                colalign=colalign,
-                                tablefmt="github"))
+        print(
+            tabulate.tabulate(
+                rows, headers=headers, colalign=colalign, tablefmt="github"
+            )
+        )
     else:
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                tablefmt="markdown"))
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
 
     print()
 
 
 # append_data_row_lambda args: (row_list, name, items[name])
-def print_measures_table(headers, colalign, items, total_measures, append_item_row_lambda):
+def print_measures_table(
+    headers, colalign, items, total_measures, append_item_row_lambda
+):
     total_time = sum_measures(total_measures)
     active_measures = get_active_measure_names(total_measures)
     num_user_columns = len(headers)
@@ -248,14 +249,13 @@ def print_measures_table(headers, colalign, items, total_measures, append_item_r
 
     # colalign and github format require tabulate 0.8.3
     if tabulate_version >= (0, 8, 3):
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                colalign=colalign,
-                                tablefmt="github"))
+        print(
+            tabulate.tabulate(
+                rows, headers=headers, colalign=colalign, tablefmt="github"
+            )
+        )
     else:
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                tablefmt="markdown"))
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
 
 
 def print_files_section(data):
@@ -319,7 +319,7 @@ def print_bench_section(bench_name, bench):
 
 def main():
     help_text = "%(prog)s [nvbench.out.json | dir/]..."
-    parser = argparse.ArgumentParser(prog='nvbench_walltime', usage=help_text)
+    parser = argparse.ArgumentParser(prog="nvbench_walltime", usage=help_text)
 
     args, files_or_dirs = parser.parse_known_args()
 
@@ -353,5 +353,5 @@ def main():
     print_files_section(data)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     sys.exit(main())
diff --git a/testing/axes_metadata.cu b/testing/axes_metadata.cu
index cf4d93a..9e54660 100644
--- a/testing/axes_metadata.cu
+++ b/testing/axes_metadata.cu
@@ -17,22 +17,19 @@
  */
 
 #include <nvbench/axes_metadata.cuh>
-
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
 #include <algorithm>
 #include <string_view>
 
-using int_list = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+#include "test_asserts.cuh"
+
+using int_list =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
 
 using float_list = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
 
@@ -110,7 +107,6 @@ void test_default_type_axes_names()
     ASSERT(axes.get_type_axis(4).get_name() == "T4");
     ASSERT(axes.get_type_axis(4).get_axis_index() == 4);
   }
-
 }
 
 void test_type_axes()
@@ -138,8 +134,7 @@ void test_type_axes()
       fmt::format_to(std::back_inserter(buffer),
                      " - {}{}\n",
                      input_string,
-                     description.empty() ? ""
-                                         : fmt::format(" ({})", description));
+                     description.empty() ? "" : fmt::format(" ({})", description));
     }
   }
 
@@ -157,9 +152,8 @@ Axis: Other
 )expected";
 
   const std::string test = fmt::to_string(buffer);
-  const auto diff =
-    std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
-  const auto idx = static_cast<std::size_t>(diff.second - test.cbegin());
+  const auto diff        = std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
+  const auto idx         = static_cast<std::size_t>(diff.second - test.cbegin());
   ASSERT_MSG(test == ref,
              "Differs at character {}.\n"
              "Expected:\n\"{}\"\n\n"
@@ -189,9 +183,7 @@ void test_float64_axes()
 void test_int64_axes()
 {
   nvbench::axes_metadata axes;
-  axes.add_int64_axis("I64 Axis",
-                      {10, 11, 12, 13, 14},
-                      nvbench::int64_axis_flags::none);
+  axes.add_int64_axis("I64 Axis", {10, 11, 12, 13, 14}, nvbench::int64_axis_flags::none);
   ASSERT(axes.get_axes().size() == 1);
   const auto &axis = axes.get_int64_axis("I64 Axis");
   ASSERT(axis.get_size() == 5);
@@ -205,9 +197,7 @@ void test_int64_axes()
 void test_int64_power_of_two_axes()
 {
   nvbench::axes_metadata axes;
-  axes.add_int64_axis("I64 POT Axis",
-                      {1, 2, 3, 4, 5},
-                      nvbench::int64_axis_flags::power_of_two);
+  axes.add_int64_axis("I64 POT Axis", {1, 2, 3, 4, 5}, nvbench::int64_axis_flags::power_of_two);
   ASSERT(axes.get_axes().size() == 1);
   const auto &axis = axes.get_int64_axis("I64 POT Axis");
   ASSERT(axis.get_size() == 5);
diff --git a/testing/benchmark.cu b/testing/benchmark.cu
index 9581b12..dfaf468 100644
--- a/testing/benchmark.cu
+++ b/testing/benchmark.cu
@@ -17,7 +17,6 @@
  */
 
 #include <nvbench/benchmark.cuh>
-
 #include <nvbench/callable.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/state.cuh>
@@ -25,8 +24,6 @@
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
 #include <algorithm>
@@ -34,6 +31,8 @@
 #include <variant>
 #include <vector>
 
+#include "test_asserts.cuh"
+
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -61,34 +60,26 @@ void no_op_generator(nvbench::state &state)
 NVBENCH_DEFINE_CALLABLE(no_op_generator, no_op_callable);
 
 template <typename Integer, typename Float, typename Other>
-void template_no_op_generator(nvbench::state &state,
-                              nvbench::type_list<Integer, Float, Other>)
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<Integer, Float, Other>)
 {
-  ASSERT(nvbench::type_strings<Integer>::input_string() ==
-         state.get_string("Integer"));
-  ASSERT(nvbench::type_strings<Float>::input_string() ==
-         state.get_string("Float"));
-  ASSERT(nvbench::type_strings<Other>::input_string() ==
-         state.get_string("Other"));
+  ASSERT(nvbench::type_strings<Integer>::input_string() == state.get_string("Integer"));
+  ASSERT(nvbench::type_strings<Float>::input_string() == state.get_string("Float"));
+  ASSERT(nvbench::type_strings<Other>::input_string() == state.get_string("Other"));
 
   // Enum params using non-templated version:
   no_op_generator(state);
 }
-NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator,
-                                 template_no_op_callable);
+NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator, template_no_op_callable);
 
-using int_list = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+using int_list =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
 
 using float_list = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
 
 using misc_list = nvbench::type_list<bool, void>;
 
 using lots_of_types_bench =
-  nvbench::benchmark<template_no_op_callable,
-                     nvbench::type_list<int_list, float_list, misc_list>>;
+  nvbench::benchmark<template_no_op_callable, nvbench::type_list<int_list, float_list, misc_list>>;
 
 using no_types_bench = nvbench::benchmark<no_op_callable>;
 
@@ -110,8 +101,7 @@ void test_type_axes()
       fmt::format_to(std::back_inserter(buffer),
                      " - {}{}\n",
                      input_string,
-                     description.empty() ? ""
-                                         : fmt::format(" ({})", description));
+                     description.empty() ? "" : fmt::format(" ({})", description));
     }
   }
 
@@ -300,9 +290,7 @@ void test_get_config_count()
 
   auto const num_devices = bench.get_devices().size();
 
-  ASSERT_MSG(bench.get_config_count() == 72 * num_devices,
-             "Got {}",
-             bench.get_config_count());
+  ASSERT_MSG(bench.get_config_count() == 72 * num_devices, "Got {}", bench.get_config_count());
 }
 
 int main()
diff --git a/testing/cpu_timer.cu b/testing/cpu_timer.cu
index 4b58a95..99e7517 100644
--- a/testing/cpu_timer.cu
+++ b/testing/cpu_timer.cu
@@ -18,11 +18,11 @@
 
 #include <nvbench/cpu_timer.cuh>
 
-#include "test_asserts.cuh"
-
 #include <chrono>
 #include <thread>
 
+#include "test_asserts.cuh"
+
 void test_basic()
 {
   using namespace std::literals::chrono_literals;
diff --git a/testing/create.cu b/testing/create.cu
index 6ed7fff..7ca8e26 100644
--- a/testing/create.cu
+++ b/testing/create.cu
@@ -16,17 +16,14 @@
  *  limitations under the License.
  */
 
-#include <nvbench/create.cuh>
-
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
+#include <nvbench/create.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
 #include <algorithm>
@@ -34,6 +31,8 @@
 #include <variant>
 #include <vector>
 
+#include "test_asserts.cuh"
+
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -72,15 +71,11 @@ using misc_types  = nvbench::type_list<bool, void>;
 using type_axes   = nvbench::type_list<float_types, int_types, misc_types>;
 
 template <typename FloatT, typename IntT, typename MiscT>
-void template_no_op_generator(nvbench::state &state,
-                              nvbench::type_list<FloatT, IntT, MiscT>)
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<FloatT, IntT, MiscT>)
 {
-  ASSERT(nvbench::type_strings<FloatT>::input_string() ==
-         state.get_string("FloatT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<FloatT>::input_string() == state.get_string("FloatT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
 
   // Enum params using non-templated version:
   no_op_generator(state);
@@ -116,8 +111,7 @@ std::string run_and_get_state_string(nvbench::benchmark_base &bench,
 
 void validate_default_name()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("no_op_generator").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("no_op_generator").clone();
 
   const std::string ref = "Params:\n";
 
@@ -127,8 +121,7 @@ void validate_default_name()
 
 void validate_custom_name()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("Custom Name").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("Custom Name").clone();
 
   const std::string ref = "Params:\n";
 
@@ -138,8 +131,7 @@ void validate_custom_name()
 
 void validate_no_types()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("No Types").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("No Types").clone();
 
   const std::string ref = R"expected(Params: Float: 11 Int: 1 String: One
 Params: Float: 11 Int: 2 String: One
@@ -176,8 +168,7 @@ Params: Float: 13 Int: 3 String: Three
 
 void validate_only_types()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("Oops, All Types!").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("Oops, All Types!").clone();
 
   const std::string ref = R"expected(Params: FloatT: F32 IntT: I32 MiscT: bool
 Params: FloatT: F32 IntT: I32 MiscT: void
@@ -195,8 +186,7 @@ Params: FloatT: F64 IntT: I64 MiscT: void
 
 void validate_all_axes()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("All The Axes").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("All The Axes").clone();
 
   const std::string ref =
     R"expected(Params: Float: 11 FloatT: F32 Int: 1 IntT: I32 MiscT: bool String: One
diff --git a/testing/criterion_manager.cu b/testing/criterion_manager.cu
index da0ddb0..b466c30 100644
--- a/testing/criterion_manager.cu
+++ b/testing/criterion_manager.cu
@@ -42,27 +42,34 @@ protected:
 
 void test_no_duplicates_are_allowed()
 {
-  nvbench::criterion_manager& manager = nvbench::criterion_manager::get();
-  bool exception_triggered = false;
+  nvbench::criterion_manager &manager = nvbench::criterion_manager::get();
+  bool exception_triggered            = false;
 
-  try {
-    [[maybe_unused]] nvbench::stopping_criterion_base& _ = manager.get_criterion("custom");
-  } catch(...) {
+  try
+  {
+    [[maybe_unused]] nvbench::stopping_criterion_base &_ = manager.get_criterion("custom");
+  }
+  catch (...)
+  {
     exception_triggered = true;
   }
   ASSERT(exception_triggered);
 
   std::unique_ptr<custom_criterion> custom_ptr = std::make_unique<custom_criterion>();
-  custom_criterion* custom_raw = custom_ptr.get();
+  custom_criterion *custom_raw                 = custom_ptr.get();
   ASSERT(&manager.add(std::move(custom_ptr)) == custom_raw);
 
-  nvbench::stopping_criterion_base& custom = nvbench::criterion_manager::get().get_criterion("custom");
+  nvbench::stopping_criterion_base &custom =
+    nvbench::criterion_manager::get().get_criterion("custom");
   ASSERT(custom_raw == &custom);
 
   exception_triggered = false;
-  try {
+  try
+  {
     manager.add(std::make_unique<custom_criterion>());
-  } catch(...) {
+  }
+  catch (...)
+  {
     exception_triggered = true;
   }
   ASSERT(exception_triggered);
diff --git a/testing/criterion_params.cu b/testing/criterion_params.cu
index 4eceefa..92e2099 100644
--- a/testing/criterion_params.cu
+++ b/testing/criterion_params.cu
@@ -60,4 +60,3 @@ int main()
   test_compat_overwrite();
   test_overwrite();
 }
-
diff --git a/testing/cuda_timer.cu b/testing/cuda_timer.cu
index d8e4a42..d610db5 100644
--- a/testing/cuda_timer.cu
+++ b/testing/cuda_timer.cu
@@ -16,19 +16,16 @@
  *  limitations under the License.
  */
 
-#include <nvbench/cuda_timer.cuh>
-
 #include <nvbench/cuda_stream.cuh>
+#include <nvbench/cuda_timer.cuh>
 #include <nvbench/test_kernels.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
-void test_basic(cudaStream_t time_stream,
-                cudaStream_t exec_stream,
-                bool expected)
+#include "test_asserts.cuh"
+
+void test_basic(cudaStream_t time_stream, cudaStream_t exec_stream, bool expected)
 {
   nvbench::cuda_timer timer;
 
diff --git a/testing/custom_main_custom_args.cu b/testing/custom_main_custom_args.cu
index f7e331e..eb43a02 100644
--- a/testing/custom_main_custom_args.cu
+++ b/testing/custom_main_custom_args.cu
@@ -16,8 +16,8 @@
  *  limitations under the License.
  */
 
+#include <nvbench/cuda_call.cuh>
 #include <nvbench/nvbench.cuh>
-#include "nvbench/cuda_call.cuh"
 
 /******************************************************************************
  * Install custom parser.
@@ -35,7 +35,7 @@
 // User code to handle a specific argument:
 void handle_my_custom_arg();
 
-// NVBench hook for modiifying the command line arguments before parsing:
+// NVBench hook for modifying the command line arguments before parsing:
 void custom_arg_handler(std::vector<std::string> &args)
 {
   // Handle and remove "--my-custom-arg"
diff --git a/testing/custom_main_global_state_raii.cu b/testing/custom_main_global_state_raii.cu
index e3584ab..8529178 100644
--- a/testing/custom_main_global_state_raii.cu
+++ b/testing/custom_main_global_state_raii.cu
@@ -19,8 +19,8 @@
 #include <nvbench/nvbench.cuh>
 
 #include <algorithm>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>
 
 /******************************************************************************
  * Test having global state that is initialized and finalized via RAII.
diff --git a/testing/device/noisy_bench.cu b/testing/device/noisy_bench.cu
index 8cca3a1..62aaab0 100644
--- a/testing/device/noisy_bench.cu
+++ b/testing/device/noisy_bench.cu
@@ -29,12 +29,10 @@
 void noisy_bench(nvbench::state &state)
 {
   // time, convert ms -> s
-  const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) /
-                    1000.f;
+  const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) / 1000.f;
   // rel stdev
-  const auto noise_pct =
-    static_cast<nvbench::float32_t>(state.get_float64("Noise"));
-  const auto noise = noise_pct / 100.f;
+  const auto noise_pct = static_cast<nvbench::float32_t>(state.get_float64("Noise"));
+  const auto noise     = noise_pct / 100.f;
   // abs stdev
   const auto stdev = noise * mean;
 
@@ -53,8 +51,7 @@ void noisy_bench(nvbench::state &state)
     try
     {
       return static_cast<nvbench::float32_t>(
-        state.get_summary("nv/cold/time/gpu/stdev/relative")
-          .get_float64("value"));
+        state.get_summary("nv/cold/time/gpu/stdev/relative").get_float64("value"));
     }
     catch (std::invalid_argument &)
     {
diff --git a/testing/entropy_criterion.cu b/testing/entropy_criterion.cu
index df489c9..795e58f 100644
--- a/testing/entropy_criterion.cu
+++ b/testing/entropy_criterion.cu
@@ -20,11 +20,11 @@
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
-#include <vector>
-#include <random>
 #include <numeric>
+#include <random>
+#include <vector>
+
+#include "test_asserts.cuh"
 
 void test_const()
 {
@@ -32,7 +32,7 @@ void test_const()
   nvbench::detail::entropy_criterion criterion;
 
   criterion.initialize(params);
-  for (int i = 0; i < 6; i++) 
+  for (int i = 0; i < 6; i++)
   { // nvbench wants at least 5 to compute the standard deviation
     criterion.add_measurement(42.0);
   }
@@ -48,7 +48,7 @@ void produce_entropy_arch(nvbench::detail::entropy_criterion &criterion)
    *   2.5, 2.4, 2.2, 2.1, 2.0, 1.9 <-+
    *   1.8, 1.7, 1.6, 1.6, 1.5, 1.4   |
    *   1.4, 1.3, 1.3, 1.3, 1.2, 1.2   |
-   *   1.1, 1.1, 1.1, 1.0, 1.0, 1.0   +-- entropy only decreases after 5-th sample, 
+   *   1.1, 1.1, 1.1, 1.0, 1.0, 1.0   +-- entropy only decreases after 5-th sample,
    *   1.0, 0.9, 0.9, 0.9, 0.9, 0.9   |   so the slope should be negative
    *   0.8, 0.8, 0.8, 0.8, 0.8, 0.8   |
    *   0.7, 0.7, 0.7, 0.7, 0.7, 0.7 <-+
diff --git a/testing/enum_type_list.cu b/testing/enum_type_list.cu
index 88535ba..50e3c68 100644
--- a/testing/enum_type_list.cu
+++ b/testing/enum_type_list.cu
@@ -18,12 +18,12 @@
 
 #include <nvbench/enum_type_list.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
 #include <type_traits>
 
+#include "test_asserts.cuh"
+
 // If using gcc version < 7, disable some tests to WAR a compiler bug. See NVIDIA/nvbench#39.
 #if defined(__GNUC__) && __GNUC__ == 7
 #define USING_GCC_7
@@ -102,8 +102,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
 void test_int()
 {
   ASSERT((std::is_same_v<nvbench::enum_type_list<>, nvbench::type_list<>>));
-  ASSERT((std::is_same_v<nvbench::enum_type_list<0>,
-                         nvbench::type_list<nvbench::enum_type<0>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<0>, nvbench::type_list<nvbench::enum_type<0>>>));
   ASSERT((std::is_same_v<nvbench::enum_type_list<0, 1, 2, 3, 4>,
                          nvbench::type_list<nvbench::enum_type<0>,
                                             nvbench::enum_type<1>,
@@ -115,42 +114,35 @@ void test_int()
 void test_scoped_enum()
 {
 #ifndef USING_GCC_7
-  ASSERT((
-    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
-                   nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
+                         nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
 #endif
-  ASSERT((
-    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1,
-                                           scoped_enum::val_2,
-                                           scoped_enum::val_3>,
-                   nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>,
-                                      nvbench::enum_type<scoped_enum::val_2>,
-                                      nvbench::enum_type<scoped_enum::val_3>>>));
+  ASSERT((std::is_same_v<
+          nvbench::enum_type_list<scoped_enum::val_1, scoped_enum::val_2, scoped_enum::val_3>,
+          nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>,
+                             nvbench::enum_type<scoped_enum::val_2>,
+                             nvbench::enum_type<scoped_enum::val_3>>>));
 }
 
 void test_unscoped_enum()
 {
 #ifndef USING_GCC_7
-  ASSERT(
-    (std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
-                    nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
-  ASSERT(
-    (std::is_same_v<
-      nvbench::enum_type_list<unscoped_val_1, unscoped_val_2, unscoped_val_3>,
-      nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
-                         nvbench::enum_type<unscoped_val_2>,
-                         nvbench::enum_type<unscoped_val_3>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
+                         nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<unscoped_val_1, unscoped_val_2, unscoped_val_3>,
+                         nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
+                                            nvbench::enum_type<unscoped_val_2>,
+                                            nvbench::enum_type<unscoped_val_3>>>));
 #endif
 }
 
 void test_scoped_enum_type_strings()
 {
-  using values = nvbench::enum_type_list<scoped_enum::val_1,
-                                         scoped_enum::val_2,
-                                         scoped_enum::val_3>;
-  using val_1  = nvbench::tl::get<0, values>;
-  using val_2  = nvbench::tl::get<1, values>;
-  using val_3  = nvbench::tl::get<2, values>;
+  using values =
+    nvbench::enum_type_list<scoped_enum::val_1, scoped_enum::val_2, scoped_enum::val_3>;
+  using val_1 = nvbench::tl::get<0, values>;
+  using val_2 = nvbench::tl::get<1, values>;
+  using val_3 = nvbench::tl::get<2, values>;
   ASSERT((nvbench::type_strings<val_1>::input_string() == "1"));
   ASSERT((nvbench::type_strings<val_1>::description() == "scoped_enum::val_1"));
   ASSERT((nvbench::type_strings<val_2>::input_string() == "2"));
diff --git a/testing/float64_axis.cu b/testing/float64_axis.cu
index 8d1ea02..5509014 100644
--- a/testing/float64_axis.cu
+++ b/testing/float64_axis.cu
@@ -34,8 +34,7 @@ void test_empty()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Empty");
@@ -62,8 +61,7 @@ void test_basic()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Basic");
diff --git a/testing/int64_axis.cu b/testing/int64_axis.cu
index 84d7dd1..4684a0b 100644
--- a/testing/int64_axis.cu
+++ b/testing/int64_axis.cu
@@ -18,10 +18,10 @@
 
 #include <nvbench/int64_axis.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
+#include "test_asserts.cuh"
+
 void test_empty()
 {
   nvbench::int64_axis axis("Empty");
@@ -36,8 +36,7 @@ void test_empty()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Empty");
@@ -66,8 +65,7 @@ void test_basic()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "BasicAxis");
@@ -87,8 +85,7 @@ void test_basic()
 void test_power_of_two()
 {
   nvbench::int64_axis axis{"POTAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
-                  nvbench::int64_axis_flags::power_of_two);
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
   const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
   const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};
 
@@ -102,14 +99,12 @@ void test_power_of_two()
   for (size_t i = 0; i < 8; ++i)
   {
     ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
   }
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "POTAxis");
@@ -122,8 +117,7 @@ void test_power_of_two()
   for (size_t i = 0; i < 8; ++i)
   {
     ASSERT(clone->get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(clone->get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(clone->get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
   }
 }
 
@@ -250,8 +244,7 @@ void test_update_none_to_pow2()
 void test_update_pow2_to_none()
 {
   nvbench::int64_axis axis{"TestAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
-                  nvbench::int64_axis_flags::power_of_two);
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
   const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
   const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};
 
@@ -304,8 +297,7 @@ void test_update_pow2_to_none()
   for (size_t i = 0; i < 8; ++i)
   {
     ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
   }
 }
 
@@ -313,8 +305,7 @@ void test_update_pow2_to_pow2()
 {
 
   nvbench::int64_axis axis{"TestAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
-                  nvbench::int64_axis_flags::power_of_two);
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
   const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
   const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};
 
@@ -369,8 +360,7 @@ void test_update_pow2_to_pow2()
   for (size_t i = 0; i < 8; ++i)
   {
     ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
   }
 }
 
diff --git a/testing/named_values.cu b/testing/named_values.cu
index 8407235..8a9846f 100644
--- a/testing/named_values.cu
+++ b/testing/named_values.cu
@@ -18,10 +18,10 @@
 
 #include <nvbench/named_values.cuh>
 
-#include "test_asserts.cuh"
-
 #include <algorithm>
 
+#include "test_asserts.cuh"
+
 void test_empty()
 {
   nvbench::named_values vals;
diff --git a/testing/option_parser.cu b/testing/option_parser.cu
index 167e833..4b01a9a 100644
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -16,15 +16,14 @@
  *  limitations under the License.
  */
 
-#include <nvbench/option_parser.cuh>
-
 #include <nvbench/create.cuh>
+#include <nvbench/option_parser.cuh>
 #include <nvbench/type_list.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
+#include "test_asserts.cuh"
+
 //==============================================================================
 // Declare a couple benchmarks for testing:
 void DummyBench(nvbench::state &state) { state.skip("Skipping for testing."); }
@@ -50,8 +49,7 @@ NVBENCH_BENCH_TYPES(TestBench, NVBENCH_TYPE_AXES(Ts, Us))
 namespace
 {
 
-[[nodiscard]] std::string
-states_to_string(const std::vector<nvbench::state> &states)
+[[nodiscard]] std::string states_to_string(const std::vector<nvbench::state> &states)
 {
   fmt::memory_buffer buffer;
   std::string table_format = "| {:^5} | {:^10} | {:^4} | {:^4} | {:^4} "
@@ -88,7 +86,7 @@ states_to_string(const std::vector<nvbench::state> &states)
 
 // Expects the parser to have a single TestBench benchmark. Runs the benchmark
 // and returns the resulting states.
-[[nodiscard]] const auto& parser_to_states(nvbench::option_parser &parser)
+[[nodiscard]] const auto &parser_to_states(nvbench::option_parser &parser)
 {
   const auto &benches = parser.get_benchmarks();
   ASSERT(benches.size() == 1);
@@ -267,8 +265,7 @@ void test_int64_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 2 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 2 : 1 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -308,8 +305,7 @@ void test_int64_axis_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 , 7 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -323,8 +319,7 @@ void test_int64_axis_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 7 : 5 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -369,8 +364,7 @@ void test_int64_axis_pow2_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -384,8 +378,7 @@ void test_int64_axis_pow2_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 : 7 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 : 7 : 1 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -425,8 +418,7 @@ void test_int64_axis_pow2_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 , 7 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -440,8 +432,7 @@ void test_int64_axis_pow2_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 : 7 : 5 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -486,8 +477,7 @@ void test_int64_axis_none_to_pow2_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -501,8 +491,7 @@ void test_int64_axis_none_to_pow2_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 : 7 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 : 7 : 1 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -542,8 +531,7 @@ void test_int64_axis_none_to_pow2_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 , 7 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -557,8 +545,7 @@ void test_int64_axis_none_to_pow2_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 : 7 : 5 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -617,8 +604,7 @@ void test_int64_axis_pow2_to_none_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 2 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 2 : 1 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -658,8 +644,7 @@ void test_int64_axis_pow2_to_none_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 , 7 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -673,8 +658,7 @@ void test_int64_axis_pow2_to_none_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 7 : 5 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -719,8 +703,7 @@ void test_float64_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -734,10 +717,7 @@ void test_float64_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse({"--benchmark",
-                  "TestBench",
-                  "--axis",
-                  " Floats [ ] = [ 3.5 : 3.6 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 : 3.6 : 1 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -777,8 +757,7 @@ void test_float64_axis_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 , 4.1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 , 4.1 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -792,18 +771,14 @@ void test_float64_axis_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse({"--benchmark",
-                  "TestBench",
-                  "--axis",
-                  " Floats [ ] = [ 3.5 : 4.2 : 0.6 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 : 4.2 : 0.6 ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", "Floats=[3.5:4.2:0.6]"});
+    parser.parse({"--benchmark", "TestBench", "--axis", "Floats=[3.5:4.2:0.6]"});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -827,8 +802,7 @@ void test_string_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Strings [ ] = fo br "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = fo br "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -842,8 +816,7 @@ void test_string_axis_single()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -883,8 +856,7 @@ void test_string_axis_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br , baz ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br , baz ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -951,8 +923,7 @@ void test_type_axis_multi()
 
   {
     nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " T [ ] = [ U8, void ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " T [ ] = [ U8, void ] "});
     const auto test = parser_to_state_string(parser);
     ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
   }
@@ -1177,9 +1148,8 @@ void test_axis_before_benchmark()
 void test_min_samples()
 {
   nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--min-samples", "12345"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--min-samples", "12345"});
+  const auto &states = parser_to_states(parser);
 
   ASSERT(states.size() == 1);
   ASSERT(states[0].get_min_samples() == 12345);
@@ -1188,9 +1158,8 @@ void test_min_samples()
 void test_min_time()
 {
   nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--min-time", "12345e2"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--min-time", "12345e2"});
+  const auto &states = parser_to_states(parser);
 
   ASSERT(states.size() == 1);
   ASSERT(std::abs(states[0].get_min_time() - 12345e2) < 1.);
@@ -1199,9 +1168,8 @@ void test_min_time()
 void test_max_noise()
 {
   nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--max-noise", "50.3"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--max-noise", "50.3"});
+  const auto &states = parser_to_states(parser);
 
   ASSERT(states.size() == 1);
   ASSERT(std::abs(states[0].get_max_noise() - 0.503) < 1.e-4);
@@ -1210,9 +1178,8 @@ void test_max_noise()
 void test_skip_time()
 {
   nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--skip-time", "12345e2"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--skip-time", "12345e2"});
+  const auto &states = parser_to_states(parser);
 
   ASSERT(states.size() == 1);
   ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
@@ -1221,9 +1188,8 @@ void test_skip_time()
 void test_timeout()
 {
   nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--timeout", "12345e2"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--timeout", "12345e2"});
+  const auto &states = parser_to_states(parser);
 
   ASSERT(states.size() == 1);
   ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.);
@@ -1232,12 +1198,15 @@ void test_timeout()
 void test_stopping_criterion()
 {
   nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", 
-     "--stopping-criterion", "entropy",
-     "--max-angle", "0.42",
-     "--min-r2", "0.6"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark",
+                "DummyBench",
+                "--stopping-criterion",
+                "entropy",
+                "--max-angle",
+                "0.42",
+                "--min-r2",
+                "0.6"});
+  const auto &states = parser_to_states(parser);
 
   ASSERT(states.size() == 1);
   ASSERT(states[0].get_stopping_criterion() == "entropy");
diff --git a/testing/range.cu b/testing/range.cu
index 53226cd..95c8765 100644
--- a/testing/range.cu
+++ b/testing/range.cu
@@ -22,12 +22,9 @@
 
 void test_basic()
 {
-  ASSERT((nvbench::range(0, 6) ==
-          std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
-  ASSERT((nvbench::range(0, 6, 1) ==
-          std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
-  ASSERT(
-    (nvbench::range(0, 6, 2) == std::vector<nvbench::int64_t>{0, 2, 4, 6}));
+  ASSERT((nvbench::range(0, 6) == std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
+  ASSERT((nvbench::range(0, 6, 1) == std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
+  ASSERT((nvbench::range(0, 6, 2) == std::vector<nvbench::int64_t>{0, 2, 4, 6}));
   ASSERT((nvbench::range(0, 6, 3) == std::vector<nvbench::int64_t>{0, 3, 6}));
   ASSERT((nvbench::range(0, 6, 4) == std::vector<nvbench::int64_t>{0, 4}));
   ASSERT((nvbench::range(0, 6, 5) == std::vector<nvbench::int64_t>{0, 5}));
@@ -37,26 +34,19 @@ void test_basic()
 void test_result_type()
 {
   // All ints should turn into int64 by default:
-  ASSERT((std::is_same_v<decltype(nvbench::range(0ll, 1ll)),
-                         std::vector<nvbench::int64_t>>));
-  ASSERT((std::is_same_v<decltype(nvbench::range(0, 1)),
-                         std::vector<nvbench::int64_t>>));
-  ASSERT((std::is_same_v<decltype(nvbench::range(0u, 1u)),
-                         std::vector<nvbench::int64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0ll, 1ll)), std::vector<nvbench::int64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0, 1)), std::vector<nvbench::int64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0u, 1u)), std::vector<nvbench::int64_t>>));
 
   // All floats should turn into float64 by default:
-  ASSERT((std::is_same_v<decltype(nvbench::range(0., 1.)),
-                         std::vector<nvbench::float64_t>>));
-  ASSERT((std::is_same_v<decltype(nvbench::range(0.f, 1.f)),
-                         std::vector<nvbench::float64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0., 1.)), std::vector<nvbench::float64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0.f, 1.f)), std::vector<nvbench::float64_t>>));
 
   // Other types may be explicitly specified:
-  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::float32_t,
-                                                 nvbench::float32_t>(0.f, 1.f)),
+  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::float32_t, nvbench::float32_t>(0.f, 1.f)),
                          std::vector<nvbench::float32_t>>));
-  ASSERT((std::is_same_v<
-          decltype(nvbench::range<nvbench::int32_t, nvbench::int32_t>(0, 1)),
-          std::vector<nvbench::int32_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::int32_t, nvbench::int32_t>(0, 1)),
+                         std::vector<nvbench::int32_t>>));
 }
 
 void test_fp_tolerance()
@@ -68,10 +58,8 @@ void test_fp_tolerance()
   const nvbench::float32_t stride = 1e-4f;
   for (std::size_t size = 1; size < 1024; ++size)
   {
-    const nvbench::float32_t end =
-      start + stride * static_cast<nvbench::float32_t>(size - 1);
-    ASSERT_MSG(nvbench::range(start, end, stride).size() == size,
-               "size={}", size);
+    const nvbench::float32_t end = start + stride * static_cast<nvbench::float32_t>(size - 1);
+    ASSERT_MSG(nvbench::range(start, end, stride).size() == size, "size={}", size);
   }
 }
 
diff --git a/testing/reset_error.cu b/testing/reset_error.cu
index 8fece93..9c20e30 100644
--- a/testing/reset_error.cu
+++ b/testing/reset_error.cu
@@ -2,18 +2,17 @@
 
 #include "test_asserts.cuh"
 
-
 namespace
 {
-    __global__ void multiply5(const int32_t* __restrict__ a, int32_t* __restrict__ b)
-    {
-      const auto id = blockIdx.x * blockDim.x + threadIdx.x;
-      b[id] = 5 * a[id];
-    }
+__global__ void multiply5(const int32_t *__restrict__ a, int32_t *__restrict__ b)
+{
+  const auto id = blockIdx.x * blockDim.x + threadIdx.x;
+  b[id]         = 5 * a[id];
 }
+} // namespace
 
 int main()
-{ 
+{
   multiply5<<<256, 256>>>(nullptr, nullptr);
 
   try
diff --git a/testing/ring_buffer.cu b/testing/ring_buffer.cu
index 5af5343..35199a6 100644
--- a/testing/ring_buffer.cu
+++ b/testing/ring_buffer.cu
@@ -18,14 +18,13 @@
 
 #include <nvbench/detail/ring_buffer.cuh>
 
-#include "test_asserts.cuh"
-
 #include <algorithm>
 #include <vector>
 
+#include "test_asserts.cuh"
+
 template <typename T>
-bool equal(const nvbench::detail::ring_buffer<T> &buffer,
-           const std::vector<T> &reference)
+bool equal(const nvbench::detail::ring_buffer<T> &buffer, const std::vector<T> &reference)
 {
   return std::equal(buffer.begin(), buffer.end(), reference.begin());
 }
diff --git a/testing/runner.cu b/testing/runner.cu
index 6335d27..4d3dfc9 100644
--- a/testing/runner.cu
+++ b/testing/runner.cu
@@ -16,23 +16,22 @@
  *  limitations under the License.
  */
 
-#include <nvbench/runner.cuh>
-
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
+#include <nvbench/runner.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
 #include <algorithm>
 #include <variant>
 #include <vector>
 
+#include "test_asserts.cuh"
+
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -65,21 +64,16 @@ using misc_types  = nvbench::type_list<bool, void>;
 using type_axes   = nvbench::type_list<float_types, int_types, misc_types>;
 
 template <typename FloatT, typename IntT, typename MiscT>
-void template_no_op_generator(nvbench::state &state,
-                              nvbench::type_list<FloatT, IntT, MiscT>)
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<FloatT, IntT, MiscT>)
 {
-  ASSERT(nvbench::type_strings<FloatT>::input_string() ==
-         state.get_string("FloatT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<FloatT>::input_string() == state.get_string("FloatT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
 
   // Enum params using non-templated version:
   no_op_generator(state);
 }
-NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator,
-                                 template_no_op_callable);
+NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator, template_no_op_callable);
 
 void test_empty()
 {
diff --git a/testing/state.cu b/testing/state.cu
index a2300d1..de7848b 100644
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -16,10 +16,9 @@
  *  limitations under the License.
  */
 
-#include <nvbench/state.cuh>
-
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
+#include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/types.cuh>
 
@@ -43,8 +42,7 @@ struct state_tester : public nvbench::state
   void set_param(std::string name, T &&value)
   {
     this->state::m_axis_values.set_value(std::move(name),
-                                         nvbench::named_values::value_type{
-                                           std::forward<T>(value)});
+                                         nvbench::named_values::value_type{std::forward<T>(value)});
   }
 };
 } // namespace nvbench::detail
diff --git a/testing/state_generator.cu b/testing/state_generator.cu
index f75be02..9042fef 100644
--- a/testing/state_generator.cu
+++ b/testing/state_generator.cu
@@ -16,17 +16,16 @@
  *  limitations under the License.
  */
 
-#include <nvbench/detail/state_generator.cuh>
-
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/axis_base.cuh>
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
-
-#include "test_asserts.cuh"
+#include <nvbench/detail/state_generator.cuh>
 
 #include <fmt/format.h>
 
+#include "test_asserts.cuh"
+
 // Mock up a benchmark for testing:
 void dummy_generator(nvbench::state &) {}
 NVBENCH_DEFINE_CALLABLE(dummy_generator, dummy_callable);
@@ -37,7 +36,7 @@ using ints      = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
 using misc      = nvbench::type_list<void, bool>;
 using type_axes = nvbench::type_list<floats, ints, misc>;
 template <typename F, typename I, typename M>
-void template_generator(nvbench::state &, nvbench::type_list<F, I, M>){};
+void template_generator(nvbench::state &, nvbench::type_list<F, I, M>) {};
 NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_generator, template_callable);
 using template_bench = nvbench::benchmark<template_callable, type_axes>;
 
@@ -151,20 +150,16 @@ void test_create()
   bench.set_devices(std::vector<int>{});
   bench.add_float64_axis("Radians", {3.14, 6.28});
   bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
-  bench.add_int64_axis("NumInputs",
-                       {10, 15, 20},
-                       nvbench::int64_axis_flags::power_of_two);
+  bench.add_int64_axis("NumInputs", {10, 15, 20}, nvbench::int64_axis_flags::power_of_two);
   bench.add_string_axis("Strategy", {"Recursive", "Iterative"});
 
-  const std::vector<nvbench::state> states =
-    nvbench::detail::state_generator::create(bench);
+  const std::vector<nvbench::state> states = nvbench::detail::state_generator::create(bench);
 
   // 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36
   ASSERT(states.size() == 36);
 
   fmt::memory_buffer buffer;
-  const std::string table_format =
-    "| {:^5} | {:^10} | {:^7} | {:^7} | {:^9} | {:^9} |\n";
+  const std::string table_format = "| {:^5} | {:^10} | {:^7} | {:^7} | {:^9} | {:^9} |\n";
 
   fmt::format_to(std::back_inserter(buffer), "\n");
   fmt::format_to(std::back_inserter(buffer),
@@ -241,13 +236,10 @@ void test_create_with_types()
   bench.set_type_axes_names({"Floats", "Ints", "Misc"});
   bench.add_float64_axis("Radians", {3.14, 6.28});
   bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
-  bench.add_int64_axis("NumInputs",
-                       {10, 15, 20},
-                       nvbench::int64_axis_flags::power_of_two);
+  bench.add_int64_axis("NumInputs", {10, 15, 20}, nvbench::int64_axis_flags::power_of_two);
   bench.add_string_axis("Strategy", {"Recursive", "Iterative"});
 
-  const std::vector<nvbench::state> states =
-    nvbench::detail::state_generator::create(bench);
+  const std::vector<nvbench::state> states = nvbench::detail::state_generator::create(bench);
 
   // - 2 (Floats) * 2 (Ints) * 2 (Misc) = 8 total type_configs
   // - 2 (Radians) * 3 (VecSize) * 3 (NumInputs) * 2 (Strategy) = 36 non_type
@@ -591,17 +583,14 @@ void test_create_with_masked_types()
   bench.set_type_axes_names({"Floats", "Ints", "Misc"});
   bench.add_float64_axis("Radians", {3.14, 6.28});
   bench.add_int64_axis("VecSize", {2, 3, 4}, nvbench::int64_axis_flags::none);
-  bench.add_int64_axis("NumInputs",
-                       {10, 15, 20},
-                       nvbench::int64_axis_flags::power_of_two);
+  bench.add_int64_axis("NumInputs", {10, 15, 20}, nvbench::int64_axis_flags::power_of_two);
   bench.add_string_axis("Strategy", {"Recursive", "Iterative"});
 
   // Mask out some types:
   bench.get_axes().get_type_axis("Floats").set_active_inputs({"F32"});
   bench.get_axes().get_type_axis("Ints").set_active_inputs({"I64"});
 
-  const std::vector<nvbench::state> states =
-    nvbench::detail::state_generator::create(bench);
+  const std::vector<nvbench::state> states = nvbench::detail::state_generator::create(bench);
 
   fmt::memory_buffer buffer;
   std::string table_format = "| {:^5} | {:^10} | {:^6} | {:^4} | {:^4} | {:^7} "
@@ -728,8 +717,7 @@ void test_devices()
   bench.add_string_axis("S", {"foo", "bar"});
   bench.add_int64_axis("I", {2, 4});
 
-  const std::vector<nvbench::state> states =
-    nvbench::detail::state_generator::create(bench);
+  const std::vector<nvbench::state> states = nvbench::detail::state_generator::create(bench);
 
   // 3 devices * 4 axis configs = 12 total states
   ASSERT(states.size() == 12);
@@ -791,8 +779,7 @@ void test_termination_criteria()
   bench.set_skip_time(skip_time);
   bench.set_timeout(timeout);
 
-  const std::vector<nvbench::state> states =
-    nvbench::detail::state_generator::create(bench);
+  const std::vector<nvbench::state> states = nvbench::detail::state_generator::create(bench);
 
   ASSERT(states.size() == 1);
   ASSERT(min_samples == states[0].get_min_samples());
diff --git a/testing/statistics.cu b/testing/statistics.cu
index a67a044..db4a40d 100644
--- a/testing/statistics.cu
+++ b/testing/statistics.cu
@@ -19,18 +19,18 @@
 #include <nvbench/detail/statistics.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <algorithm>
 #include <vector>
 
+#include "test_asserts.cuh"
+
 namespace statistics = nvbench::detail::statistics;
 
 void test_mean()
 {
   {
     std::vector<nvbench::float64_t> data{1.0, 2.0, 3.0, 4.0, 5.0};
-    const nvbench::float64_t actual = statistics::compute_mean(std::begin(data), std::end(data));
+    const nvbench::float64_t actual   = statistics::compute_mean(std::begin(data), std::end(data));
     const nvbench::float64_t expected = 3.0;
     ASSERT(std::abs(actual - expected) < 0.001);
   }
@@ -46,7 +46,8 @@ void test_std()
 {
   std::vector<nvbench::float64_t> data{1.0, 2.0, 3.0, 4.0, 5.0};
   const nvbench::float64_t mean = 3.0;
-  const nvbench::float64_t actual = statistics::standard_deviation(std::begin(data), std::end(data), mean);
+  const nvbench::float64_t actual =
+    statistics::standard_deviation(std::begin(data), std::end(data), mean);
   const nvbench::float64_t expected = 1.581;
   ASSERT(std::abs(actual - expected) < 0.001);
 }
@@ -78,7 +79,8 @@ void test_r2()
   {
     std::vector<nvbench::float64_t> ys{1.0, 2.0, 3.0, 4.0, 5.0};
     auto [slope, intercept] = statistics::compute_linear_regression(std::begin(ys), std::end(ys));
-    const nvbench::float64_t actual = statistics::compute_r2(std::begin(ys), std::end(ys), slope, intercept);
+    const nvbench::float64_t actual =
+      statistics::compute_r2(std::begin(ys), std::end(ys), slope, intercept);
     const nvbench::float64_t expected = 1.0;
     ASSERT(std::abs(actual - expected) < 0.001);
   }
@@ -95,7 +97,8 @@ void test_r2()
 
     auto [slope, intercept] = statistics::compute_linear_regression(std::begin(ys), std::end(ys));
     const nvbench::float64_t expected = 0.675;
-    const nvbench::float64_t actual = statistics::compute_r2(std::begin(ys), std::end(ys), slope, intercept);
+    const nvbench::float64_t actual =
+      statistics::compute_r2(std::begin(ys), std::end(ys), slope, intercept);
     ASSERT(std::abs(actual - expected) < 0.001);
   }
 }
@@ -103,17 +106,17 @@ void test_r2()
 void test_slope_conversion()
 {
   {
-    const nvbench::float64_t actual = statistics::slope2deg(0.0);
+    const nvbench::float64_t actual   = statistics::slope2deg(0.0);
     const nvbench::float64_t expected = 0.0;
     ASSERT(std::abs(actual - expected) < 0.001);
   }
   {
-    const nvbench::float64_t actual = statistics::slope2deg(1.0);
+    const nvbench::float64_t actual   = statistics::slope2deg(1.0);
     const nvbench::float64_t expected = 45.0;
     ASSERT(std::abs(actual - expected) < 0.001);
   }
   {
-    const nvbench::float64_t actual = statistics::slope2deg(5.0);
+    const nvbench::float64_t actual   = statistics::slope2deg(5.0);
     const nvbench::float64_t expected = 78.69;
     ASSERT(std::abs(actual - expected) < 0.001);
   }
diff --git a/testing/stdrel_criterion.cu b/testing/stdrel_criterion.cu
index b8bfc4e..fbe1dab 100644
--- a/testing/stdrel_criterion.cu
+++ b/testing/stdrel_criterion.cu
@@ -20,13 +20,13 @@
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <algorithm>
 #include <numeric>
 #include <random>
 #include <vector>
 
+#include "test_asserts.cuh"
+
 void test_const()
 {
   nvbench::criterion_params params;
@@ -46,14 +46,14 @@ std::vector<double> generate(double mean, double rel_std_dev, int size)
   std::mt19937 gen(seed++);
   std::vector<nvbench::float64_t> v(static_cast<std::size_t>(size));
   std::normal_distribution<nvbench::float64_t> dist(mean, mean * rel_std_dev);
-  std::generate(v.begin(), v.end(), [&]{ return dist(gen); });
+  std::generate(v.begin(), v.end(), [&] { return dist(gen); });
   return v;
 }
 
 void test_stdrel()
 {
-  const nvbench::int64_t size = 10;
-  const nvbench::float64_t mean = 42.0;
+  const nvbench::int64_t size        = 10;
+  const nvbench::float64_t mean      = 42.0;
   const nvbench::float64_t max_noise = 0.1;
 
   nvbench::criterion_params params;
@@ -62,7 +62,7 @@ void test_stdrel()
   nvbench::detail::stdrel_criterion criterion;
   criterion.initialize(params);
 
-  for (nvbench::float64_t measurement: generate(mean, max_noise / 2, size))
+  for (nvbench::float64_t measurement : generate(mean, max_noise / 2, size))
   {
     criterion.add_measurement(measurement);
   }
@@ -71,7 +71,7 @@ void test_stdrel()
   params.set_float64("max-noise", max_noise);
   criterion.initialize(params);
 
-  for (nvbench::float64_t measurement: generate(mean, max_noise * 2, size))
+  for (nvbench::float64_t measurement : generate(mean, max_noise * 2, size))
   {
     criterion.add_measurement(measurement);
   }
diff --git a/testing/string_axis.cu b/testing/string_axis.cu
index 1fb3d6f..80bf9c8 100644
--- a/testing/string_axis.cu
+++ b/testing/string_axis.cu
@@ -32,8 +32,7 @@ void test_empty()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::string_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::string_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Empty");
@@ -61,8 +60,7 @@ void test_basic()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::string_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::string_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Basic");
diff --git a/testing/test_asserts.cuh b/testing/test_asserts.cuh
index 58be9a1..704cd19 100644
--- a/testing/test_asserts.cuh
+++ b/testing/test_asserts.cuh
@@ -23,55 +23,54 @@
 #include <cstdio>
 #include <stdexcept>
 
-#define ASSERT(cond)                                                            \
-  do                                                                            \
-  {                                                                             \
-    if (cond)                                                                   \
-    {}                                                                          \
-    else                                                                        \
-    {                                                                           \
-      fmt::print("{}:{}: Assertion failed ({}).\n", __FILE__, __LINE__, #cond); \
-      std::fflush(stdout);                                                      \
-      throw std::runtime_error("Unit test failure.");                           \
-    }                                                                           \
+#define ASSERT(cond)                                                                               \
+  do                                                                                               \
+  {                                                                                                \
+    if (cond)                                                                                      \
+    {                                                                                              \
+    }                                                                                              \
+    else                                                                                           \
+    {                                                                                              \
+      fmt::print("{}:{}: Assertion failed ({}).\n", __FILE__, __LINE__, #cond);                    \
+      std::fflush(stdout);                                                                         \
+      throw std::runtime_error("Unit test failure.");                                              \
+    }                                                                                              \
   } while (false)
 
-#define ASSERT_MSG(cond, fmtstr, ...)                                          \
-  do                                                                           \
-  {                                                                            \
-    if (cond)                                                                  \
-    {}                                                                         \
-    else                                                                       \
-    {                                                                          \
-      fmt::print("{}:{}: Test assertion failed ({}) {}\n",                     \
-                 __FILE__,                                                     \
-                 __LINE__,                                                     \
-                 #cond,                                                        \
-                 fmt::format(fmtstr, __VA_ARGS__));                            \
-      std::fflush(stdout);                                                     \
-      throw std::runtime_error("Unit test failure.");                          \
-    }                                                                          \
+#define ASSERT_MSG(cond, fmtstr, ...)                                                              \
+  do                                                                                               \
+  {                                                                                                \
+    if (cond)                                                                                      \
+    {                                                                                              \
+    }                                                                                              \
+    else                                                                                           \
+    {                                                                                              \
+      fmt::print("{}:{}: Test assertion failed ({}) {}\n",                                         \
+                 __FILE__,                                                                         \
+                 __LINE__,                                                                         \
+                 #cond,                                                                            \
+                 fmt::format(fmtstr, __VA_ARGS__));                                                \
+      std::fflush(stdout);                                                                         \
+      throw std::runtime_error("Unit test failure.");                                              \
+    }                                                                                              \
   } while (false)
 
-#define ASSERT_THROWS_ANY(expr)                                                \
-  do                                                                           \
-  {                                                                            \
-    bool threw = false;                                                        \
-    try                                                                        \
-    {                                                                          \
-      expr;                                                                    \
-    }                                                                          \
-    catch (...)                                                                \
-    {                                                                          \
-      threw = true;                                                            \
-    }                                                                          \
-    if (!threw)                                                                \
-    {                                                                          \
-      fmt::print("{}:{}: Expression expected exception: '{}'.",                \
-                 __FILE__,                                                     \
-                 __LINE__,                                                     \
-                 #expr);                                                       \
-      std::fflush(stdout);                                                     \
-      throw std::runtime_error("Unit test failure.");                          \
-    }                                                                          \
+#define ASSERT_THROWS_ANY(expr)                                                                    \
+  do                                                                                               \
+  {                                                                                                \
+    bool threw = false;                                                                            \
+    try                                                                                            \
+    {                                                                                              \
+      expr;                                                                                        \
+    }                                                                                              \
+    catch (...)                                                                                    \
+    {                                                                                              \
+      threw = true;                                                                                \
+    }                                                                                              \
+    if (!threw)                                                                                    \
+    {                                                                                              \
+      fmt::print("{}:{}: Expression expected exception: '{}'.", __FILE__, __LINE__, #expr);        \
+      std::fflush(stdout);                                                                         \
+      throw std::runtime_error("Unit test failure.");                                              \
+    }                                                                                              \
   } while (false)
diff --git a/testing/type_axis.cu b/testing/type_axis.cu
index 3cfff13..17c5133 100644
--- a/testing/type_axis.cu
+++ b/testing/type_axis.cu
@@ -17,13 +17,12 @@
  */
 
 #include <nvbench/type_axis.cuh>
-
 #include <nvbench/types.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 
+#include "test_asserts.cuh"
+
 void test_empty()
 {
   nvbench::type_axis axis("Basic", 0);
@@ -39,8 +38,7 @@ void test_empty()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::type_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::type_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Basic");
@@ -63,8 +61,7 @@ void test_single()
 
   auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  auto *clone =
-    dynamic_cast<nvbench::type_axis *>(clone_base.get());
+  auto *clone = dynamic_cast<nvbench::type_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Single");
@@ -102,8 +99,7 @@ void test_single()
 void test_several()
 {
   nvbench::type_axis axis("Several", 0);
-  axis.set_inputs<
-    nvbench::type_list<nvbench::int32_t, nvbench::float64_t, bool>>();
+  axis.set_inputs<nvbench::type_list<nvbench::int32_t, nvbench::float64_t, bool>>();
 
   ASSERT(axis.get_name() == "Several");
   ASSERT(axis.get_size() == 3);
@@ -122,8 +118,7 @@ void test_several()
 
   auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  auto *clone =
-    dynamic_cast<nvbench::type_axis *>(clone_base.get());
+  auto *clone = dynamic_cast<nvbench::type_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_name() == "Several");
@@ -177,9 +172,8 @@ void test_several()
 void test_get_type_index()
 {
   nvbench::type_axis axis("GetIndexTest", 0);
-  axis.set_inputs<
-    nvbench::
-      type_list<nvbench::int8_t, nvbench::uint16_t, nvbench::float32_t, bool>>();
+  axis
+    .set_inputs<nvbench::type_list<nvbench::int8_t, nvbench::uint16_t, nvbench::float32_t, bool>>();
 
   ASSERT(axis.get_type_index("I8") == 0);
   ASSERT(axis.get_type_index("U16") == 1);
@@ -188,8 +182,7 @@ void test_get_type_index()
 
   const auto clone_base = axis.clone();
   ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::type_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::type_axis *>(clone_base.get());
   ASSERT(clone != nullptr);
 
   ASSERT(clone->get_type_index("I8") == 0);
diff --git a/testing/type_list.cu b/testing/type_list.cu
index 2e64e6d..99b1ee2 100644
--- a/testing/type_list.cu
+++ b/testing/type_list.cu
@@ -17,11 +17,8 @@
  */
 
 #include <nvbench/type_list.cuh>
-
 #include <nvbench/type_strings.cuh>
 
-#include "test_asserts.cuh"
-
 #include <fmt/format.h>
 #include <fmt/ranges.h>
 
@@ -30,6 +27,8 @@
 #include <type_traits>
 #include <vector>
 
+#include "test_asserts.cuh"
+
 // Unique, numbered types for testing type_list functionality.
 using T0 = std::integral_constant<std::size_t, 0>;
 using T1 = std::integral_constant<std::size_t, 1>;
@@ -80,14 +79,13 @@ struct test_concat
 
   struct empty_tests
   {
-    static_assert(
-      std::is_same_v<nvbench::tl::concat<TLEmpty, TLEmpty>, TLEmpty>);
+    static_assert(std::is_same_v<nvbench::tl::concat<TLEmpty, TLEmpty>, TLEmpty>);
     static_assert(std::is_same_v<nvbench::tl::concat<TLEmpty, TL012>, TL012>);
     static_assert(std::is_same_v<nvbench::tl::concat<TL012, TLEmpty>, TL012>);
   };
 
-  static_assert(std::is_same_v<nvbench::tl::concat<TL012, TL765>,
-                               nvbench::type_list<T0, T1, T2, T7, T6, T5>>);
+  static_assert(
+    std::is_same_v<nvbench::tl::concat<TL012, TL765>, nvbench::type_list<T0, T1, T2, T7, T6, T5>>);
 };
 
 struct test_prepend_each
@@ -97,8 +95,7 @@ struct test_prepend_each
   using T23 = nvbench::type_list<T2, T3>;
   using TLs = nvbench::type_list<T01, T23>;
 
-  using Expected = nvbench::type_list<nvbench::type_list<T, T0, T1>,
-                                      nvbench::type_list<T, T2, T3>>;
+  using Expected = nvbench::type_list<nvbench::type_list<T, T0, T1>, nvbench::type_list<T, T2, T3>>;
   static_assert(std::is_same_v<nvbench::tl::prepend_each<T, TLs>, Expected>);
 };
 
@@ -110,16 +107,12 @@ struct test_empty_cartesian_product
 
 struct test_single_cartesian_product
 {
-  using prod_1 =
-    nvbench::tl::cartesian_product<nvbench::type_list<nvbench::type_list<T0>>>;
-  static_assert(
-    std::is_same_v<prod_1, nvbench::type_list<nvbench::type_list<T0>>>);
+  using prod_1 = nvbench::tl::cartesian_product<nvbench::type_list<nvbench::type_list<T0>>>;
+  static_assert(std::is_same_v<prod_1, nvbench::type_list<nvbench::type_list<T0>>>);
 
-  using prod_2 = nvbench::tl::cartesian_product<
-    nvbench::type_list<nvbench::type_list<T0, T1>>>;
-  static_assert(std::is_same_v<prod_2,
-                               nvbench::type_list<nvbench::type_list<T0>,
-                                                  nvbench::type_list<T1>>>);
+  using prod_2 = nvbench::tl::cartesian_product<nvbench::type_list<nvbench::type_list<T0, T1>>>;
+  static_assert(
+    std::is_same_v<prod_2, nvbench::type_list<nvbench::type_list<T0>, nvbench::type_list<T1>>>);
 };
 
 struct test_cartesian_product