Formatting updates.

2026-03-14 20:27:24 +00:00 · 2025-04-14 17:26:12 +00:00
parent de36f1a248
commit 3440855dbd
107 changed files with 808 additions and 967 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -41,16 +41,28 @@ CompactNamespaces: false
 ContinuationIndentWidth: 2
 IncludeBlocks:   Regroup
 IncludeCategories:
-  - Regex:           '^<cub'
-    Priority:        1
-  - Regex:           '^<thrust'
-    Priority:        2
-  - Regex:           '^<cuda'
-    Priority:        3
  - Regex:           '^<nvbench'
+    Priority:        1
+  - Regex:           '^<cub'
+    Priority:        2
+  - Regex:           '^<thrust'
+    Priority:        3
+  - Regex:           '^<cuda/'
    Priority:        4
-  - Regex:           '^<[a-z]*>$'
+  - Regex:           '^<cuda'
    Priority:        5
+  - Regex:           '^<nvml'
+    Priority:        6
+  - Regex:           '^<cupti'
+    Priority:        7
+  - Regex:           '^<nvperf'
+    Priority:        8
+  - Regex:           '^<nlohmann'
+    Priority:        9
+  - Regex:           '^<fmt'
+    Priority:        10
+  - Regex:           '^<[a-z_]*>$'
+    Priority:        11
 IndentCaseLabels: true
 IndentPPDirectives: None
 IndentWidth: 2
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -34,7 +34,7 @@ CCCL uses [Development Containers](https://containers.dev/) to provide consisten

   - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.

-     ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png)
+     ![Shows "Reopen in Container" in command palette.](./img/open_in_container_manual.png)

 4. Select an environment with the desired CTK and host compiler from the list:

@@ -136,7 +136,7 @@ For more information, see the `.devcontainer/make_devcontainers.sh --help` messa

 2. Install WSL 2 by running:
 ```bash
-wsl --install 
+wsl --install
 ```
 This should probably install Ubuntu distro as a default.

@@ -182,14 +182,14 @@ then run `sudo systemctl restart docker.service`.
 10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).

 11. If prompted, choose `Reopen in Container`.
-    
+
    - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.

 12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.

 From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:

-13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message. 
+13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message.

 > Failed opening a web browser at https://github.com/login/device
  exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
--- a/.devcontainer/launch.sh
+++ b/.devcontainer/launch.sh
@@ -304,4 +304,3 @@ main() {
 }

 main "$@"
-
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -14,4 +14,3 @@
 #
 # Only add commits that are pure formatting changes (e.g. clang-format version changes, etc).
 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
-
--- a/ci/ninja_summary.py
+++ b/ci/ninja_summary.py
@@ -65,6 +65,7 @@ long_ext_count = 10

 class Target:
    """Represents a single line read for a .ninja_log file."""
+
    def __init__(self, start, end):
        """Creates a target object by passing in the start/end times in seconds
        as a float."""
@@ -94,9 +95,9 @@ class Target:
        """
        # Allow for modest floating-point errors
        epsilon = 0.000002
-        if (self.weighted_duration > self.Duration() + epsilon):
-            print('%s > %s?' % (self.weighted_duration, self.Duration()))
-        assert (self.weighted_duration <= self.Duration() + epsilon)
+        if self.weighted_duration > self.Duration() + epsilon:
+            print("%s > %s?" % (self.weighted_duration, self.Duration()))
+        assert self.weighted_duration <= self.Duration() + epsilon
        return self.weighted_duration

    def DescribeTargets(self):
@@ -104,10 +105,10 @@ class Target:
        # Some build steps generate dozens of outputs - handle them sanely.
        # The max_length was chosen so that it can fit most of the long
        # single-target names, while minimizing word wrapping.
-        result = ', '.join(self.targets)
+        result = ", ".join(self.targets)
        max_length = 65
        if len(result) > max_length:
-            result = result[:max_length] + '...'
+            result = result[:max_length] + "..."
        return result


@@ -121,12 +122,11 @@ def ReadTargets(log, show_all):
    # targets.
    if not header:
        return []
-    assert header == '# ninja log v5\n', \
-           'unrecognized ninja log version %r' % header
+    assert header == "# ninja log v5\n", "unrecognized ninja log version %r" % header
    targets_dict = {}
    last_end_seen = 0.0
    for line in log:
-        parts = line.strip().split('\t')
+        parts = line.strip().split("\t")
        if len(parts) != 5:
            # If ninja.exe is rudely halted then the .ninja_log file may be
            # corrupt. Silently continue.
@@ -165,17 +165,17 @@ def ReadTargets(log, show_all):
 def GetExtension(target, extra_patterns):
    """Return the file extension that best represents a target.

-  For targets that generate multiple outputs it is important to return a
-  consistent 'canonical' extension. Ultimately the goal is to group build steps
-  by type."""
+    For targets that generate multiple outputs it is important to return a
+    consistent 'canonical' extension. Ultimately the goal is to group build steps
+    by type."""
    for output in target.targets:
        if extra_patterns:
-            for fn_pattern in extra_patterns.split(';'):
-                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+            for fn_pattern in extra_patterns.split(";"):
+                if fnmatch.fnmatch(output, "*" + fn_pattern + "*"):
                    return fn_pattern
        # Not a true extension, but a good grouping.
-        if output.endswith('type_mappings'):
-            extension = 'type_mappings'
+        if output.endswith("type_mappings"):
+            extension = "type_mappings"
            break

        # Capture two extensions if present. For example: file.javac.jar should
@@ -185,26 +185,26 @@ def GetExtension(target, extra_patterns):
        extension = ext2 + ext1  # Preserve the order in the file name.

        if len(extension) == 0:
-            extension = '(no extension found)'
+            extension = "(no extension found)"

-        if ext1 in ['.pdb', '.dll', '.exe']:
-            extension = 'PEFile (linking)'
+        if ext1 in [".pdb", ".dll", ".exe"]:
+            extension = "PEFile (linking)"
            # Make sure that .dll and .exe are grouped together and that the
            # .dll.lib files don't cause these to be listed as libraries
            break
-        if ext1 in ['.so', '.TOC']:
-            extension = '.so (linking)'
+        if ext1 in [".so", ".TOC"]:
+            extension = ".so (linking)"
            # Attempt to identify linking, avoid identifying as '.TOC'
            break
        # Make sure .obj files don't get categorized as mojo files
-        if ext1 in ['.obj', '.o']:
+        if ext1 in [".obj", ".o"]:
            break
        # Jars are the canonical output of java targets.
-        if ext1 == '.jar':
+        if ext1 == ".jar":
            break
        # Normalize all mojo related outputs to 'mojo'.
-        if output.count('.mojom') > 0:
-            extension = 'mojo'
+        if output.count(".mojom") > 0:
+            extension = "mojo"
            break
    return extension

@@ -229,8 +229,8 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
        if target.end > latest:
            latest = target.end
        total_cpu_time += target.Duration()
-        task_start_stop_times.append((target.start, 'start', target))
-        task_start_stop_times.append((target.end, 'stop', target))
+        task_start_stop_times.append((target.start, "start", target))
+        task_start_stop_times.append((target.end, "stop", target))
    length = latest - earliest
    weighted_total = 0.0

@@ -256,10 +256,10 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
        if num_running > 0:
            # Update the total weighted time up to this moment.
            last_weighted_time += (time - last_time) / float(num_running)
-        if action_name == 'start':
+        if action_name == "start":
            # Record the total weighted task time when this task starts.
            running_tasks[target] = last_weighted_time
-        if action_name == 'stop':
+        if action_name == "stop":
            # Record the change in the total weighted task time while this task
            # ran.
            weighted_duration = last_weighted_time - running_tasks[target]
@@ -267,24 +267,27 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
            weighted_total += weighted_duration
            del running_tasks[target]
        last_time = time
-    assert (len(running_tasks) == 0)
+    assert len(running_tasks) == 0

    # Warn if the sum of weighted times is off by more than half a second.
    if abs(length - weighted_total) > 500:
-        print('Warning: Possible corrupt ninja log, results may be '
-              'untrustworthy. Length = %.3f, weighted total = %.3f' %
-              (length, weighted_total))
+        print(
+            "Warning: Possible corrupt ninja log, results may be "
+            "untrustworthy. Length = %.3f, weighted total = %.3f"
+            % (length, weighted_total)
+        )

    # Print the slowest build steps:
-    print('    Longest build steps:')
+    print("    Longest build steps:")
    if elapsed_time_sorting:
        entries.sort(key=lambda x: x.Duration())
    else:
        entries.sort(key=lambda x: x.WeightedDuration())
    for target in entries[-long_count:]:
-        print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
-              (target.WeightedDuration(), target.DescribeTargets(),
-               target.Duration()))
+        print(
+            "      %8.1f weighted s to build %s (%.1f s elapsed time)"
+            % (target.WeightedDuration(), target.DescribeTargets(), target.Duration())
+        )

    # Sum up the time by file extension/type of the output file
    count_by_ext = {}
@@ -293,51 +296,56 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
    # Scan through all of the targets to build up per-extension statistics.
    for target in entries:
        extension = GetExtension(target, extra_step_types)
-        time_by_ext[extension] = time_by_ext.get(extension,
-                                                 0) + target.Duration()
-        weighted_time_by_ext[extension] = weighted_time_by_ext.get(
-            extension, 0) + target.WeightedDuration()
+        time_by_ext[extension] = time_by_ext.get(extension, 0) + target.Duration()
+        weighted_time_by_ext[extension] = (
+            weighted_time_by_ext.get(extension, 0) + target.WeightedDuration()
+        )
        count_by_ext[extension] = count_by_ext.get(extension, 0) + 1

-    print('    Time by build-step type:')
+    print("    Time by build-step type:")
    # Copy to a list with extension name and total time swapped, to (time, ext)
    if elapsed_time_sorting:
-        weighted_time_by_ext_sorted = sorted(
-            (y, x) for (x, y) in time_by_ext.items())
+        weighted_time_by_ext_sorted = sorted((y, x) for (x, y) in time_by_ext.items())
    else:
        weighted_time_by_ext_sorted = sorted(
-            (y, x) for (x, y) in weighted_time_by_ext.items())
+            (y, x) for (x, y) in weighted_time_by_ext.items()
+        )
    # Print the slowest build target types:
    for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
        print(
-            '      %8.1f s weighted time to generate %d %s files '
-            '(%1.1f s elapsed time sum)' %
-            (time, count_by_ext[extension], extension, time_by_ext[extension]))
+            "      %8.1f s weighted time to generate %d %s files "
+            "(%1.1f s elapsed time sum)"
+            % (time, count_by_ext[extension], extension, time_by_ext[extension])
+        )

-    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
-          'parallelism)' %
-          (length, total_cpu_time, total_cpu_time * 1.0 / length))
-    print('    %d build steps completed, average of %1.2f/s' %
-          (len(entries), len(entries) / (length)))
+    print(
+        "    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx "
+        "parallelism)" % (length, total_cpu_time, total_cpu_time * 1.0 / length)
+    )
+    print(
+        "    %d build steps completed, average of %1.2f/s"
+        % (len(entries), len(entries) / (length))
+    )


 def main():
-    log_file = '.ninja_log'
-    metrics_file = 'siso_metrics.json'
+    log_file = ".ninja_log"
+    metrics_file = "siso_metrics.json"
    parser = argparse.ArgumentParser()
-    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument("-C", dest="build_directory", help="Build directory.")
    parser.add_argument(
-        '-s',
-        '--step-types',
-        help='semicolon separated fnmatch patterns for build-step grouping')
+        "-s",
+        "--step-types",
+        help="semicolon separated fnmatch patterns for build-step grouping",
+    )
    parser.add_argument(
-        '-e',
-        '--elapsed_time_sorting',
+        "-e",
+        "--elapsed_time_sorting",
        default=False,
-        action='store_true',
-        help='Sort output by elapsed time instead of weighted time')
-    parser.add_argument('--log-file',
-                        help="specific ninja log file to analyze.")
+        action="store_true",
+        help="Sort output by elapsed time instead of weighted time",
+    )
+    parser.add_argument("--log-file", help="specific ninja log file to analyze.")
    args, _extra_args = parser.parse_known_args()
    if args.build_directory:
        log_file = os.path.join(args.build_directory, log_file)
@@ -348,34 +356,35 @@ def main():
        # Offer a convenient way to add extra step types automatically,
        # including when this script is run by autoninja. get() returns None if
        # the variable isn't set.
-        args.step_types = os.environ.get('chromium_step_types')
+        args.step_types = os.environ.get("chromium_step_types")
    if args.step_types:
        # Make room for the extra build types.
        global long_ext_count
-        long_ext_count += len(args.step_types.split(';'))
+        long_ext_count += len(args.step_types.split(";"))

    if os.path.exists(metrics_file):
        # Automatically handle summarizing siso builds.
-        cmd = ['siso.bat' if 'win32' in sys.platform else 'siso']
-        cmd.extend(['metrics', 'summary'])
+        cmd = ["siso.bat" if "win32" in sys.platform else "siso"]
+        cmd.extend(["metrics", "summary"])
        if args.build_directory:
-            cmd.extend(['-C', args.build_directory])
+            cmd.extend(["-C", args.build_directory])
        if args.step_types:
-            cmd.extend(['--step_types', args.step_types])
+            cmd.extend(["--step_types", args.step_types])
        if args.elapsed_time_sorting:
-            cmd.append('--elapsed_time_sorting')
+            cmd.append("--elapsed_time_sorting")
        subprocess.run(cmd)
    else:
        try:
-            with open(log_file, 'r') as log:
+            with open(log_file, "r") as log:
                entries = ReadTargets(log, False)
                if entries:
-                    SummarizeEntries(entries, args.step_types,
-                                     args.elapsed_time_sorting)
+                    SummarizeEntries(
+                        entries, args.step_types, args.elapsed_time_sorting
+                    )
        except IOError:
-            print('Log file %r not found, no build summary created.' % log_file)
+            print("Log file %r not found, no build summary created." % log_file)
            return errno.ENOENT


-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/ci/sccache_hit_rate.sh
+++ b/ci/sccache_hit_rate.sh
@@ -37,5 +37,5 @@ if [ $requests_diff -eq 0 ]; then
 else
    hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
    echo "sccache hit rate: $hit_rate%" >&2
-    echo "$hit_rate" 
+    echo "$hit_rate"
 fi
--- a/cmake/DetectSupportedStandards.cmake
+++ b/cmake/DetectSupportedStandards.cmake
@@ -1,4 +1,4 @@
-# Detect the langauge standards supported by the current compilers.
+# Detect the language standards supported by the current compilers.
 #
 # Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
 #
--- a/cmake/NVBenchUtilities.cmake
+++ b/cmake/NVBenchUtilities.cmake
@@ -14,7 +14,7 @@
 # limitations under the License.

 # Passes all args directly to execute_process while setting up the following
-# results variables and propogating them to the caller's scope:
+# results variables and propagating them to the caller's scope:
 #
 # - nvbench_process_exit_code
 # - nvbench_process_stdout
--- a/cmake/header_test.in.cxx
+++ b/cmake/header_test.in.cxx
@@ -9,9 +9,9 @@
 // a potential macro collision and halts.
 //
 // Hacky way to build a string, but it works on all tested platforms.
-#define NVBench_MACRO_CHECK(MACRO, HEADER)                                      \
-  NVBench_MACRO_CHECK_IMPL(Identifier MACRO should not be used from NVBench      \
-                           headers due to conflicts with HEADER macros.)
+#define NVBench_MACRO_CHECK(MACRO, HEADER)                                                         \
+  NVBench_MACRO_CHECK_IMPL(                                                                        \
+    Identifier MACRO should not be used from NVBench headers due to conflicts with HEADER macros.)

 // Use raw platform checks instead of the NVBench_HOST_COMPILER macros since we
 // don't want to #include any headers other than the one being tested.
@@ -34,8 +34,8 @@
 // library implementations unconditionally `#undef` these macros, which then
 // causes random failures later.
 // Leaving these commented out as a warning: Here be dragons.
-//#define min(...) NVBench_MACRO_CHECK('min', windows.h)
-//#define max(...) NVBench_MACRO_CHECK('max', windows.h)
+// #define min(...) NVBench_MACRO_CHECK('min', windows.h)
+// #define max(...) NVBench_MACRO_CHECK('max', windows.h)

 // termios.h conflicts (NVIDIA/thrust#1547)
 #define B0 NVBench_MACRO_CHECK("B0", termios.h)
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -90,7 +90,7 @@
    before any `--benchmark` arguments.

 * `--stopping-criterion <criterion>`
-  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough 
+  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough
    samples were collected.
  * Only applies to Cold measurements.
  * Default is stdrel (`--stopping-criterion stdrel`)
--- a/examples/auto_throughput.cu
+++ b/examples/auto_throughput.cu
@@ -24,37 +24,33 @@
 template <int ItemsPerThread>
 __global__ void kernel(std::size_t stride,
                       std::size_t elements,
-                       const nvbench::int32_t * __restrict__ in,
+                       const nvbench::int32_t *__restrict__ in,
                       nvbench::int32_t *__restrict__ out)
 {
-  const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const std::size_t tid  = threadIdx.x + blockIdx.x * blockDim.x;
  const std::size_t step = gridDim.x * blockDim.x;

-  for (std::size_t i = stride * tid;
-       i < stride * elements;
-       i += stride * step)
+  for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
  {
    for (int j = 0; j < ItemsPerThread; j++)
    {
-      const auto read_id = (ItemsPerThread * i + j) % elements;
+      const auto read_id  = (ItemsPerThread * i + j) % elements;
      const auto write_id = tid + j * elements;
-      out[write_id] = in[read_id];
+      out[write_id]       = in[read_id];
    }
  }
 }

-
 // `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
 // and cache hit rates.
 //
 // Calling state.collect_*() enables particular metric collection if nvbench
 // was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
 template <int ItemsPerThread>
-void throughput_bench(nvbench::state &state,
-                      nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
+void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
 {
  // Allocate input data:
-  const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+  const std::size_t stride   = static_cast<std::size_t>(state.get_int64("Stride"));
  const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
  thrust::device_vector<nvbench::int32_t> input(elements);
  thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
    static_cast<int>((elements + threads_in_block - 1) / threads_in_block);

  state.exec([&](nvbench::launch &launch) {
-    kernel<ItemsPerThread>
-      <<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
-        stride,
-        elements,
-        thrust::raw_pointer_cast(input.data()),
-        thrust::raw_pointer_cast(output.data()));
+    kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+      stride,
+      elements,
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()));
  });
 }

--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -71,18 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
  thrust::device_vector<nvbench::int32_t> in(num_values, 0);
  thrust::device_vector<nvbench::int32_t> out(num_values, 0);

-  state.exec(
-    [block_size,
-     num_blocks,
-     num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
-        in_ptr,
-        out_ptr,
-        num_values);
-    });
+  state.exec([block_size,
+              num_blocks,
+              num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
+                                                                             out_ptr,
+                                                                             num_values);
+  });
 }
 NVBENCH_BENCH(copy_sweep_grid_shape)
  // Every second power of two from  64->1024:
@@ -107,15 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
  thrust::device_vector<ValueType> in(num_values, 0);
  thrust::device_vector<ValueType> out(num_values, 0);

-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Define a type_list to use for the type axis:
 using cts_types = nvbench::type_list<nvbench::uint8_t,
@@ -131,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
 // Convert 64 MiB of InputTypes to OutputTypes, represented with various
 // value_types.
 template <typename InputType, typename OutputType>
-void copy_type_conversion_sweep(nvbench::state &state,
-                                nvbench::type_list<InputType, OutputType>)
+void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // Optional: Skip narrowing conversions.
-  if constexpr(sizeof(InputType) > sizeof(OutputType))
+  if constexpr (sizeof(InputType) > sizeof(OutputType))
  {
    state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
    return;
@@ -154,15 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
  thrust::device_vector<InputType> in(num_values, 0);
  thrust::device_vector<OutputType> out(num_values, 0);

-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      (void) num_values; // clang thinks this is unused...
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Optional: Skip when InputType == OutputType. This approach avoids
 // instantiating the benchmark at all.
@@ -178,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
                                      nvbench::float32_t,
                                      nvbench::int64_t,
                                      nvbench::float64_t>;
-NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
-                    NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
+NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/custom_criterion.cu
+++ b/examples/custom_criterion.cu
@@ -36,10 +36,7 @@ public:

 protected:
  // Setup the criterion in the `do_initialize()` method:
-  virtual void do_initialize() override
-  {
-    m_num_samples = 0;
-  }
+  virtual void do_initialize() override { m_num_samples = 0; }

  // Process new measurements in the `add_measurement()` method:
  virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
@@ -52,7 +49,6 @@ protected:
  {
    return m_num_samples >= m_params.get_int64("max-samples");
  }
-
 };

 // Register the criterion with NVBench:
@@ -71,7 +67,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);

  state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/examples/enums.cu
+++ b/examples/enums.cu
@@ -17,7 +17,6 @@
 */

 #include <nvbench/nvbench.cuh>
-
 #include <nvbench/test_kernels.cuh>

 // Enum to use as parameter axis:
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
  // Create inputs, etc, configure runtime kernel parameters, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-NVBENCH_BENCH(runtime_enum_sweep_string)
-  .add_string_axis("MyEnum", {"A", "B", "C"});
+NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});

 //==============================================================================
 // Sweep through enum values at runtime using an int64 axis.
@@ -97,9 +94,8 @@ void runtime_enum_sweep_int64(nvbench::state &state)
  // Create inputs, etc, configure runtime kernel parameters, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 NVBENCH_BENCH(runtime_enum_sweep_int64)
  .add_int64_axis("MyEnum",
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
  // Template parameters, static dispatch, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-using MyEnumList =
-  nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
+using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
 NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
  .set_type_axes_names({"MyEnum"});

@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
 //  * `-12` (struct std::integral_constant<int,-12>)
 // ```
 template <nvbench::int32_t IntValue>
-void compile_time_int_sweep(nvbench::state &state,
-                            nvbench::type_list<nvbench::enum_type<IntValue>>)
+void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
 {
  // Use IntValue in compile time contexts.
  // Template parameters, static dispatch, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
 NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))
--- a/examples/exec_tag_sync.cu
+++ b/examples/exec_tag_sync.cu
@@ -53,9 +53,7 @@ void sequence_bench(nvbench::state &state)

  // nvbench::exec_tag::sync indicates that this will implicitly sync:
  state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
-    thrust::sequence(thrust::device.on(launch.get_stream()),
-                     data.begin(),
-                     data.end());
+    thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
  });
 }
 NVBENCH_BENCH(sequence_bench);
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -23,8 +23,8 @@

 // Thrust simplifies memory management, etc:
 #include <thrust/copy.h>
-#include <thrust/execution_policy.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
 #include <thrust/sequence.h>

 // mod2_inplace performs an in-place mod2 over every element in `data`. `data`
@@ -54,7 +54,7 @@ void mod2_inplace(nvbench::state &state)
  state.exec(nvbench::exec_tag::timer,
             // Lambda now takes a `timer` argument:
             [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
-               (void) num_values; // clang thinks this is unused...
+               (void)num_values; // clang thinks this is unused...

               // Reset working data:
               thrust::copy(thrust::device.on(launch.get_stream()),
--- a/examples/skip.cu
+++ b/examples/skip.cu
@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
 // Two type axes are swept, but configurations where InputType == OutputType are
 // skipped.
 template <typename InputType, typename OutputType>
-void skip_overload(nvbench::state &state,
-                   nvbench::type_list<InputType, OutputType>)
+void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Overload of skip_overload that is called when InputType == OutputType.
 template <typename T>
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Enable this overload if InputType is larger than OutputType
 template <typename InputType, typename OutputType>
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
  state.skip("sizeof(InputType) > sizeof(OutputType).");
 }
 // The same type_list is used for both inputs/outputs.
-using sn_types = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+using sn_types =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
 // Setup benchmark:
 NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -52,7 +52,7 @@ void stream_bench(nvbench::state &state)
  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));

  state.exec([&input, &output, num_values](nvbench::launch &) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    copy(thrust::raw_pointer_cast(input.data()),
         thrust::raw_pointer_cast(output.data()),
         num_values);
--- a/examples/summaries.cu
+++ b/examples/summaries.cu
@@ -26,8 +26,8 @@
 void summary_example(nvbench::state &state)
 {
  // Fetch parameters and compute duration in seconds:
-  const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
-  const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
+  const auto ms       = static_cast<nvbench::float64_t>(state.get_int64("ms"));
+  const auto us       = static_cast<nvbench::float64_t>(state.get_int64("us"));
  const auto duration = ms * 1e-3 + us * 1e-6;

  // Add a new column to the summary table with the derived duration used by the benchmark.
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -51,7 +51,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);

  state.exec([&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/exec/nvbench-ctl.cu
+++ b/exec/nvbench-ctl.cu
@@ -1,20 +1,20 @@
 /*
-*  Copyright 2021 NVIDIA Corporation
-*
-*  Licensed under the Apache License, Version 2.0 with the LLVM exception
-*  (the "License"); you may not use this file except in compliance with
-*  the License.
-*
-*  You may obtain a copy of the License at
-*
-*      http://llvm.org/foundation/relicensing/LICENSE.txt
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*/
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */

 #include <nvbench/main.cuh>

@@ -24,7 +24,7 @@ int main(int argc, char const *const *argv)
 try
 {
  // If no args, substitute a new argv that prints the version
-  std::vector<const char*> alt_argv;
+  std::vector<const char *> alt_argv;
  if (argc == 1)
  {
    alt_argv.push_back("--version");
@@ -36,7 +36,7 @@ try
  NVBENCH_CUDA_CALL(cudaDeviceReset());
  return 0;
 }
-catch (std::exception & e)
+catch (std::exception &e)
 {
  std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";
  return 1;
--- a/nvbench/axes_metadata.cxx
+++ b/nvbench/axes_metadata.cxx
@@ -19,13 +19,13 @@
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/detail/throw.cuh>

+#include <fmt/format.h>
+#include <fmt/ranges.h>
+
 #include <algorithm>
 #include <cassert>
 #include <stdexcept>

-#include <fmt/format.h>
-#include <fmt/ranges.h>
-
 namespace nvbench
 {

--- a/nvbench/axis_base.cxx
+++ b/nvbench/axis_base.cxx
@@ -16,7 +16,7 @@
 *  limitations under the License.
 */

-#include "axis_base.cuh"
+#include <nvbench/axis_base.cuh>

 namespace nvbench
 {
--- a/nvbench/benchmark.cuh
+++ b/nvbench/benchmark.cuh
@@ -18,9 +18,8 @@

 #pragma once

-#include <nvbench/benchmark_base.cuh>
-
 #include <nvbench/axes_metadata.cuh>
+#include <nvbench/benchmark_base.cuh>
 #include <nvbench/runner.cuh>
 #include <nvbench/type_list.cuh>

--- a/nvbench/benchmark_manager.cuh
+++ b/nvbench/benchmark_manager.cuh
@@ -45,7 +45,7 @@ struct benchmark_manager
   * benchmarks should be done here to avoid creating a CUDA context before we configure the CUDA
   * environment in `main`.
   */
-   void initialize();
+  void initialize();

  /**
   * Register a new benchmark.
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -17,9 +17,8 @@
 */

 #include <nvbench/benchmark_manager.cuh>
-
-#include <nvbench/device_manager.cuh>
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/device_manager.cuh>

 #include <fmt/format.h>

@@ -37,8 +36,8 @@ benchmark_manager &benchmark_manager::get()

 void benchmark_manager::initialize()
 {
-  const auto& mgr = device_manager::get();
-  for (auto& bench : m_benchmarks)
+  const auto &mgr = device_manager::get();
+  for (auto &bench : m_benchmarks)
  {
    if (!bench->get_is_cpu_only())
    {
--- a/nvbench/blocking_kernel.cu
+++ b/nvbench/blocking_kernel.cu
@@ -17,12 +17,10 @@
 */

 #include <nvbench/blocking_kernel.cuh>
-
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
-#include <nvbench/types.cuh>
-
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/types.cuh>

 #include <cuda/std/chrono>

--- a/nvbench/criterion_manager.cuh
+++ b/nvbench/criterion_manager.cuh
@@ -24,7 +24,6 @@
 #include <nvbench/types.cuh>

 #include <memory>
-
 #include <unordered_map>

 namespace nvbench
@@ -40,14 +39,14 @@ public:
  /**
   * @return The singleton criterion_manager instance.
   */
-  static criterion_manager& get();
+  static criterion_manager &get();

  /**
   * Register a new stopping criterion.
   */
-  nvbench::stopping_criterion_base& add(std::unique_ptr<nvbench::stopping_criterion_base> criterion);
-  nvbench::stopping_criterion_base& get_criterion(const std::string& name);
-  const nvbench::stopping_criterion_base& get_criterion(const std::string& name) const;
+  nvbench::stopping_criterion_base &add(std::unique_ptr<nvbench::stopping_criterion_base> criterion);
+  nvbench::stopping_criterion_base &get_criterion(const std::string &name);
+  const nvbench::stopping_criterion_base &get_criterion(const std::string &name) const;

  using params_description = std::vector<std::pair<std::string, nvbench::named_values::type>>;
  params_description get_params_description() const;
--- a/nvbench/criterion_manager.cxx
+++ b/nvbench/criterion_manager.cxx
@@ -41,7 +41,7 @@ criterion_manager &criterion_manager::get()
  return registry;
 }

-stopping_criterion_base& criterion_manager::get_criterion(const std::string& name)
+stopping_criterion_base &criterion_manager::get_criterion(const std::string &name)
 {
  auto iter = m_map.find(name);
  if (iter == m_map.end())
@@ -51,7 +51,8 @@ stopping_criterion_base& criterion_manager::get_criterion(const std::string& nam
  return *iter->second.get();
 }

-const nvbench::stopping_criterion_base& criterion_manager::get_criterion(const std::string& name) const
+const nvbench::stopping_criterion_base &
+criterion_manager::get_criterion(const std::string &name) const
 {
  auto iter = m_map.find(name);
  if (iter == m_map.end())
@@ -69,8 +70,7 @@ stopping_criterion_base &criterion_manager::add(std::unique_ptr<stopping_criteri

  if (!success)
  {
-    NVBENCH_THROW(std::runtime_error,
-                  "Stopping criterion \"{}\" is already registered.", name);
+    NVBENCH_THROW(std::runtime_error, "Stopping criterion \"{}\" is already registered.", name);
  }

  return *it->second.get();
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -16,14 +16,12 @@
 *  limitations under the License.
 */

-#include <nvbench/csv_printer.cuh>
-
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/csv_printer.cuh>
 #include <nvbench/device_info.cuh>
-#include <nvbench/summary.cuh>
-
 #include <nvbench/internal/table_builder.cuh>
+#include <nvbench/summary.cuh>

 #include <fmt/format.h>

@@ -169,7 +167,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
      std::size_t remaining = table.m_columns.size();
      for (const auto &col : table.m_columns)
      {
-        fmt::format_to(std::back_inserter(buffer), "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
+        fmt::format_to(std::back_inserter(buffer),
+                       "{}{}",
+                       col.rows[i],
+                       (--remaining == 0) ? "" : ",");
      }
      fmt::format_to(std::back_inserter(buffer), "\n");
    }
--- a/nvbench/cuda_timer.cuh
+++ b/nvbench/cuda_timer.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/cuda_call.cuh>
-
 #include <nvbench/types.cuh>

 #include <cuda_runtime_api.h>
--- a/nvbench/cupti_profiler.cxx
+++ b/nvbench/cupti_profiler.cxx
@@ -17,7 +17,6 @@
 */

 #include <nvbench/cupti_profiler.cuh>
-
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>

@@ -54,7 +53,9 @@ void nvpw_call(const NVPA_Status status)
 {
  if (status != NVPA_STATUS_SUCCESS)
  {
-    NVBENCH_THROW(std::runtime_error, "NVPW call returned error: {}", static_cast<std::underlying_type_t<NVPA_Status>>(status));
+    NVBENCH_THROW(std::runtime_error,
+                  "NVPW call returned error: {}",
+                  static_cast<std::underlying_type_t<NVPA_Status>>(status));
  }
 }

--- a/nvbench/detail/entropy_criterion.cuh
+++ b/nvbench/detail/entropy_criterion.cuh
@@ -18,9 +18,9 @@

 #pragma once

-#include <nvbench/types.cuh>
-#include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/ring_buffer.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>

 #include <vector>

@@ -38,7 +38,7 @@ class entropy_criterion final : public stopping_criterion_base
  nvbench::detail::ring_buffer<nvbench::float64_t> m_entropy_tracker{299};

  // Used to avoid re-allocating temporary memory
-  std::vector<nvbench::float64_t> m_probabilities; 
+  std::vector<nvbench::float64_t> m_probabilities;

  nvbench::float64_t compute_entropy();

@@ -49,7 +49,6 @@ protected:
  virtual void do_initialize() override;
  virtual void do_add_measurement(nvbench::float64_t measurement) override;
  virtual bool do_is_finished() override;
-  
 };

 } // namespace nvbench::detail
--- a/nvbench/detail/entropy_criterion.cxx
+++ b/nvbench/detail/entropy_criterion.cxx
@@ -21,7 +21,6 @@

 #include <cmath>

-
 namespace nvbench::detail
 {

@@ -40,7 +39,7 @@ void entropy_criterion::do_initialize()
  m_freq_tracker.clear();
 }

-nvbench::float64_t entropy_criterion::compute_entropy() 
+nvbench::float64_t entropy_criterion::compute_entropy()
 {
  const std::size_t n = m_freq_tracker.size();
  if (n == 0)
@@ -70,15 +69,15 @@ void entropy_criterion::do_add_measurement(nvbench::float64_t measurement)
  m_total_cuda_time += measurement;

  {
-    auto key = measurement;
+    auto key                = measurement;
    constexpr bool bin_keys = false;

-    if (bin_keys) 
+    if (bin_keys)
    {
      const auto resolution_us = 0.5;
-      const auto resulution_s = resolution_us / 1'000'000;
-      const auto epsilon = resulution_s * 2;
-      key = std::round(key / epsilon) * epsilon;
+      const auto resulution_s  = resolution_us / 1 '000' 000;
+      const auto epsilon       = resulution_s * 2;
+      key                      = std::round(key / epsilon) * epsilon;
    }

    // This approach is about 3x faster than `std::{unordered_,}map`
@@ -120,7 +119,7 @@ bool entropy_criterion::do_is_finished()

  const auto [slope, intercept] = statistics::compute_linear_regression(begin, end, mean);

-  if (statistics::slope2deg(slope) > m_params.get_float64("max-angle")) 
+  if (statistics::slope2deg(slope) > m_params.get_float64("max-angle"))
  {
    return false;
  }
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -25,13 +25,13 @@
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>

+#include <fmt/format.h>
+
 #include <algorithm>
 #include <chrono>
 #include <limits>
 #include <thread>

-#include <fmt/format.h>
-
 namespace nvbench::detail
 {

--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -18,8 +18,6 @@

 #pragma once

-#include <cuda_runtime.h>
-
 #include <nvbench/blocking_kernel.cuh>
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
@@ -32,12 +30,13 @@
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>
+
+#include <cuda_runtime.h>

 #include <utility>
 #include <vector>

-#include "nvbench/types.cuh"
-
 namespace nvbench
 {

--- a/nvbench/detail/measure_cpu_only.cuh
+++ b/nvbench/detail/measure_cpu_only.cuh
@@ -19,13 +19,12 @@
 #pragma once

 #include <nvbench/cpu_timer.cuh>
+#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
+#include <nvbench/detail/statistics.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>

-#include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
-#include <nvbench/detail/statistics.cuh>
-
 #include <utility>
 #include <vector>

@@ -66,7 +65,7 @@ protected:
  nvbench::cpu_timer m_walltime_timer;

  nvbench::criterion_params m_criterion_params;
-  nvbench::stopping_criterion_base& m_stopping_criterion;
+  nvbench::stopping_criterion_base &m_stopping_criterion;

  bool m_run_once{false};

--- a/nvbench/detail/measure_cpu_only.cxx
+++ b/nvbench/detail/measure_cpu_only.cxx
@@ -24,11 +24,11 @@
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>

+#include <fmt/format.h>
+
 #include <algorithm>
 #include <limits>

-#include <fmt/format.h>
-
 namespace nvbench::detail
 {

@@ -36,7 +36,8 @@ measure_cpu_only_base::measure_cpu_only_base(state &exec_state)
    : m_state{exec_state}
    , m_launch(m_state.get_cuda_stream())
    , m_criterion_params{exec_state.get_criterion_params()}
-    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
+    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
+        exec_state.get_stopping_criterion())}
    , m_run_once{exec_state.get_run_once()}
    , m_min_samples{exec_state.get_min_samples()}
    , m_skip_time{exec_state.get_skip_time()}
@@ -72,7 +73,7 @@ void measure_cpu_only_base::run_trials_prologue() { m_walltime_timer.start(); }
 void measure_cpu_only_base::record_measurements()
 {
  // Update and record timers and counters:
-  const auto cur_cpu_time  = m_cpu_timer.get_duration();
+  const auto cur_cpu_time = m_cpu_timer.get_duration();

  m_min_cpu_time = std::min(m_min_cpu_time, cur_cpu_time);
  m_max_cpu_time = std::max(m_max_cpu_time, cur_cpu_time);
@@ -188,8 +189,7 @@ void measure_cpu_only_base::generate_summaries()
      auto &summ = m_state.add_summary("nv/cpu_only/bw/global/bytes_per_second");
      summ.set_string("name", "GlobalMem BW");
      summ.set_string("hint", "byte_rate");
-      summ.set_string("description",
-                      "Number of bytes read/written per second.");
+      summ.set_string("description", "Number of bytes read/written per second.");
      summ.set_float64("value", avg_used_gmem_bw);
    }
  } // bandwidth
@@ -210,9 +210,9 @@ void measure_cpu_only_base::generate_summaries()

    if (m_max_time_exceeded)
    {
-      const auto timeout = m_walltime_timer.get_duration();
+      const auto timeout   = m_walltime_timer.get_duration();
      const auto max_noise = m_criterion_params.get_float64("max-noise");
-      const auto min_time = m_criterion_params.get_float64("min-time");
+      const auto min_time  = m_criterion_params.get_float64("min-time");

      if (cpu_noise > max_noise)
      {
--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -24,13 +24,12 @@
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
 #include <nvbench/cupti_profiler.cuh>
-#include <nvbench/device_info.cuh>
-#include <nvbench/exec_tag.cuh>
-#include <nvbench/launch.cuh>
-
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
 #include <nvbench/detail/statistics.cuh>
+#include <nvbench/device_info.cuh>
+#include <nvbench/exec_tag.cuh>
+#include <nvbench/launch.cuh>

 #include <cuda_runtime.h>

--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -16,9 +16,8 @@
 *  limitations under the License.
 */

-#include <nvbench/detail/measure_hot.cuh>
-
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/detail/measure_hot.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/printer_base.cuh>
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -19,12 +19,11 @@
 #pragma once

 #include <nvbench/config.cuh>
-
 #include <nvbench/detail/statistics.cuh>

+#include <cassert>
 #include <cstddef>
 #include <iterator>
-#include <cassert>
 #include <vector>

 namespace nvbench::detail
@@ -76,14 +75,14 @@ public:
    return temp;
  }

-  ring_buffer_iterator operator+(difference_type n) const 
-  { 
-    return ring_buffer_iterator(m_index + n, m_capacity, m_ptr); 
+  ring_buffer_iterator operator+(difference_type n) const
+  {
+    return ring_buffer_iterator(m_index + n, m_capacity, m_ptr);
  }

-  ring_buffer_iterator operator-(difference_type n) const 
-  { 
-    return ring_buffer_iterator(m_index - n, m_capacity, m_ptr); 
+  ring_buffer_iterator operator-(difference_type n) const
+  {
+    return ring_buffer_iterator(m_index - n, m_capacity, m_ptr);
  }

  difference_type operator-(const ring_buffer_iterator &other) const
@@ -121,13 +120,9 @@ private:
  std::size_t m_index{0};
  bool m_full{false};

-  std::size_t get_front_index() const 
-  {
-    return m_full ? m_index : 0;
-  }
+  std::size_t get_front_index() const { return m_full ? m_index : 0; }

 public:
-
  /**
   * Create a new ring buffer with the requested capacity.
   */
--- a/nvbench/detail/state_generator.cxx
+++ b/nvbench/detail/state_generator.cxx
@@ -16,15 +16,13 @@
 *  limitations under the License.
 */

-#include <nvbench/detail/state_generator.cuh>
-
 #include <nvbench/benchmark_base.cuh>
+#include <nvbench/detail/state_generator.cuh>
+#include <nvbench/detail/transform_reduce.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/type_axis.cuh>

-#include <nvbench/detail/transform_reduce.cuh>
-
 #include <algorithm>
 #include <cassert>
 #include <functional>
@@ -165,7 +163,7 @@ void state_generator::build_axis_configs()
        config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
      }
    } // type_si
-  }   // type_axis_config generation
+  } // type_axis_config generation

  // non_type_axis_config generation
  {
@@ -201,9 +199,9 @@ void state_generator::build_axis_configs()
                              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
            break;
        } // switch (type)
-      }   // for (axis_info : current_indices)
-    }     // for non_type_sg configs
-  }       // non_type_axis_config generation
+      } // for (axis_info : current_indices)
+    } // for non_type_sg configs
+  } // non_type_axis_config generation
 }

 void state_generator::build_states()
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -26,12 +26,10 @@
 #include <iterator>
 #include <limits>
 #include <numeric>
-#include <cmath>
-
 #include <type_traits>

 #ifndef M_PI
-  #define M_PI 3.14159265358979323846
+#define M_PI 3.14159265358979323846
 #endif

 namespace nvbench::detail::statistics
@@ -154,7 +152,7 @@ nvbench::float64_t compute_r2(It first,

  for (std::size_t i = 0; i < n; ++i, ++first)
  {
-    const nvbench::float64_t y = *first;
+    const nvbench::float64_t y      = *first;
    const nvbench::float64_t y_pred = slope * static_cast<nvbench::float64_t>(i) + intercept;

    ss_tot += (y - mean_y) * (y - mean_y);
@@ -179,19 +177,10 @@ compute_r2(It first, It last, nvbench::float64_t slope, nvbench::float64_t inter
  return compute_r2(first, last, compute_mean(first, last), slope, intercept);
 }

-inline nvbench::float64_t rad2deg(nvbench::float64_t rad)
-{
-  return rad * 180.0 / M_PI;
-}
+inline nvbench::float64_t rad2deg(nvbench::float64_t rad) { return rad * 180.0 / M_PI; }

-inline nvbench::float64_t slope2rad(nvbench::float64_t slope)
-{
-  return std::atan2(slope, 1.0);
-}
+inline nvbench::float64_t slope2rad(nvbench::float64_t slope) { return std::atan2(slope, 1.0); }

-inline nvbench::float64_t slope2deg(nvbench::float64_t slope)
-{
-  return rad2deg(slope2rad(slope));
-}
+inline nvbench::float64_t slope2deg(nvbench::float64_t slope) { return rad2deg(slope2rad(slope)); }

 } // namespace nvbench::detail::statistics
--- a/nvbench/detail/stdrel_criterion.cuh
+++ b/nvbench/detail/stdrel_criterion.cuh
@@ -18,9 +18,9 @@

 #pragma once

-#include <nvbench/types.cuh>
-#include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/ring_buffer.cuh>
+#include <nvbench/stopping_criterion.cuh>
+#include <nvbench/types.cuh>

 #include <vector>

--- a/nvbench/detail/stdrel_criterion.cxx
+++ b/nvbench/detail/stdrel_criterion.cxx
@@ -29,7 +29,7 @@ stdrel_criterion::stdrel_criterion()

 void stdrel_criterion::do_initialize()
 {
-  m_total_samples = 0;
+  m_total_samples   = 0;
  m_total_cuda_time = 0.0;
  m_cuda_times.clear();
  m_noise_tracker.clear();
@@ -46,7 +46,7 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
                                                                          m_cuda_times.cend(),
                                                                          mean_cuda_time);
-  const auto cuda_rel_stdev       = cuda_stdev / mean_cuda_time;
+  const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
  if (std::isfinite(cuda_rel_stdev))
  {
    m_noise_tracker.push_back(cuda_rel_stdev);
--- a/nvbench/detail/throw.cuh
+++ b/nvbench/detail/throw.cuh
@@ -19,6 +19,7 @@
 #pragma once

 #include <fmt/format.h>
+
 #include <stdexcept>

 #define NVBENCH_THROW(exception_type, format_str, ...)                                             \
--- a/nvbench/detail/timestamps_kernel.cu
+++ b/nvbench/detail/timestamps_kernel.cu
@@ -16,13 +16,13 @@
 *  limitations under the License.
 */

-#include <cuda_runtime.h>
-
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/detail/timestamps_kernel.cuh>
 #include <nvbench/types.cuh>

+#include <cuda_runtime.h>
+
 #include <cstdio>
 #include <cstdlib>

@@ -71,12 +71,11 @@ void timestamps_kernel::record(const nvbench::cuda_stream &stream)
  int num_sms   = 0;

  NVBENCH_CUDA_CALL(cudaGetDevice(&device_id));
-  NVBENCH_CUDA_CALL(
-    cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
+  NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));

  get_timestamps_kernel<<<static_cast<unsigned int>(num_sms), 1, 0, stream.get_stream()>>>(
    m_device_timestamps,
    m_device_timestamps + 1);
 }

-} // namespace nvbench
+} // namespace nvbench::detail
--- a/nvbench/detail/type_list_impl.cuh
+++ b/nvbench/detail/type_list_impl.cuh
@@ -82,7 +82,7 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL,
  using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
  using cur       = typename detail::prepend_each<T, tail_prod>::type;
  using next      = typename detail::cartesian_product<
-    nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
+         nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
  using type = decltype(detail::concat(cur{}, next{}));
 };

--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -16,11 +16,10 @@
 *  limitations under the License.
 */

-#include <nvbench/device_info.cuh>
-
 #include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
+#include <nvbench/device_info.cuh>
 #include <nvbench/internal/nvml.cuh>

 #include <cuda_runtime_api.h>
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -18,17 +18,16 @@

 #pragma once

-#include <cuda_runtime_api.h>
-
 #include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>

+#include <cuda_runtime_api.h>
+
 #include <cstdint> // CHAR_BIT
 #include <stdexcept>
-#include <utility>
-
 #include <string_view>
+#include <utility>

 // forward declare this for internal storage
 struct nvmlDevice_st;
--- a/nvbench/device_manager.cu
+++ b/nvbench/device_manager.cu
@@ -16,13 +16,12 @@
 *  limitations under the License.
 */

-#include <nvbench/device_manager.cuh>
-
-#include <cuda_runtime_api.h>
-
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/device_manager.cuh>
+
+#include <cuda_runtime_api.h>

 namespace nvbench
 {
@@ -45,13 +44,13 @@ device_manager::device_manager()
  }
 }

-const nvbench::device_info &device_manager::get_device(int id) 
-{ 
-  if (id < 0) 
+const nvbench::device_info &device_manager::get_device(int id)
+{
+  if (id < 0)
  {
    NVBENCH_THROW(std::runtime_error, "Negative index: {}.", id);
  }
-  return m_devices.at(static_cast<std::size_t>(id)); 
+  return m_devices.at(static_cast<std::size_t>(id));
 }

 } // namespace nvbench
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -101,10 +101,10 @@ using no_gpu_t        = tag<nvbench::detail::exec_flag::no_gpu>;
 using no_batch_t      = tag<nvbench::detail::exec_flag::no_batch>;
 using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;

-using hot_t           = tag<nvbench::detail::exec_flag::hot>;
-using cold_t          = tag<nvbench::detail::exec_flag::cold>;
-using cpu_only_t      = tag<nvbench::detail::exec_flag::cpu_only>;
-using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;
+using hot_t          = tag<nvbench::detail::exec_flag::hot>;
+using cold_t         = tag<nvbench::detail::exec_flag::cold>;
+using cpu_only_t     = tag<nvbench::detail::exec_flag::cpu_only>;
+using measure_mask_t = tag<nvbench::detail::exec_flag::measure_mask>;

 constexpr inline none_t none;
 constexpr inline timer_t timer;
--- a/nvbench/float64_axis.cuh
+++ b/nvbench/float64_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/types.cuh>

 #include <vector>
@@ -40,7 +39,10 @@ struct float64_axis final : public axis_base
  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }

 private:
-  std::unique_ptr<axis_base> do_clone() const final { return std::make_unique<float64_axis>(*this); }
+  std::unique_ptr<axis_base> do_clone() const final
+  {
+    return std::make_unique<float64_axis>(*this);
+  }
  std::size_t do_get_size() const final { return m_values.size(); }
  std::string do_get_input_string(std::size_t i) const final;
  std::string do_get_description(std::size_t i) const final;
--- a/nvbench/int64_axis.cuh
+++ b/nvbench/int64_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/flags.cuh>
 #include <nvbench/types.cuh>

--- a/nvbench/int64_axis.cxx
+++ b/nvbench/int64_axis.cxx
@@ -16,9 +16,8 @@
 *  limitations under the License.
 */

-#include <nvbench/int64_axis.cuh>
-
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/int64_axis.cuh>

 #include <fmt/format.h>

--- a/nvbench/internal/markdown_table.cuh
+++ b/nvbench/internal/markdown_table.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/detail/transform_reduce.cuh>
-
 #include <nvbench/internal/table_builder.cuh>

 #include <fmt/color.h>
--- a/nvbench/internal/nvml.cuh
+++ b/nvbench/internal/nvml.cuh
@@ -21,12 +21,12 @@
 #include <nvbench/config.cuh>
 #include <nvbench/detail/throw.cuh>

-#include <fmt/format.h>
-
 #ifdef NVBENCH_HAS_NVML
 #include <nvml.h>
 #endif // NVBENCH_HAS_NVML

+#include <fmt/format.h>
+
 #include <stdexcept>

 namespace nvbench::nvml
@@ -38,6 +38,7 @@ struct NVMLLifetimeManager
 {
  NVMLLifetimeManager();
  ~NVMLLifetimeManager();
+
 private:
  bool m_inited{false};
 };
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -16,24 +16,22 @@
 *  limitations under the License.
 */

-#include <nvbench/json_printer.cuh>
-
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/config.cuh>
+#include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/device_manager.cuh>
 #include <nvbench/git_revision.cuh>
+#include <nvbench/json_printer.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/version.cuh>

-#include <nvbench/detail/throw.cuh>
+#include <nlohmann/json.hpp>

 #include <fmt/format.h>

-#include <nlohmann/json.hpp>
-
 #include <cstdint>
 #include <fstream>
 #include <iterator>
@@ -105,7 +103,7 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
      default:
        NVBENCH_THROW(std::runtime_error, "{}", "Unrecognized value type.");
    } // end switch (value type)
-  }   // end foreach value name
+  } // end foreach value name
 }

 } // end namespace
@@ -225,27 +223,26 @@ static void add_devices_section(nlohmann::ordered_json &root)
  auto &devices = root["devices"];
  for (const auto &dev_info : nvbench::device_manager::get().get_devices())
  {
-    auto &device                    = devices.emplace_back();
-    device["id"]                    = dev_info.get_id();
-    device["name"]                  = dev_info.get_name();
-    device["sm_version"]            = dev_info.get_sm_version();
-    device["ptx_version"]           = dev_info.get_ptx_version();
-    device["sm_default_clock_rate"] = dev_info.get_sm_default_clock_rate();
-    device["number_of_sms"]         = dev_info.get_number_of_sms();
-    device["max_blocks_per_sm"]     = dev_info.get_max_blocks_per_sm();
-    device["max_threads_per_sm"]    = dev_info.get_max_threads_per_sm();
-    device["max_threads_per_block"] = dev_info.get_max_threads_per_block();
-    device["registers_per_sm"]      = dev_info.get_registers_per_sm();
-    device["registers_per_block"]   = dev_info.get_registers_per_block();
-    device["global_memory_size"]    = dev_info.get_global_memory_size();
-    device["global_memory_bus_peak_clock_rate"] =
-      dev_info.get_global_memory_bus_peak_clock_rate();
-    device["global_memory_bus_width"]     = dev_info.get_global_memory_bus_width();
-    device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
-    device["l2_cache_size"]               = dev_info.get_l2_cache_size();
-    device["shared_memory_per_sm"]        = dev_info.get_shared_memory_per_sm();
-    device["shared_memory_per_block"]     = dev_info.get_shared_memory_per_block();
-    device["ecc_state"]                   = dev_info.get_ecc_state();
+    auto &device                                = devices.emplace_back();
+    device["id"]                                = dev_info.get_id();
+    device["name"]                              = dev_info.get_name();
+    device["sm_version"]                        = dev_info.get_sm_version();
+    device["ptx_version"]                       = dev_info.get_ptx_version();
+    device["sm_default_clock_rate"]             = dev_info.get_sm_default_clock_rate();
+    device["number_of_sms"]                     = dev_info.get_number_of_sms();
+    device["max_blocks_per_sm"]                 = dev_info.get_max_blocks_per_sm();
+    device["max_threads_per_sm"]                = dev_info.get_max_threads_per_sm();
+    device["max_threads_per_block"]             = dev_info.get_max_threads_per_block();
+    device["registers_per_sm"]                  = dev_info.get_registers_per_sm();
+    device["registers_per_block"]               = dev_info.get_registers_per_block();
+    device["global_memory_size"]                = dev_info.get_global_memory_size();
+    device["global_memory_bus_peak_clock_rate"] = dev_info.get_global_memory_bus_peak_clock_rate();
+    device["global_memory_bus_width"]           = dev_info.get_global_memory_bus_width();
+    device["global_memory_bus_bandwidth"]       = dev_info.get_global_memory_bus_bandwidth();
+    device["l2_cache_size"]                     = dev_info.get_l2_cache_size();
+    device["shared_memory_per_sm"]              = dev_info.get_shared_memory_per_sm();
+    device["shared_memory_per_block"]           = dev_info.get_shared_memory_per_block();
+    device["ecc_state"]                         = dev_info.get_ecc_state();
  }
 }

@@ -298,8 +295,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
          false;
 #endif
      } // "nvbench"
-    }   // "version"
-  }     // "meta"
+    } // "version"
+  } // "meta"

  add_devices_section(root);

@@ -362,8 +359,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
            default:
              break;
          } // end switch (axis type)
-        }   // end foreach axis value
-      }     // end foreach axis
+        } // end foreach axis value
+      } // end foreach axis

      auto &states = bench["states"];
      for (const auto &exec_state : bench_ptr->get_states())
@@ -431,8 +428,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
          continue;
        }
      } // end foreach exec_state
-    }   // end foreach benchmark
-  }     // "benchmarks"
+    } // end foreach benchmark
+  } // "benchmarks"

  m_ostream << root.dump(2) << "\n";
 }
@@ -492,7 +489,7 @@ void json_printer::do_print_benchmark_list(const benchmark_vector &benches)
          default:
            break;
        } // end switch (axis type)
-      }   // end foreach axis value
+      } // end foreach axis value
    }
  } // end foreach bench

--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/printer_base.cuh>
-
 #include <nvbench/types.cuh>

 #include <string>
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -16,15 +16,13 @@
 *  limitations under the License.
 */

-#include <nvbench/markdown_printer.cuh>
-
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/device_manager.cuh>
+#include <nvbench/internal/markdown_table.cuh>
+#include <nvbench/markdown_printer.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>

-#include <nvbench/internal/markdown_table.cuh>
-
 #include <fmt/color.h>
 #include <fmt/format.h>

@@ -72,8 +70,12 @@ void markdown_printer::do_print_device_info()
                   "* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
                   device.get_shared_memory_per_sm() / 1024,
                   device.get_shared_memory_per_block() / 1024);
-    fmt::format_to(std::back_inserter(buffer), "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
-    fmt::format_to(std::back_inserter(buffer), "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
+    fmt::format_to(std::back_inserter(buffer),
+                   "* L2 Cache Size: {} KiB\n",
+                   device.get_l2_cache_size() / 1024);
+    fmt::format_to(std::back_inserter(buffer),
+                   "* Maximum Active Blocks: {}/SM\n",
+                   device.get_max_blocks_per_sm());
    fmt::format_to(std::back_inserter(buffer),
                   "* Maximum Active Threads: {}/SM, {}/Block\n",
                   device.get_max_threads_per_sm(),
@@ -82,7 +84,9 @@ void markdown_printer::do_print_device_info()
                   "* Available Registers: {}/SM, {}/Block\n",
                   device.get_registers_per_sm(),
                   device.get_registers_per_block());
-    fmt::format_to(std::back_inserter(buffer), "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
+    fmt::format_to(std::back_inserter(buffer),
+                   "* ECC Enabled: {}\n",
+                   device.get_ecc_state() ? "Yes" : "No");
    fmt::format_to(std::back_inserter(buffer), "\n");
  }
  m_ostream << fmt::to_string(buffer);
@@ -191,9 +195,12 @@ void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vec
        {
          desc = fmt::format(" ({})", desc);
        }
-        fmt::format_to(std::back_inserter(buffer), "  * `{}`{}\n", axis_ptr->get_input_string(i), desc);
+        fmt::format_to(std::back_inserter(buffer),
+                       "  * `{}`{}\n",
+                       axis_ptr->get_input_string(i),
+                       desc);
      } // end foreach value
-    }   // end foreach axis
+    } // end foreach axis
    fmt::format_to(std::back_inserter(buffer), "\n");
  } // end foreach bench

--- a/nvbench/named_values.cxx
+++ b/nvbench/named_values.cxx
@@ -16,10 +16,9 @@
 *  limitations under the License.
 */

-#include <nvbench/named_values.cuh>
-
 #include <nvbench/config.cuh>
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/named_values.cuh>

 #include <fmt/format.h>

--- a/nvbench/nvbench.cuh
+++ b/nvbench/nvbench.cuh
@@ -24,8 +24,8 @@
 #include <nvbench/callable.cuh>
 #include <nvbench/config.cuh>
 #include <nvbench/cpu_timer.cuh>
-#include <nvbench/criterion_manager.cuh>
 #include <nvbench/create.cuh>
+#include <nvbench/criterion_manager.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/cuda_timer.cuh>
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -34,6 +34,8 @@
 #include <nvbench/internal/cli_help.cuh>
 #include <nvbench/internal/cli_help_axis.cuh>

+#include <fmt/format.h>
+
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
@@ -44,12 +46,10 @@
 #include <regex>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <tuple>
 #include <vector>

-#include <fmt/format.h>
-#include <string_view>
-
 namespace
 {

--- a/nvbench/printer_base.cuh
+++ b/nvbench/printer_base.cuh
@@ -191,9 +191,9 @@ protected:
  virtual void do_process_bulk_data_float64(nvbench::state &,
                                            const std::string &,
                                            const std::string &,
-                                            const std::vector<nvbench::float64_t> &){};
+                                            const std::vector<nvbench::float64_t> &) {};

-  virtual void do_print_benchmark_list(const benchmark_vector &) 
+  virtual void do_print_benchmark_list(const benchmark_vector &)
  {
    throw std::runtime_error{"nvbench::do_print_benchmark_list is not supported by this printer."};
  }
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/benchmark_base.cuh>
-
 #include <nvbench/detail/state_generator.cuh>

 #include <stdexcept>
--- a/nvbench/runner.cxx
+++ b/nvbench/runner.cxx
@@ -16,10 +16,9 @@
 *  limitations under the License.
 */

-#include <nvbench/runner.cuh>
-
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/printer_base.cuh>
+#include <nvbench/runner.cuh>
 #include <nvbench/state.cuh>

 #include <fmt/format.h>
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -20,13 +20,13 @@
 #include <nvbench/state.cuh>
 #include <nvbench/types.cuh>

+#include <fmt/color.h>
+#include <fmt/format.h>
+
 #include <algorithm>
 #include <stdexcept>
 #include <string>

-#include <fmt/color.h>
-#include <fmt/format.h>
-
 namespace nvbench
 {

--- a/nvbench/stopping_criterion.cuh
+++ b/nvbench/stopping_criterion.cuh
@@ -21,19 +21,21 @@
 #include <nvbench/named_values.cuh>
 #include <nvbench/types.cuh>

-#include <string>
-
 #include <initializer_list>
+#include <string>
 #include <unordered_map>

 namespace nvbench
 {

-namespace detail 
+namespace detail
 {

-constexpr nvbench::float64_t compat_min_time() { return 0.5; }    // 0.5 seconds
-constexpr nvbench::float64_t compat_max_noise() { return 0.005; } // 0.5% relative standard deviation
+constexpr nvbench::float64_t compat_min_time() { return 0.5; } // 0.5 seconds
+constexpr nvbench::float64_t compat_max_noise()
+{
+  return 0.005;
+} // 0.5% relative standard deviation

 } // namespace detail

@@ -43,6 +45,7 @@ constexpr nvbench::float64_t compat_max_noise() { return 0.005; } // 0.5% relati
 class criterion_params
 {
  nvbench::named_values m_named_values;
+
 public:
  criterion_params();
  criterion_params(std::initializer_list<std::pair<std::string, nvbench::named_values::value_type>>);
@@ -96,7 +99,7 @@ public:
   *
   * This method is called once per benchmark run, before any measurements are provided.
   */
-  void initialize(const criterion_params &params) 
+  void initialize(const criterion_params &params)
  {
    m_params.set_from(params);
    this->do_initialize();
@@ -105,18 +108,12 @@ public:
  /**
   * Add the latest measurement to the criterion
   */
-  void add_measurement(nvbench::float64_t measurement)
-  {
-    this->do_add_measurement(measurement);
-  }
+  void add_measurement(nvbench::float64_t measurement) { this->do_add_measurement(measurement); }

  /**
   * Check if the criterion has been met for all measurements processed by `add_measurement`
   */
-  bool is_finished()
-  {
-    return this->do_is_finished();
-  }
+  bool is_finished() { return this->do_is_finished(); }

 protected:
  /**
--- a/nvbench/stopping_criterion.cxx
+++ b/nvbench/stopping_criterion.cxx
@@ -16,10 +16,8 @@
 *  limitations under the License.
 */

-#include <nvbench/stopping_criterion.cuh>
-
 #include <nvbench/detail/throw.cuh>
-
+#include <nvbench/stopping_criterion.cuh>

 namespace nvbench
 {
@@ -62,7 +60,7 @@ void criterion_params::set_from(const criterion_params &other)

 void criterion_params::set_int64(std::string name, nvbench::int64_t value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
  {
    m_named_values.remove_value(name);
  }
@@ -72,7 +70,7 @@ void criterion_params::set_int64(std::string name, nvbench::int64_t value)

 void criterion_params::set_float64(std::string name, nvbench::float64_t value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
  {
    m_named_values.remove_value(name);
  }
@@ -82,7 +80,7 @@ void criterion_params::set_float64(std::string name, nvbench::float64_t value)

 void criterion_params::set_string(std::string name, std::string value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
  {
    m_named_values.remove_value(name);
  }
@@ -110,15 +108,11 @@ std::string criterion_params::get_string(const std::string &name) const
  return m_named_values.get_string(name);
 }

-std::vector<std::string> criterion_params::get_names() const
-{
-  return m_named_values.get_names();
-}
+std::vector<std::string> criterion_params::get_names() const { return m_named_values.get_names(); }

 nvbench::named_values::type criterion_params::get_type(const std::string &name) const
 {
  return m_named_values.get_type(name);
 }

-
-} // namespace nvbench::detail
+} // namespace nvbench
--- a/nvbench/string_axis.cuh
+++ b/nvbench/string_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/types.cuh>

 #include <vector>
--- a/nvbench/type_axis.cuh
+++ b/nvbench/type_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once

 #include <nvbench/axis_base.cuh>
-
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>

--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -16,9 +16,8 @@
 *  limitations under the License.
 */

-#include <nvbench/type_axis.cuh>
-
 #include <nvbench/detail/throw.cuh>
+#include <nvbench/type_axis.cuh>

 #include <fmt/format.h>
 #include <fmt/ranges.h>
--- a/nvbench/type_list.cuh
+++ b/nvbench/type_list.cuh
@@ -18,7 +18,7 @@

 #pragma once

-#include "detail/type_list_impl.cuh"
+#include <nvbench/detail/type_list_impl.cuh>

 #include <tuple>
 #include <type_traits>
--- a/nvbench/type_strings.cxx
+++ b/nvbench/type_strings.cxx
@@ -27,11 +27,11 @@
 #endif

 #ifdef NVBENCH_CXXABI_DEMANGLE
-#include <cxxabi.h>
-
 #include <cstdlib>
 #include <memory>

+#include <cxxabi.h>
+
 namespace
 {
 struct free_wrapper
--- a/scripts/nvbench_compare.py
+++ b/scripts/nvbench_compare.py
@@ -5,12 +5,11 @@ import math
 import os
 import sys

-from colorama import Fore
-
 import tabulate
-
+from colorama import Fore
 from nvbench_json import reader

+
 # Parse version string into tuple, "x.y.z" -> (x, y, z)
 def version_tuple(v):
    return tuple(map(int, (v.split("."))))
@@ -139,15 +138,14 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
        colalign.append("center")

        for device_id in device_ids:
-
            rows = []
-            plot_data = {'cmp': {}, 'ref': {}, 'cmp_noise': {}, 'ref_noise': {}}
+            plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}}

            for cmp_state in cmp_states:
                cmp_state_name = cmp_state["name"]
-                ref_state = next(filter(lambda st: st["name"] == cmp_state_name,
-                                        ref_states),
-                                 None)
+                ref_state = next(
+                    filter(lambda st: st["name"] == cmp_state_name, ref_states), None
+                )
                if not ref_state:
                    continue

@@ -158,9 +156,7 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                row = []
                for axis_value in axis_values:
                    axis_value_name = axis_value["name"]
-                    row.append(format_axis_value(axis_value_name,
-                                                 axis_value,
-                                                 axes))
+                    row.append(format_axis_value(axis_value_name, axis_value, axes))

                cmp_summaries = cmp_state["summaries"]
                ref_summaries = ref_state["summaries"]
@@ -171,23 +167,37 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                def lookup_summary(summaries, tag):
                    return next(filter(lambda s: s["tag"] == tag, summaries), None)

-                cmp_time_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/mean")
-                ref_time_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/mean")
-                cmp_noise_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/stdev/relative")
-                ref_noise_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/stdev/relative")
+                cmp_time_summary = lookup_summary(
+                    cmp_summaries, "nv/cold/time/gpu/mean"
+                )
+                ref_time_summary = lookup_summary(
+                    ref_summaries, "nv/cold/time/gpu/mean"
+                )
+                cmp_noise_summary = lookup_summary(
+                    cmp_summaries, "nv/cold/time/gpu/stdev/relative"
+                )
+                ref_noise_summary = lookup_summary(
+                    ref_summaries, "nv/cold/time/gpu/stdev/relative"
+                )

                # TODO: Use other timings, too. Maybe multiple rows, with a
                # "Timing" column + values "CPU/GPU/Batch"?
-                if not all([cmp_time_summary,
-                            ref_time_summary,
-                            cmp_noise_summary,
-                            ref_noise_summary]):
+                if not all(
+                    [
+                        cmp_time_summary,
+                        ref_time_summary,
+                        cmp_noise_summary,
+                        ref_noise_summary,
+                    ]
+                ):
                    continue

                def extract_value(summary):
                    summary_data = summary["data"]
-                    value_data = next(filter(lambda v: v["name"] == "value", summary_data))
-                    assert(value_data["type"] == "float64")
+                    value_data = next(
+                        filter(lambda v: v["name"] == "value", summary_data)
+                    )
+                    assert value_data["type"] == "float64"
                    return value_data["value"]

                cmp_time = extract_value(cmp_time_summary)
@@ -218,23 +228,27 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                if plot:
                    axis_name = []
                    axis_value = "--"
-                    for aid in range(len(axis_values)): 
+                    for aid in range(len(axis_values)):
                        if axis_values[aid]["name"] != plot:
-                           axis_name.append("{} = {}".format(axis_values[aid]["name"], axis_values[aid]["value"]))
+                            axis_name.append(
+                                "{} = {}".format(
+                                    axis_values[aid]["name"], axis_values[aid]["value"]
+                                )
+                            )
                        else:
-                           axis_value = float(axis_values[aid]["value"])
-                    axis_name = ', '.join(axis_name)
+                            axis_value = float(axis_values[aid]["value"])
+                    axis_name = ", ".join(axis_name)

-                    if axis_name not in plot_data['cmp']:
-                        plot_data['cmp'][axis_name] = {}
-                        plot_data['ref'][axis_name] = {}
-                        plot_data['cmp_noise'][axis_name] = {}
-                        plot_data['ref_noise'][axis_name] = {}
+                    if axis_name not in plot_data["cmp"]:
+                        plot_data["cmp"][axis_name] = {}
+                        plot_data["ref"][axis_name] = {}
+                        plot_data["cmp_noise"][axis_name] = {}
+                        plot_data["ref_noise"][axis_name] = {}

-                    plot_data['cmp'][axis_name][axis_value] = cmp_time
-                    plot_data['ref'][axis_name][axis_value] = ref_time
-                    plot_data['cmp_noise'][axis_name][axis_value] = cmp_noise
-                    plot_data['ref_noise'][axis_name][axis_value] = ref_noise
+                    plot_data["cmp"][axis_name][axis_value] = cmp_time
+                    plot_data["ref"][axis_name][axis_value] = ref_time
+                    plot_data["cmp_noise"][axis_name][axis_value] = cmp_noise
+                    plot_data["ref_noise"][axis_name][axis_value] = ref_noise

                global config_count
                global unknown_count
@@ -273,14 +287,13 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
            print("## [%d] %s\n" % (device["id"], device["name"]))
            # colalign and github format require tabulate 0.8.3
            if tabulate_version >= (0, 8, 3):
-                print(tabulate.tabulate(rows,
-                                        headers=headers,
-                                        colalign=colalign,
-                                        tablefmt="github"))
+                print(
+                    tabulate.tabulate(
+                        rows, headers=headers, colalign=colalign, tablefmt="github"
+                    )
+                )
            else:
-                print(tabulate.tabulate(rows,
-                                        headers=headers,
-                                        tablefmt="markdown"))
+                print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))

            print("")

@@ -295,18 +308,17 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                    x = [float(x) for x in plot_data[key][axis].keys()]
                    y = list(plot_data[key][axis].values())

-                    noise = list(plot_data[key + '_noise'][axis].values())
+                    noise = list(plot_data[key + "_noise"][axis].values())

                    top = [y[i] + y[i] * noise[i] for i in range(len(x))]
                    bottom = [y[i] - y[i] * noise[i] for i in range(len(x))]

-                    p = plt.plot(x, y, shape, marker='o', label=label)
+                    p = plt.plot(x, y, shape, marker="o", label=label)
                    plt.fill_between(x, bottom, top, color=p[0].get_color(), alpha=0.1)

-
-                for axis in plot_data['cmp'].keys():
-                    plot_line('cmp', '-', axis)
-                    plot_line('ref', '--', axis + ' ref')
+                for axis in plot_data["cmp"].keys():
+                    plot_line("cmp", "-", axis)
+                    plot_line("ref", "--", axis + " ref")

                plt.legend()
                plt.show()
@@ -314,11 +326,17 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):

 def main():
    help_text = "%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]"
-    parser = argparse.ArgumentParser(prog='nvbench_compare', usage=help_text)
-    parser.add_argument('--threshold-diff', type=float, dest='threshold', default=0.0,
-                        help='only show benchmarks where percentage diff is >= THRESHOLD')
-    parser.add_argument('--plot-along', type=str, dest='plot', default=None,
-                        help='plot results')
+    parser = argparse.ArgumentParser(prog="nvbench_compare", usage=help_text)
+    parser.add_argument(
+        "--threshold-diff",
+        type=float,
+        dest="threshold",
+        default=0.0,
+        help="only show benchmarks where percentage diff is >= THRESHOLD",
+    )
+    parser.add_argument(
+        "--plot-along", type=str, dest="plot", default=None, help="plot results"
+    )

    args, files_or_dirs = parser.parse_known_args()
    print(files_or_dirs)
@@ -336,14 +354,17 @@ def main():
                continue
            r = os.path.join(files_or_dirs[0], f)
            c = os.path.join(files_or_dirs[1], f)
-            if os.path.isfile(r) and os.path.isfile(c) and \
-               os.path.getsize(r) > 0 and os.path.getsize(c) > 0:
+            if (
+                os.path.isfile(r)
+                and os.path.isfile(c)
+                and os.path.getsize(r) > 0
+                and os.path.getsize(c) > 0
+            ):
                to_compare.append((r, c))
    else:
        to_compare = [(files_or_dirs[0], files_or_dirs[1])]

    for ref, comp in to_compare:
-
        ref_root = reader.read_file(ref)
        cmp_root = reader.read_file(comp)

@@ -355,7 +376,9 @@ def main():
            print("Device sections do not match.")
            sys.exit(1)

-        compare_benches(ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot)
+        compare_benches(
+            ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot
+        )

    print("# Summary\n")
    print("- Total Matches: %d" % config_count)
@@ -365,5 +388,5 @@ def main():
    return failure_count


-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/nvbench_histogram.py
+++ b/scripts/nvbench_histogram.py
@@ -1,19 +1,19 @@
 #!/usr/bin/env python

-import numpy as np
-import pandas as pd
-
-import matplotlib.pyplot as plt
-import seaborn as sns
 import argparse
 import os
 import sys

+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
 from nvbench_json import reader

+
 def parse_files():
    help_text = "%(prog)s [nvbench.out.json | dir/] ..."
-    parser = argparse.ArgumentParser(prog='nvbench_histogram', usage=help_text)
+    parser = argparse.ArgumentParser(prog="nvbench_histogram", usage=help_text)

    args, files_or_dirs = parser.parse_known_args()

@@ -41,14 +41,14 @@ def parse_files():
 def extract_filename(summary):
    summary_data = summary["data"]
    value_data = next(filter(lambda v: v["name"] == "filename", summary_data))
-    assert(value_data["type"] == "string")
+    assert value_data["type"] == "string"
    return value_data["value"]


 def extract_size(summary):
    summary_data = summary["data"]
    value_data = next(filter(lambda v: v["name"] == "size", summary_data))
-    assert(value_data["type"] == "int64")
+    assert value_data["type"] == "int64"
    return int(value_data["value"])


@@ -57,9 +57,10 @@ def parse_samples_meta(filename, state):
    if not summaries:
        return None, None

-    summary = next(filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times",
-                          summaries),
-                   None)
+    summary = next(
+        filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times", summaries),
+        None,
+    )
    if not summary:
        return None, None

@@ -81,7 +82,7 @@ def parse_samples(filename, state):
    with open(samples_filename, "rb") as f:
        samples = np.fromfile(f, "<f4")

-    assert (sample_count == len(samples))
+    assert sample_count == len(samples)
    return samples


@@ -118,5 +119,5 @@ def main():
    plt.show()


-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/nvbench_json/init.py
+++ b/scripts/nvbench_json/init.py
@@ -1,2 +1,3 @@
-from . import reader
-from . import version
+from . import reader, version
+
+__all__ = ["reader", "version"]
--- a/scripts/nvbench_json/version.py
+++ b/scripts/nvbench_json/version.py
@@ -1,8 +1,8 @@
 file_version = (1, 0, 0)

-file_version_string = "{}.{}.{}".format(file_version[0],
-                                        file_version[1],
-                                        file_version[2])
+file_version_string = "{}.{}.{}".format(
+    file_version[0], file_version[1], file_version[2]
+)


 def check_file_version(filename, root_node):
@@ -19,8 +19,14 @@ def check_file_version(filename, root_node):
    # for now just warn on mismatch.
    if version_node["string"] != file_version_string:
        print("WARNING:")
-        print("  {} was written using a different NVBench JSON file version."
-              .format(filename))
+        print(
+            "  {} was written using a different NVBench JSON file version.".format(
+                filename
+            )
+        )
        print("  It may not read correctly.")
-        print("  (file version: {} reader version: {})"
-              .format(version_node["string"], file_version_string))
+        print(
+            "  (file version: {} reader version: {})".format(
+                version_node["string"], file_version_string
+            )
+        )
--- a/scripts/nvbench_walltime.py
+++ b/scripts/nvbench_walltime.py
@@ -5,9 +5,8 @@ import math
 import os
 import sys

-from nvbench_json import reader
-
 import tabulate
+from nvbench_json import reader


 # Parse version string into tuple, "x.y.z" -> (x, y, z)
@@ -39,7 +38,8 @@ def format_walltime(seconds_in):
        "{:0>2d}:".format(h) if h > 1e-9 else "",
        "{:0>2d}:".format(m) if (h > 1e-9 or m > 1e-9) else "",
        "{:0>2d}.".format(s) if (h > 1e-9 or m > 1e-9) else "{:d}.".format(s),
-        "{:0>3d}".format(ms))
+        "{:0>3d}".format(ms),
+    )


 def format_percentage(percentage):
@@ -58,7 +58,7 @@ measure_column_names = {"cold": "Isolated", "batch": "Batch", "cupti": "CUPTI"}
 def init_measures():
    out = {}
    for name in measure_names:
-        out[name] = 0.
+        out[name] = 0.0
    return out


@@ -67,17 +67,17 @@ def get_measures(state):
    times = {}
    for name in measure_names:
        measure_walltime_tag = "nv/{}/walltime".format(name)
-        summary = next(filter(lambda s: s["tag"] == measure_walltime_tag,
-                              summaries),
-                       None)
+        summary = next(
+            filter(lambda s: s["tag"] == measure_walltime_tag, summaries), None
+        )
        if not summary:
            continue

        walltime_data = next(filter(lambda d: d["name"] == "value", summary["data"]))
-        assert(walltime_data["type"] == "float64")
+        assert walltime_data["type"] == "float64"
        walltime = walltime_data["value"]
        walltime = float(walltime)
-        times[name] = walltime if walltime else 0.
+        times[name] = walltime if walltime else 0.0
    return times


@@ -87,7 +87,7 @@ def merge_measures(target, src):


 def sum_measures(measures):
-    total_time = 0.
+    total_time = 0.0
    for time in measures.values():
        total_time += time
    return total_time
@@ -194,20 +194,21 @@ def print_overview_section(data):

    # colalign and github format require tabulate 0.8.3
    if tabulate_version >= (0, 8, 3):
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                colalign=colalign,
-                                tablefmt="github"))
+        print(
+            tabulate.tabulate(
+                rows, headers=headers, colalign=colalign, tablefmt="github"
+            )
+        )
    else:
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                tablefmt="markdown"))
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))

    print()


 # append_data_row_lambda args: (row_list, name, items[name])
-def print_measures_table(headers, colalign, items, total_measures, append_item_row_lambda):
+def print_measures_table(
+    headers, colalign, items, total_measures, append_item_row_lambda
+):
    total_time = sum_measures(total_measures)
    active_measures = get_active_measure_names(total_measures)
    num_user_columns = len(headers)
@@ -248,14 +249,13 @@ def print_measures_table(headers, colalign, items, total_measures, append_item_r

    # colalign and github format require tabulate 0.8.3
    if tabulate_version >= (0, 8, 3):
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                colalign=colalign,
-                                tablefmt="github"))
+        print(
+            tabulate.tabulate(
+                rows, headers=headers, colalign=colalign, tablefmt="github"
+            )
+        )
    else:
-        print(tabulate.tabulate(rows,
-                                headers=headers,
-                                tablefmt="markdown"))
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))


 def print_files_section(data):
@@ -319,7 +319,7 @@ def print_bench_section(bench_name, bench):

 def main():
    help_text = "%(prog)s [nvbench.out.json | dir/]..."
-    parser = argparse.ArgumentParser(prog='nvbench_walltime', usage=help_text)
+    parser = argparse.ArgumentParser(prog="nvbench_walltime", usage=help_text)

    args, files_or_dirs = parser.parse_known_args()

@@ -353,5 +353,5 @@ def main():
    print_files_section(data)


-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/testing/axes_metadata.cu
+++ b/testing/axes_metadata.cu
@@ -17,22 +17,19 @@
 */

 #include <nvbench/axes_metadata.cuh>
-
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

 #include <algorithm>
 #include <string_view>

-using int_list = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+#include "test_asserts.cuh"
+
+using int_list =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;

 using float_list = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;

@@ -110,7 +107,6 @@ void test_default_type_axes_names()
    ASSERT(axes.get_type_axis(4).get_name() == "T4");
    ASSERT(axes.get_type_axis(4).get_axis_index() == 4);
  }
-
 }

 void test_type_axes()
@@ -138,8 +134,7 @@ void test_type_axes()
      fmt::format_to(std::back_inserter(buffer),
                     " - {}{}\n",
                     input_string,
-                     description.empty() ? ""
-                                         : fmt::format(" ({})", description));
+                     description.empty() ? "" : fmt::format(" ({})", description));
    }
  }

@@ -157,9 +152,8 @@ Axis: Other
 )expected";

  const std::string test = fmt::to_string(buffer);
-  const auto diff =
-    std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
-  const auto idx = static_cast<std::size_t>(diff.second - test.cbegin());
+  const auto diff        = std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
+  const auto idx         = static_cast<std::size_t>(diff.second - test.cbegin());
  ASSERT_MSG(test == ref,
             "Differs at character {}.\n"
             "Expected:\n\"{}\"\n\n"
@@ -189,9 +183,7 @@ void test_float64_axes()
 void test_int64_axes()
 {
  nvbench::axes_metadata axes;
-  axes.add_int64_axis("I64 Axis",
-                      {10, 11, 12, 13, 14},
-                      nvbench::int64_axis_flags::none);
+  axes.add_int64_axis("I64 Axis", {10, 11, 12, 13, 14}, nvbench::int64_axis_flags::none);
  ASSERT(axes.get_axes().size() == 1);
  const auto &axis = axes.get_int64_axis("I64 Axis");
  ASSERT(axis.get_size() == 5);
@@ -205,9 +197,7 @@ void test_int64_axes()
 void test_int64_power_of_two_axes()
 {
  nvbench::axes_metadata axes;
-  axes.add_int64_axis("I64 POT Axis",
-                      {1, 2, 3, 4, 5},
-                      nvbench::int64_axis_flags::power_of_two);
+  axes.add_int64_axis("I64 POT Axis", {1, 2, 3, 4, 5}, nvbench::int64_axis_flags::power_of_two);
  ASSERT(axes.get_axes().size() == 1);
  const auto &axis = axes.get_int64_axis("I64 POT Axis");
  ASSERT(axis.get_size() == 5);
--- a/testing/benchmark.cu
+++ b/testing/benchmark.cu
@@ -17,7 +17,6 @@
 */

 #include <nvbench/benchmark.cuh>
-
 #include <nvbench/callable.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/state.cuh>
@@ -25,8 +24,6 @@
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

 #include <algorithm>
@@ -34,6 +31,8 @@
 #include <variant>
 #include <vector>

+#include "test_asserts.cuh"
+
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -61,34 +60,26 @@ void no_op_generator(nvbench::state &state)
 NVBENCH_DEFINE_CALLABLE(no_op_generator, no_op_callable);

 template <typename Integer, typename Float, typename Other>
-void template_no_op_generator(nvbench::state &state,
-                              nvbench::type_list<Integer, Float, Other>)
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<Integer, Float, Other>)
 {
-  ASSERT(nvbench::type_strings<Integer>::input_string() ==
-         state.get_string("Integer"));
-  ASSERT(nvbench::type_strings<Float>::input_string() ==
-         state.get_string("Float"));
-  ASSERT(nvbench::type_strings<Other>::input_string() ==
-         state.get_string("Other"));
+  ASSERT(nvbench::type_strings<Integer>::input_string() == state.get_string("Integer"));
+  ASSERT(nvbench::type_strings<Float>::input_string() == state.get_string("Float"));
+  ASSERT(nvbench::type_strings<Other>::input_string() == state.get_string("Other"));

  // Enum params using non-templated version:
  no_op_generator(state);
 }
-NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator,
-                                 template_no_op_callable);
+NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator, template_no_op_callable);

-using int_list = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+using int_list =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;

 using float_list = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;

 using misc_list = nvbench::type_list<bool, void>;

 using lots_of_types_bench =
-  nvbench::benchmark<template_no_op_callable,
-                     nvbench::type_list<int_list, float_list, misc_list>>;
+  nvbench::benchmark<template_no_op_callable, nvbench::type_list<int_list, float_list, misc_list>>;

 using no_types_bench = nvbench::benchmark<no_op_callable>;

@@ -110,8 +101,7 @@ void test_type_axes()
      fmt::format_to(std::back_inserter(buffer),
                     " - {}{}\n",
                     input_string,
-                     description.empty() ? ""
-                                         : fmt::format(" ({})", description));
+                     description.empty() ? "" : fmt::format(" ({})", description));
    }
  }

@@ -300,9 +290,7 @@ void test_get_config_count()

  auto const num_devices = bench.get_devices().size();

-  ASSERT_MSG(bench.get_config_count() == 72 * num_devices,
-             "Got {}",
-             bench.get_config_count());
+  ASSERT_MSG(bench.get_config_count() == 72 * num_devices, "Got {}", bench.get_config_count());
 }

 int main()
--- a/testing/cpu_timer.cu
+++ b/testing/cpu_timer.cu
@@ -18,11 +18,11 @@

 #include <nvbench/cpu_timer.cuh>

-#include "test_asserts.cuh"
-
 #include <chrono>
 #include <thread>

+#include "test_asserts.cuh"
+
 void test_basic()
 {
  using namespace std::literals::chrono_literals;
--- a/testing/create.cu
+++ b/testing/create.cu
@@ -16,17 +16,14 @@
 *  limitations under the License.
 */

-#include <nvbench/create.cuh>
-
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
+#include <nvbench/create.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

 #include <algorithm>
@@ -34,6 +31,8 @@
 #include <variant>
 #include <vector>

+#include "test_asserts.cuh"
+
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -72,15 +71,11 @@ using misc_types  = nvbench::type_list<bool, void>;
 using type_axes   = nvbench::type_list<float_types, int_types, misc_types>;

 template <typename FloatT, typename IntT, typename MiscT>
-void template_no_op_generator(nvbench::state &state,
-                              nvbench::type_list<FloatT, IntT, MiscT>)
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<FloatT, IntT, MiscT>)
 {
-  ASSERT(nvbench::type_strings<FloatT>::input_string() ==
-         state.get_string("FloatT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<FloatT>::input_string() == state.get_string("FloatT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));

  // Enum params using non-templated version:
  no_op_generator(state);
@@ -116,8 +111,7 @@ std::string run_and_get_state_string(nvbench::benchmark_base &bench,

 void validate_default_name()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("no_op_generator").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("no_op_generator").clone();

  const std::string ref = "Params:\n";

@@ -127,8 +121,7 @@ void validate_default_name()

 void validate_custom_name()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("Custom Name").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("Custom Name").clone();

  const std::string ref = "Params:\n";

@@ -138,8 +131,7 @@ void validate_custom_name()

 void validate_no_types()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("No Types").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("No Types").clone();

  const std::string ref = R"expected(Params: Float: 11 Int: 1 String: One
 Params: Float: 11 Int: 2 String: One
@@ -176,8 +168,7 @@ Params: Float: 13 Int: 3 String: Three

 void validate_only_types()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("Oops, All Types!").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("Oops, All Types!").clone();

  const std::string ref = R"expected(Params: FloatT: F32 IntT: I32 MiscT: bool
 Params: FloatT: F32 IntT: I32 MiscT: void
@@ -195,8 +186,7 @@ Params: FloatT: F64 IntT: I64 MiscT: void

 void validate_all_axes()
 {
-  auto bench =
-    nvbench::benchmark_manager::get().get_benchmark("All The Axes").clone();
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("All The Axes").clone();

  const std::string ref =
    R"expected(Params: Float: 11 FloatT: F32 Int: 1 IntT: I32 MiscT: bool String: One
--- a/testing/criterion_manager.cu
+++ b/testing/criterion_manager.cu
@@ -42,27 +42,34 @@ protected:

 void test_no_duplicates_are_allowed()
 {
-  nvbench::criterion_manager& manager = nvbench::criterion_manager::get();
-  bool exception_triggered = false;
+  nvbench::criterion_manager &manager = nvbench::criterion_manager::get();
+  bool exception_triggered            = false;

-  try {
-    [[maybe_unused]] nvbench::stopping_criterion_base& _ = manager.get_criterion("custom");
-  } catch(...) {
+  try
+  {
+    [[maybe_unused]] nvbench::stopping_criterion_base &_ = manager.get_criterion("custom");
+  }
+  catch (...)
+  {
    exception_triggered = true;
  }
  ASSERT(exception_triggered);

  std::unique_ptr<custom_criterion> custom_ptr = std::make_unique<custom_criterion>();
-  custom_criterion* custom_raw = custom_ptr.get();
+  custom_criterion *custom_raw                 = custom_ptr.get();
  ASSERT(&manager.add(std::move(custom_ptr)) == custom_raw);

-  nvbench::stopping_criterion_base& custom = nvbench::criterion_manager::get().get_criterion("custom");
+  nvbench::stopping_criterion_base &custom =
+    nvbench::criterion_manager::get().get_criterion("custom");
  ASSERT(custom_raw == &custom);

  exception_triggered = false;
-  try {
+  try
+  {
    manager.add(std::make_unique<custom_criterion>());
-  } catch(...) {
+  }
+  catch (...)
+  {
    exception_triggered = true;
  }
  ASSERT(exception_triggered);
--- a/testing/criterion_params.cu
+++ b/testing/criterion_params.cu
@@ -60,4 +60,3 @@ int main()
  test_compat_overwrite();
  test_overwrite();
 }
-
--- a/testing/cuda_timer.cu
+++ b/testing/cuda_timer.cu
@@ -16,19 +16,16 @@
 *  limitations under the License.
 */

-#include <nvbench/cuda_timer.cuh>
-
 #include <nvbench/cuda_stream.cuh>
+#include <nvbench/cuda_timer.cuh>
 #include <nvbench/test_kernels.cuh>
 #include <nvbench/types.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

-void test_basic(cudaStream_t time_stream,
-                cudaStream_t exec_stream,
-                bool expected)
+#include "test_asserts.cuh"
+
+void test_basic(cudaStream_t time_stream, cudaStream_t exec_stream, bool expected)
 {
  nvbench::cuda_timer timer;

--- a/testing/custom_main_custom_args.cu
+++ b/testing/custom_main_custom_args.cu
@@ -16,8 +16,8 @@
 *  limitations under the License.
 */

+#include <nvbench/cuda_call.cuh>
 #include <nvbench/nvbench.cuh>
-#include "nvbench/cuda_call.cuh"

 /******************************************************************************
 * Install custom parser.
@@ -35,7 +35,7 @@
 // User code to handle a specific argument:
 void handle_my_custom_arg();

-// NVBench hook for modiifying the command line arguments before parsing:
+// NVBench hook for modifying the command line arguments before parsing:
 void custom_arg_handler(std::vector<std::string> &args)
 {
  // Handle and remove "--my-custom-arg"
--- a/testing/custom_main_global_state_raii.cu
+++ b/testing/custom_main_global_state_raii.cu
@@ -19,8 +19,8 @@
 #include <nvbench/nvbench.cuh>

 #include <algorithm>
-#include <cstdlib>
 #include <cstdio>
+#include <cstdlib>

 /******************************************************************************
 * Test having global state that is initialized and finalized via RAII.
--- a/testing/device/noisy_bench.cu
+++ b/testing/device/noisy_bench.cu
@@ -29,12 +29,10 @@
 void noisy_bench(nvbench::state &state)
 {
  // time, convert ms -> s
-  const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) /
-                    1000.f;
+  const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) / 1000.f;
  // rel stdev
-  const auto noise_pct =
-    static_cast<nvbench::float32_t>(state.get_float64("Noise"));
-  const auto noise = noise_pct / 100.f;
+  const auto noise_pct = static_cast<nvbench::float32_t>(state.get_float64("Noise"));
+  const auto noise     = noise_pct / 100.f;
  // abs stdev
  const auto stdev = noise * mean;

@@ -53,8 +51,7 @@ void noisy_bench(nvbench::state &state)
    try
    {
      return static_cast<nvbench::float32_t>(
-        state.get_summary("nv/cold/time/gpu/stdev/relative")
-          .get_float64("value"));
+        state.get_summary("nv/cold/time/gpu/stdev/relative").get_float64("value"));
    }
    catch (std::invalid_argument &)
    {
--- a/testing/entropy_criterion.cu
+++ b/testing/entropy_criterion.cu
@@ -20,11 +20,11 @@
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/types.cuh>

-#include "test_asserts.cuh"
-
-#include <vector>
-#include <random>
 #include <numeric>
+#include <random>
+#include <vector>
+
+#include "test_asserts.cuh"

 void test_const()
 {
@@ -32,7 +32,7 @@ void test_const()
  nvbench::detail::entropy_criterion criterion;

  criterion.initialize(params);
-  for (int i = 0; i < 6; i++) 
+  for (int i = 0; i < 6; i++)
  { // nvbench wants at least 5 to compute the standard deviation
    criterion.add_measurement(42.0);
  }
@@ -48,7 +48,7 @@ void produce_entropy_arch(nvbench::detail::entropy_criterion &criterion)
   *   2.5, 2.4, 2.2, 2.1, 2.0, 1.9 <-+
   *   1.8, 1.7, 1.6, 1.6, 1.5, 1.4   |
   *   1.4, 1.3, 1.3, 1.3, 1.2, 1.2   |
-   *   1.1, 1.1, 1.1, 1.0, 1.0, 1.0   +-- entropy only decreases after 5-th sample, 
+   *   1.1, 1.1, 1.1, 1.0, 1.0, 1.0   +-- entropy only decreases after 5-th sample,
   *   1.0, 0.9, 0.9, 0.9, 0.9, 0.9   |   so the slope should be negative
   *   0.8, 0.8, 0.8, 0.8, 0.8, 0.8   |
   *   0.7, 0.7, 0.7, 0.7, 0.7, 0.7 <-+
--- a/testing/enum_type_list.cu
+++ b/testing/enum_type_list.cu
@@ -18,12 +18,12 @@

 #include <nvbench/enum_type_list.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

 #include <type_traits>

+#include "test_asserts.cuh"
+
 // If using gcc version < 7, disable some tests to WAR a compiler bug. See NVIDIA/nvbench#39.
 #if defined(__GNUC__) && __GNUC__ == 7
 #define USING_GCC_7
@@ -102,8 +102,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
 void test_int()
 {
  ASSERT((std::is_same_v<nvbench::enum_type_list<>, nvbench::type_list<>>));
-  ASSERT((std::is_same_v<nvbench::enum_type_list<0>,
-                         nvbench::type_list<nvbench::enum_type<0>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<0>, nvbench::type_list<nvbench::enum_type<0>>>));
  ASSERT((std::is_same_v<nvbench::enum_type_list<0, 1, 2, 3, 4>,
                         nvbench::type_list<nvbench::enum_type<0>,
                                            nvbench::enum_type<1>,
@@ -115,42 +114,35 @@ void test_int()
 void test_scoped_enum()
 {
 #ifndef USING_GCC_7
-  ASSERT((
-    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
-                   nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
+                         nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
 #endif
-  ASSERT((
-    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1,
-                                           scoped_enum::val_2,
-                                           scoped_enum::val_3>,
-                   nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>,
-                                      nvbench::enum_type<scoped_enum::val_2>,
-                                      nvbench::enum_type<scoped_enum::val_3>>>));
+  ASSERT((std::is_same_v<
+          nvbench::enum_type_list<scoped_enum::val_1, scoped_enum::val_2, scoped_enum::val_3>,
+          nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>,
+                             nvbench::enum_type<scoped_enum::val_2>,
+                             nvbench::enum_type<scoped_enum::val_3>>>));
 }

 void test_unscoped_enum()
 {
 #ifndef USING_GCC_7
-  ASSERT(
-    (std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
-                    nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
-  ASSERT(
-    (std::is_same_v<
-      nvbench::enum_type_list<unscoped_val_1, unscoped_val_2, unscoped_val_3>,
-      nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
-                         nvbench::enum_type<unscoped_val_2>,
-                         nvbench::enum_type<unscoped_val_3>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
+                         nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<unscoped_val_1, unscoped_val_2, unscoped_val_3>,
+                         nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
+                                            nvbench::enum_type<unscoped_val_2>,
+                                            nvbench::enum_type<unscoped_val_3>>>));
 #endif
 }

 void test_scoped_enum_type_strings()
 {
-  using values = nvbench::enum_type_list<scoped_enum::val_1,
-                                         scoped_enum::val_2,
-                                         scoped_enum::val_3>;
-  using val_1  = nvbench::tl::get<0, values>;
-  using val_2  = nvbench::tl::get<1, values>;
-  using val_3  = nvbench::tl::get<2, values>;
+  using values =
+    nvbench::enum_type_list<scoped_enum::val_1, scoped_enum::val_2, scoped_enum::val_3>;
+  using val_1 = nvbench::tl::get<0, values>;
+  using val_2 = nvbench::tl::get<1, values>;
+  using val_3 = nvbench::tl::get<2, values>;
  ASSERT((nvbench::type_strings<val_1>::input_string() == "1"));
  ASSERT((nvbench::type_strings<val_1>::description() == "scoped_enum::val_1"));
  ASSERT((nvbench::type_strings<val_2>::input_string() == "2"));
--- a/testing/float64_axis.cu
+++ b/testing/float64_axis.cu
@@ -34,8 +34,7 @@ void test_empty()

  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);

  ASSERT(clone->get_name() == "Empty");
@@ -62,8 +61,7 @@ void test_basic()

  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);

  ASSERT(clone->get_name() == "Basic");
--- a/testing/int64_axis.cu
+++ b/testing/int64_axis.cu
@@ -18,10 +18,10 @@

 #include <nvbench/int64_axis.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

+#include "test_asserts.cuh"
+
 void test_empty()
 {
  nvbench::int64_axis axis("Empty");
@@ -36,8 +36,7 @@ void test_empty()

  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);

  ASSERT(clone->get_name() == "Empty");
@@ -66,8 +65,7 @@ void test_basic()

  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);

  ASSERT(clone->get_name() == "BasicAxis");
@@ -87,8 +85,7 @@ void test_basic()
 void test_power_of_two()
 {
  nvbench::int64_axis axis{"POTAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
-                  nvbench::int64_axis_flags::power_of_two);
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
  const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
  const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};

@@ -102,14 +99,12 @@ void test_power_of_two()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }

  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
-    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);

  ASSERT(clone->get_name() == "POTAxis");
@@ -122,8 +117,7 @@ void test_power_of_two()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(clone->get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(clone->get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(clone->get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }
 }

@@ -250,8 +244,7 @@ void test_update_none_to_pow2()
 void test_update_pow2_to_none()
 {
  nvbench::int64_axis axis{"TestAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
-                  nvbench::int64_axis_flags::power_of_two);
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
  const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
  const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};

@@ -304,8 +297,7 @@ void test_update_pow2_to_none()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }
 }

@@ -313,8 +305,7 @@ void test_update_pow2_to_pow2()
 {

  nvbench::int64_axis axis{"TestAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
-                  nvbench::int64_axis_flags::power_of_two);
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
  const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
  const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};

@@ -369,8 +360,7 @@ void test_update_pow2_to_pow2()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
-           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }
 }

--- a/testing/named_values.cu
+++ b/testing/named_values.cu
@@ -18,10 +18,10 @@

 #include <nvbench/named_values.cuh>

-#include "test_asserts.cuh"
-
 #include <algorithm>

+#include "test_asserts.cuh"
+
 void test_empty()
 {
  nvbench::named_values vals;
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -16,15 +16,14 @@
 *  limitations under the License.
 */

-#include <nvbench/option_parser.cuh>
-
 #include <nvbench/create.cuh>
+#include <nvbench/option_parser.cuh>
 #include <nvbench/type_list.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

+#include "test_asserts.cuh"
+
 //==============================================================================
 // Declare a couple benchmarks for testing:
 void DummyBench(nvbench::state &state) { state.skip("Skipping for testing."); }
@@ -50,8 +49,7 @@ NVBENCH_BENCH_TYPES(TestBench, NVBENCH_TYPE_AXES(Ts, Us))
 namespace
 {

-[[nodiscard]] std::string
-states_to_string(const std::vector<nvbench::state> &states)
+[[nodiscard]] std::string states_to_string(const std::vector<nvbench::state> &states)
 {
  fmt::memory_buffer buffer;
  std::string table_format = "| {:^5} | {:^10} | {:^4} | {:^4} | {:^4} "
@@ -88,7 +86,7 @@ states_to_string(const std::vector<nvbench::state> &states)

 // Expects the parser to have a single TestBench benchmark. Runs the benchmark
 // and returns the resulting states.
-[[nodiscard]] const auto& parser_to_states(nvbench::option_parser &parser)
+[[nodiscard]] const auto &parser_to_states(nvbench::option_parser &parser)
 {
  const auto &benches = parser.get_benchmarks();
  ASSERT(benches.size() == 1);
@@ -267,8 +265,7 @@ void test_int64_axis_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 2 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 2 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -308,8 +305,7 @@ void test_int64_axis_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -323,8 +319,7 @@ void test_int64_axis_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -369,8 +364,7 @@ void test_int64_axis_pow2_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -384,8 +378,7 @@ void test_int64_axis_pow2_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 : 7 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 : 7 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -425,8 +418,7 @@ void test_int64_axis_pow2_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -440,8 +432,7 @@ void test_int64_axis_pow2_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -486,8 +477,7 @@ void test_int64_axis_none_to_pow2_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -501,8 +491,7 @@ void test_int64_axis_none_to_pow2_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 : 7 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 : 7 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -542,8 +531,7 @@ void test_int64_axis_none_to_pow2_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -557,8 +545,7 @@ void test_int64_axis_none_to_pow2_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -617,8 +604,7 @@ void test_int64_axis_pow2_to_none_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 2 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 2 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -658,8 +644,7 @@ void test_int64_axis_pow2_to_none_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 , 7 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -673,8 +658,7 @@ void test_int64_axis_pow2_to_none_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 7 : 5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -719,8 +703,7 @@ void test_float64_axis_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -734,10 +717,7 @@ void test_float64_axis_single()

  {
    nvbench::option_parser parser;
-    parser.parse({"--benchmark",
-                  "TestBench",
-                  "--axis",
-                  " Floats [ ] = [ 3.5 : 3.6 : 1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 : 3.6 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -777,8 +757,7 @@ void test_float64_axis_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 , 4.1 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 , 4.1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -792,18 +771,14 @@ void test_float64_axis_multi()

  {
    nvbench::option_parser parser;
-    parser.parse({"--benchmark",
-                  "TestBench",
-                  "--axis",
-                  " Floats [ ] = [ 3.5 : 4.2 : 0.6 ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 : 4.2 : 0.6 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", "Floats=[3.5:4.2:0.6]"});
+    parser.parse({"--benchmark", "TestBench", "--axis", "Floats=[3.5:4.2:0.6]"});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -827,8 +802,7 @@ void test_string_axis_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Strings [ ] = fo br "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = fo br "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -842,8 +816,7 @@ void test_string_axis_single()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -883,8 +856,7 @@ void test_string_axis_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br , baz ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br , baz ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -951,8 +923,7 @@ void test_type_axis_multi()

  {
    nvbench::option_parser parser;
-    parser.parse(
-      {"--benchmark", "TestBench", "--axis", " T [ ] = [ U8, void ] "});
+    parser.parse({"--benchmark", "TestBench", "--axis", " T [ ] = [ U8, void ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -1177,9 +1148,8 @@ void test_axis_before_benchmark()
 void test_min_samples()
 {
  nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--min-samples", "12345"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--min-samples", "12345"});
+  const auto &states = parser_to_states(parser);

  ASSERT(states.size() == 1);
  ASSERT(states[0].get_min_samples() == 12345);
@@ -1188,9 +1158,8 @@ void test_min_samples()
 void test_min_time()
 {
  nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--min-time", "12345e2"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--min-time", "12345e2"});
+  const auto &states = parser_to_states(parser);

  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_min_time() - 12345e2) < 1.);
@@ -1199,9 +1168,8 @@ void test_min_time()
 void test_max_noise()
 {
  nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--max-noise", "50.3"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--max-noise", "50.3"});
+  const auto &states = parser_to_states(parser);

  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_max_noise() - 0.503) < 1.e-4);
@@ -1210,9 +1178,8 @@ void test_max_noise()
 void test_skip_time()
 {
  nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--skip-time", "12345e2"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--skip-time", "12345e2"});
+  const auto &states = parser_to_states(parser);

  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
@@ -1221,9 +1188,8 @@ void test_skip_time()
 void test_timeout()
 {
  nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", "--timeout", "12345e2"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark", "DummyBench", "--timeout", "12345e2"});
+  const auto &states = parser_to_states(parser);

  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.);
@@ -1232,12 +1198,15 @@ void test_timeout()
 void test_stopping_criterion()
 {
  nvbench::option_parser parser;
-  parser.parse(
-    {"--benchmark", "DummyBench", 
-     "--stopping-criterion", "entropy",
-     "--max-angle", "0.42",
-     "--min-r2", "0.6"});
-  const auto& states = parser_to_states(parser);
+  parser.parse({"--benchmark",
+                "DummyBench",
+                "--stopping-criterion",
+                "entropy",
+                "--max-angle",
+                "0.42",
+                "--min-r2",
+                "0.6"});
+  const auto &states = parser_to_states(parser);

  ASSERT(states.size() == 1);
  ASSERT(states[0].get_stopping_criterion() == "entropy");
--- a/testing/range.cu
+++ b/testing/range.cu
@@ -22,12 +22,9 @@

 void test_basic()
 {
-  ASSERT((nvbench::range(0, 6) ==
-          std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
-  ASSERT((nvbench::range(0, 6, 1) ==
-          std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
-  ASSERT(
-    (nvbench::range(0, 6, 2) == std::vector<nvbench::int64_t>{0, 2, 4, 6}));
+  ASSERT((nvbench::range(0, 6) == std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
+  ASSERT((nvbench::range(0, 6, 1) == std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
+  ASSERT((nvbench::range(0, 6, 2) == std::vector<nvbench::int64_t>{0, 2, 4, 6}));
  ASSERT((nvbench::range(0, 6, 3) == std::vector<nvbench::int64_t>{0, 3, 6}));
  ASSERT((nvbench::range(0, 6, 4) == std::vector<nvbench::int64_t>{0, 4}));
  ASSERT((nvbench::range(0, 6, 5) == std::vector<nvbench::int64_t>{0, 5}));
@@ -37,26 +34,19 @@ void test_basic()
 void test_result_type()
 {
  // All ints should turn into int64 by default:
-  ASSERT((std::is_same_v<decltype(nvbench::range(0ll, 1ll)),
-                         std::vector<nvbench::int64_t>>));
-  ASSERT((std::is_same_v<decltype(nvbench::range(0, 1)),
-                         std::vector<nvbench::int64_t>>));
-  ASSERT((std::is_same_v<decltype(nvbench::range(0u, 1u)),
-                         std::vector<nvbench::int64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0ll, 1ll)), std::vector<nvbench::int64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0, 1)), std::vector<nvbench::int64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0u, 1u)), std::vector<nvbench::int64_t>>));

  // All floats should turn into float64 by default:
-  ASSERT((std::is_same_v<decltype(nvbench::range(0., 1.)),
-                         std::vector<nvbench::float64_t>>));
-  ASSERT((std::is_same_v<decltype(nvbench::range(0.f, 1.f)),
-                         std::vector<nvbench::float64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0., 1.)), std::vector<nvbench::float64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0.f, 1.f)), std::vector<nvbench::float64_t>>));

  // Other types may be explicitly specified:
-  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::float32_t,
-                                                 nvbench::float32_t>(0.f, 1.f)),
+  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::float32_t, nvbench::float32_t>(0.f, 1.f)),
                         std::vector<nvbench::float32_t>>));
-  ASSERT((std::is_same_v<
-          decltype(nvbench::range<nvbench::int32_t, nvbench::int32_t>(0, 1)),
-          std::vector<nvbench::int32_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::int32_t, nvbench::int32_t>(0, 1)),
+                         std::vector<nvbench::int32_t>>));
 }

 void test_fp_tolerance()
@@ -68,10 +58,8 @@ void test_fp_tolerance()
  const nvbench::float32_t stride = 1e-4f;
  for (std::size_t size = 1; size < 1024; ++size)
  {
-    const nvbench::float32_t end =
-      start + stride * static_cast<nvbench::float32_t>(size - 1);
-    ASSERT_MSG(nvbench::range(start, end, stride).size() == size,
-               "size={}", size);
+    const nvbench::float32_t end = start + stride * static_cast<nvbench::float32_t>(size - 1);
+    ASSERT_MSG(nvbench::range(start, end, stride).size() == size, "size={}", size);
  }
 }

--- a/testing/reset_error.cu
+++ b/testing/reset_error.cu
@@ -2,18 +2,17 @@

 #include "test_asserts.cuh"

-
 namespace
 {
-    __global__ void multiply5(const int32_t* __restrict__ a, int32_t* __restrict__ b)
-    {
-      const auto id = blockIdx.x * blockDim.x + threadIdx.x;
-      b[id] = 5 * a[id];
-    }
+__global__ void multiply5(const int32_t *__restrict__ a, int32_t *__restrict__ b)
+{
+  const auto id = blockIdx.x * blockDim.x + threadIdx.x;
+  b[id]         = 5 * a[id];
 }
+} // namespace

 int main()
-{ 
+{
  multiply5<<<256, 256>>>(nullptr, nullptr);

  try
--- a/testing/ring_buffer.cu
+++ b/testing/ring_buffer.cu
@@ -18,14 +18,13 @@

 #include <nvbench/detail/ring_buffer.cuh>

-#include "test_asserts.cuh"
-
 #include <algorithm>
 #include <vector>

+#include "test_asserts.cuh"
+
 template <typename T>
-bool equal(const nvbench::detail::ring_buffer<T> &buffer,
-           const std::vector<T> &reference)
+bool equal(const nvbench::detail::ring_buffer<T> &buffer, const std::vector<T> &reference)
 {
  return std::equal(buffer.begin(), buffer.end(), reference.begin());
 }
--- a/testing/runner.cu
+++ b/testing/runner.cu
@@ -16,23 +16,22 @@
 *  limitations under the License.
 */

-#include <nvbench/runner.cuh>
-
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
+#include <nvbench/runner.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>

-#include "test_asserts.cuh"
-
 #include <fmt/format.h>

 #include <algorithm>
 #include <variant>
 #include <vector>

+#include "test_asserts.cuh"
+
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -65,21 +64,16 @@ using misc_types  = nvbench::type_list<bool, void>;
 using type_axes   = nvbench::type_list<float_types, int_types, misc_types>;

 template <typename FloatT, typename IntT, typename MiscT>
-void template_no_op_generator(nvbench::state &state,
-                              nvbench::type_list<FloatT, IntT, MiscT>)
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<FloatT, IntT, MiscT>)
 {
-  ASSERT(nvbench::type_strings<FloatT>::input_string() ==
-         state.get_string("FloatT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
-         state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<FloatT>::input_string() == state.get_string("FloatT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));

  // Enum params using non-templated version:
  no_op_generator(state);
 }
-NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator,
-                                 template_no_op_callable);
+NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator, template_no_op_callable);

 void test_empty()
 {
--- a/testing/state.cu
+++ b/testing/state.cu
@@ -16,10 +16,9 @@
 *  limitations under the License.
 */

-#include <nvbench/state.cuh>
-
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
+#include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/types.cuh>

@@ -43,8 +42,7 @@ struct state_tester : public nvbench::state
  void set_param(std::string name, T &&value)
  {
    this->state::m_axis_values.set_value(std::move(name),
-                                         nvbench::named_values::value_type{
-                                           std::forward<T>(value)});
+                                         nvbench::named_values::value_type{std::forward<T>(value)});
  }
 };
 } // namespace nvbench::detail
--- a/Show More
+++ b/Show More