Merge pull request #209 from alliepiper/pre-commit-ci

Add pre-commit.ci configs, format.
2026-04-27 02:08:52 +00:00 · 2025-04-14 14:05:48 -04:00
parent 47bd2838da 4c38b2d5f7
commit 2c2f40a659
109 changed files with 903 additions and 967 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -41,16 +41,28 @@ CompactNamespaces: false
 ContinuationIndentWidth: 2
 IncludeBlocks:   Regroup
 IncludeCategories:
  - Regex:           '^<cub'
    Priority:        1
  - Regex:           '^<thrust'
    Priority:        2
  - Regex:           '^<cuda'
    Priority:        3
  - Regex:           '^<nvbench'
    Priority:        1
  - Regex:           '^<cub'
    Priority:        2
  - Regex:           '^<thrust'
    Priority:        3
  - Regex:           '^<cuda/'
    Priority:        4
-  - Regex:           '^<[a-z]*>$'
+  - Regex:           '^<cuda'
    Priority:        5
  - Regex:           '^<nvml'
    Priority:        6
  - Regex:           '^<cupti'
    Priority:        7
  - Regex:           '^<nvperf'
    Priority:        8
  - Regex:           '^<nlohmann'
    Priority:        9
  - Regex:           '^<fmt'
    Priority:        10
  - Regex:           '^<[a-z_]*>$'
    Priority:        11
 IndentCaseLabels: true
 IndentPPDirectives: None
 IndentWidth: 2
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -34,7 +34,7 @@ CCCL uses [Development Containers](https://containers.dev/) to provide consisten
   - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.
-     ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png)
+     ![Shows "Reopen in Container" in command palette.](./img/open_in_container_manual.png)
 4. Select an environment with the desired CTK and host compiler from the list:
@@ -136,7 +136,7 @@ For more information, see the `.devcontainer/make_devcontainers.sh --help` messa
 2. Install WSL 2 by running:
 ```bash
-wsl --install 
+wsl --install
 ```
 This should probably install Ubuntu distro as a default.
@@ -182,14 +182,14 @@ then run `sudo systemctl restart docker.service`.
 10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).
 11. If prompted, choose `Reopen in Container`.
-    
+
    - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.
 12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.
 From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:
-13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message. 
+13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message.
 > Failed opening a web browser at https://github.com/login/device
  exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
--- a/.devcontainer/launch.sh
+++ b/.devcontainer/launch.sh
@@ -304,4 +304,3 @@ main() {
 }
 main "$@"
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -14,4 +14,4 @@
 #
 # Only add commits that are pure formatting changes (e.g. clang-format version changes, etc).
 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
-
+3440855dbd405db614861885ad1577fffd882867 # Initial addition of pre-commit.ci formatting.
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,70 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.
 ci:
    autofix_commit_msg: |
      [pre-commit.ci] auto code formatting
    autofix_prs: false
    autoupdate_branch: ''
    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
    autoupdate_schedule: quarterly
    skip: []
    submodules: false
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v5.0.0
    hooks:
      - id: end-of-file-fixer
      - id: mixed-line-ending
      - id: trailing-whitespace
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v19.1.6
    hooks:
      - id: clang-format
        types_or: [file]
        files: |
          (?x)^(
            ^.*\.c$|
            ^.*\.cpp$|
            ^.*\.cu$|
            ^.*\.cuh$|
            ^.*\.cxx$|
            ^.*\.h$|
            ^.*\.hpp$|
            ^.*\.inl$|
            ^.*\.mm$
          )
        args: ["-fallback-style=none", "-style=file", "-i"]
  # TODO/REMINDER: add the Ruff vscode extension to the devcontainers
  # Ruff, the Python auto-correcting linter/formatter written in Rust
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.8.6
    hooks:
    - id: ruff  # linter
    - id: ruff-format  # formatter
  # TOML lint & format
  - repo: https://github.com/ComPWA/taplo-pre-commit
    rev: v0.9.3
    hooks:
      # See https://github.com/NVIDIA/cccl/issues/3426
      # - id: taplo-lint
      #   exclude: "^docs/"
      - id: taplo-format
        exclude: "^docs/"
  - repo: https://github.com/codespell-project/codespell
    rev: v2.3.0
    hooks:
      - id: codespell
        additional_dependencies: [tomli]
        args: ["--toml", "pyproject.toml"]
        exclude: |
          (?x)^(
            build|
            CITATION.md
          )
 default_language_version:
  python: python3
--- a/ci/ninja_summary.py
+++ b/ci/ninja_summary.py
@@ -65,6 +65,7 @@ long_ext_count = 10
 class Target:
    """Represents a single line read for a .ninja_log file."""
    def __init__(self, start, end):
        """Creates a target object by passing in the start/end times in seconds
        as a float."""
@@ -94,9 +95,9 @@ class Target:
        """
        # Allow for modest floating-point errors
        epsilon = 0.000002
-        if (self.weighted_duration > self.Duration() + epsilon):
+        if self.weighted_duration > self.Duration() + epsilon:
-            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+            print("%s > %s?" % (self.weighted_duration, self.Duration()))
-        assert (self.weighted_duration <= self.Duration() + epsilon)
+        assert self.weighted_duration <= self.Duration() + epsilon
        return self.weighted_duration
    def DescribeTargets(self):
@@ -104,10 +105,10 @@ class Target:
        # Some build steps generate dozens of outputs - handle them sanely.
        # The max_length was chosen so that it can fit most of the long
        # single-target names, while minimizing word wrapping.
-        result = ', '.join(self.targets)
+        result = ", ".join(self.targets)
        max_length = 65
        if len(result) > max_length:
-            result = result[:max_length] + '...'
+            result = result[:max_length] + "..."
        return result
@@ -121,12 +122,11 @@ def ReadTargets(log, show_all):
    # targets.
    if not header:
        return []
-    assert header == '# ninja log v5\n', \
+    assert header == "# ninja log v5\n", "unrecognized ninja log version %r" % header
           'unrecognized ninja log version %r' % header
    targets_dict = {}
    last_end_seen = 0.0
    for line in log:
-        parts = line.strip().split('\t')
+        parts = line.strip().split("\t")
        if len(parts) != 5:
            # If ninja.exe is rudely halted then the .ninja_log file may be
            # corrupt. Silently continue.
@@ -165,17 +165,17 @@ def ReadTargets(log, show_all):
 def GetExtension(target, extra_patterns):
    """Return the file extension that best represents a target.
-  For targets that generate multiple outputs it is important to return a
+    For targets that generate multiple outputs it is important to return a
-  consistent 'canonical' extension. Ultimately the goal is to group build steps
+    consistent 'canonical' extension. Ultimately the goal is to group build steps
-  by type."""
+    by type."""
    for output in target.targets:
        if extra_patterns:
-            for fn_pattern in extra_patterns.split(';'):
+            for fn_pattern in extra_patterns.split(";"):
-                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+                if fnmatch.fnmatch(output, "*" + fn_pattern + "*"):
                    return fn_pattern
        # Not a true extension, but a good grouping.
-        if output.endswith('type_mappings'):
+        if output.endswith("type_mappings"):
-            extension = 'type_mappings'
+            extension = "type_mappings"
            break
        # Capture two extensions if present. For example: file.javac.jar should
@@ -185,26 +185,26 @@ def GetExtension(target, extra_patterns):
        extension = ext2 + ext1  # Preserve the order in the file name.
        if len(extension) == 0:
-            extension = '(no extension found)'
+            extension = "(no extension found)"
-        if ext1 in ['.pdb', '.dll', '.exe']:
+        if ext1 in [".pdb", ".dll", ".exe"]:
-            extension = 'PEFile (linking)'
+            extension = "PEFile (linking)"
            # Make sure that .dll and .exe are grouped together and that the
            # .dll.lib files don't cause these to be listed as libraries
            break
-        if ext1 in ['.so', '.TOC']:
+        if ext1 in [".so", ".TOC"]:
-            extension = '.so (linking)'
+            extension = ".so (linking)"
            # Attempt to identify linking, avoid identifying as '.TOC'
            break
        # Make sure .obj files don't get categorized as mojo files
-        if ext1 in ['.obj', '.o']:
+        if ext1 in [".obj", ".o"]:
            break
        # Jars are the canonical output of java targets.
-        if ext1 == '.jar':
+        if ext1 == ".jar":
            break
        # Normalize all mojo related outputs to 'mojo'.
-        if output.count('.mojom') > 0:
+        if output.count(".mojom") > 0:
-            extension = 'mojo'
+            extension = "mojo"
            break
    return extension
@@ -229,8 +229,8 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
        if target.end > latest:
            latest = target.end
        total_cpu_time += target.Duration()
-        task_start_stop_times.append((target.start, 'start', target))
+        task_start_stop_times.append((target.start, "start", target))
-        task_start_stop_times.append((target.end, 'stop', target))
+        task_start_stop_times.append((target.end, "stop", target))
    length = latest - earliest
    weighted_total = 0.0
@@ -256,10 +256,10 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
        if num_running > 0:
            # Update the total weighted time up to this moment.
            last_weighted_time += (time - last_time) / float(num_running)
-        if action_name == 'start':
+        if action_name == "start":
            # Record the total weighted task time when this task starts.
            running_tasks[target] = last_weighted_time
-        if action_name == 'stop':
+        if action_name == "stop":
            # Record the change in the total weighted task time while this task
            # ran.
            weighted_duration = last_weighted_time - running_tasks[target]
@@ -267,24 +267,27 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
            weighted_total += weighted_duration
            del running_tasks[target]
        last_time = time
-    assert (len(running_tasks) == 0)
+    assert len(running_tasks) == 0
    # Warn if the sum of weighted times is off by more than half a second.
    if abs(length - weighted_total) > 500:
-        print('Warning: Possible corrupt ninja log, results may be '
+        print(
-              'untrustworthy. Length = %.3f, weighted total = %.3f' %
+            "Warning: Possible corrupt ninja log, results may be "
-              (length, weighted_total))
+            "untrustworthy. Length = %.3f, weighted total = %.3f"
            % (length, weighted_total)
        )
    # Print the slowest build steps:
-    print('    Longest build steps:')
+    print("    Longest build steps:")
    if elapsed_time_sorting:
        entries.sort(key=lambda x: x.Duration())
    else:
        entries.sort(key=lambda x: x.WeightedDuration())
    for target in entries[-long_count:]:
-        print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
+        print(
-              (target.WeightedDuration(), target.DescribeTargets(),
+            "      %8.1f weighted s to build %s (%.1f s elapsed time)"
-               target.Duration()))
+            % (target.WeightedDuration(), target.DescribeTargets(), target.Duration())
        )
    # Sum up the time by file extension/type of the output file
    count_by_ext = {}
@@ -293,51 +296,56 @@ def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
    # Scan through all of the targets to build up per-extension statistics.
    for target in entries:
        extension = GetExtension(target, extra_step_types)
-        time_by_ext[extension] = time_by_ext.get(extension,
+        time_by_ext[extension] = time_by_ext.get(extension, 0) + target.Duration()
-                                                 0) + target.Duration()
+        weighted_time_by_ext[extension] = (
-        weighted_time_by_ext[extension] = weighted_time_by_ext.get(
+            weighted_time_by_ext.get(extension, 0) + target.WeightedDuration()
-            extension, 0) + target.WeightedDuration()
+        )
        count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
-    print('    Time by build-step type:')
+    print("    Time by build-step type:")
    # Copy to a list with extension name and total time swapped, to (time, ext)
    if elapsed_time_sorting:
-        weighted_time_by_ext_sorted = sorted(
+        weighted_time_by_ext_sorted = sorted((y, x) for (x, y) in time_by_ext.items())
            (y, x) for (x, y) in time_by_ext.items())
    else:
        weighted_time_by_ext_sorted = sorted(
-            (y, x) for (x, y) in weighted_time_by_ext.items())
+            (y, x) for (x, y) in weighted_time_by_ext.items()
        )
    # Print the slowest build target types:
    for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
        print(
-            '      %8.1f s weighted time to generate %d %s files '
+            "      %8.1f s weighted time to generate %d %s files "
-            '(%1.1f s elapsed time sum)' %
+            "(%1.1f s elapsed time sum)"
-            (time, count_by_ext[extension], extension, time_by_ext[extension]))
+            % (time, count_by_ext[extension], extension, time_by_ext[extension])
        )
-    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
+    print(
-          'parallelism)' %
+        "    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx "
-          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+        "parallelism)" % (length, total_cpu_time, total_cpu_time * 1.0 / length)
-    print('    %d build steps completed, average of %1.2f/s' %
+    )
-          (len(entries), len(entries) / (length)))
+    print(
        "    %d build steps completed, average of %1.2f/s"
        % (len(entries), len(entries) / (length))
    )
 def main():
-    log_file = '.ninja_log'
+    log_file = ".ninja_log"
-    metrics_file = 'siso_metrics.json'
+    metrics_file = "siso_metrics.json"
    parser = argparse.ArgumentParser()
-    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument("-C", dest="build_directory", help="Build directory.")
    parser.add_argument(
-        '-s',
+        "-s",
-        '--step-types',
+        "--step-types",
-        help='semicolon separated fnmatch patterns for build-step grouping')
+        help="semicolon separated fnmatch patterns for build-step grouping",
    )
    parser.add_argument(
-        '-e',
+        "-e",
-        '--elapsed_time_sorting',
+        "--elapsed_time_sorting",
        default=False,
-        action='store_true',
+        action="store_true",
-        help='Sort output by elapsed time instead of weighted time')
+        help="Sort output by elapsed time instead of weighted time",
-    parser.add_argument('--log-file',
+    )
-                        help="specific ninja log file to analyze.")
+    parser.add_argument("--log-file", help="specific ninja log file to analyze.")
    args, _extra_args = parser.parse_known_args()
    if args.build_directory:
        log_file = os.path.join(args.build_directory, log_file)
@@ -348,34 +356,35 @@ def main():
        # Offer a convenient way to add extra step types automatically,
        # including when this script is run by autoninja. get() returns None if
        # the variable isn't set.
-        args.step_types = os.environ.get('chromium_step_types')
+        args.step_types = os.environ.get("chromium_step_types")
    if args.step_types:
        # Make room for the extra build types.
        global long_ext_count
-        long_ext_count += len(args.step_types.split(';'))
+        long_ext_count += len(args.step_types.split(";"))
    if os.path.exists(metrics_file):
        # Automatically handle summarizing siso builds.
-        cmd = ['siso.bat' if 'win32' in sys.platform else 'siso']
+        cmd = ["siso.bat" if "win32" in sys.platform else "siso"]
-        cmd.extend(['metrics', 'summary'])
+        cmd.extend(["metrics", "summary"])
        if args.build_directory:
-            cmd.extend(['-C', args.build_directory])
+            cmd.extend(["-C", args.build_directory])
        if args.step_types:
-            cmd.extend(['--step_types', args.step_types])
+            cmd.extend(["--step_types", args.step_types])
        if args.elapsed_time_sorting:
-            cmd.append('--elapsed_time_sorting')
+            cmd.append("--elapsed_time_sorting")
        subprocess.run(cmd)
    else:
        try:
-            with open(log_file, 'r') as log:
+            with open(log_file, "r") as log:
                entries = ReadTargets(log, False)
                if entries:
-                    SummarizeEntries(entries, args.step_types,
+                    SummarizeEntries(
-                                     args.elapsed_time_sorting)
+                        entries, args.step_types, args.elapsed_time_sorting
                    )
        except IOError:
-            print('Log file %r not found, no build summary created.' % log_file)
+            print("Log file %r not found, no build summary created." % log_file)
            return errno.ENOENT
-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/ci/sccache_hit_rate.sh
+++ b/ci/sccache_hit_rate.sh
@@ -37,5 +37,5 @@ if [ $requests_diff -eq 0 ]; then
 else
    hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
    echo "sccache hit rate: $hit_rate%" >&2
-    echo "$hit_rate" 
+    echo "$hit_rate"
 fi
--- a/cmake/DetectSupportedStandards.cmake
+++ b/cmake/DetectSupportedStandards.cmake
@@ -1,4 +1,4 @@
-# Detect the langauge standards supported by the current compilers.
+# Detect the language standards supported by the current compilers.
 #
 # Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
 #
--- a/cmake/NVBenchUtilities.cmake
+++ b/cmake/NVBenchUtilities.cmake
@@ -14,7 +14,7 @@
 # limitations under the License.
 # Passes all args directly to execute_process while setting up the following
-# results variables and propogating them to the caller's scope:
+# results variables and propagating them to the caller's scope:
 #
 # - nvbench_process_exit_code
 # - nvbench_process_stdout
--- a/cmake/header_test.in.cxx
+++ b/cmake/header_test.in.cxx
@@ -9,9 +9,9 @@
 // a potential macro collision and halts.
 //
 // Hacky way to build a string, but it works on all tested platforms.
-#define NVBench_MACRO_CHECK(MACRO, HEADER)                                      \
+#define NVBench_MACRO_CHECK(MACRO, HEADER)                                                         \
-  NVBench_MACRO_CHECK_IMPL(Identifier MACRO should not be used from NVBench      \
+  NVBench_MACRO_CHECK_IMPL(                                                                        \
-                           headers due to conflicts with HEADER macros.)
+    Identifier MACRO should not be used from NVBench headers due to conflicts with HEADER macros.)
 // Use raw platform checks instead of the NVBench_HOST_COMPILER macros since we
 // don't want to #include any headers other than the one being tested.
@@ -34,8 +34,8 @@
 // library implementations unconditionally `#undef` these macros, which then
 // causes random failures later.
 // Leaving these commented out as a warning: Here be dragons.
-//#define min(...) NVBench_MACRO_CHECK('min', windows.h)
+// #define min(...) NVBench_MACRO_CHECK('min', windows.h)
-//#define max(...) NVBench_MACRO_CHECK('max', windows.h)
+// #define max(...) NVBench_MACRO_CHECK('max', windows.h)
 // termios.h conflicts (NVIDIA/thrust#1547)
 #define B0 NVBench_MACRO_CHECK("B0", termios.h)
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -90,7 +90,7 @@
    before any `--benchmark` arguments.
 * `--stopping-criterion <criterion>`
-  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough 
+  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough
    samples were collected.
  * Only applies to Cold measurements.
  * Default is stdrel (`--stopping-criterion stdrel`)
--- a/examples/auto_throughput.cu
+++ b/examples/auto_throughput.cu
@@ -24,37 +24,33 @@
 template <int ItemsPerThread>
 __global__ void kernel(std::size_t stride,
                       std::size_t elements,
-                       const nvbench::int32_t * __restrict__ in,
+                       const nvbench::int32_t *__restrict__ in,
                       nvbench::int32_t *__restrict__ out)
 {
-  const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const std::size_t tid  = threadIdx.x + blockIdx.x * blockDim.x;
  const std::size_t step = gridDim.x * blockDim.x;
-  for (std::size_t i = stride * tid;
+  for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
       i < stride * elements;
       i += stride * step)
  {
    for (int j = 0; j < ItemsPerThread; j++)
    {
-      const auto read_id = (ItemsPerThread * i + j) % elements;
+      const auto read_id  = (ItemsPerThread * i + j) % elements;
      const auto write_id = tid + j * elements;
-      out[write_id] = in[read_id];
+      out[write_id]       = in[read_id];
    }
  }
 }
 // `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
 // and cache hit rates.
 //
 // Calling state.collect_*() enables particular metric collection if nvbench
 // was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
 template <int ItemsPerThread>
-void throughput_bench(nvbench::state &state,
+void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
                      nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
 {
  // Allocate input data:
-  const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+  const std::size_t stride   = static_cast<std::size_t>(state.get_int64("Stride"));
  const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
  thrust::device_vector<nvbench::int32_t> input(elements);
  thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
    static_cast<int>((elements + threads_in_block - 1) / threads_in_block);
  state.exec([&](nvbench::launch &launch) {
-    kernel<ItemsPerThread>
+    kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
-      <<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+      stride,
-        stride,
+      elements,
-        elements,
+      thrust::raw_pointer_cast(input.data()),
-        thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()));
        thrust::raw_pointer_cast(output.data()));
  });
 }
--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -71,18 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
  thrust::device_vector<nvbench::int32_t> in(num_values, 0);
  thrust::device_vector<nvbench::int32_t> out(num_values, 0);
-  state.exec(
+  state.exec([block_size,
-    [block_size,
+              num_blocks,
-     num_blocks,
+              num_values,
-     num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
-      (void) num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
-      nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
+                                                                             out_ptr,
-        in_ptr,
+                                                                             num_values);
-        out_ptr,
+  });
        num_values);
    });
 }
 NVBENCH_BENCH(copy_sweep_grid_shape)
  // Every second power of two from  64->1024:
@@ -107,15 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
  thrust::device_vector<ValueType> in(num_values, 0);
  thrust::device_vector<ValueType> out(num_values, 0);
-  state.exec(
+  state.exec([num_values,
-    [num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
-      (void) num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
+  });
                                                                 out_ptr,
                                                                 num_values);
    });
 }
 // Define a type_list to use for the type axis:
 using cts_types = nvbench::type_list<nvbench::uint8_t,
@@ -131,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
 // Convert 64 MiB of InputTypes to OutputTypes, represented with various
 // value_types.
 template <typename InputType, typename OutputType>
-void copy_type_conversion_sweep(nvbench::state &state,
+void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
                                nvbench::type_list<InputType, OutputType>)
 {
  // Optional: Skip narrowing conversions.
-  if constexpr(sizeof(InputType) > sizeof(OutputType))
+  if constexpr (sizeof(InputType) > sizeof(OutputType))
  {
    state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
    return;
@@ -154,15 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
  thrust::device_vector<InputType> in(num_values, 0);
  thrust::device_vector<OutputType> out(num_values, 0);
-  state.exec(
+  state.exec([num_values,
-    [num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
-      (void) num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
+  });
                                                                 out_ptr,
                                                                 num_values);
    });
 }
 // Optional: Skip when InputType == OutputType. This approach avoids
 // instantiating the benchmark at all.
@@ -178,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
                                      nvbench::float32_t,
                                      nvbench::int64_t,
                                      nvbench::float64_t>;
-NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
+NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
                    NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/custom_criterion.cu
+++ b/examples/custom_criterion.cu
@@ -36,10 +36,7 @@ public:
 protected:
  // Setup the criterion in the `do_initialize()` method:
-  virtual void do_initialize() override
+  virtual void do_initialize() override { m_num_samples = 0; }
  {
    m_num_samples = 0;
  }
  // Process new measurements in the `add_measurement()` method:
  virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
@@ -52,7 +49,6 @@ protected:
  {
    return m_num_samples >= m_params.get_int64("max-samples");
  }
 };
 // Register the criterion with NVBench:
@@ -71,7 +67,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);
  state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/examples/enums.cu
+++ b/examples/enums.cu
@@ -17,7 +17,6 @@
 */
 #include <nvbench/nvbench.cuh>
 #include <nvbench/test_kernels.cuh>
 // Enum to use as parameter axis:
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
  // Create inputs, etc, configure runtime kernel parameters, etc.
  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
+  state.exec(
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
  });
 }
-NVBENCH_BENCH(runtime_enum_sweep_string)
+NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});
  .add_string_axis("MyEnum", {"A", "B", "C"});
 //==============================================================================
 // Sweep through enum values at runtime using an int64 axis.
@@ -97,9 +94,8 @@ void runtime_enum_sweep_int64(nvbench::state &state)
  // Create inputs, etc, configure runtime kernel parameters, etc.
  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
+  state.exec(
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
  });
 }
 NVBENCH_BENCH(runtime_enum_sweep_int64)
  .add_int64_axis("MyEnum",
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
  // Template parameters, static dispatch, etc.
  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
+  state.exec(
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
  });
 }
-using MyEnumList =
+using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
  nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
 NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
  .set_type_axes_names({"MyEnum"});
@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
 //  * `-12` (struct std::integral_constant<int,-12>)
 // ```
 template <nvbench::int32_t IntValue>
-void compile_time_int_sweep(nvbench::state &state,
+void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
                            nvbench::type_list<nvbench::enum_type<IntValue>>)
 {
  // Use IntValue in compile time contexts.
  // Template parameters, static dispatch, etc.
  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
+  state.exec(
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
  });
 }
 using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
 NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))
--- a/examples/exec_tag_sync.cu
+++ b/examples/exec_tag_sync.cu
@@ -53,9 +53,7 @@ void sequence_bench(nvbench::state &state)
  // nvbench::exec_tag::sync indicates that this will implicitly sync:
  state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
-    thrust::sequence(thrust::device.on(launch.get_stream()),
+    thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
                     data.begin(),
                     data.end());
  });
 }
 NVBENCH_BENCH(sequence_bench);
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -23,8 +23,8 @@
 // Thrust simplifies memory management, etc:
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 // mod2_inplace performs an in-place mod2 over every element in `data`. `data`
@@ -54,7 +54,7 @@ void mod2_inplace(nvbench::state &state)
  state.exec(nvbench::exec_tag::timer,
             // Lambda now takes a `timer` argument:
             [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
-               (void) num_values; // clang thinks this is unused...
+               (void)num_values; // clang thinks this is unused...
               // Reset working data:
               thrust::copy(thrust::device.on(launch.get_stream()),
--- a/examples/skip.cu
+++ b/examples/skip.cu
@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
 // Two type axes are swept, but configurations where InputType == OutputType are
 // skipped.
 template <typename InputType, typename OutputType>
-void skip_overload(nvbench::state &state,
+void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
                   nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
+  state.exec(
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
  });
 }
 // Overload of skip_overload that is called when InputType == OutputType.
 template <typename T>
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
+  state.exec(
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
  });
 }
 // Enable this overload if InputType is larger than OutputType
 template <typename InputType, typename OutputType>
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
  state.skip("sizeof(InputType) > sizeof(OutputType).");
 }
 // The same type_list is used for both inputs/outputs.
-using sn_types = nvbench::type_list<nvbench::int8_t,
+using sn_types =
-                                    nvbench::int16_t,
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
                                    nvbench::int32_t,
                                    nvbench::int64_t>;
 // Setup benchmark:
 NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -52,7 +52,7 @@ void stream_bench(nvbench::state &state)
  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
  state.exec([&input, &output, num_values](nvbench::launch &) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    copy(thrust::raw_pointer_cast(input.data()),
         thrust::raw_pointer_cast(output.data()),
         num_values);
--- a/examples/summaries.cu
+++ b/examples/summaries.cu
@@ -26,8 +26,8 @@
 void summary_example(nvbench::state &state)
 {
  // Fetch parameters and compute duration in seconds:
-  const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
+  const auto ms       = static_cast<nvbench::float64_t>(state.get_int64("ms"));
-  const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
+  const auto us       = static_cast<nvbench::float64_t>(state.get_int64("us"));
  const auto duration = ms * 1e-3 + us * 1e-6;
  // Add a new column to the summary table with the derived duration used by the benchmark.
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -51,7 +51,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);
  state.exec([&input, &output, num_values](nvbench::launch &launch) {
-    (void) num_values; // clang thinks this is unused...
+    (void)num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/exec/nvbench-ctl.cu
+++ b/exec/nvbench-ctl.cu
@@ -1,20 +1,20 @@
 /*
-*  Copyright 2021 NVIDIA Corporation
+ *  Copyright 2021 NVIDIA Corporation
-*
+ *
-*  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
-*  (the "License"); you may not use this file except in compliance with
+ *  (the "License"); you may not use this file except in compliance with
-*  the License.
+ *  the License.
-*
+ *
-*  You may obtain a copy of the License at
+ *  You may obtain a copy of the License at
-*
+ *
-*      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
-*
+ *
-*  Unless required by applicable law or agreed to in writing, software
+ *  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
+ *  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
+ *  See the License for the specific language governing permissions and
-*  limitations under the License.
+ *  limitations under the License.
-*/
+ */
 #include <nvbench/main.cuh>
@@ -24,7 +24,7 @@ int main(int argc, char const *const *argv)
 try
 {
  // If no args, substitute a new argv that prints the version
-  std::vector<const char*> alt_argv;
+  std::vector<const char *> alt_argv;
  if (argc == 1)
  {
    alt_argv.push_back("--version");
@@ -36,7 +36,7 @@ try
  NVBENCH_CUDA_CALL(cudaDeviceReset());
  return 0;
 }
-catch (std::exception & e)
+catch (std::exception &e)
 {
  std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";
  return 1;
--- a/nvbench/axes_metadata.cxx
+++ b/nvbench/axes_metadata.cxx
@@ -19,13 +19,13 @@
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
 #include <algorithm>
 #include <cassert>
 #include <stdexcept>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
 namespace nvbench
 {
--- a/nvbench/axis_base.cxx
+++ b/nvbench/axis_base.cxx
@@ -16,7 +16,7 @@
 *  limitations under the License.
 */
-#include "axis_base.cuh"
+#include <nvbench/axis_base.cuh>
 namespace nvbench
 {
--- a/nvbench/benchmark.cuh
+++ b/nvbench/benchmark.cuh
@@ -18,9 +18,8 @@
 #pragma once
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/runner.cuh>
 #include <nvbench/type_list.cuh>
--- a/nvbench/benchmark_manager.cuh
+++ b/nvbench/benchmark_manager.cuh
@@ -45,7 +45,7 @@ struct benchmark_manager
   * benchmarks should be done here to avoid creating a CUDA context before we configure the CUDA
   * environment in `main`.
   */
-   void initialize();
+  void initialize();
  /**
   * Register a new benchmark.
--- a/nvbench/benchmark_manager.cxx
+++ b/nvbench/benchmark_manager.cxx
@@ -17,9 +17,8 @@
 */
 #include <nvbench/benchmark_manager.cuh>
 #include <nvbench/device_manager.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_manager.cuh>
 #include <fmt/format.h>
@@ -37,8 +36,8 @@ benchmark_manager &benchmark_manager::get()
 void benchmark_manager::initialize()
 {
-  const auto& mgr = device_manager::get();
+  const auto &mgr = device_manager::get();
-  for (auto& bench : m_benchmarks)
+  for (auto &bench : m_benchmarks)
  {
    if (!bench->get_is_cpu_only())
    {
--- a/nvbench/blocking_kernel.cu
+++ b/nvbench/blocking_kernel.cu
@@ -17,12 +17,10 @@
 */
 #include <nvbench/blocking_kernel.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/types.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/types.cuh>
 #include <cuda/std/chrono>
--- a/nvbench/criterion_manager.cuh
+++ b/nvbench/criterion_manager.cuh
@@ -24,7 +24,6 @@
 #include <nvbench/types.cuh>
 #include <memory>
 #include <unordered_map>
 namespace nvbench
@@ -40,14 +39,14 @@ public:
  /**
   * @return The singleton criterion_manager instance.
   */
-  static criterion_manager& get();
+  static criterion_manager &get();
  /**
   * Register a new stopping criterion.
   */
-  nvbench::stopping_criterion_base& add(std::unique_ptr<nvbench::stopping_criterion_base> criterion);
+  nvbench::stopping_criterion_base &add(std::unique_ptr<nvbench::stopping_criterion_base> criterion);
-  nvbench::stopping_criterion_base& get_criterion(const std::string& name);
+  nvbench::stopping_criterion_base &get_criterion(const std::string &name);
-  const nvbench::stopping_criterion_base& get_criterion(const std::string& name) const;
+  const nvbench::stopping_criterion_base &get_criterion(const std::string &name) const;
  using params_description = std::vector<std::pair<std::string, nvbench::named_values::type>>;
  params_description get_params_description() const;
--- a/nvbench/criterion_manager.cxx
+++ b/nvbench/criterion_manager.cxx
@@ -41,7 +41,7 @@ criterion_manager &criterion_manager::get()
  return registry;
 }
-stopping_criterion_base& criterion_manager::get_criterion(const std::string& name)
+stopping_criterion_base &criterion_manager::get_criterion(const std::string &name)
 {
  auto iter = m_map.find(name);
  if (iter == m_map.end())
@@ -51,7 +51,8 @@ stopping_criterion_base& criterion_manager::get_criterion(const std::string& nam
  return *iter->second.get();
 }
-const nvbench::stopping_criterion_base& criterion_manager::get_criterion(const std::string& name) const
+const nvbench::stopping_criterion_base &
 criterion_manager::get_criterion(const std::string &name) const
 {
  auto iter = m_map.find(name);
  if (iter == m_map.end())
@@ -69,8 +70,7 @@ stopping_criterion_base &criterion_manager::add(std::unique_ptr<stopping_criteri
  if (!success)
  {
-    NVBENCH_THROW(std::runtime_error,
+    NVBENCH_THROW(std::runtime_error, "Stopping criterion \"{}\" is already registered.", name);
                  "Stopping criterion \"{}\" is already registered.", name);
  }
  return *it->second.get();
--- a/nvbench/csv_printer.cu
+++ b/nvbench/csv_printer.cu
@@ -16,14 +16,12 @@
 *  limitations under the License.
 */
 #include <nvbench/csv_printer.cuh>
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/csv_printer.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/internal/table_builder.cuh>
 #include <nvbench/summary.cuh>
 #include <fmt/format.h>
@@ -169,7 +167,10 @@ void csv_printer::do_print_benchmark_results(const benchmark_vector &benches)
      std::size_t remaining = table.m_columns.size();
      for (const auto &col : table.m_columns)
      {
-        fmt::format_to(std::back_inserter(buffer), "{}{}", col.rows[i], (--remaining == 0) ? "" : ",");
+        fmt::format_to(std::back_inserter(buffer),
                       "{}{}",
                       col.rows[i],
                       (--remaining == 0) ? "" : ",");
      }
      fmt::format_to(std::back_inserter(buffer), "\n");
    }
--- a/nvbench/cuda_timer.cuh
+++ b/nvbench/cuda_timer.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/types.cuh>
 #include <cuda_runtime_api.h>
--- a/nvbench/cupti_profiler.cxx
+++ b/nvbench/cupti_profiler.cxx
@@ -17,7 +17,6 @@
 */
 #include <nvbench/cupti_profiler.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
@@ -54,7 +53,9 @@ void nvpw_call(const NVPA_Status status)
 {
  if (status != NVPA_STATUS_SUCCESS)
  {
-    NVBENCH_THROW(std::runtime_error, "NVPW call returned error: {}", static_cast<std::underlying_type_t<NVPA_Status>>(status));
+    NVBENCH_THROW(std::runtime_error,
                  "NVPW call returned error: {}",
                  static_cast<std::underlying_type_t<NVPA_Status>>(status));
  }
 }
--- a/nvbench/detail/entropy_criterion.cuh
+++ b/nvbench/detail/entropy_criterion.cuh
@@ -18,9 +18,9 @@
 #pragma once
 #include <nvbench/types.cuh>
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/ring_buffer.cuh>
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/types.cuh>
 #include <vector>
@@ -38,7 +38,7 @@ class entropy_criterion final : public stopping_criterion_base
  nvbench::detail::ring_buffer<nvbench::float64_t> m_entropy_tracker{299};
  // Used to avoid re-allocating temporary memory
-  std::vector<nvbench::float64_t> m_probabilities; 
+  std::vector<nvbench::float64_t> m_probabilities;
  nvbench::float64_t compute_entropy();
@@ -49,7 +49,6 @@ protected:
  virtual void do_initialize() override;
  virtual void do_add_measurement(nvbench::float64_t measurement) override;
  virtual bool do_is_finished() override;
 };
 } // namespace nvbench::detail
--- a/nvbench/detail/entropy_criterion.cxx
+++ b/nvbench/detail/entropy_criterion.cxx
@@ -21,7 +21,6 @@
 #include <cmath>
 namespace nvbench::detail
 {
@@ -40,7 +39,7 @@ void entropy_criterion::do_initialize()
  m_freq_tracker.clear();
 }
-nvbench::float64_t entropy_criterion::compute_entropy() 
+nvbench::float64_t entropy_criterion::compute_entropy()
 {
  const std::size_t n = m_freq_tracker.size();
  if (n == 0)
@@ -70,15 +69,15 @@ void entropy_criterion::do_add_measurement(nvbench::float64_t measurement)
  m_total_cuda_time += measurement;
  {
-    auto key = measurement;
+    auto key                = measurement;
    constexpr bool bin_keys = false;
-    if (bin_keys) 
+    if (bin_keys)
    {
      const auto resolution_us = 0.5;
-      const auto resulution_s = resolution_us / 1'000'000;
+      const auto resulution_s  = resolution_us / 1000000;
-      const auto epsilon = resulution_s * 2;
+      const auto epsilon       = resulution_s * 2;
-      key = std::round(key / epsilon) * epsilon;
+      key                      = std::round(key / epsilon) * epsilon;
    }
    // This approach is about 3x faster than `std::{unordered_,}map`
@@ -120,7 +119,7 @@ bool entropy_criterion::do_is_finished()
  const auto [slope, intercept] = statistics::compute_linear_regression(begin, end, mean);
-  if (statistics::slope2deg(slope) > m_params.get_float64("max-angle")) 
+  if (statistics::slope2deg(slope) > m_params.get_float64("max-angle"))
  {
    return false;
  }
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -25,13 +25,13 @@
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <fmt/format.h>
 #include <algorithm>
 #include <chrono>
 #include <limits>
 #include <thread>
 #include <fmt/format.h>
 namespace nvbench::detail
 {
--- a/nvbench/detail/measure_cold.cuh
+++ b/nvbench/detail/measure_cold.cuh
@@ -18,8 +18,6 @@
 #pragma once
 #include <cuda_runtime.h>
 #include <nvbench/blocking_kernel.cuh>
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/cuda_call.cuh>
@@ -32,12 +30,13 @@
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/types.cuh>
 #include <cuda_runtime.h>
 #include <utility>
 #include <vector>
 #include "nvbench/types.cuh"
 namespace nvbench
 {
--- a/nvbench/detail/measure_cpu_only.cuh
+++ b/nvbench/detail/measure_cpu_only.cuh
@@ -19,13 +19,12 @@
 #pragma once
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/statistics.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/statistics.cuh>
 #include <utility>
 #include <vector>
@@ -66,7 +65,7 @@ protected:
  nvbench::cpu_timer m_walltime_timer;
  nvbench::criterion_params m_criterion_params;
-  nvbench::stopping_criterion_base& m_stopping_criterion;
+  nvbench::stopping_criterion_base &m_stopping_criterion;
  bool m_run_once{false};
--- a/nvbench/detail/measure_cpu_only.cxx
+++ b/nvbench/detail/measure_cpu_only.cxx
@@ -24,11 +24,11 @@
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <fmt/format.h>
 #include <algorithm>
 #include <limits>
 #include <fmt/format.h>
 namespace nvbench::detail
 {
@@ -36,7 +36,8 @@ measure_cpu_only_base::measure_cpu_only_base(state &exec_state)
    : m_state{exec_state}
    , m_launch(m_state.get_cuda_stream())
    , m_criterion_params{exec_state.get_criterion_params()}
-    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(exec_state.get_stopping_criterion())}
+    , m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
        exec_state.get_stopping_criterion())}
    , m_run_once{exec_state.get_run_once()}
    , m_min_samples{exec_state.get_min_samples()}
    , m_skip_time{exec_state.get_skip_time()}
@@ -72,7 +73,7 @@ void measure_cpu_only_base::run_trials_prologue() { m_walltime_timer.start(); }
 void measure_cpu_only_base::record_measurements()
 {
  // Update and record timers and counters:
-  const auto cur_cpu_time  = m_cpu_timer.get_duration();
+  const auto cur_cpu_time = m_cpu_timer.get_duration();
  m_min_cpu_time = std::min(m_min_cpu_time, cur_cpu_time);
  m_max_cpu_time = std::max(m_max_cpu_time, cur_cpu_time);
@@ -188,8 +189,7 @@ void measure_cpu_only_base::generate_summaries()
      auto &summ = m_state.add_summary("nv/cpu_only/bw/global/bytes_per_second");
      summ.set_string("name", "GlobalMem BW");
      summ.set_string("hint", "byte_rate");
-      summ.set_string("description",
+      summ.set_string("description", "Number of bytes read/written per second.");
                      "Number of bytes read/written per second.");
      summ.set_float64("value", avg_used_gmem_bw);
    }
  } // bandwidth
@@ -210,9 +210,9 @@ void measure_cpu_only_base::generate_summaries()
    if (m_max_time_exceeded)
    {
-      const auto timeout = m_walltime_timer.get_duration();
+      const auto timeout   = m_walltime_timer.get_duration();
      const auto max_noise = m_criterion_params.get_float64("max-noise");
-      const auto min_time = m_criterion_params.get_float64("min-time");
+      const auto min_time  = m_criterion_params.get_float64("min-time");
      if (cpu_noise > max_noise)
      {
--- a/nvbench/detail/measure_cupti.cuh
+++ b/nvbench/detail/measure_cupti.cuh
@@ -24,13 +24,12 @@
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_timer.cuh>
 #include <nvbench/cupti_profiler.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <nvbench/detail/kernel_launcher_timer_wrapper.cuh>
 #include <nvbench/detail/l2flush.cuh>
 #include <nvbench/detail/statistics.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/exec_tag.cuh>
 #include <nvbench/launch.cuh>
 #include <cuda_runtime.h>
--- a/nvbench/detail/measure_hot.cu
+++ b/nvbench/detail/measure_hot.cu
@@ -16,9 +16,8 @@
 *  limitations under the License.
 */
 #include <nvbench/detail/measure_hot.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/detail/measure_hot.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/printer_base.cuh>
--- a/nvbench/detail/ring_buffer.cuh
+++ b/nvbench/detail/ring_buffer.cuh
@@ -19,12 +19,11 @@
 #pragma once
 #include <nvbench/config.cuh>
 #include <nvbench/detail/statistics.cuh>
 #include <cassert>
 #include <cstddef>
 #include <iterator>
 #include <cassert>
 #include <vector>
 namespace nvbench::detail
@@ -76,14 +75,14 @@ public:
    return temp;
  }
-  ring_buffer_iterator operator+(difference_type n) const 
+  ring_buffer_iterator operator+(difference_type n) const
-  { 
+  {
-    return ring_buffer_iterator(m_index + n, m_capacity, m_ptr); 
+    return ring_buffer_iterator(m_index + n, m_capacity, m_ptr);
  }
-  ring_buffer_iterator operator-(difference_type n) const 
+  ring_buffer_iterator operator-(difference_type n) const
-  { 
+  {
-    return ring_buffer_iterator(m_index - n, m_capacity, m_ptr); 
+    return ring_buffer_iterator(m_index - n, m_capacity, m_ptr);
  }
  difference_type operator-(const ring_buffer_iterator &other) const
@@ -121,13 +120,9 @@ private:
  std::size_t m_index{0};
  bool m_full{false};
-  std::size_t get_front_index() const 
+  std::size_t get_front_index() const { return m_full ? m_index : 0; }
  {
    return m_full ? m_index : 0;
  }
 public:
  /**
   * Create a new ring buffer with the requested capacity.
   */
--- a/nvbench/detail/state_generator.cxx
+++ b/nvbench/detail/state_generator.cxx
@@ -16,15 +16,13 @@
 *  limitations under the License.
 */
 #include <nvbench/detail/state_generator.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/detail/state_generator.cuh>
 #include <nvbench/detail/transform_reduce.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/type_axis.cuh>
 #include <nvbench/detail/transform_reduce.cuh>
 #include <algorithm>
 #include <cassert>
 #include <functional>
@@ -165,7 +163,7 @@ void state_generator::build_axis_configs()
        config.set_string(axis_info.axis, axis.get_input_string(axis_info.index));
      }
    } // type_si
-  }   // type_axis_config generation
+  } // type_axis_config generation
  // non_type_axis_config generation
  {
@@ -201,9 +199,9 @@ void state_generator::build_axis_configs()
                              axes.get_string_axis(axis_info.axis).get_value(axis_info.index));
            break;
        } // switch (type)
-      }   // for (axis_info : current_indices)
+      } // for (axis_info : current_indices)
-    }     // for non_type_sg configs
+    } // for non_type_sg configs
-  }       // non_type_axis_config generation
+  } // non_type_axis_config generation
 }
 void state_generator::build_states()
--- a/nvbench/detail/statistics.cuh
+++ b/nvbench/detail/statistics.cuh
@@ -26,12 +26,10 @@
 #include <iterator>
 #include <limits>
 #include <numeric>
 #include <cmath>
 #include <type_traits>
 #ifndef M_PI
-  #define M_PI 3.14159265358979323846
+#define M_PI 3.14159265358979323846
 #endif
 namespace nvbench::detail::statistics
@@ -154,7 +152,7 @@ nvbench::float64_t compute_r2(It first,
  for (std::size_t i = 0; i < n; ++i, ++first)
  {
-    const nvbench::float64_t y = *first;
+    const nvbench::float64_t y      = *first;
    const nvbench::float64_t y_pred = slope * static_cast<nvbench::float64_t>(i) + intercept;
    ss_tot += (y - mean_y) * (y - mean_y);
@@ -179,19 +177,10 @@ compute_r2(It first, It last, nvbench::float64_t slope, nvbench::float64_t inter
  return compute_r2(first, last, compute_mean(first, last), slope, intercept);
 }
-inline nvbench::float64_t rad2deg(nvbench::float64_t rad)
+inline nvbench::float64_t rad2deg(nvbench::float64_t rad) { return rad * 180.0 / M_PI; }
 {
  return rad * 180.0 / M_PI;
 }
-inline nvbench::float64_t slope2rad(nvbench::float64_t slope)
+inline nvbench::float64_t slope2rad(nvbench::float64_t slope) { return std::atan2(slope, 1.0); }
 {
  return std::atan2(slope, 1.0);
 }
-inline nvbench::float64_t slope2deg(nvbench::float64_t slope)
+inline nvbench::float64_t slope2deg(nvbench::float64_t slope) { return rad2deg(slope2rad(slope)); }
 {
  return rad2deg(slope2rad(slope));
 }
 } // namespace nvbench::detail::statistics
--- a/nvbench/detail/stdrel_criterion.cuh
+++ b/nvbench/detail/stdrel_criterion.cuh
@@ -18,9 +18,9 @@
 #pragma once
 #include <nvbench/types.cuh>
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/ring_buffer.cuh>
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/types.cuh>
 #include <vector>
--- a/nvbench/detail/stdrel_criterion.cxx
+++ b/nvbench/detail/stdrel_criterion.cxx
@@ -29,7 +29,7 @@ stdrel_criterion::stdrel_criterion()
 void stdrel_criterion::do_initialize()
 {
-  m_total_samples = 0;
+  m_total_samples   = 0;
  m_total_cuda_time = 0.0;
  m_cuda_times.clear();
  m_noise_tracker.clear();
@@ -46,7 +46,7 @@ void stdrel_criterion::do_add_measurement(nvbench::float64_t measurement)
  const auto cuda_stdev     = nvbench::detail::statistics::standard_deviation(m_cuda_times.cbegin(),
                                                                          m_cuda_times.cend(),
                                                                          mean_cuda_time);
-  const auto cuda_rel_stdev       = cuda_stdev / mean_cuda_time;
+  const auto cuda_rel_stdev = cuda_stdev / mean_cuda_time;
  if (std::isfinite(cuda_rel_stdev))
  {
    m_noise_tracker.push_back(cuda_rel_stdev);
--- a/nvbench/detail/throw.cuh
+++ b/nvbench/detail/throw.cuh
@@ -19,6 +19,7 @@
 #pragma once
 #include <fmt/format.h>
 #include <stdexcept>
 #define NVBENCH_THROW(exception_type, format_str, ...)                                             \
--- a/nvbench/detail/timestamps_kernel.cu
+++ b/nvbench/detail/timestamps_kernel.cu
@@ -16,13 +16,13 @@
 *  limitations under the License.
 */
 #include <cuda_runtime.h>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/detail/timestamps_kernel.cuh>
 #include <nvbench/types.cuh>
 #include <cuda_runtime.h>
 #include <cstdio>
 #include <cstdlib>
@@ -71,12 +71,11 @@ void timestamps_kernel::record(const nvbench::cuda_stream &stream)
  int num_sms   = 0;
  NVBENCH_CUDA_CALL(cudaGetDevice(&device_id));
-  NVBENCH_CUDA_CALL(
+  NVBENCH_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
    cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device_id));
  get_timestamps_kernel<<<static_cast<unsigned int>(num_sms), 1, 0, stream.get_stream()>>>(
    m_device_timestamps,
    m_device_timestamps + 1);
 }
-} // namespace nvbench
+} // namespace nvbench::detail
--- a/nvbench/detail/type_list_impl.cuh
+++ b/nvbench/detail/type_list_impl.cuh
@@ -82,7 +82,7 @@ struct cartesian_product<nvbench::type_list<nvbench::type_list<T, Tail...>, TL,
  using tail_prod = typename detail::cartesian_product<nvbench::type_list<TL, TLTail...>>::type;
  using cur       = typename detail::prepend_each<T, tail_prod>::type;
  using next      = typename detail::cartesian_product<
-    nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
+         nvbench::type_list<nvbench::type_list<Tail...>, TL, TLTail...>>::type;
  using type = decltype(detail::concat(cur{}, next{}));
 };
--- a/nvbench/device_info.cu
+++ b/nvbench/device_info.cu
@@ -16,11 +16,10 @@
 *  limitations under the License.
 */
 #include <nvbench/device_info.cuh>
 #include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/internal/nvml.cuh>
 #include <cuda_runtime_api.h>
--- a/nvbench/device_info.cuh
+++ b/nvbench/device_info.cuh
@@ -18,17 +18,16 @@
 #pragma once
 #include <cuda_runtime_api.h>
 #include <nvbench/config.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
 #include <cuda_runtime_api.h>
 #include <cstdint> // CHAR_BIT
 #include <stdexcept>
 #include <utility>
 #include <string_view>
 #include <utility>
 // forward declare this for internal storage
 struct nvmlDevice_st;
--- a/nvbench/device_manager.cu
+++ b/nvbench/device_manager.cu
@@ -16,13 +16,12 @@
 *  limitations under the License.
 */
 #include <nvbench/device_manager.cuh>
 #include <cuda_runtime_api.h>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/detail/device_scope.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_manager.cuh>
 #include <cuda_runtime_api.h>
 namespace nvbench
 {
@@ -45,13 +44,13 @@ device_manager::device_manager()
  }
 }
-const nvbench::device_info &device_manager::get_device(int id) 
+const nvbench::device_info &device_manager::get_device(int id)
-{ 
+{
-  if (id < 0) 
+  if (id < 0)
  {
    NVBENCH_THROW(std::runtime_error, "Negative index: {}.", id);
  }
-  return m_devices.at(static_cast<std::size_t>(id)); 
+  return m_devices.at(static_cast<std::size_t>(id));
 }
 } // namespace nvbench
--- a/nvbench/exec_tag.cuh
+++ b/nvbench/exec_tag.cuh
@@ -101,10 +101,10 @@ using no_gpu_t        = tag<nvbench::detail::exec_flag::no_gpu>;
 using no_batch_t      = tag<nvbench::detail::exec_flag::no_batch>;
 using modifier_mask_t = tag<nvbench::detail::exec_flag::modifier_mask>;
-using hot_t           = tag<nvbench::detail::exec_flag::hot>;
+using hot_t          = tag<nvbench::detail::exec_flag::hot>;
-using cold_t          = tag<nvbench::detail::exec_flag::cold>;
+using cold_t         = tag<nvbench::detail::exec_flag::cold>;
-using cpu_only_t      = tag<nvbench::detail::exec_flag::cpu_only>;
+using cpu_only_t     = tag<nvbench::detail::exec_flag::cpu_only>;
-using measure_mask_t  = tag<nvbench::detail::exec_flag::measure_mask>;
+using measure_mask_t = tag<nvbench::detail::exec_flag::measure_mask>;
 constexpr inline none_t none;
 constexpr inline timer_t timer;
--- a/nvbench/float64_axis.cuh
+++ b/nvbench/float64_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/axis_base.cuh>
 #include <nvbench/types.cuh>
 #include <vector>
@@ -40,7 +39,10 @@ struct float64_axis final : public axis_base
  [[nodiscard]] nvbench::float64_t get_value(std::size_t i) const { return m_values[i]; }
 private:
-  std::unique_ptr<axis_base> do_clone() const final { return std::make_unique<float64_axis>(*this); }
+  std::unique_ptr<axis_base> do_clone() const final
  {
    return std::make_unique<float64_axis>(*this);
  }
  std::size_t do_get_size() const final { return m_values.size(); }
  std::string do_get_input_string(std::size_t i) const final;
  std::string do_get_description(std::size_t i) const final;
--- a/nvbench/int64_axis.cuh
+++ b/nvbench/int64_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/axis_base.cuh>
 #include <nvbench/flags.cuh>
 #include <nvbench/types.cuh>
--- a/nvbench/int64_axis.cxx
+++ b/nvbench/int64_axis.cxx
@@ -16,9 +16,8 @@
 *  limitations under the License.
 */
 #include <nvbench/int64_axis.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/int64_axis.cuh>
 #include <fmt/format.h>
--- a/nvbench/internal/markdown_table.cuh
+++ b/nvbench/internal/markdown_table.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/detail/transform_reduce.cuh>
 #include <nvbench/internal/table_builder.cuh>
 #include <fmt/color.h>
--- a/nvbench/internal/nvml.cuh
+++ b/nvbench/internal/nvml.cuh
@@ -21,12 +21,12 @@
 #include <nvbench/config.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <fmt/format.h>
 #ifdef NVBENCH_HAS_NVML
 #include <nvml.h>
 #endif // NVBENCH_HAS_NVML
 #include <fmt/format.h>
 #include <stdexcept>
 namespace nvbench::nvml
@@ -38,6 +38,7 @@ struct NVMLLifetimeManager
 {
  NVMLLifetimeManager();
  ~NVMLLifetimeManager();
 private:
  bool m_inited{false};
 };
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -16,24 +16,22 @@
 *  limitations under the License.
 */
 #include <nvbench/json_printer.cuh>
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/config.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/device_info.cuh>
 #include <nvbench/device_manager.cuh>
 #include <nvbench/git_revision.cuh>
 #include <nvbench/json_printer.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/version.cuh>
-#include <nvbench/detail/throw.cuh>
+#include <nlohmann/json.hpp>
 #include <fmt/format.h>
 #include <nlohmann/json.hpp>
 #include <cstdint>
 #include <fstream>
 #include <iterator>
@@ -105,7 +103,7 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
      default:
        NVBENCH_THROW(std::runtime_error, "{}", "Unrecognized value type.");
    } // end switch (value type)
-  }   // end foreach value name
+  } // end foreach value name
 }
 } // end namespace
@@ -225,27 +223,26 @@ static void add_devices_section(nlohmann::ordered_json &root)
  auto &devices = root["devices"];
  for (const auto &dev_info : nvbench::device_manager::get().get_devices())
  {
-    auto &device                    = devices.emplace_back();
+    auto &device                                = devices.emplace_back();
-    device["id"]                    = dev_info.get_id();
+    device["id"]                                = dev_info.get_id();
-    device["name"]                  = dev_info.get_name();
+    device["name"]                              = dev_info.get_name();
-    device["sm_version"]            = dev_info.get_sm_version();
+    device["sm_version"]                        = dev_info.get_sm_version();
-    device["ptx_version"]           = dev_info.get_ptx_version();
+    device["ptx_version"]                       = dev_info.get_ptx_version();
-    device["sm_default_clock_rate"] = dev_info.get_sm_default_clock_rate();
+    device["sm_default_clock_rate"]             = dev_info.get_sm_default_clock_rate();
-    device["number_of_sms"]         = dev_info.get_number_of_sms();
+    device["number_of_sms"]                     = dev_info.get_number_of_sms();
-    device["max_blocks_per_sm"]     = dev_info.get_max_blocks_per_sm();
+    device["max_blocks_per_sm"]                 = dev_info.get_max_blocks_per_sm();
-    device["max_threads_per_sm"]    = dev_info.get_max_threads_per_sm();
+    device["max_threads_per_sm"]                = dev_info.get_max_threads_per_sm();
-    device["max_threads_per_block"] = dev_info.get_max_threads_per_block();
+    device["max_threads_per_block"]             = dev_info.get_max_threads_per_block();
-    device["registers_per_sm"]      = dev_info.get_registers_per_sm();
+    device["registers_per_sm"]                  = dev_info.get_registers_per_sm();
-    device["registers_per_block"]   = dev_info.get_registers_per_block();
+    device["registers_per_block"]               = dev_info.get_registers_per_block();
-    device["global_memory_size"]    = dev_info.get_global_memory_size();
+    device["global_memory_size"]                = dev_info.get_global_memory_size();
-    device["global_memory_bus_peak_clock_rate"] =
+    device["global_memory_bus_peak_clock_rate"] = dev_info.get_global_memory_bus_peak_clock_rate();
-      dev_info.get_global_memory_bus_peak_clock_rate();
+    device["global_memory_bus_width"]           = dev_info.get_global_memory_bus_width();
-    device["global_memory_bus_width"]     = dev_info.get_global_memory_bus_width();
+    device["global_memory_bus_bandwidth"]       = dev_info.get_global_memory_bus_bandwidth();
-    device["global_memory_bus_bandwidth"] = dev_info.get_global_memory_bus_bandwidth();
+    device["l2_cache_size"]                     = dev_info.get_l2_cache_size();
-    device["l2_cache_size"]               = dev_info.get_l2_cache_size();
+    device["shared_memory_per_sm"]              = dev_info.get_shared_memory_per_sm();
-    device["shared_memory_per_sm"]        = dev_info.get_shared_memory_per_sm();
+    device["shared_memory_per_block"]           = dev_info.get_shared_memory_per_block();
-    device["shared_memory_per_block"]     = dev_info.get_shared_memory_per_block();
+    device["ecc_state"]                         = dev_info.get_ecc_state();
    device["ecc_state"]                   = dev_info.get_ecc_state();
  }
 }
@@ -298,8 +295,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
          false;
 #endif
      } // "nvbench"
-    }   // "version"
+    } // "version"
-  }     // "meta"
+  } // "meta"
  add_devices_section(root);
@@ -362,8 +359,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
            default:
              break;
          } // end switch (axis type)
-        }   // end foreach axis value
+        } // end foreach axis value
-      }     // end foreach axis
+      } // end foreach axis
      auto &states = bench["states"];
      for (const auto &exec_state : bench_ptr->get_states())
@@ -431,8 +428,8 @@ void json_printer::do_print_benchmark_results(const benchmark_vector &benches)
          continue;
        }
      } // end foreach exec_state
-    }   // end foreach benchmark
+    } // end foreach benchmark
-  }     // "benchmarks"
+  } // "benchmarks"
  m_ostream << root.dump(2) << "\n";
 }
@@ -492,7 +489,7 @@ void json_printer::do_print_benchmark_list(const benchmark_vector &benches)
          default:
            break;
        } // end switch (axis type)
-      }   // end foreach axis value
+      } // end foreach axis value
    }
  } // end foreach bench
--- a/nvbench/json_printer.cuh
+++ b/nvbench/json_printer.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/printer_base.cuh>
 #include <nvbench/types.cuh>
 #include <string>
--- a/nvbench/markdown_printer.cu
+++ b/nvbench/markdown_printer.cu
@@ -16,15 +16,13 @@
 *  limitations under the License.
 */
 #include <nvbench/markdown_printer.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/device_manager.cuh>
 #include <nvbench/internal/markdown_table.cuh>
 #include <nvbench/markdown_printer.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/summary.cuh>
 #include <nvbench/internal/markdown_table.cuh>
 #include <fmt/color.h>
 #include <fmt/format.h>
@@ -72,8 +70,12 @@ void markdown_printer::do_print_device_info()
                   "* Max Shared Memory: {} KiB/SM, {} KiB/Block\n",
                   device.get_shared_memory_per_sm() / 1024,
                   device.get_shared_memory_per_block() / 1024);
-    fmt::format_to(std::back_inserter(buffer), "* L2 Cache Size: {} KiB\n", device.get_l2_cache_size() / 1024);
+    fmt::format_to(std::back_inserter(buffer),
-    fmt::format_to(std::back_inserter(buffer), "* Maximum Active Blocks: {}/SM\n", device.get_max_blocks_per_sm());
+                   "* L2 Cache Size: {} KiB\n",
                   device.get_l2_cache_size() / 1024);
    fmt::format_to(std::back_inserter(buffer),
                   "* Maximum Active Blocks: {}/SM\n",
                   device.get_max_blocks_per_sm());
    fmt::format_to(std::back_inserter(buffer),
                   "* Maximum Active Threads: {}/SM, {}/Block\n",
                   device.get_max_threads_per_sm(),
@@ -82,7 +84,9 @@ void markdown_printer::do_print_device_info()
                   "* Available Registers: {}/SM, {}/Block\n",
                   device.get_registers_per_sm(),
                   device.get_registers_per_block());
-    fmt::format_to(std::back_inserter(buffer), "* ECC Enabled: {}\n", device.get_ecc_state() ? "Yes" : "No");
+    fmt::format_to(std::back_inserter(buffer),
                   "* ECC Enabled: {}\n",
                   device.get_ecc_state() ? "Yes" : "No");
    fmt::format_to(std::back_inserter(buffer), "\n");
  }
  m_ostream << fmt::to_string(buffer);
@@ -191,9 +195,12 @@ void markdown_printer::do_print_benchmark_list(const printer_base::benchmark_vec
        {
          desc = fmt::format(" ({})", desc);
        }
-        fmt::format_to(std::back_inserter(buffer), "  * `{}`{}\n", axis_ptr->get_input_string(i), desc);
+        fmt::format_to(std::back_inserter(buffer),
                       "  * `{}`{}\n",
                       axis_ptr->get_input_string(i),
                       desc);
      } // end foreach value
-    }   // end foreach axis
+    } // end foreach axis
    fmt::format_to(std::back_inserter(buffer), "\n");
  } // end foreach bench
--- a/nvbench/named_values.cxx
+++ b/nvbench/named_values.cxx
@@ -16,10 +16,9 @@
 *  limitations under the License.
 */
 #include <nvbench/named_values.cuh>
 #include <nvbench/config.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/named_values.cuh>
 #include <fmt/format.h>
--- a/nvbench/nvbench.cuh
+++ b/nvbench/nvbench.cuh
@@ -24,8 +24,8 @@
 #include <nvbench/callable.cuh>
 #include <nvbench/config.cuh>
 #include <nvbench/cpu_timer.cuh>
 #include <nvbench/criterion_manager.cuh>
 #include <nvbench/create.cuh>
 #include <nvbench/criterion_manager.cuh>
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/cuda_timer.cuh>
--- a/nvbench/option_parser.cu
+++ b/nvbench/option_parser.cu
@@ -34,6 +34,8 @@
 #include <nvbench/internal/cli_help.cuh>
 #include <nvbench/internal/cli_help_axis.cuh>
 #include <fmt/format.h>
 #include <algorithm>
 #include <cassert>
 #include <cstdlib>
@@ -44,12 +46,10 @@
 #include <regex>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <tuple>
 #include <vector>
 #include <fmt/format.h>
 #include <string_view>
 namespace
 {
--- a/nvbench/printer_base.cuh
+++ b/nvbench/printer_base.cuh
@@ -191,9 +191,9 @@ protected:
  virtual void do_process_bulk_data_float64(nvbench::state &,
                                            const std::string &,
                                            const std::string &,
-                                            const std::vector<nvbench::float64_t> &){};
+                                            const std::vector<nvbench::float64_t> &) {};
-  virtual void do_print_benchmark_list(const benchmark_vector &) 
+  virtual void do_print_benchmark_list(const benchmark_vector &)
  {
    throw std::runtime_error{"nvbench::do_print_benchmark_list is not supported by this printer."};
  }
--- a/nvbench/runner.cuh
+++ b/nvbench/runner.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/detail/state_generator.cuh>
 #include <stdexcept>
--- a/nvbench/runner.cxx
+++ b/nvbench/runner.cxx
@@ -16,10 +16,9 @@
 *  limitations under the License.
 */
 #include <nvbench/runner.cuh>
 #include <nvbench/benchmark_base.cuh>
 #include <nvbench/printer_base.cuh>
 #include <nvbench/runner.cuh>
 #include <nvbench/state.cuh>
 #include <fmt/format.h>
--- a/nvbench/state.cxx
+++ b/nvbench/state.cxx
@@ -20,13 +20,13 @@
 #include <nvbench/state.cuh>
 #include <nvbench/types.cuh>
 #include <fmt/color.h>
 #include <fmt/format.h>
 #include <algorithm>
 #include <stdexcept>
 #include <string>
 #include <fmt/color.h>
 #include <fmt/format.h>
 namespace nvbench
 {
--- a/nvbench/stopping_criterion.cuh
+++ b/nvbench/stopping_criterion.cuh
@@ -21,19 +21,21 @@
 #include <nvbench/named_values.cuh>
 #include <nvbench/types.cuh>
 #include <string>
 #include <initializer_list>
 #include <string>
 #include <unordered_map>
 namespace nvbench
 {
-namespace detail 
+namespace detail
 {
-constexpr nvbench::float64_t compat_min_time() { return 0.5; }    // 0.5 seconds
+constexpr nvbench::float64_t compat_min_time() { return 0.5; } // 0.5 seconds
-constexpr nvbench::float64_t compat_max_noise() { return 0.005; } // 0.5% relative standard deviation
+constexpr nvbench::float64_t compat_max_noise()
 {
  return 0.005;
 } // 0.5% relative standard deviation
 } // namespace detail
@@ -43,6 +45,7 @@ constexpr nvbench::float64_t compat_max_noise() { return 0.005; } // 0.5% relati
 class criterion_params
 {
  nvbench::named_values m_named_values;
 public:
  criterion_params();
  criterion_params(std::initializer_list<std::pair<std::string, nvbench::named_values::value_type>>);
@@ -96,7 +99,7 @@ public:
   *
   * This method is called once per benchmark run, before any measurements are provided.
   */
-  void initialize(const criterion_params &params) 
+  void initialize(const criterion_params &params)
  {
    m_params.set_from(params);
    this->do_initialize();
@@ -105,18 +108,12 @@ public:
  /**
   * Add the latest measurement to the criterion
   */
-  void add_measurement(nvbench::float64_t measurement)
+  void add_measurement(nvbench::float64_t measurement) { this->do_add_measurement(measurement); }
  {
    this->do_add_measurement(measurement);
  }
  /**
   * Check if the criterion has been met for all measurements processed by `add_measurement`
   */
-  bool is_finished()
+  bool is_finished() { return this->do_is_finished(); }
  {
    return this->do_is_finished();
  }
 protected:
  /**
--- a/nvbench/stopping_criterion.cxx
+++ b/nvbench/stopping_criterion.cxx
@@ -16,10 +16,8 @@
 *  limitations under the License.
 */
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/detail/throw.cuh>
-
+#include <nvbench/stopping_criterion.cuh>
 namespace nvbench
 {
@@ -62,7 +60,7 @@ void criterion_params::set_from(const criterion_params &other)
 void criterion_params::set_int64(std::string name, nvbench::int64_t value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
  {
    m_named_values.remove_value(name);
  }
@@ -72,7 +70,7 @@ void criterion_params::set_int64(std::string name, nvbench::int64_t value)
 void criterion_params::set_float64(std::string name, nvbench::float64_t value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
  {
    m_named_values.remove_value(name);
  }
@@ -82,7 +80,7 @@ void criterion_params::set_float64(std::string name, nvbench::float64_t value)
 void criterion_params::set_string(std::string name, std::string value)
 {
-  if (m_named_values.has_value(name)) 
+  if (m_named_values.has_value(name))
  {
    m_named_values.remove_value(name);
  }
@@ -110,15 +108,11 @@ std::string criterion_params::get_string(const std::string &name) const
  return m_named_values.get_string(name);
 }
-std::vector<std::string> criterion_params::get_names() const
+std::vector<std::string> criterion_params::get_names() const { return m_named_values.get_names(); }
 {
  return m_named_values.get_names();
 }
 nvbench::named_values::type criterion_params::get_type(const std::string &name) const
 {
  return m_named_values.get_type(name);
 }
-
+} // namespace nvbench
 } // namespace nvbench::detail
--- a/nvbench/string_axis.cuh
+++ b/nvbench/string_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/axis_base.cuh>
 #include <nvbench/types.cuh>
 #include <vector>
--- a/nvbench/type_axis.cuh
+++ b/nvbench/type_axis.cuh
@@ -19,7 +19,6 @@
 #pragma once
 #include <nvbench/axis_base.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
--- a/nvbench/type_axis.cxx
+++ b/nvbench/type_axis.cxx
@@ -16,9 +16,8 @@
 *  limitations under the License.
 */
 #include <nvbench/type_axis.cuh>
 #include <nvbench/detail/throw.cuh>
 #include <nvbench/type_axis.cuh>
 #include <fmt/format.h>
 #include <fmt/ranges.h>
--- a/nvbench/type_list.cuh
+++ b/nvbench/type_list.cuh
@@ -18,7 +18,7 @@
 #pragma once
-#include "detail/type_list_impl.cuh"
+#include <nvbench/detail/type_list_impl.cuh>
 #include <tuple>
 #include <type_traits>
--- a/nvbench/type_strings.cxx
+++ b/nvbench/type_strings.cxx
@@ -27,11 +27,11 @@
 #endif
 #ifdef NVBENCH_CXXABI_DEMANGLE
 #include <cxxabi.h>
 #include <cstdlib>
 #include <memory>
 #include <cxxabi.h>
 namespace
 {
 struct free_wrapper
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,24 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 [tool.ruff]
 target-version = "py310"
 fix = true
 show-fixes = true
 exclude = ["docs/tools"]
 [tool.ruff.lint]
 extend-select = ["I"]
 [tool.codespell]
 # To run codespell interactively and fix errors that pre-commit reports, try
 # `codespell -i 3 -w -H`. This will run with interactive review (-i 3), writes
 # changes to disk (-w), and includes hidden files (-H).
 # Note: pre-commit passes explicit lists of files here, which this skip file
 # list doesn't override - the skip list is only to allow you to run codespell
 # interactively.
 skip = "./.git,./build,./CITATION.md"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
 ignore-words-list = "inout,imovable,optionN,aCount,quitted,Invokable,countr,unexpect,numer,euclidian,couldn,OffsetT,FromM"
 builtin = "clear"
 quiet-level = 3
--- a/scripts/nvbench_compare.py
+++ b/scripts/nvbench_compare.py
@@ -5,12 +5,11 @@ import math
 import os
 import sys
 from colorama import Fore
 import tabulate
-
+from colorama import Fore
 from nvbench_json import reader
 # Parse version string into tuple, "x.y.z" -> (x, y, z)
 def version_tuple(v):
    return tuple(map(int, (v.split("."))))
@@ -139,15 +138,14 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
        colalign.append("center")
        for device_id in device_ids:
            rows = []
-            plot_data = {'cmp': {}, 'ref': {}, 'cmp_noise': {}, 'ref_noise': {}}
+            plot_data = {"cmp": {}, "ref": {}, "cmp_noise": {}, "ref_noise": {}}
            for cmp_state in cmp_states:
                cmp_state_name = cmp_state["name"]
-                ref_state = next(filter(lambda st: st["name"] == cmp_state_name,
+                ref_state = next(
-                                        ref_states),
+                    filter(lambda st: st["name"] == cmp_state_name, ref_states), None
-                                 None)
+                )
                if not ref_state:
                    continue
@@ -158,9 +156,7 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                row = []
                for axis_value in axis_values:
                    axis_value_name = axis_value["name"]
-                    row.append(format_axis_value(axis_value_name,
+                    row.append(format_axis_value(axis_value_name, axis_value, axes))
                                                 axis_value,
                                                 axes))
                cmp_summaries = cmp_state["summaries"]
                ref_summaries = ref_state["summaries"]
@@ -171,23 +167,37 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                def lookup_summary(summaries, tag):
                    return next(filter(lambda s: s["tag"] == tag, summaries), None)
-                cmp_time_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/mean")
+                cmp_time_summary = lookup_summary(
-                ref_time_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/mean")
+                    cmp_summaries, "nv/cold/time/gpu/mean"
-                cmp_noise_summary = lookup_summary(cmp_summaries, "nv/cold/time/gpu/stdev/relative")
+                )
-                ref_noise_summary = lookup_summary(ref_summaries, "nv/cold/time/gpu/stdev/relative")
+                ref_time_summary = lookup_summary(
                    ref_summaries, "nv/cold/time/gpu/mean"
                )
                cmp_noise_summary = lookup_summary(
                    cmp_summaries, "nv/cold/time/gpu/stdev/relative"
                )
                ref_noise_summary = lookup_summary(
                    ref_summaries, "nv/cold/time/gpu/stdev/relative"
                )
                # TODO: Use other timings, too. Maybe multiple rows, with a
                # "Timing" column + values "CPU/GPU/Batch"?
-                if not all([cmp_time_summary,
+                if not all(
-                            ref_time_summary,
+                    [
-                            cmp_noise_summary,
+                        cmp_time_summary,
-                            ref_noise_summary]):
+                        ref_time_summary,
                        cmp_noise_summary,
                        ref_noise_summary,
                    ]
                ):
                    continue
                def extract_value(summary):
                    summary_data = summary["data"]
-                    value_data = next(filter(lambda v: v["name"] == "value", summary_data))
+                    value_data = next(
-                    assert(value_data["type"] == "float64")
+                        filter(lambda v: v["name"] == "value", summary_data)
                    )
                    assert value_data["type"] == "float64"
                    return value_data["value"]
                cmp_time = extract_value(cmp_time_summary)
@@ -218,23 +228,27 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                if plot:
                    axis_name = []
                    axis_value = "--"
-                    for aid in range(len(axis_values)): 
+                    for aid in range(len(axis_values)):
                        if axis_values[aid]["name"] != plot:
-                           axis_name.append("{} = {}".format(axis_values[aid]["name"], axis_values[aid]["value"]))
+                            axis_name.append(
                                "{} = {}".format(
                                    axis_values[aid]["name"], axis_values[aid]["value"]
                                )
                            )
                        else:
-                           axis_value = float(axis_values[aid]["value"])
+                            axis_value = float(axis_values[aid]["value"])
-                    axis_name = ', '.join(axis_name)
+                    axis_name = ", ".join(axis_name)
-                    if axis_name not in plot_data['cmp']:
+                    if axis_name not in plot_data["cmp"]:
-                        plot_data['cmp'][axis_name] = {}
+                        plot_data["cmp"][axis_name] = {}
-                        plot_data['ref'][axis_name] = {}
+                        plot_data["ref"][axis_name] = {}
-                        plot_data['cmp_noise'][axis_name] = {}
+                        plot_data["cmp_noise"][axis_name] = {}
-                        plot_data['ref_noise'][axis_name] = {}
+                        plot_data["ref_noise"][axis_name] = {}
-                    plot_data['cmp'][axis_name][axis_value] = cmp_time
+                    plot_data["cmp"][axis_name][axis_value] = cmp_time
-                    plot_data['ref'][axis_name][axis_value] = ref_time
+                    plot_data["ref"][axis_name][axis_value] = ref_time
-                    plot_data['cmp_noise'][axis_name][axis_value] = cmp_noise
+                    plot_data["cmp_noise"][axis_name][axis_value] = cmp_noise
-                    plot_data['ref_noise'][axis_name][axis_value] = ref_noise
+                    plot_data["ref_noise"][axis_name][axis_value] = ref_noise
                global config_count
                global unknown_count
@@ -273,14 +287,13 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
            print("## [%d] %s\n" % (device["id"], device["name"]))
            # colalign and github format require tabulate 0.8.3
            if tabulate_version >= (0, 8, 3):
-                print(tabulate.tabulate(rows,
+                print(
-                                        headers=headers,
+                    tabulate.tabulate(
-                                        colalign=colalign,
+                        rows, headers=headers, colalign=colalign, tablefmt="github"
-                                        tablefmt="github"))
+                    )
                )
            else:
-                print(tabulate.tabulate(rows,
+                print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
                                        headers=headers,
                                        tablefmt="markdown"))
            print("")
@@ -295,18 +308,17 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
                    x = [float(x) for x in plot_data[key][axis].keys()]
                    y = list(plot_data[key][axis].values())
-                    noise = list(plot_data[key + '_noise'][axis].values())
+                    noise = list(plot_data[key + "_noise"][axis].values())
                    top = [y[i] + y[i] * noise[i] for i in range(len(x))]
                    bottom = [y[i] - y[i] * noise[i] for i in range(len(x))]
-                    p = plt.plot(x, y, shape, marker='o', label=label)
+                    p = plt.plot(x, y, shape, marker="o", label=label)
                    plt.fill_between(x, bottom, top, color=p[0].get_color(), alpha=0.1)
-
+                for axis in plot_data["cmp"].keys():
-                for axis in plot_data['cmp'].keys():
+                    plot_line("cmp", "-", axis)
-                    plot_line('cmp', '-', axis)
+                    plot_line("ref", "--", axis + " ref")
                    plot_line('ref', '--', axis + ' ref')
                plt.legend()
                plt.show()
@@ -314,11 +326,17 @@ def compare_benches(ref_benches, cmp_benches, threshold, plot):
 def main():
    help_text = "%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]"
-    parser = argparse.ArgumentParser(prog='nvbench_compare', usage=help_text)
+    parser = argparse.ArgumentParser(prog="nvbench_compare", usage=help_text)
-    parser.add_argument('--threshold-diff', type=float, dest='threshold', default=0.0,
+    parser.add_argument(
-                        help='only show benchmarks where percentage diff is >= THRESHOLD')
+        "--threshold-diff",
-    parser.add_argument('--plot-along', type=str, dest='plot', default=None,
+        type=float,
-                        help='plot results')
+        dest="threshold",
        default=0.0,
        help="only show benchmarks where percentage diff is >= THRESHOLD",
    )
    parser.add_argument(
        "--plot-along", type=str, dest="plot", default=None, help="plot results"
    )
    args, files_or_dirs = parser.parse_known_args()
    print(files_or_dirs)
@@ -336,14 +354,17 @@ def main():
                continue
            r = os.path.join(files_or_dirs[0], f)
            c = os.path.join(files_or_dirs[1], f)
-            if os.path.isfile(r) and os.path.isfile(c) and \
+            if (
-               os.path.getsize(r) > 0 and os.path.getsize(c) > 0:
+                os.path.isfile(r)
                and os.path.isfile(c)
                and os.path.getsize(r) > 0
                and os.path.getsize(c) > 0
            ):
                to_compare.append((r, c))
    else:
        to_compare = [(files_or_dirs[0], files_or_dirs[1])]
    for ref, comp in to_compare:
        ref_root = reader.read_file(ref)
        cmp_root = reader.read_file(comp)
@@ -355,7 +376,9 @@ def main():
            print("Device sections do not match.")
            sys.exit(1)
-        compare_benches(ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot)
+        compare_benches(
            ref_root["benchmarks"], cmp_root["benchmarks"], args.threshold, args.plot
        )
    print("# Summary\n")
    print("- Total Matches: %d" % config_count)
@@ -365,5 +388,5 @@ def main():
    return failure_count
-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/nvbench_histogram.py
+++ b/scripts/nvbench_histogram.py
@@ -1,19 +1,19 @@
 #!/usr/bin/env python
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import argparse
 import os
 import sys
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
 from nvbench_json import reader
 def parse_files():
    help_text = "%(prog)s [nvbench.out.json | dir/] ..."
-    parser = argparse.ArgumentParser(prog='nvbench_histogram', usage=help_text)
+    parser = argparse.ArgumentParser(prog="nvbench_histogram", usage=help_text)
    args, files_or_dirs = parser.parse_known_args()
@@ -41,14 +41,14 @@ def parse_files():
 def extract_filename(summary):
    summary_data = summary["data"]
    value_data = next(filter(lambda v: v["name"] == "filename", summary_data))
-    assert(value_data["type"] == "string")
+    assert value_data["type"] == "string"
    return value_data["value"]
 def extract_size(summary):
    summary_data = summary["data"]
    value_data = next(filter(lambda v: v["name"] == "size", summary_data))
-    assert(value_data["type"] == "int64")
+    assert value_data["type"] == "int64"
    return int(value_data["value"])
@@ -57,9 +57,10 @@ def parse_samples_meta(filename, state):
    if not summaries:
        return None, None
-    summary = next(filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times",
+    summary = next(
-                          summaries),
+        filter(lambda s: s["tag"] == "nv/json/bin:nv/cold/sample_times", summaries),
-                   None)
+        None,
    )
    if not summary:
        return None, None
@@ -81,7 +82,7 @@ def parse_samples(filename, state):
    with open(samples_filename, "rb") as f:
        samples = np.fromfile(f, "<f4")
-    assert (sample_count == len(samples))
+    assert sample_count == len(samples)
    return samples
@@ -118,5 +119,5 @@ def main():
    plt.show()
-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/nvbench_json/init.py
+++ b/scripts/nvbench_json/init.py
@@ -1,2 +1,3 @@
-from . import reader
+from . import reader, version
-from . import version
+
 __all__ = ["reader", "version"]
--- a/scripts/nvbench_json/version.py
+++ b/scripts/nvbench_json/version.py
@@ -1,8 +1,8 @@
 file_version = (1, 0, 0)
-file_version_string = "{}.{}.{}".format(file_version[0],
+file_version_string = "{}.{}.{}".format(
-                                        file_version[1],
+    file_version[0], file_version[1], file_version[2]
-                                        file_version[2])
+)
 def check_file_version(filename, root_node):
@@ -19,8 +19,14 @@ def check_file_version(filename, root_node):
    # for now just warn on mismatch.
    if version_node["string"] != file_version_string:
        print("WARNING:")
-        print("  {} was written using a different NVBench JSON file version."
+        print(
-              .format(filename))
+            "  {} was written using a different NVBench JSON file version.".format(
                filename
            )
        )
        print("  It may not read correctly.")
-        print("  (file version: {} reader version: {})"
+        print(
-              .format(version_node["string"], file_version_string))
+            "  (file version: {} reader version: {})".format(
                version_node["string"], file_version_string
            )
        )
--- a/scripts/nvbench_walltime.py
+++ b/scripts/nvbench_walltime.py
@@ -5,9 +5,8 @@ import math
 import os
 import sys
 from nvbench_json import reader
 import tabulate
 from nvbench_json import reader
 # Parse version string into tuple, "x.y.z" -> (x, y, z)
@@ -39,7 +38,8 @@ def format_walltime(seconds_in):
        "{:0>2d}:".format(h) if h > 1e-9 else "",
        "{:0>2d}:".format(m) if (h > 1e-9 or m > 1e-9) else "",
        "{:0>2d}.".format(s) if (h > 1e-9 or m > 1e-9) else "{:d}.".format(s),
-        "{:0>3d}".format(ms))
+        "{:0>3d}".format(ms),
    )
 def format_percentage(percentage):
@@ -58,7 +58,7 @@ measure_column_names = {"cold": "Isolated", "batch": "Batch", "cupti": "CUPTI"}
 def init_measures():
    out = {}
    for name in measure_names:
-        out[name] = 0.
+        out[name] = 0.0
    return out
@@ -67,17 +67,17 @@ def get_measures(state):
    times = {}
    for name in measure_names:
        measure_walltime_tag = "nv/{}/walltime".format(name)
-        summary = next(filter(lambda s: s["tag"] == measure_walltime_tag,
+        summary = next(
-                              summaries),
+            filter(lambda s: s["tag"] == measure_walltime_tag, summaries), None
-                       None)
+        )
        if not summary:
            continue
        walltime_data = next(filter(lambda d: d["name"] == "value", summary["data"]))
-        assert(walltime_data["type"] == "float64")
+        assert walltime_data["type"] == "float64"
        walltime = walltime_data["value"]
        walltime = float(walltime)
-        times[name] = walltime if walltime else 0.
+        times[name] = walltime if walltime else 0.0
    return times
@@ -87,7 +87,7 @@ def merge_measures(target, src):
 def sum_measures(measures):
-    total_time = 0.
+    total_time = 0.0
    for time in measures.values():
        total_time += time
    return total_time
@@ -194,20 +194,21 @@ def print_overview_section(data):
    # colalign and github format require tabulate 0.8.3
    if tabulate_version >= (0, 8, 3):
-        print(tabulate.tabulate(rows,
+        print(
-                                headers=headers,
+            tabulate.tabulate(
-                                colalign=colalign,
+                rows, headers=headers, colalign=colalign, tablefmt="github"
-                                tablefmt="github"))
+            )
        )
    else:
-        print(tabulate.tabulate(rows,
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
                                headers=headers,
                                tablefmt="markdown"))
    print()
 # append_data_row_lambda args: (row_list, name, items[name])
-def print_measures_table(headers, colalign, items, total_measures, append_item_row_lambda):
+def print_measures_table(
    headers, colalign, items, total_measures, append_item_row_lambda
 ):
    total_time = sum_measures(total_measures)
    active_measures = get_active_measure_names(total_measures)
    num_user_columns = len(headers)
@@ -248,14 +249,13 @@ def print_measures_table(headers, colalign, items, total_measures, append_item_r
    # colalign and github format require tabulate 0.8.3
    if tabulate_version >= (0, 8, 3):
-        print(tabulate.tabulate(rows,
+        print(
-                                headers=headers,
+            tabulate.tabulate(
-                                colalign=colalign,
+                rows, headers=headers, colalign=colalign, tablefmt="github"
-                                tablefmt="github"))
+            )
        )
    else:
-        print(tabulate.tabulate(rows,
+        print(tabulate.tabulate(rows, headers=headers, tablefmt="markdown"))
                                headers=headers,
                                tablefmt="markdown"))
 def print_files_section(data):
@@ -319,7 +319,7 @@ def print_bench_section(bench_name, bench):
 def main():
    help_text = "%(prog)s [nvbench.out.json | dir/]..."
-    parser = argparse.ArgumentParser(prog='nvbench_walltime', usage=help_text)
+    parser = argparse.ArgumentParser(prog="nvbench_walltime", usage=help_text)
    args, files_or_dirs = parser.parse_known_args()
@@ -353,5 +353,5 @@ def main():
    print_files_section(data)
-if __name__ == '__main__':
+if __name__ == "__main__":
    sys.exit(main())
--- a/testing/axes_metadata.cu
+++ b/testing/axes_metadata.cu
@@ -17,22 +17,19 @@
 */
 #include <nvbench/axes_metadata.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>
 #include "test_asserts.cuh"
 #include <fmt/format.h>
 #include <algorithm>
 #include <string_view>
-using int_list = nvbench::type_list<nvbench::int8_t,
+#include "test_asserts.cuh"
-                                    nvbench::int16_t,
+
-                                    nvbench::int32_t,
+using int_list =
-                                    nvbench::int64_t>;
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
 using float_list = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
@@ -110,7 +107,6 @@ void test_default_type_axes_names()
    ASSERT(axes.get_type_axis(4).get_name() == "T4");
    ASSERT(axes.get_type_axis(4).get_axis_index() == 4);
  }
 }
 void test_type_axes()
@@ -138,8 +134,7 @@ void test_type_axes()
      fmt::format_to(std::back_inserter(buffer),
                     " - {}{}\n",
                     input_string,
-                     description.empty() ? ""
+                     description.empty() ? "" : fmt::format(" ({})", description));
                                         : fmt::format(" ({})", description));
    }
  }
@@ -157,9 +152,8 @@ Axis: Other
 )expected";
  const std::string test = fmt::to_string(buffer);
-  const auto diff =
+  const auto diff        = std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
-    std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
+  const auto idx         = static_cast<std::size_t>(diff.second - test.cbegin());
  const auto idx = static_cast<std::size_t>(diff.second - test.cbegin());
  ASSERT_MSG(test == ref,
             "Differs at character {}.\n"
             "Expected:\n\"{}\"\n\n"
@@ -189,9 +183,7 @@ void test_float64_axes()
 void test_int64_axes()
 {
  nvbench::axes_metadata axes;
-  axes.add_int64_axis("I64 Axis",
+  axes.add_int64_axis("I64 Axis", {10, 11, 12, 13, 14}, nvbench::int64_axis_flags::none);
                      {10, 11, 12, 13, 14},
                      nvbench::int64_axis_flags::none);
  ASSERT(axes.get_axes().size() == 1);
  const auto &axis = axes.get_int64_axis("I64 Axis");
  ASSERT(axis.get_size() == 5);
@@ -205,9 +197,7 @@ void test_int64_axes()
 void test_int64_power_of_two_axes()
 {
  nvbench::axes_metadata axes;
-  axes.add_int64_axis("I64 POT Axis",
+  axes.add_int64_axis("I64 POT Axis", {1, 2, 3, 4, 5}, nvbench::int64_axis_flags::power_of_two);
                      {1, 2, 3, 4, 5},
                      nvbench::int64_axis_flags::power_of_two);
  ASSERT(axes.get_axes().size() == 1);
  const auto &axis = axes.get_int64_axis("I64 POT Axis");
  ASSERT(axis.get_size() == 5);
--- a/testing/benchmark.cu
+++ b/testing/benchmark.cu
@@ -17,7 +17,6 @@
 */
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
 #include <nvbench/named_values.cuh>
 #include <nvbench/state.cuh>
@@ -25,8 +24,6 @@
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>
 #include "test_asserts.cuh"
 #include <fmt/format.h>
 #include <algorithm>
@@ -34,6 +31,8 @@
 #include <variant>
 #include <vector>
 #include "test_asserts.cuh"
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -61,34 +60,26 @@ void no_op_generator(nvbench::state &state)
 NVBENCH_DEFINE_CALLABLE(no_op_generator, no_op_callable);
 template <typename Integer, typename Float, typename Other>
-void template_no_op_generator(nvbench::state &state,
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<Integer, Float, Other>)
                              nvbench::type_list<Integer, Float, Other>)
 {
-  ASSERT(nvbench::type_strings<Integer>::input_string() ==
+  ASSERT(nvbench::type_strings<Integer>::input_string() == state.get_string("Integer"));
-         state.get_string("Integer"));
+  ASSERT(nvbench::type_strings<Float>::input_string() == state.get_string("Float"));
-  ASSERT(nvbench::type_strings<Float>::input_string() ==
+  ASSERT(nvbench::type_strings<Other>::input_string() == state.get_string("Other"));
         state.get_string("Float"));
  ASSERT(nvbench::type_strings<Other>::input_string() ==
         state.get_string("Other"));
  // Enum params using non-templated version:
  no_op_generator(state);
 }
-NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator,
+NVBENCH_DEFINE_CALLABLE_TEMPLATE(template_no_op_generator, template_no_op_callable);
                                 template_no_op_callable);
-using int_list = nvbench::type_list<nvbench::int8_t,
+using int_list =
-                                    nvbench::int16_t,
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
                                    nvbench::int32_t,
                                    nvbench::int64_t>;
 using float_list = nvbench::type_list<nvbench::float32_t, nvbench::float64_t>;
 using misc_list = nvbench::type_list<bool, void>;
 using lots_of_types_bench =
-  nvbench::benchmark<template_no_op_callable,
+  nvbench::benchmark<template_no_op_callable, nvbench::type_list<int_list, float_list, misc_list>>;
                     nvbench::type_list<int_list, float_list, misc_list>>;
 using no_types_bench = nvbench::benchmark<no_op_callable>;
@@ -110,8 +101,7 @@ void test_type_axes()
      fmt::format_to(std::back_inserter(buffer),
                     " - {}{}\n",
                     input_string,
-                     description.empty() ? ""
+                     description.empty() ? "" : fmt::format(" ({})", description));
                                         : fmt::format(" ({})", description));
    }
  }
@@ -300,9 +290,7 @@ void test_get_config_count()
  auto const num_devices = bench.get_devices().size();
-  ASSERT_MSG(bench.get_config_count() == 72 * num_devices,
+  ASSERT_MSG(bench.get_config_count() == 72 * num_devices, "Got {}", bench.get_config_count());
             "Got {}",
             bench.get_config_count());
 }
 int main()
--- a/testing/cpu_timer.cu
+++ b/testing/cpu_timer.cu
@@ -18,11 +18,11 @@
 #include <nvbench/cpu_timer.cuh>
 #include "test_asserts.cuh"
 #include <chrono>
 #include <thread>
 #include "test_asserts.cuh"
 void test_basic()
 {
  using namespace std::literals::chrono_literals;
--- a/testing/create.cu
+++ b/testing/create.cu
@@ -16,17 +16,14 @@
 *  limitations under the License.
 */
 #include <nvbench/create.cuh>
 #include <nvbench/benchmark.cuh>
 #include <nvbench/callable.cuh>
 #include <nvbench/create.cuh>
 #include <nvbench/state.cuh>
 #include <nvbench/type_list.cuh>
 #include <nvbench/type_strings.cuh>
 #include <nvbench/types.cuh>
 #include "test_asserts.cuh"
 #include <fmt/format.h>
 #include <algorithm>
@@ -34,6 +31,8 @@
 #include <variant>
 #include <vector>
 #include "test_asserts.cuh"
 template <typename T>
 std::vector<T> sort(std::vector<T> &&vec)
 {
@@ -72,15 +71,11 @@ using misc_types  = nvbench::type_list<bool, void>;
 using type_axes   = nvbench::type_list<float_types, int_types, misc_types>;
 template <typename FloatT, typename IntT, typename MiscT>
-void template_no_op_generator(nvbench::state &state,
+void template_no_op_generator(nvbench::state &state, nvbench::type_list<FloatT, IntT, MiscT>)
                              nvbench::type_list<FloatT, IntT, MiscT>)
 {
-  ASSERT(nvbench::type_strings<FloatT>::input_string() ==
+  ASSERT(nvbench::type_strings<FloatT>::input_string() == state.get_string("FloatT"));
-         state.get_string("FloatT"));
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
-  ASSERT(nvbench::type_strings<IntT>::input_string() ==
+  ASSERT(nvbench::type_strings<IntT>::input_string() == state.get_string("IntT"));
         state.get_string("IntT"));
  ASSERT(nvbench::type_strings<IntT>::input_string() ==
         state.get_string("IntT"));
  // Enum params using non-templated version:
  no_op_generator(state);
@@ -116,8 +111,7 @@ std::string run_and_get_state_string(nvbench::benchmark_base &bench,
 void validate_default_name()
 {
-  auto bench =
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("no_op_generator").clone();
    nvbench::benchmark_manager::get().get_benchmark("no_op_generator").clone();
  const std::string ref = "Params:\n";
@@ -127,8 +121,7 @@ void validate_default_name()
 void validate_custom_name()
 {
-  auto bench =
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("Custom Name").clone();
    nvbench::benchmark_manager::get().get_benchmark("Custom Name").clone();
  const std::string ref = "Params:\n";
@@ -138,8 +131,7 @@ void validate_custom_name()
 void validate_no_types()
 {
-  auto bench =
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("No Types").clone();
    nvbench::benchmark_manager::get().get_benchmark("No Types").clone();
  const std::string ref = R"expected(Params: Float: 11 Int: 1 String: One
 Params: Float: 11 Int: 2 String: One
@@ -176,8 +168,7 @@ Params: Float: 13 Int: 3 String: Three
 void validate_only_types()
 {
-  auto bench =
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("Oops, All Types!").clone();
    nvbench::benchmark_manager::get().get_benchmark("Oops, All Types!").clone();
  const std::string ref = R"expected(Params: FloatT: F32 IntT: I32 MiscT: bool
 Params: FloatT: F32 IntT: I32 MiscT: void
@@ -195,8 +186,7 @@ Params: FloatT: F64 IntT: I64 MiscT: void
 void validate_all_axes()
 {
-  auto bench =
+  auto bench = nvbench::benchmark_manager::get().get_benchmark("All The Axes").clone();
    nvbench::benchmark_manager::get().get_benchmark("All The Axes").clone();
  const std::string ref =
    R"expected(Params: Float: 11 FloatT: F32 Int: 1 IntT: I32 MiscT: bool String: One
--- a/testing/criterion_manager.cu
+++ b/testing/criterion_manager.cu
@@ -42,27 +42,34 @@ protected:
 void test_no_duplicates_are_allowed()
 {
-  nvbench::criterion_manager& manager = nvbench::criterion_manager::get();
+  nvbench::criterion_manager &manager = nvbench::criterion_manager::get();
-  bool exception_triggered = false;
+  bool exception_triggered            = false;
-  try {
+  try
-    [[maybe_unused]] nvbench::stopping_criterion_base& _ = manager.get_criterion("custom");
+  {
-  } catch(...) {
+    [[maybe_unused]] nvbench::stopping_criterion_base &_ = manager.get_criterion("custom");
  }
  catch (...)
  {
    exception_triggered = true;
  }
  ASSERT(exception_triggered);
  std::unique_ptr<custom_criterion> custom_ptr = std::make_unique<custom_criterion>();
-  custom_criterion* custom_raw = custom_ptr.get();
+  custom_criterion *custom_raw                 = custom_ptr.get();
  ASSERT(&manager.add(std::move(custom_ptr)) == custom_raw);
-  nvbench::stopping_criterion_base& custom = nvbench::criterion_manager::get().get_criterion("custom");
+  nvbench::stopping_criterion_base &custom =
    nvbench::criterion_manager::get().get_criterion("custom");
  ASSERT(custom_raw == &custom);
  exception_triggered = false;
-  try {
+  try
  {
    manager.add(std::make_unique<custom_criterion>());
-  } catch(...) {
+  }
  catch (...)
  {
    exception_triggered = true;
  }
  ASSERT(exception_triggered);
--- a/testing/criterion_params.cu
+++ b/testing/criterion_params.cu
@@ -60,4 +60,3 @@ int main()
  test_compat_overwrite();
  test_overwrite();
 }
--- a/testing/cuda_timer.cu
+++ b/testing/cuda_timer.cu
@@ -16,19 +16,16 @@
 *  limitations under the License.
 */
 #include <nvbench/cuda_timer.cuh>
 #include <nvbench/cuda_stream.cuh>
 #include <nvbench/cuda_timer.cuh>
 #include <nvbench/test_kernels.cuh>
 #include <nvbench/types.cuh>
 #include "test_asserts.cuh"
 #include <fmt/format.h>
-void test_basic(cudaStream_t time_stream,
+#include "test_asserts.cuh"
-                cudaStream_t exec_stream,
+
-                bool expected)
+void test_basic(cudaStream_t time_stream, cudaStream_t exec_stream, bool expected)
 {
  nvbench::cuda_timer timer;
--- a/testing/custom_main_custom_args.cu
+++ b/testing/custom_main_custom_args.cu
@@ -16,8 +16,8 @@
 *  limitations under the License.
 */
 #include <nvbench/cuda_call.cuh>
 #include <nvbench/nvbench.cuh>
 #include "nvbench/cuda_call.cuh"
 /******************************************************************************
 * Install custom parser.
@@ -35,7 +35,7 @@
 // User code to handle a specific argument:
 void handle_my_custom_arg();
-// NVBench hook for modiifying the command line arguments before parsing:
+// NVBench hook for modifying the command line arguments before parsing:
 void custom_arg_handler(std::vector<std::string> &args)
 {
  // Handle and remove "--my-custom-arg"
--- a/testing/custom_main_global_state_raii.cu
+++ b/testing/custom_main_global_state_raii.cu
@@ -19,8 +19,8 @@
 #include <nvbench/nvbench.cuh>
 #include <algorithm>
 #include <cstdlib>
 #include <cstdio>
 #include <cstdlib>
 /******************************************************************************
 * Test having global state that is initialized and finalized via RAII.
--- a/testing/device/noisy_bench.cu
+++ b/testing/device/noisy_bench.cu
@@ -29,12 +29,10 @@
 void noisy_bench(nvbench::state &state)
 {
  // time, convert ms -> s
-  const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) /
+  const auto mean = static_cast<nvbench::float32_t>(state.get_float64("Mean")) / 1000.f;
                    1000.f;
  // rel stdev
-  const auto noise_pct =
+  const auto noise_pct = static_cast<nvbench::float32_t>(state.get_float64("Noise"));
-    static_cast<nvbench::float32_t>(state.get_float64("Noise"));
+  const auto noise     = noise_pct / 100.f;
  const auto noise = noise_pct / 100.f;
  // abs stdev
  const auto stdev = noise * mean;
@@ -53,8 +51,7 @@ void noisy_bench(nvbench::state &state)
    try
    {
      return static_cast<nvbench::float32_t>(
-        state.get_summary("nv/cold/time/gpu/stdev/relative")
+        state.get_summary("nv/cold/time/gpu/stdev/relative").get_float64("value"));
          .get_float64("value"));
    }
    catch (std::invalid_argument &)
    {
--- a/testing/entropy_criterion.cu
+++ b/testing/entropy_criterion.cu
@@ -20,11 +20,11 @@
 #include <nvbench/stopping_criterion.cuh>
 #include <nvbench/types.cuh>
 #include "test_asserts.cuh"
 #include <vector>
 #include <random>
 #include <numeric>
 #include <random>
 #include <vector>
 #include "test_asserts.cuh"
 void test_const()
 {
@@ -32,7 +32,7 @@ void test_const()
  nvbench::detail::entropy_criterion criterion;
  criterion.initialize(params);
-  for (int i = 0; i < 6; i++) 
+  for (int i = 0; i < 6; i++)
  { // nvbench wants at least 5 to compute the standard deviation
    criterion.add_measurement(42.0);
  }
@@ -48,7 +48,7 @@ void produce_entropy_arch(nvbench::detail::entropy_criterion &criterion)
   *   2.5, 2.4, 2.2, 2.1, 2.0, 1.9 <-+
   *   1.8, 1.7, 1.6, 1.6, 1.5, 1.4   |
   *   1.4, 1.3, 1.3, 1.3, 1.2, 1.2   |
-   *   1.1, 1.1, 1.1, 1.0, 1.0, 1.0   +-- entropy only decreases after 5-th sample, 
+   *   1.1, 1.1, 1.1, 1.0, 1.0, 1.0   +-- entropy only decreases after 5-th sample,
   *   1.0, 0.9, 0.9, 0.9, 0.9, 0.9   |   so the slope should be negative
   *   0.8, 0.8, 0.8, 0.8, 0.8, 0.8   |
   *   0.7, 0.7, 0.7, 0.7, 0.7, 0.7 <-+
--- a/testing/enum_type_list.cu
+++ b/testing/enum_type_list.cu
@@ -18,12 +18,12 @@
 #include <nvbench/enum_type_list.cuh>
 #include "test_asserts.cuh"
 #include <fmt/format.h>
 #include <type_traits>
 #include "test_asserts.cuh"
 // If using gcc version < 7, disable some tests to WAR a compiler bug. See NVIDIA/nvbench#39.
 #if defined(__GNUC__) && __GNUC__ == 7
 #define USING_GCC_7
@@ -102,8 +102,7 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
 void test_int()
 {
  ASSERT((std::is_same_v<nvbench::enum_type_list<>, nvbench::type_list<>>));
-  ASSERT((std::is_same_v<nvbench::enum_type_list<0>,
+  ASSERT((std::is_same_v<nvbench::enum_type_list<0>, nvbench::type_list<nvbench::enum_type<0>>>));
                         nvbench::type_list<nvbench::enum_type<0>>>));
  ASSERT((std::is_same_v<nvbench::enum_type_list<0, 1, 2, 3, 4>,
                         nvbench::type_list<nvbench::enum_type<0>,
                                            nvbench::enum_type<1>,
@@ -115,42 +114,35 @@ void test_int()
 void test_scoped_enum()
 {
 #ifndef USING_GCC_7
-  ASSERT((
+  ASSERT((std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
-    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
+                         nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
                   nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
 #endif
-  ASSERT((
+  ASSERT((std::is_same_v<
-    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1,
+          nvbench::enum_type_list<scoped_enum::val_1, scoped_enum::val_2, scoped_enum::val_3>,
-                                           scoped_enum::val_2,
+          nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>,
-                                           scoped_enum::val_3>,
+                             nvbench::enum_type<scoped_enum::val_2>,
-                   nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>,
+                             nvbench::enum_type<scoped_enum::val_3>>>));
                                      nvbench::enum_type<scoped_enum::val_2>,
                                      nvbench::enum_type<scoped_enum::val_3>>>));
 }
 void test_unscoped_enum()
 {
 #ifndef USING_GCC_7
-  ASSERT(
+  ASSERT((std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
-    (std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
+                         nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
-                    nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
+  ASSERT((std::is_same_v<nvbench::enum_type_list<unscoped_val_1, unscoped_val_2, unscoped_val_3>,
-  ASSERT(
+                         nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
-    (std::is_same_v<
+                                            nvbench::enum_type<unscoped_val_2>,
-      nvbench::enum_type_list<unscoped_val_1, unscoped_val_2, unscoped_val_3>,
+                                            nvbench::enum_type<unscoped_val_3>>>));
      nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
                         nvbench::enum_type<unscoped_val_2>,
                         nvbench::enum_type<unscoped_val_3>>>));
 #endif
 }
 void test_scoped_enum_type_strings()
 {
-  using values = nvbench::enum_type_list<scoped_enum::val_1,
+  using values =
-                                         scoped_enum::val_2,
+    nvbench::enum_type_list<scoped_enum::val_1, scoped_enum::val_2, scoped_enum::val_3>;
-                                         scoped_enum::val_3>;
+  using val_1 = nvbench::tl::get<0, values>;
-  using val_1  = nvbench::tl::get<0, values>;
+  using val_2 = nvbench::tl::get<1, values>;
-  using val_2  = nvbench::tl::get<1, values>;
+  using val_3 = nvbench::tl::get<2, values>;
  using val_3  = nvbench::tl::get<2, values>;
  ASSERT((nvbench::type_strings<val_1>::input_string() == "1"));
  ASSERT((nvbench::type_strings<val_1>::description() == "scoped_enum::val_1"));
  ASSERT((nvbench::type_strings<val_2>::input_string() == "2"));
--- a/testing/float64_axis.cu
+++ b/testing/float64_axis.cu
@@ -34,8 +34,7 @@ void test_empty()
  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
+  const auto *clone = dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
    dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);
  ASSERT(clone->get_name() == "Empty");
@@ -62,8 +61,7 @@ void test_basic()
  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
+  const auto *clone = dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
    dynamic_cast<const nvbench::float64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);
  ASSERT(clone->get_name() == "Basic");
--- a/testing/int64_axis.cu
+++ b/testing/int64_axis.cu
@@ -18,10 +18,10 @@
 #include <nvbench/int64_axis.cuh>
 #include "test_asserts.cuh"
 #include <fmt/format.h>
 #include "test_asserts.cuh"
 void test_empty()
 {
  nvbench::int64_axis axis("Empty");
@@ -36,8 +36,7 @@ void test_empty()
  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);
  ASSERT(clone->get_name() == "Empty");
@@ -66,8 +65,7 @@ void test_basic()
  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);
  ASSERT(clone->get_name() == "BasicAxis");
@@ -87,8 +85,7 @@ void test_basic()
 void test_power_of_two()
 {
  nvbench::int64_axis axis{"POTAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
                  nvbench::int64_axis_flags::power_of_two);
  const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
  const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};
@@ -102,14 +99,12 @@ void test_power_of_two()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }
  const auto clone_base = axis.clone();
  ASSERT(clone_base.get() != nullptr);
-  const auto *clone =
+  const auto *clone = dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
    dynamic_cast<const nvbench::int64_axis *>(clone_base.get());
  ASSERT(clone != nullptr);
  ASSERT(clone->get_name() == "POTAxis");
@@ -122,8 +117,7 @@ void test_power_of_two()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(clone->get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(clone->get_description(i) ==
+    ASSERT(clone->get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }
 }
@@ -250,8 +244,7 @@ void test_update_none_to_pow2()
 void test_update_pow2_to_none()
 {
  nvbench::int64_axis axis{"TestAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
                  nvbench::int64_axis_flags::power_of_two);
  const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
  const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};
@@ -304,8 +297,7 @@ void test_update_pow2_to_none()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }
 }
@@ -313,8 +305,7 @@ void test_update_pow2_to_pow2()
 {
  nvbench::int64_axis axis{"TestAxis"};
-  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4},
+  axis.set_inputs({0, 1, 2, 3, 7, 6, 5, 4}, nvbench::int64_axis_flags::power_of_two);
                  nvbench::int64_axis_flags::power_of_two);
  const std::vector<nvbench::int64_t> ref_inputs{0, 1, 2, 3, 7, 6, 5, 4};
  const std::vector<nvbench::int64_t> ref_values{1, 2, 4, 8, 128, 64, 32, 16};
@@ -369,8 +360,7 @@ void test_update_pow2_to_pow2()
  for (size_t i = 0; i < 8; ++i)
  {
    ASSERT(axis.get_input_string(i) == fmt::to_string(ref_inputs[i]));
-    ASSERT(axis.get_description(i) ==
+    ASSERT(axis.get_description(i) == fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
           fmt::format("2^{} = {}", ref_inputs[i], ref_values[i]));
  }
 }
--- a/testing/named_values.cu
+++ b/testing/named_values.cu
@@ -18,10 +18,10 @@
 #include <nvbench/named_values.cuh>
 #include "test_asserts.cuh"
 #include <algorithm>
 #include "test_asserts.cuh"
 void test_empty()
 {
  nvbench::named_values vals;
--- a/testing/option_parser.cu
+++ b/testing/option_parser.cu
@@ -16,15 +16,14 @@
 *  limitations under the License.
 */
 #include <nvbench/option_parser.cuh>
 #include <nvbench/create.cuh>
 #include <nvbench/option_parser.cuh>
 #include <nvbench/type_list.cuh>
 #include "test_asserts.cuh"
 #include <fmt/format.h>
 #include "test_asserts.cuh"
 //==============================================================================
 // Declare a couple benchmarks for testing:
 void DummyBench(nvbench::state &state) { state.skip("Skipping for testing."); }
@@ -50,8 +49,7 @@ NVBENCH_BENCH_TYPES(TestBench, NVBENCH_TYPE_AXES(Ts, Us))
 namespace
 {
-[[nodiscard]] std::string
+[[nodiscard]] std::string states_to_string(const std::vector<nvbench::state> &states)
 states_to_string(const std::vector<nvbench::state> &states)
 {
  fmt::memory_buffer buffer;
  std::string table_format = "| {:^5} | {:^10} | {:^4} | {:^4} | {:^4} "
@@ -88,7 +86,7 @@ states_to_string(const std::vector<nvbench::state> &states)
 // Expects the parser to have a single TestBench benchmark. Runs the benchmark
 // and returns the resulting states.
-[[nodiscard]] const auto& parser_to_states(nvbench::option_parser &parser)
+[[nodiscard]] const auto &parser_to_states(nvbench::option_parser &parser)
 {
  const auto &benches = parser.get_benchmarks();
  ASSERT(benches.size() == 1);
@@ -267,8 +265,7 @@ void test_int64_axis_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 2 : 1 ] "});
      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 2 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -308,8 +305,7 @@ void test_int64_axis_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 , 7 ] "});
      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -323,8 +319,7 @@ void test_int64_axis_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 7 : 5 ] "});
      {"--benchmark", "TestBench", "--axis", " Ints [ ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -369,8 +364,7 @@ void test_int64_axis_pow2_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 ] "});
      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -384,8 +378,7 @@ void test_int64_axis_pow2_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 : 7 : 1 ] "});
      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 7 : 7 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -425,8 +418,7 @@ void test_int64_axis_pow2_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 , 7 ] "});
      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -440,8 +432,7 @@ void test_int64_axis_pow2_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 : 7 : 5 ] "});
      {"--benchmark", "TestBench", "--axis", " PO2s [ pow2 ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -486,8 +477,7 @@ void test_int64_axis_none_to_pow2_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 ] "});
      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -501,8 +491,7 @@ void test_int64_axis_none_to_pow2_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 : 7 : 1 ] "});
      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 7 : 7 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -542,8 +531,7 @@ void test_int64_axis_none_to_pow2_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 , 7 ] "});
      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -557,8 +545,7 @@ void test_int64_axis_none_to_pow2_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 : 7 : 5 ] "});
      {"--benchmark", "TestBench", "--axis", " Ints [ pow2 ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -617,8 +604,7 @@ void test_int64_axis_pow2_to_none_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 2 : 1 ] "});
      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 2 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -658,8 +644,7 @@ void test_int64_axis_pow2_to_none_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 , 7 ] "});
      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 , 7 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -673,8 +658,7 @@ void test_int64_axis_pow2_to_none_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 7 : 5 ] "});
      {"--benchmark", "TestBench", "--axis", " PO2s [ ] = [ 2 : 7 : 5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -719,8 +703,7 @@ void test_float64_axis_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 ] "});
      {"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -734,10 +717,7 @@ void test_float64_axis_single()
  {
    nvbench::option_parser parser;
-    parser.parse({"--benchmark",
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 : 3.6 : 1 ] "});
                  "TestBench",
                  "--axis",
                  " Floats [ ] = [ 3.5 : 3.6 : 1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -777,8 +757,7 @@ void test_float64_axis_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 , 4.1 ] "});
      {"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 , 4.1 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -792,18 +771,14 @@ void test_float64_axis_multi()
  {
    nvbench::option_parser parser;
-    parser.parse({"--benchmark",
+    parser.parse({"--benchmark", "TestBench", "--axis", " Floats [ ] = [ 3.5 : 4.2 : 0.6 ] "});
                  "TestBench",
                  "--axis",
                  " Floats [ ] = [ 3.5 : 4.2 : 0.6 ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", "Floats=[3.5:4.2:0.6]"});
      {"--benchmark", "TestBench", "--axis", "Floats=[3.5:4.2:0.6]"});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -827,8 +802,7 @@ void test_string_axis_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = fo br "});
      {"--benchmark", "TestBench", "--axis", " Strings [ ] = fo br "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -842,8 +816,7 @@ void test_string_axis_single()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br ] "});
      {"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -883,8 +856,7 @@ void test_string_axis_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br , baz ] "});
      {"--benchmark", "TestBench", "--axis", " Strings [ ] = [ fo br , baz ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -951,8 +923,7 @@ void test_type_axis_multi()
  {
    nvbench::option_parser parser;
-    parser.parse(
+    parser.parse({"--benchmark", "TestBench", "--axis", " T [ ] = [ U8, void ] "});
      {"--benchmark", "TestBench", "--axis", " T [ ] = [ U8, void ] "});
    const auto test = parser_to_state_string(parser);
    ASSERT_MSG(test == ref, "Expected:\n\"{}\"\n\nActual:\n\"{}\"", ref, test);
  }
@@ -1177,9 +1148,8 @@ void test_axis_before_benchmark()
 void test_min_samples()
 {
  nvbench::option_parser parser;
-  parser.parse(
+  parser.parse({"--benchmark", "DummyBench", "--min-samples", "12345"});
-    {"--benchmark", "DummyBench", "--min-samples", "12345"});
+  const auto &states = parser_to_states(parser);
  const auto& states = parser_to_states(parser);
  ASSERT(states.size() == 1);
  ASSERT(states[0].get_min_samples() == 12345);
@@ -1188,9 +1158,8 @@ void test_min_samples()
 void test_min_time()
 {
  nvbench::option_parser parser;
-  parser.parse(
+  parser.parse({"--benchmark", "DummyBench", "--min-time", "12345e2"});
-    {"--benchmark", "DummyBench", "--min-time", "12345e2"});
+  const auto &states = parser_to_states(parser);
  const auto& states = parser_to_states(parser);
  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_min_time() - 12345e2) < 1.);
@@ -1199,9 +1168,8 @@ void test_min_time()
 void test_max_noise()
 {
  nvbench::option_parser parser;
-  parser.parse(
+  parser.parse({"--benchmark", "DummyBench", "--max-noise", "50.3"});
-    {"--benchmark", "DummyBench", "--max-noise", "50.3"});
+  const auto &states = parser_to_states(parser);
  const auto& states = parser_to_states(parser);
  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_max_noise() - 0.503) < 1.e-4);
@@ -1210,9 +1178,8 @@ void test_max_noise()
 void test_skip_time()
 {
  nvbench::option_parser parser;
-  parser.parse(
+  parser.parse({"--benchmark", "DummyBench", "--skip-time", "12345e2"});
-    {"--benchmark", "DummyBench", "--skip-time", "12345e2"});
+  const auto &states = parser_to_states(parser);
  const auto& states = parser_to_states(parser);
  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_skip_time() - 12345e2) < 1.);
@@ -1221,9 +1188,8 @@ void test_skip_time()
 void test_timeout()
 {
  nvbench::option_parser parser;
-  parser.parse(
+  parser.parse({"--benchmark", "DummyBench", "--timeout", "12345e2"});
-    {"--benchmark", "DummyBench", "--timeout", "12345e2"});
+  const auto &states = parser_to_states(parser);
  const auto& states = parser_to_states(parser);
  ASSERT(states.size() == 1);
  ASSERT(std::abs(states[0].get_timeout() - 12345e2) < 1.);
@@ -1232,12 +1198,15 @@ void test_timeout()
 void test_stopping_criterion()
 {
  nvbench::option_parser parser;
-  parser.parse(
+  parser.parse({"--benchmark",
-    {"--benchmark", "DummyBench", 
+                "DummyBench",
-     "--stopping-criterion", "entropy",
+                "--stopping-criterion",
-     "--max-angle", "0.42",
+                "entropy",
-     "--min-r2", "0.6"});
+                "--max-angle",
-  const auto& states = parser_to_states(parser);
+                "0.42",
                "--min-r2",
                "0.6"});
  const auto &states = parser_to_states(parser);
  ASSERT(states.size() == 1);
  ASSERT(states[0].get_stopping_criterion() == "entropy");
--- a/testing/range.cu
+++ b/testing/range.cu
@@ -22,12 +22,9 @@
 void test_basic()
 {
-  ASSERT((nvbench::range(0, 6) ==
+  ASSERT((nvbench::range(0, 6) == std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
-          std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
+  ASSERT((nvbench::range(0, 6, 1) == std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
-  ASSERT((nvbench::range(0, 6, 1) ==
+  ASSERT((nvbench::range(0, 6, 2) == std::vector<nvbench::int64_t>{0, 2, 4, 6}));
          std::vector<nvbench::int64_t>{0, 1, 2, 3, 4, 5, 6}));
  ASSERT(
    (nvbench::range(0, 6, 2) == std::vector<nvbench::int64_t>{0, 2, 4, 6}));
  ASSERT((nvbench::range(0, 6, 3) == std::vector<nvbench::int64_t>{0, 3, 6}));
  ASSERT((nvbench::range(0, 6, 4) == std::vector<nvbench::int64_t>{0, 4}));
  ASSERT((nvbench::range(0, 6, 5) == std::vector<nvbench::int64_t>{0, 5}));
@@ -37,26 +34,19 @@ void test_basic()
 void test_result_type()
 {
  // All ints should turn into int64 by default:
-  ASSERT((std::is_same_v<decltype(nvbench::range(0ll, 1ll)),
+  ASSERT((std::is_same_v<decltype(nvbench::range(0ll, 1ll)), std::vector<nvbench::int64_t>>));
-                         std::vector<nvbench::int64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0, 1)), std::vector<nvbench::int64_t>>));
-  ASSERT((std::is_same_v<decltype(nvbench::range(0, 1)),
+  ASSERT((std::is_same_v<decltype(nvbench::range(0u, 1u)), std::vector<nvbench::int64_t>>));
                         std::vector<nvbench::int64_t>>));
  ASSERT((std::is_same_v<decltype(nvbench::range(0u, 1u)),
                         std::vector<nvbench::int64_t>>));
  // All floats should turn into float64 by default:
-  ASSERT((std::is_same_v<decltype(nvbench::range(0., 1.)),
+  ASSERT((std::is_same_v<decltype(nvbench::range(0., 1.)), std::vector<nvbench::float64_t>>));
-                         std::vector<nvbench::float64_t>>));
+  ASSERT((std::is_same_v<decltype(nvbench::range(0.f, 1.f)), std::vector<nvbench::float64_t>>));
  ASSERT((std::is_same_v<decltype(nvbench::range(0.f, 1.f)),
                         std::vector<nvbench::float64_t>>));
  // Other types may be explicitly specified:
-  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::float32_t,
+  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::float32_t, nvbench::float32_t>(0.f, 1.f)),
                                                 nvbench::float32_t>(0.f, 1.f)),
                         std::vector<nvbench::float32_t>>));
-  ASSERT((std::is_same_v<
+  ASSERT((std::is_same_v<decltype(nvbench::range<nvbench::int32_t, nvbench::int32_t>(0, 1)),
-          decltype(nvbench::range<nvbench::int32_t, nvbench::int32_t>(0, 1)),
+                         std::vector<nvbench::int32_t>>));
          std::vector<nvbench::int32_t>>));
 }
 void test_fp_tolerance()
@@ -68,10 +58,8 @@ void test_fp_tolerance()
  const nvbench::float32_t stride = 1e-4f;
  for (std::size_t size = 1; size < 1024; ++size)
  {
-    const nvbench::float32_t end =
+    const nvbench::float32_t end = start + stride * static_cast<nvbench::float32_t>(size - 1);
-      start + stride * static_cast<nvbench::float32_t>(size - 1);
+    ASSERT_MSG(nvbench::range(start, end, stride).size() == size, "size={}", size);
    ASSERT_MSG(nvbench::range(start, end, stride).size() == size,
               "size={}", size);
  }
 }
--- a/testing/reset_error.cu
+++ b/testing/reset_error.cu
@@ -2,18 +2,17 @@
 #include "test_asserts.cuh"
 namespace
 {
-    __global__ void multiply5(const int32_t* __restrict__ a, int32_t* __restrict__ b)
+__global__ void multiply5(const int32_t *__restrict__ a, int32_t *__restrict__ b)
-    {
+{
-      const auto id = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto id = blockIdx.x * blockDim.x + threadIdx.x;
-      b[id] = 5 * a[id];
+  b[id]         = 5 * a[id];
    }
 }
 } // namespace
 int main()
-{ 
+{
  multiply5<<<256, 256>>>(nullptr, nullptr);
  try
--- a/testing/ring_buffer.cu
+++ b/testing/ring_buffer.cu
@@ -18,14 +18,13 @@
 #include <nvbench/detail/ring_buffer.cuh>
 #include "test_asserts.cuh"
 #include <algorithm>
 #include <vector>
 #include "test_asserts.cuh"
 template <typename T>
-bool equal(const nvbench::detail::ring_buffer<T> &buffer,
+bool equal(const nvbench::detail::ring_buffer<T> &buffer, const std::vector<T> &reference)
           const std::vector<T> &reference)
 {
  return std::equal(buffer.begin(), buffer.end(), reference.begin());
 }
--- a/Show More
+++ b/Show More