Merge remote-tracking branch 'origin/main' into fea/axes_iteration_space

2026-03-14 20:27:24 +00:00 · 2025-05-01 10:42:35 -04:00
parent 910b5cc759 433376fd83
commit 50e764a308
234 changed files with 10357 additions and 12278 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -36,9 +36,33 @@ BreakBeforeBinaryOperators: None
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeComma
 BreakInheritanceList: BeforeComma
-ColumnLimit: 80
+ColumnLimit: 100
 CompactNamespaces: false
 ContinuationIndentWidth: 2
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<nvbench'
+    Priority:        1
+  - Regex:           '^<cub'
+    Priority:        2
+  - Regex:           '^<thrust'
+    Priority:        3
+  - Regex:           '^<cuda/'
+    Priority:        4
+  - Regex:           '^<cuda'
+    Priority:        5
+  - Regex:           '^<nvml'
+    Priority:        6
+  - Regex:           '^<cupti'
+    Priority:        7
+  - Regex:           '^<nvperf'
+    Priority:        8
+  - Regex:           '^<nlohmann'
+    Priority:        9
+  - Regex:           '^<fmt'
+    Priority:        10
+  - Regex:           '^<[a-z_]*>$'
+    Priority:        11
 IndentCaseLabels: true
 IndentPPDirectives: None
 IndentWidth: 2
@@ -55,7 +79,7 @@ PenaltyExcessCharacter: 100
 PenaltyReturnTypeOnItsOwnLine: 90
 PointerAlignment: Right
 ReflowComments: true
-SortIncludes: true
+SortIncludes: CaseInsensitive
 SpaceAfterCStyleCast: false
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
--- a/.clangd
+++ b/.clangd
@@ -0,0 +1,62 @@
+# https://clangd.llvm.org/config
+
+# Apply a config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+
+---
+
+# Apply a config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+
+---
+
+# Apply a config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Add:
+    # Allow variadic CUDA functions
+    - "-Xclang=-fcuda-allow-variadic-functions"
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Compiler: clang++
+  CompilationDatabase: .
+  Add:
+    - -x
+    - cuda
+    # report all errors
+    - "-ferror-limit=0"
+    - "-ftemplate-backtrace-limit=0"
+    - "-std=c++17"
+  Remove:
+    # strip CUDA fatbin args
+    - "-Xfatbin*"
+    - "-Xcompiler*"
+    - "-Xcudafe*"
+    - "-rdc=*"
+    - "-gpu=*"
+    - "--diag_suppress*"
+    # strip CUDA arch flags
+    - "-gencode*"
+    - "--generate-code*"
+    # strip gcc's -fcoroutines
+    - -fcoroutines
+    # strip CUDA flags unknown to clang
+    - "-ccbin*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
+    # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error.
+    # Temporarily suppressing it, but should probably fix
+    - "template_param_shadow"
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -0,0 +1,198 @@
+> **Note**
+> The instructions in this README are specific to Linux development environments. Instructions for Windows are coming soon!
+
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
+
+# CCCL Dev Containers
+
+CCCL uses [Development Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. This guide covers setup in [Visual Studio Code](#quickstart-vscode-recommended) and [Docker](#quickstart-docker-manual-approach). The guide also provides additional instructions in case you want use WSL.
+
+## Table of Contents
+1. [Quickstart: VSCode (Recommended)](#vscode)
+2. [Quickstart: Docker (Manual Approach)](#docker)
+3. [Quickstart: Using WSL](#wsl)
+
+## Quickstart: VSCode (Recommended) <a name="vscode"></a>
+
+### Prerequisites
+- [Visual Studio Code](https://code.visualstudio.com/)
+- [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+- [Docker](https://docs.docker.com/engine/install/) - This is only for completeness because it should already be implicitly installed by the Dev Containers extension
+
+### Steps
+
+1. Clone the Repository
+    ```bash
+    git clone https://github.com/nvidia/cccl.git
+    ```
+2. Open the cloned directory in VSCode
+
+3. Launch a Dev Container by clicking the prompt suggesting to "Reopen in Container"
+
+   ![Shows "Reopen in Container" prompt when opening the cccl directory in VScode.](./img/reopen_in_container.png)
+
+   - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.
+
+     ![Shows "Reopen in Container" in command palette.](./img/open_in_container_manual.png)
+
+4. Select an environment with the desired CTK and host compiler from the list:
+
+   ![Shows list of available container environments.](./img/container_list.png)
+
+5. VSCode will initialize the selected Dev Container. This can take a few minutes the first time.
+
+6. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent.
+
+7. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+
+### (Optional) Authenticate with GitHub for `sccache`
+
+After starting the container, there will be a prompt to authenticate with GitHub. This grants access to a [`sccache`](https://github.com/mozilla/sccache) server shared with CI and greatly accelerates local build times. This is currently limited to NVIDIA employees belonging to the `NVIDIA` or `rapidsai` GitHub organizations.
+
+Without authentication to the remote server, `sccache` will still accelerate local builds by using a filesystem cache.
+
+Follow the instructions in the prompt as below and enter the one-time code at https://github.com/login/device
+
+  ![Shows authentication with GitHub to access sccache bucket.](./img/github_auth.png)
+
+To manually trigger this authentication, execute the `devcontainer-utils-vault-s3-init` script within the container.
+
+For more information about the sccache configuration and authentication, see the documentation at [`rapidsai/devcontainers`](https://github.com/rapidsai/devcontainers/blob/branch-23.10/USAGE.md#build-caching-with-sccache).
+
+## Quickstart: Docker (Manual Approach) <a name="docker"></a>
+
+### Prerequisites
+- [Docker](https://docs.docker.com/desktop/install/linux-install/)
+
+### Steps
+1. Clone the repository and use the [`launch.sh`](./launch.sh) script to launch the default container environment
+    ```bash
+    git clone https://github.com/nvidia/cccl.git
+    cd cccl
+    ./.devcontainer/launch.sh --docker
+    ```
+    This script starts an interactive shell as the `coder` user inside the container with the local `cccl/` directory mirrored into `/home/coder/cccl`.
+
+    For specific environments, use the `--cuda` and `--host` options:
+    ```bassh
+    ./.devcontainer/launch.sh --docker --cuda 12.2 --host gcc10
+    ```
+    See `./.devcontainer/launch.sh --help` for more information.
+
+2. Done. See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+
+## Available Environments
+
+CCCL provides environments for both the oldest and newest supported CUDA versions with all compatible host compilers.
+
+Look in the [`.devcontainer/`](.) directory to see the available configurations. The top-level [`devcontainer.json`](./devcontainer.json) serves as the default environment. All `devcontainer.json` files in the `cuda<CTK_VERSION>-<HOST-COMPILER>` sub-directories are variations on this top-level file, with different base images for the different CUDA and host compiler versions.
+
+## VSCode Customization
+
+By default, CCCL's Dev Containers come with certain VSCode settings and extensions configured by default, as can be seen in the [`devcontainer.json`](./devcontainer.json) file. This can be further customized by users without needing to modify the `devcontainer.json` file directly.
+
+For extensions, the [`dev.containers.defaultExtensions` setting](https://code.visualstudio.com/docs/devcontainers/containers#_always-installed-extensions) allows listing extensions that will always be installed.
+
+For more general customizations, VSCode allows using a dotfile repository. See the [VSCode documentation](https://code.visualstudio.com/docs/devcontainers/containers#_personalizing-with-dotfile-repositories) for more information.
+
+## GitHub Codespaces
+
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
+
+One of the benefits of Dev Containers is that they integrate natively with [GitHub Codespaces](https://github.com/features/codespaces). Codespaces provide a VSCode development environment right in your browser running on a machine in the cloud. This provides a truly one-click, turnkey development environment where you can develop, build, and test with no other setup required.
+
+Click the badge above or [click here](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) to get started with CCCL's Dev Containers on Codespaces. This will start the default Dev Container environment. [Click here](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=296416761&skip_quickstart=true) to start a Codespace with a particular environment and hardware configuration as shown:
+
+   ![Shows configuring a Codespace with a custom environment](../docs/images/codespaces.png)
+
+## For Maintainers: The `make_devcontainers.sh` Script
+
+### Overview
+
+[`make_devcontainers.sh`](./make_devcontainers.sh) generates devcontainer configurations for the unique combinations of CUDA Toolkit (CTK) versions and host compilers in [`ci/matrix.yaml`](../ci/matrix.yaml).
+
+### How It Works:
+
+1. Parses the matrix from `ci/matrix.yaml`.
+2. Use the top-level [`.devcontainer/devcontainer.json`](./devcontainer.json) as a template. For each unique combination of CTK version and host compiler, generate a corresponding `devcontainer.json` configuration, adjusting only the base Docker image to match the desired environment.
+3. Place the generated configurations in the `.devcontainer` directory, organizing them into subdirectories following the naming convention `cuda<CTK_VERSION>-<COMPILER_VERSION>`.
+
+For more information, see the `.devcontainer/make_devcontainers.sh --help` message.
+
+**Note**: When adding or updating supported environments, modify `matrix.yaml` and then rerun this script to synchronize the `devcontainer` configurations.
+
+## Quickstart: Using WSL <a name="wsl"></a>
+
+> [!NOTE]
+> _Make sure you have the Nvidia driver installed on your Windows host before moving further_. Type in `nvidia-smi` for verification.
+
+### Install WSL on your Windows host
+
+> [!WARNING]
+> Disclaimer: This guide was developed for WSL 2 on Windows 11.
+
+1. Launch a Windows terminal (_e.g. Powershell_) as an administrator.
+
+2. Install WSL 2 by running:
+```bash
+wsl --install
+```
+This should probably install Ubuntu distro as a default.
+
+3. Restart your computer and run `wsl -l -v` on a Windows terminal to verify installation.
+
+<h3 id="prereqs"> Install prerequisites and VS Code extensions</h3>
+
+4. Launch your WSL/Ubuntu terminal by running `wsl` in Powershell.
+
+5. Install the [WSL extension](ms-vscode-remote.remote-wsl) on VS Code.
+
+    - `Ctrl + Shift + P` and select `WSL: Connect to WSL` (it will prompt you to install the WSL extension).
+
+    - Make sure you are connected to WSL with VS Code by checking the bottom left corner of the VS Code window (should indicate "WSL: Ubuntu" in our case).
+
+6. Install the [Dev Containers extension](ms-vscode-remote.remote-containers) on VS Code.
+
+    - In a vanilla system you should be prompted to install `Docker` at this point, accept it. If it hangs you might have to restart VS Code after that.
+
+7. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). **Make sure you install the WSL 2 version and not the native Linux one**. This builds on top of Docker so make sure you have Docker properly installed (run `docker --version`).
+
+8. Open `/etc/docker/daemon.json` from within your WSL system (if the file does not exist, create it) and add the following:
+
+```json
+{
+    "runtimes": {
+        "nvidia": {
+            "path": "nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    }
+}
+```
+
+then run `sudo systemctl restart docker.service`.
+
+---
+### Build CCCL in WSL using Dev Containers
+
+9. Still on your WSL terminal run `git clone https://github.com/NVIDIA/cccl.git`
+
+
+10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).
+
+11. If prompted, choose `Reopen in Container`.
+
+    - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.
+
+12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.
+
+From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:
+
+13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message.
+
+> Failed opening a web browser at https://github.com/login/device
+  exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
+  Please try entering the URL in your browser manually
+
+In that case type in the address manually in your web browser https://github.com/login/device and fill in the one-time code.
--- a/.devcontainer/cuda12.0-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc10-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc10",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc10"
+}
--- a/.devcontainer/cuda12.0-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc11-cuda12.0-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc11",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc11"
+}
--- a/.devcontainer/cuda12.0-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc12-cuda12.0-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc12",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc12"
+}
--- a/.devcontainer/cuda12.0-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc7/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc7-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc7",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc7"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc7"
+}
--- a/.devcontainer/cuda12.0-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc8-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc8",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc8"
+}
--- a/.devcontainer/cuda12.0-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc9-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc9",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc9"
+}
--- a/.devcontainer/cuda12.0-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-llvm14-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm14",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm14"
+}
--- a/.devcontainer/cuda12.8-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc10-cuda12.8-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc10",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc10"
+}
--- a/.devcontainer/cuda12.8-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc11-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc11",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc11"
+}
--- a/.devcontainer/cuda12.8-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc12-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc12",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc12"
+}
--- a/.devcontainer/cuda12.8-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc13/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc13-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc13",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc13"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc13"
+}
--- a/.devcontainer/cuda12.8-gcc14/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc14/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc14-cuda12.8-ubuntu24.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc14",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc14"
+}
--- a/.devcontainer/cuda12.8-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc7/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc7-cuda12.8-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc7",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc7"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc7"
+}
--- a/.devcontainer/cuda12.8-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc8-cuda12.8-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc8",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc8"
+}
--- a/.devcontainer/cuda12.8-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc9-cuda12.8-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc9",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc9"
+}
--- a/.devcontainer/cuda12.8-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm14/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-llvm14-cuda12.8-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm14",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-llvm14"
+}
--- a/.devcontainer/cuda12.8-llvm15/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm15/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-llvm15-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm15",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "15",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm15"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-llvm15"
+}
--- a/.devcontainer/cuda12.8-llvm16/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm16/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-llvm16-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm16",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "16",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm16"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-llvm16"
+}
--- a/.devcontainer/cuda12.8-llvm17/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm17/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-llvm17-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm17",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "17",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm17"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-llvm17"
+}
--- a/.devcontainer/cuda12.8-llvm18/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm18/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-llvm18-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm18",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "18",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm18"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-llvm18"
+}
--- a/.devcontainer/cuda12.8-llvm19/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm19/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-llvm19-cuda12.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm19",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "19",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm19"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-llvm19"
+}
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.06-cpp-gcc14-cuda12.8-ubuntu24.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc14",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.8-gcc14"
+}
--- a/.devcontainer/docker-entrypoint.sh
+++ b/.devcontainer/docker-entrypoint.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Maybe change the UID/GID of the container's non-root user to match the host's UID/GID
+
+: "${REMOTE_USER:="coder"}";
+: "${OLD_UID:=}";
+: "${OLD_GID:=}";
+: "${NEW_UID:=}";
+: "${NEW_GID:=}";
+
+eval "$(sed -n "s/${REMOTE_USER}:[^:]*:\([^:]*\):\([^:]*\):[^:]*:\([^:]*\).*/OLD_UID=\1;OLD_GID=\2;HOME_FOLDER=\3/p" /etc/passwd)";
+eval "$(sed -n "s/\([^:]*\):[^:]*:${NEW_UID}:.*/EXISTING_USER=\1/p" /etc/passwd)";
+eval "$(sed -n "s/\([^:]*\):[^:]*:${NEW_GID}:.*/EXISTING_GROUP=\1/p" /etc/group)";
+
+if [ -z "$OLD_UID" ]; then
+    echo "Remote user not found in /etc/passwd ($REMOTE_USER).";
+    exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+elif [ "$OLD_UID" = "$NEW_UID" ] && [ "$OLD_GID" = "$NEW_GID" ]; then
+    echo "UIDs and GIDs are the same ($NEW_UID:$NEW_GID).";
+    exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+elif [ "$OLD_UID" != "$NEW_UID" ] && [ -n "$EXISTING_USER" ]; then
+    echo "User with UID exists ($EXISTING_USER=$NEW_UID).";
+    exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+else
+    if [ "$OLD_GID" != "$NEW_GID" ] && [ -n "$EXISTING_GROUP" ]; then
+        echo "Group with GID exists ($EXISTING_GROUP=$NEW_GID).";
+        NEW_GID="$OLD_GID";
+    fi
+    echo "Updating UID:GID from $OLD_UID:$OLD_GID to $NEW_UID:$NEW_GID.";
+    sed -i -e "s/\(${REMOTE_USER}:[^:]*:\)[^:]*:[^:]*/\1${NEW_UID}:${NEW_GID}/" /etc/passwd;
+    if [ "$OLD_GID" != "$NEW_GID" ]; then
+        sed -i -e "s/\([^:]*:[^:]*:\)${OLD_GID}:/\1${NEW_GID}:/" /etc/group;
+    fi
+
+    # Fast parallel `chown -R`
+    find "$HOME_FOLDER/" -not -user "$REMOTE_USER" -print0 \
+  | xargs -0 -r -n1 -P"$(nproc --all)" chown "$NEW_UID:$NEW_GID"
+
+    # Run the container command as $REMOTE_USER, preserving the container startup environment.
+    #
+    # We cannot use `su -w` because that's not supported by the `su` in Ubuntu18.04, so we reset the following
+    # environment variables to the expected values, then pass through everything else from the startup environment.
+    export HOME="$HOME_FOLDER";
+    export XDG_CACHE_HOME="$HOME_FOLDER/.cache";
+    export XDG_CONFIG_HOME="$HOME_FOLDER/.config";
+    export XDG_STATE_HOME="$HOME_FOLDER/.local/state";
+    export PYTHONHISTFILE="$HOME_FOLDER/.local/state/.python_history";
+    exec su -p "$REMOTE_USER" -- "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
+fi
--- a/.devcontainer/img/container_list.png
+++ b/.devcontainer/img/container_list.png
--- a/.devcontainer/img/github_auth.png
+++ b/.devcontainer/img/github_auth.png
--- a/.devcontainer/img/open_in_container_manual.png
+++ b/.devcontainer/img/open_in_container_manual.png
--- a/.devcontainer/img/reopen_in_container.png
+++ b/.devcontainer/img/reopen_in_container.png
--- a/.devcontainer/launch.sh
+++ b/.devcontainer/launch.sh
@@ -0,0 +1,306 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Ensure the script is being executed in the nvbench/ root
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..";
+
+print_help() {
+    echo "Usage: $0 [-c|--cuda <CUDA version>] [-H|--host <Host compiler>] [-d|--docker]"
+    echo "Launch a development container. If no CUDA version or Host compiler are specified,"
+    echo "the top-level devcontainer in .devcontainer/devcontainer.json will be used."
+    echo ""
+    echo "Options:"
+    echo "  -c, --cuda               Specify the CUDA version. E.g., 12.2"
+    echo "  -H, --host               Specify the host compiler. E.g., gcc12"
+    echo "  -d, --docker             Launch the development environment in Docker directly without using VSCode."
+    echo "  --gpus gpu-request       GPU devices to add to the container ('all' to pass all GPUs)."
+    echo "  -e, --env list           Set additional container environment variables."
+    echo "  -v, --volume list        Bind mount a volume."
+    echo "  -h, --help               Display this help message and exit."
+}
+
+# Assign variable one scope above the caller
+# Usage: local "$1" && _upvar $1 "value(s)"
+# Param: $1  Variable name to assign value to
+# Param: $*  Value(s) to assign.  If multiple values, an array is
+#            assigned, otherwise a single value is assigned.
+# See: http://fvue.nl/wiki/Bash:_Passing_variables_by_reference
+_upvar() {
+    if unset -v "$1"; then
+        if (( $# == 2 )); then
+            eval $1=\"\$2\";
+        else
+            eval $1=\(\"\${@:2}\"\);
+        fi;
+    fi
+}
+
+parse_options() {
+    local -;
+    set -euo pipefail;
+
+    # Read the name of the variable in which to return unparsed arguments
+    local UNPARSED="${!#}";
+    # Splice the unparsed arguments variable name from the arguments list
+    set -- "${@:1:$#-1}";
+
+    local OPTIONS=c:e:H:dhv
+    local LONG_OPTIONS=cuda:,env:,host:,gpus:,volume:,docker,help
+    # shellcheck disable=SC2155
+    local PARSED_OPTIONS=$(getopt -n "$0" -o "${OPTIONS}" --long "${LONG_OPTIONS}" -- "$@")
+
+    # shellcheck disable=SC2181
+    if [[ $? -ne 0 ]]; then
+        exit 1
+    fi
+
+    eval set -- "${PARSED_OPTIONS}"
+
+    while true; do
+        case "$1" in
+            -c|--cuda)
+                cuda_version="$2"
+                shift 2
+                ;;
+            -e|--env)
+                env_vars+=("$1" "$2")
+                shift 2
+                ;;
+            -H|--host)
+                host_compiler="$2"
+                shift 2
+                ;;
+            --gpus)
+                gpu_request="$2"
+                shift 2
+                ;;
+            -d|--docker)
+                docker_mode=true
+                shift
+                ;;
+            -h|--help)
+                print_help
+                exit 0
+                ;;
+            -v|--volume)
+                volumes+=("$1" "$2")
+                shift 2
+                ;;
+            --)
+                shift
+                _upvar "${UNPARSED}" "${@}"
+                break
+                ;;
+            *)
+                echo "Invalid option: $1"
+                print_help
+                exit 1
+                ;;
+        esac
+    done
+}
+
+# shellcheck disable=SC2155
+launch_docker() {
+    local -;
+    set -euo pipefail
+
+    inline_vars() {
+        cat - \
+        `# inline local workspace folder` \
+      | sed "s@\${localWorkspaceFolder}@$(pwd)@g" \
+        `# inline local workspace folder basename` \
+      | sed "s@\${localWorkspaceFolderBasename}@$(basename "$(pwd)")@g" \
+        `# inline container workspace folder` \
+      | sed "s@\${containerWorkspaceFolder}@${WORKSPACE_FOLDER:-}@g" \
+        `# inline container workspace folder basename` \
+      | sed "s@\${containerWorkspaceFolderBasename}@$(basename "${WORKSPACE_FOLDER:-}")@g" \
+        `# translate local envvars to shell syntax` \
+      | sed -r 's/\$\{localEnv:([^\:]*):?(.*)\}/${\1:-\2}/g'
+    }
+
+    args_to_path() {
+        local -a keys=("${@}")
+        keys=("${keys[@]/#/[}")
+        keys=("${keys[@]/%/]}")
+        echo "$(IFS=; echo "${keys[*]}")"
+    }
+
+    json_string() {
+        python3 -c "import json,sys; print(json.load(sys.stdin)$(args_to_path "${@}"))" 2>/dev/null | inline_vars
+    }
+
+    json_array() {
+        python3 -c "import json,sys; [print(f'\"{x}\"') for x in json.load(sys.stdin)$(args_to_path "${@}")]" 2>/dev/null | inline_vars
+    }
+
+    json_map() {
+        python3 -c "import json,sys; [print(f'{k}=\"{v}\"') for k,v in json.load(sys.stdin)$(args_to_path "${@}").items()]" 2>/dev/null | inline_vars
+    }
+
+    devcontainer_metadata_json() {
+        docker inspect --type image --format '{{json .Config.Labels}}' "$DOCKER_IMAGE" \
+      | json_string '"devcontainer.metadata"'
+    }
+
+    ###
+    # Read relevant values from devcontainer.json
+    ###
+
+    local devcontainer_json="${path}/devcontainer.json";
+
+    # Read image
+    local DOCKER_IMAGE="$(json_string '"image"' < "${devcontainer_json}")"
+    # Always pull the latest copy of the image
+    docker pull "$DOCKER_IMAGE"
+
+    # Read workspaceFolder
+    local WORKSPACE_FOLDER="$(json_string '"workspaceFolder"' < "${devcontainer_json}")"
+    # Read remoteUser
+    local REMOTE_USER="$(json_string '"remoteUser"' < "${devcontainer_json}")"
+    # If remoteUser isn't in our devcontainer.json, read it from the image's "devcontainer.metadata" label
+    if test -z "${REMOTE_USER:-}"; then
+        REMOTE_USER="$(devcontainer_metadata_json | json_string "-1" '"remoteUser"')"
+    fi
+    # Read runArgs
+    local -a RUN_ARGS="($(json_array '"runArgs"' < "${devcontainer_json}"))"
+    # Read initializeCommand
+    local -a INITIALIZE_COMMAND="($(json_array '"initializeCommand"' < "${devcontainer_json}"))"
+    # Read containerEnv
+    local -a ENV_VARS="($(json_map '"containerEnv"' < "${devcontainer_json}" | sed -r 's/(.*)=(.*)/--env \1=\2/'))"
+    # Read mounts
+    local -a MOUNTS="($(
+        tee < "${devcontainer_json}"          \
+            1>/dev/null                       \
+            >(json_array '"mounts"')          \
+            >(json_string '"workspaceMount"') \
+      | xargs -r -I% echo --mount '%'
+    ))"
+
+    ###
+    # Update run arguments and container environment variables
+    ###
+
+    # Only pass `-it` if the shell is a tty
+    if ! ${CI:-'false'} && tty >/dev/null 2>&1 && (exec </dev/tty); then
+        RUN_ARGS+=("-it")
+    fi
+
+    for flag in rm init; do
+        if [[ " ${RUN_ARGS[*]} " != *" --${flag} "* ]]; then
+            RUN_ARGS+=("--${flag}")
+        fi
+    done
+
+    # Prefer the user-provided --gpus argument
+    if test -n "${gpu_request:-}"; then
+        RUN_ARGS+=(--gpus "${gpu_request}")
+    else
+        # Otherwise read and infer from hostRequirements.gpu
+        local GPU_REQUEST="$(json_string '"hostRequirements"' '"gpu"' < "${devcontainer_json}")"
+        if test "${GPU_REQUEST:-false}" = true; then
+            RUN_ARGS+=(--gpus all)
+        elif test "${GPU_REQUEST:-false}" = optional && \
+             command -v nvidia-container-runtime >/dev/null 2>&1; then
+            RUN_ARGS+=(--gpus all)
+        fi
+    fi
+
+        RUN_ARGS+=(--workdir "${WORKSPACE_FOLDER:-/home/coder/nvbench}")
+
+    if test -n "${REMOTE_USER:-}"; then
+        ENV_VARS+=(--env NEW_UID="$(id -u)")
+        ENV_VARS+=(--env NEW_GID="$(id -g)")
+        ENV_VARS+=(--env REMOTE_USER="$REMOTE_USER")
+        RUN_ARGS+=(-u root:root)
+        RUN_ARGS+=(--entrypoint "${WORKSPACE_FOLDER:-/home/coder/nvbench}/.devcontainer/docker-entrypoint.sh")
+    fi
+
+    if test -n "${SSH_AUTH_SOCK:-}"; then
+        ENV_VARS+=(--env "SSH_AUTH_SOCK=/tmp/ssh-auth-sock")
+        MOUNTS+=(--mount "source=${SSH_AUTH_SOCK},target=/tmp/ssh-auth-sock,type=bind")
+    fi
+
+    # Append user-provided volumes
+    if test -v volumes && test ${#volumes[@]} -gt 0; then
+        MOUNTS+=("${volumes[@]}")
+    fi
+
+    # Append user-provided envvars
+    if test -v env_vars && test ${#env_vars[@]} -gt 0; then
+        ENV_VARS+=("${env_vars[@]}")
+    fi
+
+    # Run the initialize command before starting the container
+    if test "${#INITIALIZE_COMMAND[@]}" -gt 0; then
+        eval "${INITIALIZE_COMMAND[*]@Q}"
+    fi
+
+    exec docker run \
+        "${RUN_ARGS[@]}" \
+        "${ENV_VARS[@]}" \
+        "${MOUNTS[@]}" \
+        "${DOCKER_IMAGE}" \
+        "$@"
+}
+
+launch_vscode() {
+    local -;
+    set -euo pipefail;
+    # Since Visual Studio Code allows only one instance per `devcontainer.json`,
+    # this code prepares a unique temporary directory structure for each launch of a devcontainer.
+    # By doing so, it ensures that multiple instances of the same environment can be run
+    # simultaneously. The script replicates the `devcontainer.json` from the desired CUDA
+    # and compiler environment into this temporary directory, adjusting paths to ensure the
+    # correct workspace is loaded. A special URL is then generated to instruct VSCode to
+    # launch the development container using this temporary configuration.
+    local workspace="$(basename "$(pwd)")"
+    local tmpdir="$(mktemp -d)/${workspace}"
+    mkdir -p "${tmpdir}"
+    mkdir -p "${tmpdir}/.devcontainer"
+    cp -arL "${path}/devcontainer.json" "${tmpdir}/.devcontainer"
+    sed -i "s@\\${localWorkspaceFolder}@$(pwd)@g" "${tmpdir}/.devcontainer/devcontainer.json"
+    local path="${tmpdir}"
+    local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')"
+    local url="vscode://vscode-remote/dev-container+${hash}/home/coder/nvbench"
+
+    local launch=""
+    if type open >/dev/null 2>&1; then
+        launch="open"
+    elif type xdg-open >/dev/null 2>&1; then
+        launch="xdg-open"
+    fi
+
+    if [ -n "${launch}" ]; then
+        echo "Launching VSCode Dev Container URL: ${url}"
+        code --new-window "${tmpdir}"
+        exec "${launch}" "${url}" >/dev/null 2>&1
+    fi
+}
+
+main() {
+    local -a unparsed;
+    parse_options "$@" unparsed;
+    set -- "${unparsed[@]}";
+
+    # If no CTK/Host compiler are provided, just use the default environment
+    if [[ -z ${cuda_version:-} ]] && [[ -z ${host_compiler:-} ]]; then
+        path=".devcontainer"
+    else
+        path=".devcontainer/cuda${cuda_version}-${host_compiler}"
+        if [[ ! -f "${path}/devcontainer.json" ]]; then
+            echo "Unknown CUDA [${cuda_version}] compiler [${host_compiler}] combination"
+            echo "Requested devcontainer ${path}/devcontainer.json does not exist"
+            exit 1
+        fi
+    fi
+
+    if ${docker_mode:-'false'}; then
+        launch_docker "$@"
+    else
+        launch_vscode
+    fi
+}
+
+main "$@"
--- a/.devcontainer/make_devcontainers.sh
+++ b/.devcontainer/make_devcontainers.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+# This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of
+# CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the
+# .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version.
+# GitHub docs on using multiple devcontainer.json files:
+# https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson
+
+set -euo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+
+function usage {
+    echo "Usage: $0 [--clean] [-h/--help] [-v/--verbose]"
+    echo "  --clean   Remove stale devcontainer subdirectories"
+    echo "  -h, --help   Display this help message"
+    echo "  -v, --verbose  Enable verbose mode (set -x)"
+    exit 1
+}
+
+# Function to update the devcontainer.json file with the provided parameters
+update_devcontainer() {
+    local input_file="$1"
+    local output_file="$2"
+    local name="$3"
+    local cuda_version="$4"
+    local compiler_name="$5"
+    local compiler_exe="$6"
+    local compiler_version="$7"
+    local os="$8"
+    local devcontainer_version="$9"
+
+    local IMAGE_ROOT="rapidsai/devcontainers:${devcontainer_version}-cpp-"
+    local image="${IMAGE_ROOT}${compiler_name}${compiler_version}-cuda${cuda_version}-${os}"
+
+    jq --arg image "$image" --arg name "$name" \
+       --arg cuda_version "$cuda_version" --arg compiler_name "$compiler_name" \
+       --arg compiler_exe "$compiler_exe" --arg compiler_version "$compiler_version" --arg os "$os" \
+       '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name |
+        .containerEnv.CCCL_BUILD_INFIX = $name |
+        .containerEnv.CCCL_CUDA_VERSION = $cuda_version | .containerEnv.CCCL_HOST_COMPILER = $compiler_name |
+        .containerEnv.CCCL_HOST_COMPILER_VERSION = $compiler_version '\
+       "$input_file" > "$output_file"
+}
+
+make_name() {
+    local cuda_version="$1"
+    local compiler_name="$2"
+    local compiler_version="$3"
+
+    echo "cuda$cuda_version-$compiler_name$compiler_version"
+}
+
+CLEAN=false
+VERBOSE=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --clean)
+            CLEAN=true
+            ;;
+        -h|--help)
+            usage
+            ;;
+        -v|--verbose)
+            VERBOSE=true
+            ;;
+        *)
+            usage
+            ;;
+    esac
+    shift
+done
+
+MATRIX_FILE="../ci/matrix.yaml"
+
+# Enable verbose mode if requested
+if [ "$VERBOSE" = true ]; then
+    set -x
+    cat ${MATRIX_FILE}
+fi
+
+# Read matrix.yaml and convert it to json
+matrix_json=$(yq -o json ${MATRIX_FILE})
+
+# Exclude Windows environments
+readonly matrix_json=$(echo "$matrix_json" | jq 'del(.pull_request.nvcc[] | select(.os | contains("windows")))')
+
+# Get the devcontainer image version and define image tag root
+readonly DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version')
+
+# Get unique combinations of cuda version, compiler name/version, and Ubuntu version
+readonly combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_exe: .compiler.exe, compiler_version: .compiler.version, os: .os}] | unique | .[]')
+
+# Update the base devcontainer with the default values
+# The root devcontainer.json file is used as the default container as well as a template for all
+# other devcontainer.json files by replacing the `image:` field with the appropriate image name
+readonly base_devcontainer_file="./devcontainer.json"
+readonly NEWEST_GCC_CUDA_ENTRY=$(echo "$combinations" | jq -rs '[.[] | select(.compiler_name == "gcc")] | sort_by((.cuda | tonumber), (.compiler_version | tonumber)) | .[-1]')
+readonly DEFAULT_CUDA=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.cuda')
+readonly DEFAULT_COMPILER_NAME=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_name')
+readonly DEFAULT_COMPILER_EXE=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_exe')
+readonly DEFAULT_COMPILER_VERSION=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_version')
+readonly DEFAULT_OS=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.os')
+readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION")
+
+update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEFAULT_OS" "$DEVCONTAINER_VERSION"
+mv "./temp_devcontainer.json" ${base_devcontainer_file}
+
+# Create an array to keep track of valid subdirectory names
+valid_subdirs=()
+
+# The img folder should not be removed:
+valid_subdirs+=("img")
+
+# For each unique combination
+for combination in $combinations; do
+    cuda_version=$(echo "$combination" | jq -r '.cuda')
+    compiler_name=$(echo "$combination" | jq -r '.compiler_name')
+    compiler_exe=$(echo "$combination" | jq -r '.compiler_exe')
+    compiler_version=$(echo "$combination" | jq -r '.compiler_version')
+    os=$(echo "$combination" | jq -r '.os')
+
+    name=$(make_name "$cuda_version" "$compiler_name" "$compiler_version")
+    mkdir -p "$name"
+    new_devcontainer_file="$name/devcontainer.json"
+
+    update_devcontainer "$base_devcontainer_file" "$new_devcontainer_file" "$name" "$cuda_version" "$compiler_name" "$compiler_exe" "$compiler_version" "$os" "$DEVCONTAINER_VERSION"
+    echo "Created $new_devcontainer_file"
+
+    # Add the subdirectory name to the valid_subdirs array
+    valid_subdirs+=("$name")
+done
+
+# Clean up stale subdirectories and devcontainer.json files
+if [ "$CLEAN" = true ]; then
+    for subdir in ./*; do
+        if [ -d "$subdir" ] && [[ ! " ${valid_subdirs[@]} " =~ " ${subdir#./} " ]]; then
+            echo "Removing stale subdirectory: $subdir"
+            rm -r "$subdir"
+        fi
+    done
+fi
--- a/.devcontainer/nvbench-entrypoint.sh
+++ b/.devcontainer/nvbench-entrypoint.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# shellcheck disable=SC1091
+
+set -e;
+
+devcontainer-utils-post-create-command;
+devcontainer-utils-init-git;
+devcontainer-utils-post-attach-command;
+
+cd /home/coder/nvbench/
+
+if test $# -gt 0; then
+    exec "$@";
+else
+    exec /bin/bash -li;
+fi
--- a/.devcontainer/verify_devcontainer.sh
+++ b/.devcontainer/verify_devcontainer.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+function usage {
+    echo "Usage: $0"
+    echo
+    echo "This script is intended to be run within one of CCCL's Dev Containers."
+    echo "It verifies that the expected environment variables and binary versions match what is expected."
+}
+
+check_envvars() {
+    for var_name in "$@"; do
+        if [[ -z "${!var_name:-}" ]]; then
+            echo "::error:: ${var_name} variable is not set."
+            exit 1
+        else
+            echo "$var_name=${!var_name}"
+        fi
+    done
+}
+
+check_host_compiler_version() {
+    local version_output=$($CXX --version)
+
+    if [[ "$CXX" == "g++" ]]; then
+        local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 4 | cut -d '.' -f 1)
+        local expected_compiler="gcc"
+    elif [[ "$CXX" == "clang++" ]]; then
+        if [[ $version_output =~ clang\ version\ ([0-9]+) ]]; then
+            actual_version=${BASH_REMATCH[1]}
+        else
+            echo "::error:: Unable to determine clang version."
+            exit 1
+        fi
+        expected_compiler="llvm"
+    elif [[ "$CXX" == "icpc" ]]; then
+        local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 3 )
+        # The icpc compiler version of oneAPI release 2023.2.0 is 2021.10.0
+        if [[ "$actual_version" == "2021.10.0" ]]; then
+            actual_version="2023.2.0"
+        fi
+        expected_compiler="oneapi"
+    else
+        echo "::error:: Unexpected CXX value ($CXX)."
+        exit 1
+    fi
+
+    if [[ "$expected_compiler" != "${CCCL_HOST_COMPILER}" || "$actual_version" != "$CCCL_HOST_COMPILER_VERSION" ]]; then
+        echo "::error:: CXX ($CXX) version ($actual_version) does not match the expected compiler (${CCCL_HOST_COMPILER}) and version (${CCCL_HOST_COMPILER_VERSION})."
+        exit 1
+    else
+        echo "Detected host compiler: $CXX version $actual_version"
+    fi
+}
+
+check_cuda_version() {
+    local cuda_version_output=$(nvcc --version)
+    if [[ $cuda_version_output =~ release\ ([0-9]+\.[0-9]+) ]]; then
+        local actual_cuda_version=${BASH_REMATCH[1]}
+    else
+        echo "::error:: Unable to determine CUDA version from nvcc."
+        exit 1
+    fi
+
+    if [[ "$actual_cuda_version" != "$CCCL_CUDA_VERSION" ]]; then
+        echo "::error:: CUDA version ($actual_cuda_version) does not match the expected CUDA version ($CCCL_CUDA_VERSION)."
+        exit 1
+    else
+        echo "Detected CUDA version: $actual_cuda_version"
+    fi
+}
+
+main() {
+    if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+        usage
+        exit 0
+    fi
+
+    set -euo pipefail
+
+    check_envvars DEVCONTAINER_NAME CXX CUDAHOSTCXX CCCL_BUILD_INFIX CCCL_HOST_COMPILER CCCL_CUDA_VERSION CCCL_HOST_COMPILER_VERSION
+
+    check_host_compiler_version
+
+    check_cuda_version
+
+    echo "Dev Container successfully verified!"
+}
+
+main "$@"
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,17 @@
+# Exclude these commits from git-blame and similar tools.
+#
+# To use this file, run the following command from the repo root:
+#
+# ```
+# $ git config blame.ignoreRevsFile .git-blame-ignore-revs
+# ```
+#
+# Include a brief comment with each commit added, for example:
+#
+# ```
+# 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
+# ```
+#
+# Only add commits that are pure formatting changes (e.g. clang-format version changes, etc).
+8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
+3440855dbd405db614861885ad1577fffd882867 # Initial addition of pre-commit.ci formatting.
--- a/.github/actions/compute-matrix/action.yml
+++ b/.github/actions/compute-matrix/action.yml
@@ -0,0 +1,25 @@
+
+name: Compute Matrix
+description: "Compute the matrix for a given matrix type from the specified matrix file"
+
+inputs:
+  matrix_query:
+    description: "The jq query used to specify the desired matrix. e.g., .pull_request.nvcc"
+    required: true
+  matrix_file:
+    description: 'The file containing the matrix'
+    required: true
+outputs:
+  matrix:
+    description: 'The requested matrix'
+    value: ${{ steps.compute-matrix.outputs.MATRIX }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Compute matrix
+      id: compute-matrix
+      run: |
+        MATRIX=$(./.github/actions/compute-matrix/compute-matrix.sh ${{inputs.matrix_file}}  ${{inputs.matrix_query}} )
+        echo "matrix=$MATRIX" | tee -a $GITHUB_OUTPUT
+      shell: bash -euxo pipefail {0}
--- a/.github/actions/compute-matrix/compute-matrix.sh
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -euo pipefail
+
+write_output() {
+  local key="$1"
+  local value="$2"
+  echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
+}
+
+extract_matrix() {
+  local file="$1"
+  local type="$2"
+  local matrix=$(yq -o=json "$file" | jq -cr ".$type")
+  write_output "DEVCONTAINER_VERSION" "$(yq -o json "$file" | jq -cr '.devcontainer_version')"
+
+  local nvcc_full_matrix="$(echo "$matrix" | jq -cr '.nvcc')"
+  local per_cuda_compiler_matrix="$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
+  write_output "PER_CUDA_COMPILER_MATRIX"  "$per_cuda_compiler_matrix"
+  write_output "PER_CUDA_COMPILER_KEYS" "$(echo "$per_cuda_compiler_matrix" | jq -r 'keys | @json')"
+}
+
+main() {
+  if [ "$1" == "-v" ]; then
+    set -x
+    shift
+  fi
+
+  if [ $# -ne 2 ] || [ "$2" != "pull_request" ]; then
+    echo "Usage: $0 [-v] MATRIX_FILE MATRIX_TYPE"
+    echo "  -v            : Enable verbose output"
+    echo "  MATRIX_FILE   : The path to the matrix file."
+    echo "  MATRIX_TYPE   : The desired matrix. Supported values: 'pull_request'"
+    exit 1
+  fi
+
+  echo "Input matrix file:" >&2
+  cat "$1" >&2
+  echo "Matrix Type: $2" >&2
+
+  extract_matrix "$1" "$2"
+}
+
+main "$@"
--- a/.github/actions/configure_cccl_sccache/action.yml
+++ b/.github/actions/configure_cccl_sccache/action.yml
@@ -0,0 +1,13 @@
+name: Set up AWS credentials and environment variables for sccache
+description: "Set up AWS credentials and environment variables for sccache"
+runs:
+  using: "composite"
+  steps:
+    - name: Set environment variables
+      run: |
+        echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV
+        echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
+        echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
+        echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
+        echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
+      shell: bash
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
--- a/.github/problem-matchers/problem-matcher.json
+++ b/.github/problem-matchers/problem-matcher.json
@@ -0,0 +1,14 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "nvcc",
+      "pattern": [
+        {
+          "regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$",
+          "severity": 4,
+          "message": 5
+        }
+      ]
+    }
+  ]
+}
--- a/.github/workflows/build-and-test-linux.yml
+++ b/.github/workflows/build-and-test-linux.yml
@@ -0,0 +1,36 @@
+name: build and test
+
+defaults:
+  run:
+    shell: bash -exo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      cuda: {type: string, required: true}
+      host: {type: string, required: true}
+      cpu: {type: string, required: true}
+      test_name: {type: string, required: false}
+      build_script: {type: string, required: false}
+      test_script: {type: string, required: false}
+      container_image: {type: string, required: false}
+      run_tests: {type: boolean, required: false, default: true}
+
+permissions:
+  contents: read
+
+jobs:
+  build-and-test:
+    name: Build/Test ${{inputs.test_name}}
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      cuda: ${{ inputs.cuda }}
+      host: ${{ inputs.host }}
+      name: Build/Test ${{inputs.test_name}}
+      runner: linux-${{inputs.cpu}}-gpu-l4-latest-1
+      image:  ${{ inputs.container_image }}
+      command: |
+        ${{ inputs.test_script }}
--- a/.github/workflows/dispatch-build-and-test.yml
+++ b/.github/workflows/dispatch-build-and-test.yml
@@ -0,0 +1,34 @@
+name: Dispatch build and test
+
+on:
+  workflow_call:
+    inputs:
+      project_name: {type: string, required: true}
+      per_cuda_compiler_matrix: {type: string, required: true}
+      devcontainer_version: {type: string, required: true}
+
+permissions:
+  contents: read
+
+jobs:
+  # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
+  # ensures that the build/test steps can overlap across different configurations. For example,
+  # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
+  build_and_test_linux:
+    name: build and test linux
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/build-and-test-linux.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
+    with:
+      cuda: ${{ matrix.cuda }}
+      host: ${{matrix.compiler.name}}${{matrix.compiler.version}}
+      cpu: ${{ matrix.cpu }}
+      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}} ${{matrix.extra_build_args}}
+      build_script: "./ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
+      test_script:  "./ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
+      container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the main workflow that runs on every PR and push to main
+name: pr
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+
+# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts.
+concurrency:
+  group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  compute-matrix:
+    name: Compute matrix
+    runs-on: ubuntu-latest
+    outputs:
+      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+      base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
+      pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+      - name: Lookup PR info
+        id: get-pr-info
+        uses: nv-gha-runners/get-pr-info@main
+      - name: Export PR info
+        id: export-pr-info
+        run: |
+          echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
+          echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
+      - name: Compute matrix outputs
+        id: set-outputs
+        run: |
+          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+
+  nvbench:
+    name: NVBench CUDA${{ matrix.cuda_host_combination }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+    with:
+      project_name: "nvbench"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+
+  verify-devcontainers:
+    name: Verify Dev Containers
+    if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
+    needs: compute-matrix
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/verify-devcontainers.yml
+    with:
+      base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
+
+  # This job is the final job that runs after all other jobs and is used for branch protection status checks.
+  # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
+  # https://github.com/orgs/community/discussions/26822#discussioncomment-5122101
+  ci:
+    runs-on: ubuntu-latest
+    name: CI
+    if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
+    needs:
+      - nvbench
+      - verify-devcontainers
+    steps:
+      - name: Check status of all precursor jobs
+        if: >-
+          ${{
+               contains(needs.*.result, 'failure')
+            || contains(needs.*.result, 'cancelled')
+          }}
+        run: exit 1
--- a/.github/workflows/run-as-coder.yml
+++ b/.github/workflows/run-as-coder.yml
@@ -0,0 +1,156 @@
+name: Run as coder user
+
+defaults:
+  run:
+    shell: bash -exo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      cuda: {type: string, required: true}
+      host: {type: string, required: true}
+      name: {type: string, required: true}
+      image: {type: string, required: true}
+      runner: {type: string, required: true}
+      command: {type: string, required: true}
+      env: { type: string, required: false, default: "" }
+
+permissions:
+  contents: read
+
+jobs:
+  run-as-coder:
+    name: ${{inputs.name}}
+    permissions:
+      id-token: write
+      contents: read
+    runs-on: ${{inputs.runner}}
+    container:
+      # This job now uses a docker-outside-of-docker (DOOD) strategy.
+      #
+      # The GitHub Actions runner application mounts the host's docker socket `/var/run/docker.sock` into the
+      # container. By using a container with the `docker` CLI, this container can launch docker containers
+      # using the host's docker daemon.
+      #
+      # This allows us to run actions that require node v20 in the `cruizba/ubuntu-dind:jammy-26.1.3` container, and
+      # then launch our Ubuntu18.04-based GCC 6/7 containers to build and test CCCL.
+      #
+      # The main inconvenience to this approach is that any container mounts have to match the paths of the runner host,
+      # not the paths as seen in the intermediate (`cruizba/ubuntu-dind`) container.
+      #
+      # Note: I am using `cruizba/ubuntu-dind:jammy-26.1.3` instead of `docker:latest`, because GitHub doesn't support
+      # JS actions in alpine aarch64 containers, instead failing actions with this error:
+      # ```
+      # Error: JavaScript Actions in Alpine containers are only supported on x64 Linux runners. Detected Linux Arm64
+      # ```
+      image: cruizba/ubuntu-dind:jammy-26.1.3
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+        with:
+          path: nvbench
+          persist-credentials: false
+      - name: Add NVCC problem matcher
+        run: |
+          echo "::add-matcher::nvbench/.github/problem-matchers/problem-matcher.json"
+      - name: Configure credentials and environment variables for sccache
+        uses: ./nvbench/.github/actions/configure_cccl_sccache
+      - name: Run command
+        env:
+          CI: true
+          RUNNER: "${{inputs.runner}}"
+          COMMAND: "${{inputs.command}}"
+          AWS_ACCESS_KEY_ID: "${{env.AWS_ACCESS_KEY_ID}}"
+          AWS_SESSION_TOKEN: "${{env.AWS_SESSION_TOKEN}}"
+          AWS_SECRET_ACCESS_KEY: "${{env.AWS_SECRET_ACCESS_KEY}}"
+        run: |
+            echo "[host]      github.workspace: ${{github.workspace}}"
+            echo "[container] GITHUB_WORKSPACE: ${GITHUB_WORKSPACE:-}"
+            echo "[container]              PWD: $(pwd)"
+
+            # Necessary because we're doing docker-outside-of-docker:
+            # Make a symlink in the container that matches the host's ${{github.workspace}}, so that way `$(pwd)`
+            # in `.devcontainer/launch.sh` constructs volume paths relative to the hosts's ${{github.workspace}}.
+            mkdir -p "$(dirname "${{github.workspace}}")"
+            ln -s "$(pwd)" "${{github.workspace}}"
+
+            cd "${{github.workspace}}"
+
+            cat <<"EOF" > ci.sh
+
+            #! /usr/bin/env bash
+            set -eo pipefail
+            echo -e "\e[1;34mRunning as '$(whoami)' user in $(pwd):\e[0m"
+            echo -e "\e[1;34m${{inputs.command}}\e[0m"
+            eval "${{inputs.command}}" || exit_code=$?
+            if [ ! -z "$exit_code" ]; then
+              echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
+              echo "::error:: To replicate this failure locally, follow the steps below:"
+              echo "1. Clone the repository, and navigate to the correct branch and commit:"
+              echo "   git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
+              echo ""
+              echo "2. Run the failed command inside the same Docker container used by the CI:"
+              echo "   docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
+              echo ""
+              echo "For additional information, see:"
+              echo "   - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
+              echo "   - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
+              exit $exit_code
+            fi
+            EOF
+
+            chmod +x ci.sh
+
+            mkdir "$RUNNER_TEMP/.aws";
+
+            cat <<EOF > "$RUNNER_TEMP/.aws/config"
+            [default]
+            bucket=rapids-sccache-devs
+            region=us-east-2
+            EOF
+
+            cat <<EOF > "$RUNNER_TEMP/.aws/credentials"
+            [default]
+            aws_access_key_id=$AWS_ACCESS_KEY_ID
+            aws_session_token=$AWS_SESSION_TOKEN
+            aws_secret_access_key=$AWS_SECRET_ACCESS_KEY
+            EOF
+
+            chmod 0600 "$RUNNER_TEMP/.aws/credentials"
+            chmod 0664 "$RUNNER_TEMP/.aws/config"
+
+            declare -a gpu_request=()
+
+            # Explicitly pass which GPU to use if on a GPU runner
+            if [[ "${RUNNER}" = *"-gpu-"* ]]; then
+              gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES}")
+            fi
+
+            host_path() {
+              sed "s@/__w@$(dirname "$(dirname "${{github.workspace}}")")@" <<< "$1"
+            }
+
+            # Launch this container using the host's docker daemon
+            ${{github.event.repository.name}}/.devcontainer/launch.sh \
+              --docker \
+              --cuda ${{inputs.cuda}} \
+              --host ${{inputs.host}} \
+              "${gpu_request[@]}" \
+              --env "CI=$CI" \
+              --env "AWS_ROLE_ARN=" \
+              --env "COMMAND=$COMMAND" \
+              --env "GITHUB_ENV=$GITHUB_ENV" \
+              --env "GITHUB_SHA=$GITHUB_SHA" \
+              --env "GITHUB_PATH=$GITHUB_PATH" \
+              --env "GITHUB_OUTPUT=$GITHUB_OUTPUT" \
+              --env "GITHUB_ACTIONS=$GITHUB_ACTIONS" \
+              --env "GITHUB_REF_NAME=$GITHUB_REF_NAME" \
+              --env "GITHUB_WORKSPACE=$GITHUB_WORKSPACE" \
+              --env "GITHUB_REPOSITORY=$GITHUB_REPOSITORY" \
+              --env "GITHUB_STEP_SUMMARY=$GITHUB_STEP_SUMMARY" \
+              --volume "${{github.workspace}}/ci.sh:/ci.sh" \
+              --volume "$(host_path "$RUNNER_TEMP")/.aws:/root/.aws" \
+              --volume "$(dirname "$(dirname "${{github.workspace}}")"):/__w" \
+              -- /ci.sh
--- a/.github/workflows/verify-devcontainers.yml
+++ b/.github/workflows/verify-devcontainers.yml
@@ -0,0 +1,150 @@
+name: Verify devcontainers
+
+on:
+  workflow_call:
+    inputs:
+      base_sha:
+        type: string
+        description: 'For PRs, set the base SHA to conditionally run this workflow only when relevant files are modified.'
+        required: false
+
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+permissions:
+  contents: read
+
+jobs:
+  get-devcontainer-list:
+    name: Verify devcontainer files are up-to-date
+    outputs:
+      skip: ${{ steps.inspect-changes.outputs.skip }}
+      devcontainers: ${{ steps.get-list.outputs.devcontainers }}
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+    - name: Setup jq and yq
+      run: |
+        sudo apt-get update
+        sudo apt-get install jq -y
+        sudo wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.34.2/yq_linux_amd64
+        sudo chmod +x /usr/local/bin/yq
+    - name: Run the script to generate devcontainer files
+      run: |
+        ./.devcontainer/make_devcontainers.sh --verbose --clean
+    - name: Check for changes
+      run: |
+        if [[ $(git diff --stat) != '' || $(git status --porcelain | grep '^??') != '' ]]; then
+          git diff --minimal
+          git status --porcelain
+          echo "::error:: Dev Container files are out of date or there are untracked files. Run the .devcontainer/make_devcontainers.sh script and commit the changes."
+          exit 1
+        else
+          echo "::note::Dev Container files are up-to-date."
+        fi
+    - name: Inspect changes
+      if: ${{ inputs.base_sha != '' }}
+      id: inspect-changes
+      env:
+        BASE_SHA: ${{ inputs.base_sha }}
+      run: |
+        echo "Fetch history and determine merge base..."
+        git fetch origin --unshallow -q
+        git fetch origin $BASE_SHA -q
+        merge_base_sha=$(git merge-base $GITHUB_SHA $BASE_SHA)
+
+        echo "Head SHA: $GITHUB_SHA"
+        echo "PR Base SHA: $BASE_SHA"
+        echo "Merge Base SHA: $merge_base_sha"
+
+        echo "Checking for changes to devcontainer/matrix files..."
+
+        all_dirty_files=$(git diff --name-only "${merge_base_sha}" "${GITHUB_SHA}")
+        echo "::group::All dirty files"
+        echo "${all_dirty_files}"
+        echo "::endgroup::"
+
+        file_regex="^(.devcontainer|ci/matrix.yaml|.github/actions/workflow-build/build-workflow.py)"
+        echo "Regex: ${file_regex}"
+
+        relevant_dirty_files=$(echo "${all_dirty_files}" | grep -E "${file_regex}" || true)
+        echo "::group::Relevant dirty files"
+        echo "${relevant_dirty_files}"
+        echo "::endgroup::"
+
+        if [[ -z "${relevant_dirty_files}" ]]; then
+          echo "No relevant changes detected. Skipping devcontainer testing."
+          echo "skip=true" >> $GITHUB_OUTPUT
+        else
+          echo "Detected relevant changes. Continuing."
+          echo "skip=false" >> $GITHUB_OUTPUT
+        fi
+    - name: Get list of devcontainer.json paths and names
+      if: ${{ steps.inspect-changes.outputs.skip != 'true' }}
+      id: get-list
+      run: |
+        devcontainers=$(find .devcontainer/ -name 'devcontainer.json' | while read -r devcontainer; do
+          jq --arg path "$devcontainer" '{path: $path, name: .name}' "$devcontainer"
+          done | jq -s -c .)
+        echo "devcontainers=${devcontainers}" | tee --append "${GITHUB_OUTPUT}"
+
+  verify-devcontainers:
+    name: ${{matrix.devcontainer.name}}
+    needs: get-devcontainer-list
+    if: ${{ needs.get-devcontainer-list.outputs.skip != 'true' }}
+    runs-on: linux-amd64-cpu4
+    strategy:
+      fail-fast: false
+      matrix:
+        devcontainer: ${{fromJson(needs.get-devcontainer-list.outputs.devcontainers)}}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+    - name: Check out the code
+      uses: actions/checkout@v4
+      with:
+        persist-credentials: false
+
+    - name: Install dependencies
+      run: |
+        # Add PPA for nodejs, devcontainer CLI requires a newer version:
+        curl -fsSL https://deb.nodesource.com/setup_20.x -o /tmp/nodesource_setup.sh
+        sudo bash /tmp/nodesource_setup.sh
+        sudo apt-get update
+        sudo apt-get install -y nodejs
+        sudo npm install -g @devcontainers/cli
+
+      # We don't really need sccache configured, but we need the AWS credentials envvars to be set
+      # in order to avoid the devcontainer hanging waiting for GitHub authentication
+    - name: Get AWS credentials for sccache bucket
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+        aws-region: us-east-2
+        role-duration-seconds: 43200 # 12 hours
+    - name: Set environment variables
+      run: |
+        echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV
+        echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
+        echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
+        echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
+        echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
+
+    - name: Run in devcontainer
+      uses: devcontainers/ci@v0.3
+      with:
+        push: never
+        configFile: ${{ matrix.devcontainer.path }}
+        env: |
+          SCCACHE_REGION=${{ env.SCCACHE_REGION }}
+          AWS_ACCESS_KEY_ID=${{ env.AWS_ACCESS_KEY_ID }}
+          AWS_SESSION_TOKEN=${{ env.AWS_SESSION_TOKEN }}
+          AWS_SECRET_ACCESS_KEY=${{ env.AWS_SECRET_ACCESS_KEY }}
+        runCmd: |
+          .devcontainer/verify_devcontainer.sh
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,10 @@
 build*/
+.aws
+.vscode
+.cache
+.config
 .idea
 cmake-build-*
 *~
+compile_commands.json
+CMakeUserPresets.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,70 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+ci:
+    autofix_commit_msg: |
+      [pre-commit.ci] auto code formatting
+    autofix_prs: false
+    autoupdate_branch: ''
+    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
+    autoupdate_schedule: quarterly
+    skip: []
+    submodules: false
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+      - id: trailing-whitespace
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v19.1.6
+    hooks:
+      - id: clang-format
+        types_or: [file]
+        files: |
+          (?x)^(
+            ^.*\.c$|
+            ^.*\.cpp$|
+            ^.*\.cu$|
+            ^.*\.cuh$|
+            ^.*\.cxx$|
+            ^.*\.h$|
+            ^.*\.hpp$|
+            ^.*\.inl$|
+            ^.*\.mm$
+          )
+        args: ["-fallback-style=none", "-style=file", "-i"]
+
+  # TODO/REMINDER: add the Ruff vscode extension to the devcontainers
+  # Ruff, the Python auto-correcting linter/formatter written in Rust
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.8.6
+    hooks:
+    - id: ruff  # linter
+    - id: ruff-format  # formatter
+
+  # TOML lint & format
+  - repo: https://github.com/ComPWA/taplo-pre-commit
+    rev: v0.9.3
+    hooks:
+      # See https://github.com/NVIDIA/cccl/issues/3426
+      # - id: taplo-lint
+      #   exclude: "^docs/"
+      - id: taplo-format
+        exclude: "^docs/"
+
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.3.0
+    hooks:
+      - id: codespell
+        additional_dependencies: [tomli]
+        args: ["--toml", "pyproject.toml"]
+        exclude: |
+          (?x)^(
+            build|
+            CITATION.md
+          )
+
+
+default_language_version:
+  python: python3
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,5 @@
-# 3.20.1 required for rapids-cmake
-# 3.21.0 required for NVBench_ADD_DEPENDENT_DLLS_TO_* (MSVC only)
-cmake_minimum_required(VERSION 3.20.1)
+# 3.30.4 required for rapids-cmake
+cmake_minimum_required(VERSION 3.30.4)

 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
@@ -22,6 +21,11 @@ project(NVBench

 nvbench_init_rapids_cmake()

+# Define NVBench_DETECTED_${LANG}_STANDARDS
+include(cmake/DetectSupportedStandards.cmake)
+detect_supported_standards(NVBench CXX 17 20)
+detect_supported_standards(NVBench CUDA 17 20)
+
 # See NVIDIA/NVBench#52
 find_package(CUDAToolkit REQUIRED)
 set(cupti_default ON)
@@ -29,29 +33,37 @@ if (${CUDAToolkit_VERSION} VERSION_LESS 11.3)
  set(cupti_default OFF)
 endif()

+option(BUILD_SHARED_LIBS "Build NVBench as a shared library" ON)
+
 option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
 option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default})

 option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
+option(NVBench_ENABLE_HEADER_TESTING "Build NVBench testing suite." OFF)
 option(NVBench_ENABLE_DEVICE_TESTING
  "Include tests that require a GPU (with locked clocks)."
  OFF
 )
 option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
+option(NVBench_ENABLE_INSTALL_RULES "Install NVBench." ${NVBench_TOPLEVEL_PROJECT})
+
+include(cmake/NVBenchUtilities.cmake) # Must be first
+include(cmake/NVBenchClangdCompileInfo.cmake) # Must be before any targets are created

 include(cmake/NVBenchConfigTarget.cmake)
-include(cmake/NVBenchDependentDlls.cmake)
 include(cmake/NVBenchExports.cmake)
 include(cmake/NVBenchWriteConfigHeader.cmake)
 include(cmake/NVBenchDependencies.cmake)
 include(cmake/NVBenchInstallRules.cmake)
-include(cmake/NVBenchUtilities.cmake)

 message(STATUS "NVBench CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

 add_subdirectory(nvbench)

-if (NVBench_ENABLE_EXAMPLES OR NVBench_ENABLE_TESTING)
+if (NVBench_ENABLE_EXAMPLES OR
+    NVBench_ENABLE_TESTING OR
+    NVBench_ENABLE_HEADER_TESTING)
+  include(CTest)
  enable_testing()
 endif()

@@ -65,4 +77,8 @@ if (NVBench_ENABLE_TESTING)
  add_subdirectory(testing)
 endif()

+if (NVBench_ENABLE_HEADER_TESTING)
+  include(cmake/NVBenchHeaderTesting.cmake)
+endif()
+
 nvbench_generate_exports()
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -0,0 +1,74 @@
+{
+  "version": 3,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 23,
+    "patch": 1
+  },
+  "configurePresets": [
+    {
+      "name": "base",
+      "hidden": true,
+      "generator": "Ninja",
+      "binaryDir": "${sourceDir}/build/$env{CCCL_BUILD_INFIX}/${presetName}",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_CUDA_ARCHITECTURES": "all-major",
+        "NVBench_ENABLE_CUPTI": true,
+        "NVBench_ENABLE_DEVICE_TESTING": false,
+        "NVBench_ENABLE_EXAMPLES": true,
+        "NVBench_ENABLE_HEADER_TESTING": true,
+        "NVBench_ENABLE_INSTALL_RULES": true,
+        "NVBench_ENABLE_NVML": true,
+        "NVBench_ENABLE_TESTING": true,
+        "NVBench_ENABLE_WERROR": true
+      }
+    },
+    {
+      "name": "nvbench-dev",
+      "displayName": "Developer Build",
+      "inherits": "base",
+      "cacheVariables": {
+        "NVBench_ENABLE_DEVICE_TESTING": true
+      }
+    },
+    {
+      "name": "nvbench-ci",
+      "displayName": "NVBench CI",
+      "inherits": "base"
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "nvbench-dev",
+      "configurePreset": "nvbench-dev"
+    },
+    {
+      "name": "nvbench-ci",
+      "configurePreset": "nvbench-ci"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "base",
+      "hidden": true,
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "noTestsAction": "error",
+        "stopOnFailure": false
+      }
+    },
+    {
+      "name": "nvbench-dev",
+      "configurePreset": "nvbench-dev",
+      "inherits": "base"
+    },
+    {
+      "name": "nvbench-ci",
+      "configurePreset": "nvbench-ci",
+      "inherits": "base"
+    }
+  ]
+}
--- a/README.md
+++ b/README.md
@@ -25,6 +25,17 @@ features:
  * Batch Measurements:
    * Executes the benchmark multiple times back-to-back and records total time.
    * Reports the average execution time (total time / number of executions).
+  * [CPU-only Measurements](docs/benchmarks.md#cpu-only-benchmarks)
+    * Measures the host-side execution time of a non-GPU benchmark.
+    * Not suitable for microbenchmarking.
+
+# Supported Compilers and Tools
+
+- CMake > 3.30.4
+- CUDA Toolkit + nvcc: 12.0 and above
+- g++: 7 -> 14
+- clang++: 14 -> 19
+- Headers are tested with C++17 -> C++20.

 # Getting Started

@@ -34,7 +45,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:

 ```cpp
 void my_benchmark(nvbench::state& state) {
-  state.exec([](nvbench::launch& launch) { 
+  state.exec([](nvbench::launch& launch) {
    my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
  });
 }
@@ -57,10 +68,12 @@ This repository provides a number of [examples](examples/) that demonstrate
 various NVBench features and usecases:

 - [Runtime and compile-time parameter sweeps](examples/axes.cu)
+- [CPU-only benchmarking](examples/cpu_only.cu)
 - [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
 - [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
 - [Skipping benchmark configurations](examples/skip.cu)
 - [Benchmarking on a specific stream](examples/stream.cu)
+- [Adding / hiding columns (summaries) in markdown output](examples/summaries.cu)
 - [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
 - [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)

@@ -70,9 +83,9 @@ To build the examples:
 ```
 mkdir -p build
 cd build
-cmake -DNVBench_ENABLE_EXAMPLES=ON -DCMAKE_CUDA_ARCHITECTURE=70 .. && make
+cmake -DNVBench_ENABLE_EXAMPLES=ON -DCMAKE_CUDA_ARCHITECTURES=70 .. && make
 ```
-Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on. 
+Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on.

 Examples are built by default into `build/bin` and are prefixed with `nvbench.example`.

@@ -119,7 +132,7 @@ Pass: Batch: 0.261963ms GPU, 7.18s total GPU, 27394x
 ## Demo Project

 To get started using NVBench with your own kernels, consider trying out
-the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). 
+the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo).

 `nvbench_demo` provides a simple CMake project that uses NVBench to build an
 example benchmark. It's a great way to experiment with the library without a lot
@@ -129,7 +142,7 @@ of investment.

 Contributions are welcome!

-For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors. 
+For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors.

 ## Tests

@@ -146,7 +159,7 @@ To run all tests:
 ```
 make test
 ```
-or 
+or
 ```
 ctest
 ```
@@ -163,6 +176,7 @@ testing and parameter tuning of individual kernels. For in-depth analysis of
 end-to-end performance of multiple applications, the NVIDIA Nsight tools are
 more appropriate.

-NVBench is focused on evaluating the performance of CUDA kernels and is not
-optimized for CPU microbenchmarks. This may change in the future, but for now,
+NVBench is focused on evaluating the performance of CUDA kernels. It also provides
+CPU-only benchmarking facilities intended for non-trivial CPU workloads, but is
+not optimized for CPU microbenchmarks. This may change in the future, but for now,
 consider using Google Benchmark for high resolution CPU benchmarks.
--- a/ci/axis/cpu.yml
+++ b/ci/axis/cpu.yml
@@ -1,38 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 5
-  - 6
-  - 7
-  - 8
-  - 9
-  - 10
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 5
-  - CXX_TYPE: clang
-    CXX_VER: 6
-  - CXX_TYPE: gcc
-    CXX_VER: 12
--- a/ci/axis/gpu.yml
+++ b/ci/axis/gpu.yml
@@ -1,30 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 11
-  - CXX_TYPE: gcc
-    CXX_VER: 12
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+# Script defaults
+HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
+CXX_STANDARD=17
+CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc`
+CUDA_ARCHS= # Empty, use presets by default.
+GLOBAL_CMAKE_OPTIONS=()
+DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks.
+
+# Check if the correct number of arguments has been provided
+function usage {
+    echo "Usage: $0 [OPTIONS]"
+    echo
+    echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo
+    echo "Options:"
+    echo "  -v/--verbose: enable shell echo for debugging"
+    echo "  -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
+    echo "  -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
+    echo "  -std: CUDA/C++ standard (Defaults to 17)"
+    echo "  -arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to value in presets file)"
+    echo "  -cmake-options: Additional options to pass to CMake"
+    echo
+    echo "Examples:"
+    echo "  $ PARALLEL_LEVEL=8 $0"
+    echo "  $ PARALLEL_LEVEL=8 $0 -cxx g++-9"
+    echo "  $ $0 -cxx clang++-8"
+    echo "  $ $0 -cxx g++-8 -std 20 -arch 80-real -v -cuda /usr/local/bin/nvcc"
+    echo "  $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\""
+    exit 1
+}
+
+# Parse options
+
+# Copy the args into a temporary array, since we will modify them and
+# the parent script may still need them.
+args=("$@")
+while [ "${#args[@]}" -ne 0 ]; do
+    case "${args[0]}" in
+    -v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
+    -cxx)  HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
+    -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -arch) CUDA_ARCHS="${args[1]}";    args=("${args[@]:2}");;
+    -disable-benchmarks) DISABLE_CUB_BENCHMARKS=1; args=("${args[@]:1}");;
+    -cmake-options)
+        if [ -n "${args[1]}" ]; then
+            IFS=' ' read -ra split_args <<< "${args[1]}"
+            GLOBAL_CMAKE_OPTIONS+=("${split_args[@]}")
+            args=("${args[@]:2}")
+        else
+            echo "Error: No arguments provided for -cmake-options"
+            usage
+            exit 1
+        fi
+        ;;
+    -h | -help | --help) usage ;;
+    *) echo "Unrecognized option: ${args[0]}"; usage ;;
+    esac
+done
+
+# Convert to full paths:
+HOST_COMPILER=$(which ${HOST_COMPILER})
+CUDA_COMPILER=$(which ${CUDA_COMPILER})
+
+if [[ -n "${CUDA_ARCHS}" ]]; then
+    GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}")
+fi
+
+if [ $VERBOSE ]; then
+    set -x
+fi
+
+# Begin processing unsets after option parsing
+set -u
+
+readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
+
+if [ -z ${CCCL_BUILD_INFIX+x} ]; then
+    CCCL_BUILD_INFIX=""
+fi
+
+# Presets will be configured in this directory:
+BUILD_DIR="../build/${CCCL_BUILD_INFIX}"
+
+# The most recent build will always be symlinked to cccl/build/latest
+mkdir -p $BUILD_DIR
+rm -f ../build/latest
+ln -sf $BUILD_DIR ../build/latest
+
+# Now that BUILD_DIR exists, use readlink to canonicalize the path:
+BUILD_DIR=$(readlink -f "${BUILD_DIR}")
+
+# Prepare environment for CMake:
+export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}"
+export CTEST_PARALLEL_LEVEL="1"
+export CXX="${HOST_COMPILER}"
+export CUDACXX="${CUDA_COMPILER}"
+export CUDAHOSTCXX="${HOST_COMPILER}"
+export CXX_STANDARD
+
+source ./pretty_printing.sh
+
+print_environment_details() {
+  begin_group "⚙️ Environment Details"
+
+  echo "pwd=$(pwd)"
+
+  print_var_values \
+      BUILD_DIR \
+      CXX_STANDARD \
+      CXX \
+      CUDACXX \
+      CUDAHOSTCXX \
+      NVCC_VERSION \
+      CMAKE_BUILD_PARALLEL_LEVEL \
+      CTEST_PARALLEL_LEVEL \
+      CCCL_BUILD_INFIX \
+      GLOBAL_CMAKE_OPTIONS
+
+  echo "Current commit is:"
+  git log -1 || echo "Not a repository"
+
+  if command -v nvidia-smi &> /dev/null; then
+    nvidia-smi
+  else
+    echo "nvidia-smi not found"
+  fi
+
+  end_group "⚙️ Environment Details"
+}
+
+fail_if_no_gpu() {
+    if ! nvidia-smi &> /dev/null; then
+        echo "Error: No NVIDIA GPU detected. Please ensure you have an NVIDIA GPU installed and the drivers are properly configured." >&2
+        exit 1
+    fi
+}
+
+function print_test_time_summary()
+{
+    ctest_log=${1}
+
+    if [ -f ${ctest_log} ]; then
+        begin_group "⏱️ Longest Test Steps"
+        # Only print the full output in CI:
+        if [ -n "${GITHUB_ACTIONS:-}" ]; then
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake
+        else
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake | head -n 15
+        fi
+        end_group "⏱️ Longest Test Steps"
+    fi
+}
+
+function configure_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+    local GROUP_NAME="🛠️  CMake Configure ${BUILD_NAME}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE "${GLOBAL_CMAKE_OPTIONS[@]}" $CMAKE_OPTIONS
+    status=$?
+    popd > /dev/null
+    return $status
+}
+
+function build_preset() {
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local green="1;32"
+    local red="1;31"
+    local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
+
+    source "./sccache_stats.sh" "start"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v
+    status=$?
+    popd > /dev/null
+
+    minimal_sccache_stats=$(source "./sccache_stats.sh" "end")
+
+    # Only print detailed stats in actions workflow
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        begin_group "💲 sccache stats"
+        echo "${minimal_sccache_stats}"
+        sccache -s
+        end_group
+
+        begin_group "🥷 ninja build times"
+        echo "The "weighted" time is the elapsed time of each build step divided by the number
+              of tasks that were running in parallel. This makes it an excellent approximation
+              of how "important" a slow step was. A link that is entirely or mostly serialized
+              will have a weighted time that is the same or similar to its elapsed time. A
+              compile that runs in parallel with 999 other compiles will have a weighted time
+              that is tiny."
+        ./ninja_summary.py -C ${BUILD_DIR}/${PRESET} || echo "ninja_summary.py failed"
+        end_group
+    else
+      echo $minimal_sccache_stats
+    fi
+
+    return $status
+}
+
+function test_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local GROUP_NAME="🚀  Test ${BUILD_NAME}"
+
+    fail_if_no_gpu
+
+
+    ctest_log_dir="${BUILD_DIR}/log/ctest"
+    ctest_log="${ctest_log_dir}/${PRESET}"
+    mkdir -p "${ctest_log_dir}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" ctest --output-log "${ctest_log}" --preset=$PRESET
+    status=$?
+    popd > /dev/null
+
+    print_test_time_summary ${ctest_log}
+
+    return $status
+}
+
+function configure_and_build_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+
+    configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
+    build_preset "$BUILD_NAME" "$PRESET"
+}
--- a/ci/build_nvbench.sh
+++ b/ci/build_nvbench.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+print_environment_details
+
+PRESET="nvbench-ci"
+
+CMAKE_OPTIONS=""
+
+function version_lt() {
+  local lhs="${1//v/}"
+  local rhs="${2//v/}"
+  # If the versions are equal, return false
+  [ "$lhs" = "$rhs" ] && return 1
+  # If the left-hand side is less than the right-hand side, return true
+  [  "$lhs" = `echo -e "$lhs\n$rhs" | sort -V | head -n1` ]
+}
+
+# If CUDA_COMPILER is nvcc and the version < 11.3, disable CUPTI
+if [[ "$CUDA_COMPILER" == *"nvcc"* ]]; then
+  CUDA_VERSION=$(nvcc --version | grep release | sed -r 's/.*release ([0-9.]+).*/\1/')
+  if version_lt "$CUDA_VERSION" "11.3"; then
+    CMAKE_OPTIONS+=" -DNVBench_ENABLE_CUPTI=OFF "
+  fi
+fi
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
--- a/ci/common/build.bash
+++ b/ci/common/build.bash
@@ -1,231 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI
-################################################################################
-
-set -e
-
-# append variable value
-# Appends ${value} to ${variable}, adding a space before ${value} if
-# ${variable} is not empty.
-function append {
-  tmp="${!1:+${!1} }${2}"
-  eval "${1}=\${tmp}"
-}
-
-# log args...
-# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after.
-function log() {
-  printf "\n>>>> %s\n\n" "${*}"
-}
-
-# print_with_trailing_blank_line args...
-# Prints ${args[*]} with one blank line following, preserving newlines within
-# ${args[*]} but stripping any preceding ${args[*]}.
-function print_with_trailing_blank_line {
-  printf "%s\n\n" "${*}"
-}
-
-# echo_and_run name args...
-# Echo ${args[@]}, then execute ${args[@]}
-function echo_and_run {
-  echo "${1}: ${@:2}"
-  ${@:2}
-}
-
-# echo_and_run_timed name args...
-# Echo ${args[@]}, then execute ${args[@]} and report how long it took,
-# including ${name} in the output of the time.
-function echo_and_run_timed {
-  echo "${@:2}"
-  TIMEFORMAT=$'\n'"${1} Time: %lR"
-  time ${@:2}
-}
-
-# join_delimit <delimiter> [value [value [...]]]
-# Combine all values into a single string, separating each by a single character
-# delimiter. Eg:
-# foo=(bar baz kramble)
-# joined_foo=$(join_delimit "|" "${foo[@]}")
-# echo joined_foo # "bar|baz|kramble"
-function join_delimit {
-  local IFS="${1}"
-  shift
-  echo "${*}"
-}
-
-################################################################################
-# VARIABLES - Set up bash and environmental variables.
-################################################################################
-
-# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
-source /etc/cccl.bashrc
-
-# Set path.
-export PATH=/usr/local/cuda/bin:${PATH}
-
-# Set home to the job's workspace.
-export HOME=${WORKSPACE}
-
-# Switch to the build directory.
-cd ${WORKSPACE}
-mkdir -p build
-cd build
-
-# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
-rm -f .ninja_log
-
-if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
-  CMAKE_BUILD_TYPE="Release"
-fi
-
-CMAKE_BUILD_FLAGS="--"
-
-# The Docker image sets up `${CXX}` and `${CUDACXX}`.
-append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
-append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'"
-
-if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
-  echo "nvc++ not supported."
-  exit 1
-else
-  if [[ "${CXX_TYPE}" == "icc" ]]; then
-    echo "icc not supported."
-    exit 1
-  fi
-  # We're using NVCC so we need to set the host compiler.
-  append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-DCMAKE_CUDA_HOST_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-G Ninja"
-  # Don't stop on build failures.
-  append CMAKE_BUILD_FLAGS "-k0"
-fi
-
-if [[ -n "${PARALLEL_LEVEL}" ]]; then
-  DETERMINE_PARALLELISM_FLAGS="-j ${PARALLEL_LEVEL}"
-fi
-
-WSL=0
-if [[ $(grep -i microsoft /proc/version) ]]; then
-  echo "Windows Subsystem for Linux detected."
-  WSL=1
-fi
-export WSL
-
-#append CMAKE_FLAGS "-DCMAKE_CUDA_ARCHITECTURES=all"
-
-append CMAKE_FLAGS "-DNVBench_ENABLE_EXAMPLES=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_TESTING=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_CUPTI=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_WERROR=ON"
-
-# These consume a lot of time and don't currently have
-# any value as regression tests.
-append CMAKE_FLAGS "-DNVBench_ENABLE_DEVICE_TESTING=OFF"
-
-# NVML doesn't work under WSL
-if [[ ${WSL} -eq 0 ]]; then
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=ON"
-else
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=OFF"
-fi
-
-if [[ -n "${@}" ]]; then
-  append CMAKE_BUILD_FLAGS "${@}"
-fi
-
-append CTEST_FLAGS "--output-on-failure"
-
-# Export variables so they'll show up in the logs when we report the environment.
-export CMAKE_FLAGS
-export CMAKE_BUILD_FLAGS
-export CTEST_FLAGS
-
-################################################################################
-# ENVIRONMENT - Configure and print out information about the environment.
-################################################################################
-
-log "Determine system topology..."
-
-# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the
-# system topology.
-source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS}
-
-log "Get environment..."
-
-env | sort
-
-log "Check versions..."
-
-# We use sed and echo below to ensure there is always one and only trailing
-# line following the output from each tool.
-
-${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-if [[ "${BUILD_TYPE}" == "gpu" ]]; then
-  nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
-fi
-
-################################################################################
-# BUILD
-################################################################################
-
-log "Configure..."
-
-echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
-configure_status=$?
-
-log "Build..."
-
-# ${PARALLEL_LEVEL} needs to be passed after we run
-# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
-set +e # Don't stop on build failures.
-echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
-build_status=$?
-set -e
-
-################################################################################
-# TEST - Run examples and tests.
-################################################################################
-
-log "Test..."
-
-(
-  # Make sure test_status captures ctest, not tee:
-  # https://stackoverflow.com/a/999259/11130318
-  set -o pipefail
-  echo_and_run_timed "Test" ctest ${CTEST_FLAGS} -j ${PARALLEL_LEVEL} | tee ctest_log
-)
-
-test_status=$?
-
-################################################################################
-# SUMMARY - Print status of each step and exit with failure if needed.
-################################################################################
-
-log "Summary:"
-echo "- Configure Error Code: ${configure_status}"
-echo "- Build Error Code: ${build_status}"
-echo "- Test Error Code: ${test_status}"
-
-if [[ "${configure_status}" != "0" ]] || \
-   [[ "${build_status}" != "0" ]] || \
-   [[ "${test_status}" != "0" ]]; then
-     exit 1
-fi
--- a/ci/common/determine_build_parallelism.bash
+++ b/ci/common/determine_build_parallelism.bash
@@ -1,119 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-function usage {
-  echo "Usage: ${0} [flags...]"
-  echo
-  echo "Examine the system topology to determine a reasonable amount of build"
-  echo "parallelism."
-  echo
-  echo "Exported variables:"
-  echo "  \${LOGICAL_CPUS}          : Logical processors (e.g. threads)."
-  echo "  \${PHYSICAL_CPUS}         : Physical processors (e.g. cores)."
-  echo "  \${TOTAL_MEM}             : Total system memory [GB]."
-  echo "  \${MAX_THREADS_PER_CORE}  : Maximum threads per core allowed."
-  echo "  \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed."
-  echo "  \${CPU_BOUND_THREADS}     : # of build threads constrained by processors."
-  echo "  \${MEM_BOUND_THREADS}     : # of build threads constrained by memory [GB]."
-  echo "  \${PARALLEL_LEVEL}        : Determined # of build threads."
-  echo "  \${MEM_PER_THREAD}        : Memory [GB] per build thread."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-q, --quiet"
-  echo "  Print nothing and only export variables."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Explicitly set the number of build threads to use."
-  echo
-  echo "--max-threads-per-core <threads>"
-  echo "  Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])."
-  echo
-  echo "--min-memory-per-thread <gigabytes>"
-  echo "  Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])."
-
-  exit -3
-}
-
-QUIET=0
-
-export MAX_THREADS_PER_CORE=2
-export MIN_MEMORY_PER_THREAD=1 # [GB]
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -q) ;&
-  --quiet) QUIET=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  --max-threads-per-core)
-    shift # The next argument is the number of threads per core.
-    MAX_THREADS_PER_CORE="${1}"
-    ;;
-  --min-memory-per-thread)
-    shift # The next argument is the amount of memory per thread.
-    MIN_MEMORY_PER_THREAD="${1}"
-    ;;
-  esac
-  shift
-done
-
-# https://stackoverflow.com/a/23378780
-if [ $(uname) == "Darwin" ]; then
-  export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max)
-  export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max)
-else
-  export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l)
-  export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
-fi
-
-export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }")
-
-export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }")
-export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }")
-
-if [[ -z "${PARALLEL_LEVEL}" ]]; then
-  # Pick the smaller of the two as the default.
-  if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then
-    export PARALLEL_LEVEL=${MEM_BOUND_THREADS}
-  else
-    export PARALLEL_LEVEL=${CPU_BOUND_THREADS}
-  fi
-else
-  EXPLICIT_PARALLEL_LEVEL=1
-fi
-
-# This can be a floating point number.
-export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }")
-
-if [[ "${QUIET}" == 0 ]]; then
-  echo    "Logical CPUs:           ${LOGICAL_CPUS} [threads]"
-  echo    "Physical CPUs:          ${PHYSICAL_CPUS} [cores]"
-  echo    "Total Mem:              ${TOTAL_MEM} [GBs]"
-  echo    "Max Threads Per Core:   ${MAX_THREADS_PER_CORE} [threads/core]"
-  echo    "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]"
-  echo    "CPU Bound Threads:      ${CPU_BOUND_THREADS} [threads]"
-  echo    "Mem Bound Threads:      ${MEM_BOUND_THREADS} [threads]"
-
-  echo -n "Parallel Level:         ${PARALLEL_LEVEL} [threads]"
-  if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then
-    echo " (explicitly set)"
-  else
-    echo
-  fi
-
-  echo    "Mem Per Thread:         ${MEM_PER_THREAD} [GBs/thread]"
-fi
-
--- a/ci/cpu/build.bash
+++ b/ci/cpu/build.bash
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (CPU-only)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
--- a/ci/gpu/build.bash
+++ b/ci/gpu/build.bash
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (heterogeneous)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
--- a/ci/local/build.bash
+++ b/ci/local/build.bash
@@ -1,215 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench local containerized build script
-################################################################################
-
-function usage {
-  echo "Usage: ${0} [flags...] [cmake-targets...]"
-  echo
-  echo "Build and test your local repository using a gpuCI Docker image."
-  echo "If CMake targets are specified, only those targets are built and tested."
-  echo "Otherwise, everything is built and tested."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-r <path>, --repository <path>"
-  echo "  Path to the repository (default: ${REPOSITORY_PATH})."
-  echo
-  echo "-i <image>, --image <image>"
-  echo "  Docker image to use (default: ${IMAGE})"
-  echo
-  echo "-l, --local-image"
-  echo "  Use the local version of the image instead of pulling from Docker hub."
-  echo
-  echo "-s, --shell-only"
-  echo "  Skip building and testing and launch an interactive shell instead."
-  echo
-  echo "-d, --disable-gpus"
-  echo "  Don't start the container with the NVIDIA runtime and GPUs attached."
-  echo
-  echo "-c, --clean"
-  echo "  If the build directory already exists, delete it."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Number of threads to use when building (default: inferred)."
-  echo
-  echo "-b <type>, --cmake-build-type <plan>"
-  echo "  CMake build type to use, either Release, RelWithDebInfo, or Debug"
-  echo "  (default: ${CMAKE_BUILD_TYPE})."
-  echo
-
-  exit -3
-}
-
-SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
-
-REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..)
-
-################################################################################
-# FLAGS - Process command line flags.
-################################################################################
-
-IMAGE="gpuci/cccl:cuda11.5.1-devel-ubuntu20.04-gcc9"
-
-LOCAL_IMAGE=0
-
-SHELL_ONLY=0
-
-BUILD_TYPE="gpu"
-
-CLEAN=0
-
-PARALLEL_LEVEL=""
-
-CMAKE_BUILD_TYPE="Release"
-
-TARGETS=""
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -r) ;&
-  --repository)
-    shift # The next argument is the path.
-    REPOSITORY_PATH="${1}"
-    ;;
-  -i) ;&
-  --image)
-    shift # The next argument is the image.
-    IMAGE="${1}"
-    ;;
-  -l) ;&
-  --local-image) LOCAL_IMAGE=1 ;;
-  -s) ;&
-  --shell-only) SHELL_ONLY=1 ;;
-  -d) ;&
-  --disable-gpus) BUILD_TYPE="cpu" ;;
-  -c) ;&
-  --clean) CLEAN=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  -b) ;&
-  --cmake-build-type)
-    shift # The next argument is the build type.
-    CMAKE_BUILD_TYPE="${1}"
-    ;;
-  *)
-    TARGETS="${TARGETS:+${TARGETS} }${1}"
-    ;;
-  esac
-  shift
-done
-
-################################################################################
-# PATHS - Setup paths for the container.
-################################################################################
-
-# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being
-# built and tested. It can be set with the --repository flag.
-#
-# ${BUILD_PATH} is the local filesystem path that will be used for the build. It
-# is named after the image name, allowing multiple image builds to coexist on
-# the local filesystem.
-#
-# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside
-# the container.
-#
-# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the
-# container.
-
-BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g')
-
-if [[ "${CLEAN}" != 0 ]]; then
-  rm -rf ${BUILD_PATH}
-fi
-
-mkdir -p ${BUILD_PATH}
-
-BASE_PATH_IN_CONTAINER="/cccl"
-
-REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")"
-
-BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build"
-
-################################################################################
-# ENVIRONMENT - Setup the thunk build script that will be run by the container.
-################################################################################
-
-# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this
-# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399
-
-COMMAND="sudo ldconfig; sudo ldconfig"
-if [[ "${SHELL_ONLY}" != 0 ]]; then
-  COMMAND="${COMMAND}; bash"
-else
-  COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash"
-fi
-
-################################################################################
-# GPU - Setup GPUs.
-################################################################################
-
-# Note: We always start docker with --gpus, even for cpu builds. Otherwise
-# libcuda.so.1 is not present and no NVBench tests are able to run.
-
-# Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}.
-if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then
-  VISIBLE_DEVICES="all"
-else
-  VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}"
-fi
-
-DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/')
-GPU_OPTS="--gpus device=${VISIBLE_DEVICES}"
-if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]]
-then
-  GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'"
-fi
-
-################################################################################
-# LAUNCH - Pull and launch the container.
-################################################################################
-
-#NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
-NVIDIA_DOCKER_INSTALLED=1 # Broken on WSL
-if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
-  echo "NVIDIA Docker not found, please install it: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
-  exit -4
-fi
-
-if [[ "${LOCAL_IMAGE}" == 0 ]]; then
-  docker pull "${IMAGE}"
-fi
-
-docker run --rm -it ${GPU_OPTS} \
-  --cap-add=SYS_PTRACE \
-  --user "$(id -u)":"$(id -g)" \
-  -v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \
-  -v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \
-  -v /etc/passwd:/etc/passwd:ro \
-  -v /etc/group:/etc/group:ro \
-  -v /etc/subuid:/etc/subuid:ro \
-  -v /etc/subgid:/etc/subgid:ro \
-  -v /etc/shadow:/etc/shadow:ro \
-  -v /etc/gshadow:/etc/gshadow:ro \
-  -e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \
-  -e "BUILD_TYPE=${BUILD_TYPE}" \
-  -e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \
-  -e "COVERAGE_PLAN=${COVERAGE_PLAN}" \
-  -e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \
-  -w "${BUILD_PATH_IN_CONTAINER}" \
-  "${IMAGE}" bash -c "${COMMAND}"
-
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -0,0 +1,61 @@
+
+cuda_prev_min: &cuda_prev_min '11.1' # Unsupported: No cupti support, issues compiling newer fmt.
+cuda_prev_max: &cuda_prev_max '11.8'
+cuda_curr_min: &cuda_curr_min '12.0'
+cuda_curr_max: &cuda_curr_max '12.8'
+
+# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
+devcontainer_version: '25.06'
+
+# gcc compiler configurations
+gcc7: &gcc7 { name: 'gcc', version: '7', exe: 'g++' }
+gcc8: &gcc8 { name: 'gcc', version: '8', exe: 'g++' }
+gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' }
+gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' }
+gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' }
+gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' }
+gcc13: &gcc13 { name: 'gcc', version: '13', exe: 'g++' }
+gcc14: &gcc14 { name: 'gcc', version: '14', exe: 'g++' }
+
+# LLVM Compiler configurations
+llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' }
+llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' }
+llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' }
+llvm17: &llvm17 { name: 'llvm', version: '17', exe: 'clang++' }
+llvm18: &llvm18 { name: 'llvm', version: '18', exe: 'clang++' }
+llvm19: &llvm19 { name: 'llvm', version: '19', exe: 'clang++' }
+
+# Each environment below will generate a unique build/test job
+# See the "compute-matrix" job in the workflow for how this is parsed and used
+# cuda: The CUDA Toolkit version
+# os: The operating system used
+# cpu: The CPU architecture
+# compiler: The compiler to use
+#   name: The compiler name
+#   version: The compiler version
+#   exe: The unverionsed compiler binary name
+
+# Configurations that will run for every PR
+pull_request:
+  nvcc:
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9     }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu24.04', cpu: 'amd64', compiler: *gcc14    }
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18   }
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm19   }
--- a/ci/ninja_summary.py
+++ b/ci/ninja_summary.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+# Copyright (c) 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+r"""Summarize the last ninja build, invoked with ninja's -C syntax.
+
+This script is designed to be automatically run after each ninja build in
+order to summarize the build's performance. Making build performance information
+more visible should make it easier to notice anomalies and opportunities. To use
+this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat.
+
+On Linux you can get autoninja to invoke this script using this syntax:
+
+$ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome
+
+You can also call this script directly using ninja's syntax to specify the
+output directory of interest:
+
+> python3 post_build_ninja_summary.py -C out/Default
+
+Typical output looks like this:
+
+>ninja -C out\debug_component base
+ninja.exe -C out\debug_component base -j 960 -l 48  -d keeprsp
+ninja: Entering directory `out\debug_component'
+[1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files
+Longest build steps:
+       0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time)
+       0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time)
+       0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time)
+       1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time)
+Time by build-step type:
+       0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum)
+       0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum)
+       0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum)
+       1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed
+time sum)
+      23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum)
+26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism)
+839 build steps completed, average of 32.17/s
+
+If no gn clean has been done then results will be for the last non-NULL
+invocation of ninja. Ideas for future statistics, and implementations are
+appreciated.
+
+The "weighted" time is the elapsed time of each build step divided by the number
+of tasks that were running in parallel. This makes it an excellent approximation
+of how "important" a slow step was. A link that is entirely or mostly serialized
+will have a weighted time that is the same or similar to its elapsed time. A
+compile that runs in parallel with 999 other compiles will have a weighted time
+that is tiny."""
+
+import argparse
+import errno
+import fnmatch
+import os
+import subprocess
+import sys
+
+# The number of long build times to report:
+long_count = 10
+# The number of long times by extension to report
+long_ext_count = 10
+
+
+class Target:
+    """Represents a single line read for a .ninja_log file."""
+
+    def __init__(self, start, end):
+        """Creates a target object by passing in the start/end times in seconds
+        as a float."""
+        self.start = start
+        self.end = end
+        # A list of targets, appended to by the owner of this object.
+        self.targets = []
+        self.weighted_duration = 0.0
+
+    def Duration(self):
+        """Returns the task duration in seconds as a float."""
+        return self.end - self.start
+
+    def SetWeightedDuration(self, weighted_duration):
+        """Sets the duration, in seconds, passed in as a float."""
+        self.weighted_duration = weighted_duration
+
+    def WeightedDuration(self):
+        """Returns the task's weighted duration in seconds as a float.
+
+        Weighted_duration takes the elapsed time of the task and divides it
+        by how many other tasks were running at the same time. Thus, it
+        represents the approximate impact of this task on the total build time,
+        with serialized or serializing steps typically ending up with much
+        longer weighted durations.
+        weighted_duration should always be the same or shorter than duration.
+        """
+        # Allow for modest floating-point errors
+        epsilon = 0.000002
+        if self.weighted_duration > self.Duration() + epsilon:
+            print("%s > %s?" % (self.weighted_duration, self.Duration()))
+        assert self.weighted_duration <= self.Duration() + epsilon
+        return self.weighted_duration
+
+    def DescribeTargets(self):
+        """Returns a printable string that summarizes the targets."""
+        # Some build steps generate dozens of outputs - handle them sanely.
+        # The max_length was chosen so that it can fit most of the long
+        # single-target names, while minimizing word wrapping.
+        result = ", ".join(self.targets)
+        max_length = 65
+        if len(result) > max_length:
+            result = result[:max_length] + "..."
+        return result
+
+
+# Copied with some modifications from ninjatracing
+def ReadTargets(log, show_all):
+    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
+
+    The result is a list of Target objects."""
+    header = log.readline()
+    # Handle empty ninja_log gracefully by silently returning an empty list of
+    # targets.
+    if not header:
+        return []
+    assert header == "# ninja log v5\n", "unrecognized ninja log version %r" % header
+    targets_dict = {}
+    last_end_seen = 0.0
+    for line in log:
+        parts = line.strip().split("\t")
+        if len(parts) != 5:
+            # If ninja.exe is rudely halted then the .ninja_log file may be
+            # corrupt. Silently continue.
+            continue
+        start, end, _, name, cmdhash = parts  # Ignore restat.
+        # Convert from integral milliseconds to float seconds.
+        start = int(start) / 1000.0
+        end = int(end) / 1000.0
+        if not show_all and end < last_end_seen:
+            # An earlier time stamp means that this step is the first in a new
+            # build, possibly an incremental build. Throw away the previous
+            # data so that this new build will be displayed independently.
+            # This has to be done by comparing end times because records are
+            # written to the .ninja_log file when commands complete, so end
+            # times are guaranteed to be in order, but start times are not.
+            targets_dict = {}
+        target = None
+        if cmdhash in targets_dict:
+            target = targets_dict[cmdhash]
+            if not show_all and (target.start != start or target.end != end):
+                # If several builds in a row just run one or two build steps
+                # then the end times may not go backwards so the last build may
+                # not be detected as such. However in many cases there will be a
+                # build step repeated in the two builds and the changed
+                # start/stop points for that command, identified by the hash,
+                # can be used to detect and reset the target dictionary.
+                targets_dict = {}
+                target = None
+        if not target:
+            targets_dict[cmdhash] = target = Target(start, end)
+        last_end_seen = end
+        target.targets.append(name)
+    return list(targets_dict.values())
+
+
+def GetExtension(target, extra_patterns):
+    """Return the file extension that best represents a target.
+
+    For targets that generate multiple outputs it is important to return a
+    consistent 'canonical' extension. Ultimately the goal is to group build steps
+    by type."""
+    for output in target.targets:
+        if extra_patterns:
+            for fn_pattern in extra_patterns.split(";"):
+                if fnmatch.fnmatch(output, "*" + fn_pattern + "*"):
+                    return fn_pattern
+        # Not a true extension, but a good grouping.
+        if output.endswith("type_mappings"):
+            extension = "type_mappings"
+            break
+
+        # Capture two extensions if present. For example: file.javac.jar should
+        # be distinguished from file.interface.jar.
+        root, ext1 = os.path.splitext(output)
+        _, ext2 = os.path.splitext(root)
+        extension = ext2 + ext1  # Preserve the order in the file name.
+
+        if len(extension) == 0:
+            extension = "(no extension found)"
+
+        if ext1 in [".pdb", ".dll", ".exe"]:
+            extension = "PEFile (linking)"
+            # Make sure that .dll and .exe are grouped together and that the
+            # .dll.lib files don't cause these to be listed as libraries
+            break
+        if ext1 in [".so", ".TOC"]:
+            extension = ".so (linking)"
+            # Attempt to identify linking, avoid identifying as '.TOC'
+            break
+        # Make sure .obj files don't get categorized as mojo files
+        if ext1 in [".obj", ".o"]:
+            break
+        # Jars are the canonical output of java targets.
+        if ext1 == ".jar":
+            break
+        # Normalize all mojo related outputs to 'mojo'.
+        if output.count(".mojom") > 0:
+            extension = "mojo"
+            break
+    return extension
+
+
+def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
+    """Print a summary of the passed in list of Target objects."""
+
+    # Create a list that is in order by time stamp and has entries for the
+    # beginning and ending of each build step (one time stamp may have multiple
+    # entries due to multiple steps starting/stopping at exactly the same time).
+    # Iterate through this list, keeping track of which tasks are running at all
+    # times. At each time step calculate a running total for weighted time so
+    # that when each task ends its own weighted time can easily be calculated.
+    task_start_stop_times = []
+
+    earliest = -1
+    latest = 0
+    total_cpu_time = 0
+    for target in entries:
+        if earliest < 0 or target.start < earliest:
+            earliest = target.start
+        if target.end > latest:
+            latest = target.end
+        total_cpu_time += target.Duration()
+        task_start_stop_times.append((target.start, "start", target))
+        task_start_stop_times.append((target.end, "stop", target))
+    length = latest - earliest
+    weighted_total = 0.0
+
+    # Sort by the time/type records and ignore |target|
+    task_start_stop_times.sort(key=lambda times: times[:2])
+    # Now we have all task start/stop times sorted by when they happen. If a
+    # task starts and stops on the same time stamp then the start will come
+    # first because of the alphabet, which is important for making this work
+    # correctly.
+    # Track the tasks which are currently running.
+    running_tasks = {}
+    # Record the time we have processed up to so we know how to calculate time
+    # deltas.
+    last_time = task_start_stop_times[0][0]
+    # Track the accumulated weighted time so that it can efficiently be added
+    # to individual tasks.
+    last_weighted_time = 0.0
+    # Scan all start/stop events.
+    for event in task_start_stop_times:
+        time, action_name, target = event
+        # Accumulate weighted time up to now.
+        num_running = len(running_tasks)
+        if num_running > 0:
+            # Update the total weighted time up to this moment.
+            last_weighted_time += (time - last_time) / float(num_running)
+        if action_name == "start":
+            # Record the total weighted task time when this task starts.
+            running_tasks[target] = last_weighted_time
+        if action_name == "stop":
+            # Record the change in the total weighted task time while this task
+            # ran.
+            weighted_duration = last_weighted_time - running_tasks[target]
+            target.SetWeightedDuration(weighted_duration)
+            weighted_total += weighted_duration
+            del running_tasks[target]
+        last_time = time
+    assert len(running_tasks) == 0
+
+    # Warn if the sum of weighted times is off by more than half a second.
+    if abs(length - weighted_total) > 500:
+        print(
+            "Warning: Possible corrupt ninja log, results may be "
+            "untrustworthy. Length = %.3f, weighted total = %.3f"
+            % (length, weighted_total)
+        )
+
+    # Print the slowest build steps:
+    print("    Longest build steps:")
+    if elapsed_time_sorting:
+        entries.sort(key=lambda x: x.Duration())
+    else:
+        entries.sort(key=lambda x: x.WeightedDuration())
+    for target in entries[-long_count:]:
+        print(
+            "      %8.1f weighted s to build %s (%.1f s elapsed time)"
+            % (target.WeightedDuration(), target.DescribeTargets(), target.Duration())
+        )
+
+    # Sum up the time by file extension/type of the output file
+    count_by_ext = {}
+    time_by_ext = {}
+    weighted_time_by_ext = {}
+    # Scan through all of the targets to build up per-extension statistics.
+    for target in entries:
+        extension = GetExtension(target, extra_step_types)
+        time_by_ext[extension] = time_by_ext.get(extension, 0) + target.Duration()
+        weighted_time_by_ext[extension] = (
+            weighted_time_by_ext.get(extension, 0) + target.WeightedDuration()
+        )
+        count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
+
+    print("    Time by build-step type:")
+    # Copy to a list with extension name and total time swapped, to (time, ext)
+    if elapsed_time_sorting:
+        weighted_time_by_ext_sorted = sorted((y, x) for (x, y) in time_by_ext.items())
+    else:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in weighted_time_by_ext.items()
+        )
+    # Print the slowest build target types:
+    for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
+        print(
+            "      %8.1f s weighted time to generate %d %s files "
+            "(%1.1f s elapsed time sum)"
+            % (time, count_by_ext[extension], extension, time_by_ext[extension])
+        )
+
+    print(
+        "    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx "
+        "parallelism)" % (length, total_cpu_time, total_cpu_time * 1.0 / length)
+    )
+    print(
+        "    %d build steps completed, average of %1.2f/s"
+        % (len(entries), len(entries) / (length))
+    )
+
+
+def main():
+    log_file = ".ninja_log"
+    metrics_file = "siso_metrics.json"
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-C", dest="build_directory", help="Build directory.")
+    parser.add_argument(
+        "-s",
+        "--step-types",
+        help="semicolon separated fnmatch patterns for build-step grouping",
+    )
+    parser.add_argument(
+        "-e",
+        "--elapsed_time_sorting",
+        default=False,
+        action="store_true",
+        help="Sort output by elapsed time instead of weighted time",
+    )
+    parser.add_argument("--log-file", help="specific ninja log file to analyze.")
+    args, _extra_args = parser.parse_known_args()
+    if args.build_directory:
+        log_file = os.path.join(args.build_directory, log_file)
+        metrics_file = os.path.join(args.build_directory, metrics_file)
+    if args.log_file:
+        log_file = args.log_file
+    if not args.step_types:
+        # Offer a convenient way to add extra step types automatically,
+        # including when this script is run by autoninja. get() returns None if
+        # the variable isn't set.
+        args.step_types = os.environ.get("chromium_step_types")
+    if args.step_types:
+        # Make room for the extra build types.
+        global long_ext_count
+        long_ext_count += len(args.step_types.split(";"))
+
+    if os.path.exists(metrics_file):
+        # Automatically handle summarizing siso builds.
+        cmd = ["siso.bat" if "win32" in sys.platform else "siso"]
+        cmd.extend(["metrics", "summary"])
+        if args.build_directory:
+            cmd.extend(["-C", args.build_directory])
+        if args.step_types:
+            cmd.extend(["--step_types", args.step_types])
+        if args.elapsed_time_sorting:
+            cmd.append("--elapsed_time_sorting")
+        subprocess.run(cmd)
+    else:
+        try:
+            with open(log_file, "r") as log:
+                entries = ReadTargets(log, False)
+                if entries:
+                    SummarizeEntries(
+                        entries, args.step_types, args.elapsed_time_sorting
+                    )
+        except IOError:
+            print("Log file %r not found, no build summary created." % log_file)
+            return errno.ENOENT
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/ci/pretty_printing.sh
+++ b/ci/pretty_printing.sh
@@ -0,0 +1,105 @@
+# Print "ARG=${ARG}" for all args.
+function print_var_values() {
+    # Iterate through the arguments
+    for var_name in "$@"; do
+        if [ -z "$var_name" ]; then
+            echo "Usage: print_var_values <variable_name1> <variable_name2> ..."
+            return 1
+        fi
+
+        # Dereference the variable and print the result
+        echo "$var_name=${!var_name:-(undefined)}"
+    done
+}
+
+# begin_group: Start a named section of log output, possibly with color.
+# Usage: begin_group "Group Name" [Color]
+#   Group Name: A string specifying the name of the group.
+#   Color (optional): ANSI color code to set text color. Default is blue (1;34).
+function begin_group() {
+    # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124
+    local blue="34"
+    local name="${1:-}"
+    local color="${2:-$blue}"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo -e "::group::\e[${color}m${name}\e[0m"
+    else
+        echo -e "\e[${color}m================== ${name} ======================\e[0m"
+    fi
+}
+
+# end_group: End a named section of log output and print status based on exit status.
+# Usage: end_group "Group Name" [Exit Status]
+#   Group Name: A string specifying the name of the group.
+#   Exit Status (optional): The exit status of the command run within the group. Default is 0.
+function end_group() {
+    local name="${1:-}"
+    local build_status="${2:-0}"
+    local duration="${3:-}"
+    local red="31"
+    local blue="34"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo "::endgroup::"
+
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m"
+        fi
+    else
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m"
+        else
+            echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m"
+        fi
+    fi
+}
+
+declare -A command_durations
+
+# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result.
+# Usage: run_command "Group Name" command [arguments...]
+function run_command() {
+    local group_name="${1:-}"
+    shift
+    local command=("$@")
+    local status
+
+    begin_group "$group_name"
+    set +e
+    local start_time=$(date +%s)
+    "${command[@]}"
+    status=$?
+    local end_time=$(date +%s)
+    set -e
+    local duration=$((end_time - start_time))
+    end_group "$group_name" $status $duration
+    command_durations["$group_name"]=$duration
+    return $status
+}
+
+function string_width() {
+    local str="$1"
+    echo "$str" | awk '{print length}'
+}
+
+function print_time_summary() {
+    local max_length=0
+    local group
+
+    # Find the longest group name for formatting
+    for group in "${!command_durations[@]}"; do
+        local group_length=$(echo "$group" | awk '{print length}')
+        if [ "$group_length" -gt "$max_length" ]; then
+            max_length=$group_length
+        fi
+    done
+
+    echo "Time Summary:"
+    for group in "${!command_durations[@]}"; do
+        printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
+    done
+
+    # Clear the array of timing info
+    declare -gA command_durations=()
+}
--- a/ci/sccache_hit_rate.sh
+++ b/ci/sccache_hit_rate.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Ensure two arguments are provided
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <before-file> <after-file>" >&2
+  exit 1
+fi
+
+# Print the contents of the before file
+echo "=== Contents of $1 ===" >&2
+cat $1 >&2
+echo "=== End of $1 ===" >&2
+
+# Print the contents of the after file
+echo "=== Contents of $2 ==="  >&2
+cat $2 >&2
+echo "=== End of $2 ===" >&2
+
+# Extract compile requests and cache hits from the before and after files
+requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1")
+hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1")
+requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2")
+hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2")
+
+# Calculate the differences to find out how many new requests and hits
+requests_diff=$((requests_after - requests_before))
+hits_diff=$((hits_after - hits_before))
+
+echo "New Compile Requests: $requests_diff" >&2
+echo "New Hits: $hits_diff" >&2
+
+# Calculate and print the hit rate
+if [ $requests_diff -eq 0 ]; then
+    echo "No new compile requests, hit rate is not applicable"
+else
+    hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
+    echo "sccache hit rate: $hit_rate%" >&2
+    echo "$hit_rate"
+fi
--- a/ci/sccache_stats.sh
+++ b/ci/sccache_stats.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# This script prints the sccache hit rate between two calls to sccache --show-stats.
+# It should be sourced in your script before and after the operations you want to profile,
+# with the 'start' or 'end' argument respectively.
+
+mode=$1
+
+if [[ "$mode" != "start" && "$mode" != "end" ]]; then
+    echo "Invalid mode: $mode"
+    echo "Usage: $0 {start|end}"
+    exit 1
+fi
+
+# Check if sccache is available
+if ! command -v sccache &> /dev/null; then
+    echo "Notice: sccache is not available. Skipping..."
+    exit 0
+fi
+
+case $mode in
+  start)
+    export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    ;;
+  end)
+    if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then
+        echo "Error: start stats not collected. Did you call this script with 'start' before your operations?"
+        exit 1
+    fi
+
+    final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    hits=$((final_hits - SCCACHE_START_HITS))
+    misses=$((final_misses - SCCACHE_START_MISSES))
+    total=$((hits + misses))
+
+    prefix=""
+    if [ ${GITHUB_ACTIONS:-false} = "true" ]; then
+      prefix="::notice::"
+    fi
+
+    if (( total > 0 )); then
+      hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }')
+      echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%"
+    else
+      echo ${prefix}"sccache stats: N/A No new compilation requests"
+    fi
+    unset SCCACHE_START_HITS
+    unset SCCACHE_START_MISSES
+    ;;
+esac
--- a/ci/test_nvbench.sh
+++ b/ci/test_nvbench.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+# Run NVBench tests with high parallelism. If any need to be
+# serialized, define the `RUN_SERIAL` CMake property on the
+# test.
+export CTEST_PARALLEL_LEVEL=${PARALLEL_LEVEL}
+
+print_environment_details
+
+./build_nvbench.sh "$@"
+
+PRESET="nvbench-ci"
+
+test_preset "NVBench" ${PRESET}
+
+print_time_summary
--- a/cmake/DetectSupportedStandards.cmake
+++ b/cmake/DetectSupportedStandards.cmake
@@ -0,0 +1,65 @@
+# Detect the language standards supported by the current compilers.
+#
+# Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
+#
+# - var_prefix: Used to name result variables,
+#   e.g. ${var_prefix}_${lang}_XX_SUPPORTED will be TRUE or FALSE. Defined for
+#   each XX in ${standards}.
+# - lang: The language to test: C, CXX, or CUDA.
+# - standards: List of any standard versions.
+#
+# Example: detect_supported_standards(PROJ CXX 11 14 17)
+#   - Sets the following variables in the parent scope to TRUE or FALSE:
+#     - PROJ_CXX_11_SUPPORTED
+#     - PROJ_CXX_14_SUPPORTED
+#     - PROJ_CXX_17_SUPPORTED
+#   - Sets `PROJ_DETECTED_CXX_STANDARDS` to a list of supported standards (e.g. "11;14;17").
+function(detect_supported_standards prefix lang)
+  string(TOLOWER "${lang}_std" feature_prefix)
+  set(all_stds)
+  foreach(standard IN LISTS ARGN)
+    set(var_name "${prefix}_${lang}_${standard}_SUPPORTED")
+    if ("${feature_prefix}_${standard}" IN_LIST CMAKE_${lang}_COMPILE_FEATURES)
+      set(${var_name} TRUE)
+    else()
+      set(${var_name} FALSE)
+    endif()
+
+    # Special cases:
+    if (standard EQUAL 17 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)))
+      # gcc < 7 and clang < 8 don't fully support C++17.
+      # They accept the flag and have partial support, but nvcc will refuse
+      # to enable it and falls back to the default dialect for the current
+      # CXX compiler version. This breaks our CI.
+      # CMake's COMPILE_FEATURES var reports that these compilers support C++17,
+      # but we can't rely on it, so manually disable the dialect in these cases.
+      set(${var_name} FALSE)
+    endif()
+
+    if (standard EQUAL 20 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1930)))
+      # Similar to the above, but for C++20.
+      set(${var_name} FALSE)
+    endif()
+
+    if (${var_name})
+      list(APPEND all_stds ${standard})
+    endif()
+
+    message(STATUS "Testing ${lang}${standard} Support: ${${var_name}}")
+    set(${var_name} ${${var_name}} PARENT_SCOPE)
+  endforeach()
+
+  set(${prefix}_DETECTED_${lang}_STANDARDS "${all_stds}" PARENT_SCOPE)
+endfunction()
--- a/cmake/NVBenchCUPTI.cmake
+++ b/cmake/NVBenchCUPTI.cmake
@@ -22,47 +22,15 @@ function(nvbench_add_cupti_dep dep_name)

  add_library(nvbench::${dep_name_lower} SHARED IMPORTED)

-  if (WIN32)
-    # Attempt to locate the dll in the expected location. This is necessary
-    # because the CUPTI dll has a versioned suffix, so we can't directly search
-    # for it with find_file.
-    file(GLOB dep_dll_path "${nvbench_cupti_root}/lib64/${dep_name_lower}*dll")
-    cmake_path(GET dep_dll_path FILENAME dep_dll_filename)
+  find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED
+    DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit."
+    HINTS "${nvbench_cupti_root}/lib64"
+  )
+  mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)

-    # If the dll was not found in the expected location, use a default filename as a user hint.
-    if (NOT dep_dll_filename)
-      set(dep_dll_filename ${dep_name_lower}.dll)
-    endif()
-
-    # Use find_file to create a cache variable and mark the file as REQUIRED.
-    find_file(NVBench_${dep_name_upper}_DLL ${dep_dll_filename} REQUIRED
-      DOC "The full path to ${dep_name_lower}.dll from the CUDA Toolkit."
-      HINTS "${nvbench_cupti_root}/lib64/"
-    )
-    mark_as_advanced(NVBench_${dep_name_upper}_DLL)
-
-    # The .libs don't have suffixes, so we can just directly search for them.
-    find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower}.lib REQUIRED
-      DOC "The full path to ${dep_name_lower}.lib from the CUDA Toolkit."
-      HINTS "${nvbench_cupti_root}/lib64/"
-    )
-    mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
-
-    set_target_properties(nvbench::${dep_name_lower} PROPERTIES
-      IMPORTED_LOCATION "${NVBench_${dep_name_upper}_DLL}"
-      IMPORTED_IMPLIB "${NVBench_${dep_name_upper}_LIBRARY}"
-    )
-  else()
-    find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED
-      DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit."
-      HINTS "${nvbench_cupti_root}/lib64"
-    )
-    mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
-
-    set_target_properties(nvbench::${dep_name_lower} PROPERTIES
-      IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}"
-    )
-  endif()
+  set_target_properties(nvbench::${dep_name_lower} PROPERTIES
+    IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}"
+  )
 endfunction()

 nvbench_add_cupti_dep(nvperf_target)
--- a/cmake/NVBenchClangdCompileInfo.cmake
+++ b/cmake/NVBenchClangdCompileInfo.cmake
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Tell cmake to generate a json file of compile commands for clangd:
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Symlink the compile command output to the source dir, where clangd will find it.
+set(compile_commands_file "${CMAKE_BINARY_DIR}/compile_commands.json")
+set(compile_commands_link "${CMAKE_SOURCE_DIR}/compile_commands.json")
+message(STATUS "Creating symlink from ${compile_commands_link} to ${compile_commands_file}...")
+nvbench_execute_non_fatal_process(COMMAND
+  "${CMAKE_COMMAND}" -E rm -f "${compile_commands_link}")
+nvbench_execute_non_fatal_process(COMMAND
+  "${CMAKE_COMMAND}" -E touch "${compile_commands_file}")
+nvbench_execute_non_fatal_process(COMMAND
+  "${CMAKE_COMMAND}" -E create_symlink "${compile_commands_file}" "${compile_commands_link}")
--- a/cmake/NVBenchConfigTarget.cmake
+++ b/cmake/NVBenchConfigTarget.cmake
@@ -29,46 +29,37 @@ function(nvbench_add_cxx_flag target_name type flag)
    target_compile_options(${target_name} ${type}
      $<$<COMPILE_LANGUAGE:CXX>:${flag}>
      $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=${flag}>
-      # FIXME nvc++ case
    )
  endif()
 endfunction()

-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "/W4")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wall")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wextra")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wconversion")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Woverloaded-virtual")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wcast-qual")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wpointer-arith")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-local-typedef")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-parameter")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wvla")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wgnu")
+nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wno-gnu-line-marker") # WAR 3916341

-  if (NVBench_ENABLE_WERROR)
-    nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "/WX")
-  endif()
-
-  # Suppress overly-pedantic/unavoidable warnings brought in with /W4:
-  # C4505: unreferenced local function has been removed
-  # The CUDA `host_runtime.h` header emits this for
-  # `__cudaUnregisterBinaryUtil`.
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "/wd4505")
-else()
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wall")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wextra")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wconversion")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Woverloaded-virtual")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wcast-qual")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wpointer-arith")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-local-typedef")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-parameter")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wvla")
-  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wgnu")
-
-  if (NVBench_ENABLE_WERROR)
-    nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Werror")
-  endif()
+if (NVBench_ENABLE_WERROR)
+  nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Werror")
 endif()

-# GCC-specific flags
-if (CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+# Experimental filesystem library
+if (CMAKE_CXX_COMPILER_ID STREQUAL GNU OR CMAKE_CXX_COMPILER_ID STREQUAL Clang)
  target_link_libraries(nvbench.build_interface INTERFACE stdc++fs)
 endif()

 # CUDA-specific flags
+if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+  # fmtlib uses llvm's _BitInt internally, which is not available when compiling through nvcc:
+  target_compile_definitions(nvbench.build_interface INTERFACE "FMT_USE_BITINT=0")
+endif()
+
 target_compile_options(nvbench.build_interface INTERFACE
  $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--display_error_number>
  $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Wno-deprecated-gpu-targets>
@@ -85,6 +76,5 @@ function(nvbench_config_target target_name)
    ARCHIVE_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
    LIBRARY_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
    RUNTIME_OUTPUT_DIRECTORY "${NVBench_EXECUTABLE_OUTPUT_DIR}"
-    WINDOWS_EXPORT_ALL_SYMBOLS ON # oooo pretty hammer...
  )
 endfunction()
--- a/cmake/NVBenchDependencies.cmake
+++ b/cmake/NVBenchDependencies.cmake
@@ -1,52 +1,61 @@
 ################################################################################
 # fmtlib/fmt
-rapids_cpm_find(fmt 7.1.3
+set(export_set_details)
+set(install_fmt OFF)
+if(NOT BUILD_SHARED_LIBS AND NVBench_ENABLE_INSTALL_RULES)
+  set(export_set_details BUILD_EXPORT_SET nvbench-targets
+                         INSTALL_EXPORT_SET nvbench-targets)
+  set(install_fmt ON)
+endif()
+
+rapids_cpm_find(fmt 11.1.4 ${export_set_details}
+  GLOBAL_TARGETS fmt::fmt fmt::fmt-header-only
  CPM_ARGS
-    GITHUB_REPOSITORY fmtlib/fmt
-    GIT_TAG 7.1.3
-    GIT_SHALLOW TRUE
+    GIT_REPOSITORY "https://github.com/fmtlib/fmt.git"
+    GIT_TAG "11.1.4"
    OPTIONS
      # Force static to keep fmt internal.
      "BUILD_SHARED_LIBS OFF"
+      # Suppress warnings from fmt headers by marking them as system.
+      "FMT_SYSTEM_HEADERS ON"
+      # Disable install rules since we're linking statically.
+      "FMT_INSTALL ${install_fmt}"
      "CMAKE_POSITION_INDEPENDENT_CODE ON"
 )

+if(NOT fmt_ADDED)
+  set(fmt_is_external TRUE)
+endif()
+
 ################################################################################
 # nlohmann/json
 #
 # Following recipe from
 # http://github.com/cpm-cmake/CPM.cmake/blob/master/examples/json/CMakeLists.txt
 # Download the zips because the repo takes an excessively long time to clone.
-rapids_cpm_find(nlohmann_json 3.9.1
-  # Release:
+rapids_cpm_find(nlohmann_json 3.11.3
  CPM_ARGS
-    URL https://github.com/nlohmann/json/releases/download/v3.9.1/include.zip
-    URL_HASH SHA256=6bea5877b1541d353bd77bdfbdb2696333ae5ed8f9e8cc22df657192218cad91
-    PATCH_COMMAND
-      # Work around compiler bug in nvcc 11.0, see NVIDIA/NVBench#18
-      ${CMAKE_COMMAND} -E copy
-        "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/nlohmann_json.hpp"
-        "./include/nlohmann/json.hpp"
-
-  # Development version:
-  # I'm waiting for https://github.com/nlohmann/json/issues/2676 to be fixed,
-  # leave this in to simplify testing patches as they come out. Update the
-  # `nvbench_json` target too when switching branches.
-  #  CPM_ARGS
-  #    VERSION develop
-  #    URL https://github.com/nlohmann/json/archive/refs/heads/develop.zip
-  #    OPTIONS JSON_MultipleHeaders ON
+    URL https://github.com/nlohmann/json/releases/download/v3.11.3/include.zip
+    URL_HASH SHA256=a22461d13119ac5c78f205d3df1db13403e58ce1bb1794edc9313677313f4a9d
+  PATCH_COMMAND
+    ${CMAKE_COMMAND}
+      -D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
+      -D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
+      -D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
+      -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/json_unordered_map_ice.cmake"
 )

-# nlohmann_json release headers
 add_library(nvbench_json INTERFACE IMPORTED)
-target_include_directories(nvbench_json SYSTEM INTERFACE
-  "${nlohmann_json_SOURCE_DIR}/include"
-)
-
-# nlohmann_json development branch:
-#add_library(nvbench_json INTERFACE)
-#target_link_libraries(nvbench_json INTERFACE nlohmann_json)
+if (TARGET nlohmann_json::nlohmann_json)
+  # If we have a target, just use it. Cannot be an ALIAS library because
+  # nlohmann_json::nlohmann_json itself might be one.
+  target_link_libraries(nvbench_json INTERFACE nlohmann_json::nlohmann_json)
+else()
+  # Otherwise we only downloaded the headers.
+  target_include_directories(nvbench_json SYSTEM INTERFACE
+    "${nlohmann_json_SOURCE_DIR}/include"
+  )
+endif()

 ################################################################################
 # CUDAToolkit
--- a/cmake/NVBenchDependentDlls.cmake
+++ b/cmake/NVBenchDependentDlls.cmake
@@ -1,38 +0,0 @@
-# By default, add dependent DLLs to the build dir on MSVC. This avoids
-# a variety of runtime issues when using NVML, etc.
-# This behavior can be disabled using the following options:
-if (WIN32)
-  option(NVBench_ADD_DEPENDENT_DLLS_TO_BUILD
-    "Copy dependent dlls to NVBench library build location (MSVC only)."
-    ON
-  )
-else()
-  # These are forced off for non-MSVC builds, as $<TARGET_RUNTIME_DLLS:...>
-  # will always be empty on non-dll platforms.
-  set(NVBench_ADD_DEPENDENT_DLLS_TO_BUILD OFF)
-endif()
-
-if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD)
-  message(STATUS
-    "CMake 3.21.0 is required when NVBench_ADD_DEPENDENT_DLLS_TO_BUILD "
-    "is enabled."
-  )
-  cmake_minimum_required(VERSION 3.21.0)
-endif()
-
-function(nvbench_setup_dep_dlls target_name)
-  # The custom command below fails when there aren't any runtime DLLs to copy,
-  # so only enable it when a relevant dependency is enabled:
-  if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND
-      (NVBench_ENABLE_NVML OR
-       NVBench_ENABLE_CUPTI))
-    add_custom_command(TARGET ${target_name}
-      POST_BUILD
-      COMMAND
-        "${CMAKE_COMMAND}" -E copy
-          "$<TARGET_RUNTIME_DLLS:${target_name}>"
-          "$<TARGET_FILE_DIR:${target_name}>"
-      COMMAND_EXPAND_LISTS
-    )
-  endif()
-endfunction()
--- a/cmake/NVBenchExports.cmake
+++ b/cmake/NVBenchExports.cmake
@@ -1,37 +1,51 @@
 macro(nvbench_generate_exports)
-  set(nvbench_build_export_code_block "")
-  set(nvbench_install_export_code_block "")
+  if(NVBench_ENABLE_INSTALL_RULES)
+    set(nvbench_build_export_code_block "")
+    set(nvbench_install_export_code_block "")

-  if (NVBench_ENABLE_NVML)
-    string(APPEND nvbench_build_export_code_block
-      "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake\")\n"
+    if (NVBench_ENABLE_NVML)
+      string(APPEND nvbench_build_export_code_block
+        "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake\")\n"
+      )
+      string(APPEND nvbench_install_export_code_block
+        "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake\")\n"
+      )
+    endif()
+
+    if (NVBench_ENABLE_CUPTI)
+      string(APPEND nvbench_build_export_code_block
+        "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
+      )
+      string(APPEND nvbench_install_export_code_block
+        "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
+      )
+    endif()
+
+    if (TARGET nvbench_json)
+      set(nvbench_json_code_block
+        [=[
+        add_library(nvbench_json INTERFACE IMPORTED)
+        if (TARGET nlohmann_json::nlohmann_json)
+          target_link_libraries(nvbench_json INTERFACE nlohmann_json::nlohmann_json)
+        endif()
+        ]=])
+      string(APPEND nvbench_build_export_code_block ${nvbench_json_code_block})
+      string(APPEND nvbench_install_export_code_block ${nvbench_json_code_block})
+    endif()
+
+    rapids_export(BUILD NVBench
+      EXPORT_SET nvbench-targets
+      NAMESPACE "nvbench::"
+      GLOBAL_TARGETS nvbench main ctl internal_build_interface
+      LANGUAGES CUDA CXX
+      FINAL_CODE_BLOCK nvbench_build_export_code_block
    )
-    string(APPEND nvbench_install_export_code_block
-      "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake\")\n"
+    rapids_export(INSTALL NVBench
+      EXPORT_SET nvbench-targets
+      NAMESPACE "nvbench::"
+      GLOBAL_TARGETS nvbench main ctl internal_build_interface
+      LANGUAGES CUDA CXX
+      FINAL_CODE_BLOCK nvbench_install_export_code_block
    )
  endif()
-
-  if (NVBench_ENABLE_CUPTI)
-    string(APPEND nvbench_build_export_code_block
-      "include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
-    )
-    string(APPEND nvbench_install_export_code_block
-      "include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
-    )
-  endif()
-
-  rapids_export(BUILD NVBench
-    EXPORT_SET nvbench-targets
-    NAMESPACE "nvbench::"
-    GLOBAL_TARGETS nvbench main ctl internal_build_interface
-    LANGUAGES CUDA CXX
-    FINAL_CODE_BLOCK nvbench_build_export_code_block
-  )
-  rapids_export(INSTALL NVBench
-    EXPORT_SET nvbench-targets
-    NAMESPACE "nvbench::"
-    GLOBAL_TARGETS nvbench main ctl internal_build_interface
-    LANGUAGES CUDA CXX
-    FINAL_CODE_BLOCK nvbench_install_export_code_block
-  )
 endmacro()
--- a/cmake/NVBenchHeaderTesting.cmake
+++ b/cmake/NVBenchHeaderTesting.cmake
@@ -0,0 +1,40 @@
+# For every public header, build a translation unit containing `#include <header>`
+# with some various checks.
+
+set(excluded_headers_regexes
+  # Should never be used externally.
+  "^detail"
+  "^internal"
+)
+
+# Meta target for all configs' header builds:
+add_custom_target(nvbench.headers.all)
+add_dependencies(nvbench.all nvbench.headers.all)
+
+file(GLOB_RECURSE header_files
+  RELATIVE "${NVBench_SOURCE_DIR}/nvbench/"
+  CONFIGURE_DEPENDS
+  "${NVBench_SOURCE_DIR}/nvbench/*.cuh"
+)
+
+foreach (exclusion IN LISTS excluded_headers_regexes)
+  list(FILTER header_files EXCLUDE REGEX "${exclusion}")
+endforeach()
+
+function (nvbench_add_header_target target_name cuda_std)
+  foreach (header IN LISTS header_files)
+    set(headertest_src "headers/${target_name}/${header}.cu")
+    set(header_str "nvbench/${header}") # Substitution used by configure_file:
+    configure_file("${NVBench_SOURCE_DIR}/cmake/header_test.in.cxx" "${headertest_src}")
+    list(APPEND headertest_srcs "${headertest_src}")
+  endforeach()
+
+  add_library(${target_name} OBJECT ${headertest_srcs})
+  target_link_libraries(${target_name} PUBLIC nvbench::nvbench)
+  set_target_properties(${target_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
+  add_dependencies(nvbench.headers.all ${target_name})
+endfunction()
+
+foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
+  nvbench_add_header_target(nvbench.headers.cpp${std} ${std})
+endforeach()
--- a/cmake/NVBenchInstallRules.cmake
+++ b/cmake/NVBenchInstallRules.cmake
@@ -1,61 +1,69 @@
-include(GNUInstallDirs)
-rapids_cmake_install_lib_dir(NVBench_INSTALL_LIB_DIR)

-# in-source public headers:
-install(DIRECTORY "${NVBench_SOURCE_DIR}/nvbench"
-  TYPE INCLUDE
-  FILES_MATCHING
-    PATTERN "*.cuh"
-    PATTERN "internal" EXCLUDE
-)
+if(NVBench_ENABLE_INSTALL_RULES)

-# generated headers from build dir:
-install(
-  FILES
-    "${NVBench_BINARY_DIR}/nvbench/config.cuh"
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench"
-)
-install(
-  FILES
-    "${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
-    "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench/detail"
-)
+  include(GNUInstallDirs)
+  rapids_cmake_install_lib_dir(NVBench_INSTALL_LIB_DIR)

-#
-# Install CMake files needed by consumers to locate dependencies:
-#
+  # in-source public headers:
+  install(DIRECTORY "${NVBench_SOURCE_DIR}/nvbench"
+    TYPE INCLUDE
+    FILES_MATCHING
+      PATTERN "*.cuh"
+      PATTERN "internal" EXCLUDE
+  )

-# Borrowing this logic from rapids_cmake's export logic to make sure these end
-# up in the same location as nvbench-config.cmake:
-rapids_cmake_install_lib_dir(config_install_location)
-set(config_install_location "${config_install_location}/cmake/nvbench")
-
-if (NVBench_ENABLE_NVML)
+  # generated headers from build dir:
  install(
    FILES
-      "${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake"
-    DESTINATION "${config_install_location}"
+      "${NVBench_BINARY_DIR}/nvbench/config.cuh"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench"
  )
-endif()
-
-if (NVBench_ENABLE_CUPTI)
  install(
    FILES
-      "${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
-    DESTINATION "${config_install_location}"
+      "${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
+      "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench/detail"
  )
+
+  #
+  # Install CMake files needed by consumers to locate dependencies:
+  #
+
+  # Borrowing this logic from rapids_cmake's export logic to make sure these end
+  # up in the same location as nvbench-config.cmake:
+  rapids_cmake_install_lib_dir(config_install_location)
+  set(config_install_location "${config_install_location}/cmake/nvbench")
+
+  if (NVBench_ENABLE_NVML)
+    install(
+      FILES
+        "${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake"
+      DESTINATION "${config_install_location}"
+    )
+  endif()
+
+  if (NVBench_ENABLE_CUPTI)
+    install(
+      FILES
+        "${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
+      DESTINATION "${config_install_location}"
+    )
+  endif()
 endif()

 # Call with a list of library targets to generate install rules:
 function(nvbench_install_libraries)
-  install(TARGETS ${ARGN}
-    DESTINATION "${NVBench_INSTALL_LIB_DIR}"
-    EXPORT nvbench-targets
-  )
+  if(NVBench_ENABLE_INSTALL_RULES)
+    install(TARGETS ${ARGN}
+      DESTINATION "${NVBench_INSTALL_LIB_DIR}"
+      EXPORT nvbench-targets
+    )
+  endif()
 endfunction()

 # Call with a list of executables to generate install rules:
 function(nvbench_install_executables)
-  install(TARGETS ${ARGN} EXPORT nvbench-targets)
+  if(NVBench_ENABLE_INSTALL_RULES)
+    install(TARGETS ${ARGN} EXPORT nvbench-targets)
+  endif()
 endfunction()
--- a/cmake/NVBenchNVML.cmake
+++ b/cmake/NVBenchNVML.cmake
@@ -1,37 +1,7 @@
-# Since this file is installed, we need to make sure that the CUDAToolkit has
-# been found by consumers:
-if (NOT TARGET CUDA::toolkit)
-  find_package(CUDAToolkit REQUIRED)
-endif()
-
-if (WIN32)
-  # The CUDA:: targets currently don't provide dll locations through the
-  # `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
-  # (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
-  # CMake function from copying the dlls to the build / install directories.
-  # This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
-  # and the other CMake issues it links to.
-  #
-  # We create a nvbench-specific target that configures the nvml interface as
-  # described here:
-  # https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
-  #
-  # Use find_file instead of find_library, which would search for a .lib file.
-  # This is also nice because find_file searches recursively (find_library
-  # does not) and some versions of CTK nest nvml.dll several directories deep
-  # under C:\Windows\System32.
-  find_file(NVBench_NVML_DLL nvml.dll REQUIRED
-    DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
-    PATHS "C:/Windows/System32"
-  )
-  mark_as_advanced(NVBench_NVML_DLL)
-  add_library(nvbench::nvml SHARED IMPORTED)
-  target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
-  set_target_properties(nvbench::nvml PROPERTIES
-    IMPORTED_LOCATION "${NVBench_NVML_DLL}"
-    IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
-  )
-else()
-  # Linux is much easier...
-  add_library(nvbench::nvml ALIAS CUDA::nvml)
-endif()
+# Since this file is installed, we need to make sure that the CUDAToolkit has
+# been found by consumers:
+if (NOT TARGET CUDA::toolkit)
+  find_package(CUDAToolkit REQUIRED)
+endif()
+
+add_library(nvbench::nvml ALIAS CUDA::nvml)
--- a/cmake/NVBenchRapidsCMake.cmake
+++ b/cmake/NVBenchRapidsCMake.cmake
@@ -1,10 +1,12 @@
 # Called before project(...)
 macro(nvbench_load_rapids_cmake)
-  file(DOWNLOAD
-    https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
-    "${CMAKE_BINARY_DIR}/RAPIDS.cmake"
-  )
-  include("${CMAKE_BINARY_DIR}/RAPIDS.cmake")
+  if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake")
+    file(DOWNLOAD
+      https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-25.04/RAPIDS.cmake
+      "${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake"
+    )
+  endif()
+  include("${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake")

  include(rapids-cmake)
  include(rapids-cpm)
@@ -18,10 +20,9 @@ endmacro()
 # Called after project(...)
 macro(nvbench_init_rapids_cmake)
  rapids_cmake_build_type(Release)
-  rapids_cmake_write_version_file("${NVBench_BINARY_DIR}/nvbench/detail/version.cuh")
-  rapids_cmake_write_git_revision_file(
-    nvbench_git_revision
-    "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
+  rapids_cmake_write_version_file(
+    "${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
+    PREFIX "NVBENCH"
  )
  rapids_cpm_init()
 endmacro()
--- a/cmake/NVBenchUtilities.cmake
+++ b/cmake/NVBenchUtilities.cmake
@@ -1,3 +1,48 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Passes all args directly to execute_process while setting up the following
+# results variables and propagating them to the caller's scope:
+#
+# - nvbench_process_exit_code
+# - nvbench_process_stdout
+# - nvbench_process_stderr
+#
+# If the command is not successful (e.g. the last command does not return zero),
+# a non-fatal warning is printed.
+function(nvbench_execute_non_fatal_process)
+  execute_process(${ARGN}
+    RESULT_VARIABLE nvbench_process_exit_code
+    OUTPUT_VARIABLE nvbench_process_stdout
+    ERROR_VARIABLE nvbench_process_stderr
+  )
+
+  if (NOT nvbench_process_exit_code EQUAL 0)
+    message(WARNING
+      "execute_process failed with non-zero exit code: ${nvbench_process_exit_code}\n"
+      "${ARGN}\n"
+      "stdout:\n${nvbench_process_stdout}\n"
+      "stderr:\n${nvbench_process_stderr}\n"
+    )
+  endif()
+
+  set(nvbench_process_exit_code "${nvbench_process_exit_code}" PARENT_SCOPE)
+  set(nvbench_process_stdout "${nvbench_process_stdout}" PARENT_SCOPE)
+  set(nvbench_process_stderr "${nvbench_process_stderr}" PARENT_SCOPE)
+endfunction()
+
 # Writes CMAKE_CUDA_ARCHITECTURES to out_var, but using escaped semicolons
 # as delimiters
 function(nvbench_escaped_cuda_arches out_var)
--- a/cmake/PrintCTestRunTimes.cmake
+++ b/cmake/PrintCTestRunTimes.cmake
@@ -0,0 +1,127 @@
+## This CMake script parses the output of ctest and prints a formatted list
+## of individual test runtimes, sorted longest first.
+##
+## ctest > ctest_log
+## cmake -DLOGFILE=ctest_log \
+##       -DMINSEC=10 \
+##       -P PrintCTestRunTimes.cmake
+##
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  message(FATAL_ERROR "Missing -DLOGFILE=<ctest output> argument.")
+endif()
+
+if (NOT DEFINED MINSEC)
+  set(MINSEC 10)
+endif()
+
+set(num_below_thresh 0)
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+string(JOIN "" regex
+  "[0-9]+/[0-9]+[ ]+Test[ ]+#"
+  "([0-9]+)"                        # Test ID
+  ":[ ]+"
+  "([^ ]+)"                         # Test Name
+  "[ ]*\\.+[ ]*\\**[ ]*"
+  "([^ ]+)"                         # Result
+  "[ ]+"
+  "([0-9]+)"                        # Seconds
+  "\\.[0-9]+[ ]+sec"
+)
+
+message(DEBUG "LOGFILE: ${LOGFILE}")
+message(DEBUG "MINSEC: ${MINSEC}")
+message(DEBUG "regex: ${regex}")
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH "${regex}" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 4)
+    set(test_id      "${CMAKE_MATCH_1}")
+    set(test_name    "${CMAKE_MATCH_2}")
+    set(test_result  "${CMAKE_MATCH_3}")
+    set(tmp          "${CMAKE_MATCH_4}") # floor(runtime_seconds)
+
+    if (tmp LESS MINSEC)
+      math(EXPR num_below_thresh "${num_below_thresh} + 1")
+      continue()
+    endif()
+
+    # Compute human readable time
+    math(EXPR days         "${tmp} / (60 * 60 * 24)")
+    math(EXPR tmp          "${tmp} - (${days} * 60 * 60 * 24)")
+    math(EXPR hours        "${tmp} / (60 * 60)")
+    math(EXPR tmp          "${tmp} - (${hours} * 60 * 60)")
+    math(EXPR minutes      "${tmp} / (60)")
+    math(EXPR tmp          "${tmp} - (${minutes} * 60)")
+    math(EXPR seconds      "${tmp}")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${test_id}" key)
+    string(JOIN " | " ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s"
+      "${test_result}"
+      "${test_id}: ${test_name}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(STATUS "LOGFILE contained no test times ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries ORDER DESCENDING)
+
+# Dump table:
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
+
+if (num_below_thresh GREATER 0)
+  message(STATUS "${num_below_thresh} additional tests took < ${MINSEC}s each.")
+endif()
--- a/cmake/PrintNinjaBuildTimes.cmake
+++ b/cmake/PrintNinjaBuildTimes.cmake
@@ -0,0 +1,101 @@
+## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of
+## build/link times, sorted longest first.
+##
+## cmake -DLOGFILE=<.ninja_log file> \
+##       -P PrintNinjaBuildTimes.cmake
+##
+## If LOGFILE is omitted, the current directory's .ninja_log file is used.
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  set(LOGFILE ".ninja_log")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH
+    "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 3)
+    set(start_ms ${CMAKE_MATCH_1})
+    set(end_ms ${CMAKE_MATCH_2})
+    set(command "${CMAKE_MATCH_3}")
+    math(EXPR runtime_ms "${end_ms} - ${start_ms}")
+
+    # Compute human readable time
+    math(EXPR days         "${runtime_ms} / (1000 * 60 * 60 * 24)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)")
+    math(EXPR hours        "${runtime_ms} / (1000 * 60 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${hours} * 1000 * 60 * 60)")
+    math(EXPR minutes      "${runtime_ms} / (1000 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${minutes} * 1000 * 60)")
+    math(EXPR seconds      "${runtime_ms} / 1000")
+    math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+    pad_string_with_zeros(milliseconds 3)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${command}" key)
+    set(ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries)
+list(REVERSE entries)
+
+# Dump table:
+message(STATUS "-----------------------+----------------------------")
+message(STATUS "Time                   | Command                    ")
+message(STATUS "-----------------------+----------------------------")
+
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
--- a/cmake/header_test.in.cxx
+++ b/cmake/header_test.in.cxx
@@ -0,0 +1,45 @@
+// This source file checks that:
+// 1) Header <${header_str}> compiles without error.
+// 2) Common macro collisions with platform/system headers are avoided.
+
+// Turn off failures for certain configurations:
+#ifndef NVBench_IGNORE_MACRO_CHECKS
+
+// Define NVBench_MACRO_CHECK(macro, header), which emits a diagnostic indicating
+// a potential macro collision and halts.
+//
+// Hacky way to build a string, but it works on all tested platforms.
+#define NVBench_MACRO_CHECK(MACRO, HEADER)                                                         \
+  NVBench_MACRO_CHECK_IMPL(                                                                        \
+    Identifier MACRO should not be used from NVBench headers due to conflicts with HEADER macros.)
+
+// Use raw platform checks instead of the NVBench_HOST_COMPILER macros since we
+// don't want to #include any headers other than the one being tested.
+//
+// This is only implemented for GCC/Clang.
+#if defined(__clang__) || defined(__GNUC__)
+
+// GCC/clang are easy:
+#define NVBench_MACRO_CHECK_IMPL(msg) NVBench_MACRO_CHECK_IMPL0(GCC error #msg)
+#define NVBench_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
+
+#endif // defined(__clang__) || defined(__GNUC__)
+
+// complex.h conflicts
+#define I NVBench_MACRO_CHECK('I', complex.h)
+
+// windows.h conflicts
+#define small NVBench_MACRO_CHECK('small', windows.h)
+// We can't enable these checks without breaking some builds -- some standard
+// library implementations unconditionally `#undef` these macros, which then
+// causes random failures later.
+// Leaving these commented out as a warning: Here be dragons.
+// #define min(...) NVBench_MACRO_CHECK('min', windows.h)
+// #define max(...) NVBench_MACRO_CHECK('max', windows.h)
+
+// termios.h conflicts (NVIDIA/thrust#1547)
+#define B0 NVBench_MACRO_CHECK("B0", termios.h)
+
+#endif // NVBench_IGNORE_MACRO_CHECKS
+
+#include <${header_str}>
--- a/cmake/patches/json_unordered_map_ice.cmake
+++ b/cmake/patches/json_unordered_map_ice.cmake
@@ -0,0 +1,22 @@
+# NVCC 11.1 and GCC 9 need a patch to build, otherwise:
+#
+# nlohmann/ordered_map.hpp(29): error #3316:
+# Internal Compiler Error (codegen): "internal error during structure layout!"
+#
+# Usage:
+# ${CMAKE_COMMAND}
+#   -D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
+#   -D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
+#   -D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
+#   -P "json_unordered_map_ice.cmake"
+
+if(CUDA_VERSION VERSION_GREATER 11.8 OR NOT CXX_ID STREQUAL "GNU" OR CXX_VERSION VERSION_LESS 9.0)
+  return()
+endif()
+
+# Read the file and replace the string "JSON_NO_UNIQUE_ADDRESS" with
+# "/* JSON_NO_UNIQUE_ADDRESS */".
+file(READ "include/nlohmann/ordered_map.hpp" NLOHMANN_ORDERED_MAP_HPP)
+string(REPLACE "JSON_NO_UNIQUE_ADDRESS" "/* [NVBench Patch] JSON_NO_UNIQUE_ADDRESS */"
+  NLOHMANN_ORDERED_MAP_HPP "${NLOHMANN_ORDERED_MAP_HPP}")
+file(WRITE "include/nlohmann/ordered_map.hpp" "${NLOHMANN_ORDERED_MAP_HPP}")
--- a/cmake/patches/nlohmann_json.hpp
+++ b/cmake/patches/nlohmann_json.hpp
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -4,7 +4,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:

 ```cpp
 void my_benchmark(nvbench::state& state) {
-  state.exec([](nvbench::launch& launch) { 
+  state.exec([](nvbench::launch& launch) {
    my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
  });
 }
@@ -97,7 +97,7 @@ void benchmark(nvbench::state& state)
  const auto num_inputs = state.get_int64("NumInputs");
  thrust::device_vector<int> data = generate_input(num_inputs);

-  state.exec([&data](nvbench::launch& launch) { 
+  state.exec([&data](nvbench::launch& launch) {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
  });
 }
@@ -134,7 +134,7 @@ void benchmark(nvbench::state& state)
  const auto quality = state.get_float64("Quality");

  state.exec([&quality](nvbench::launch& launch)
-  { 
+  {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
  });
 }
@@ -153,7 +153,7 @@ void benchmark(nvbench::state& state)
  thrust::device_vector<int> data = generate_input(rng_dist);

  state.exec([&data](nvbench::launch& launch)
-  { 
+  {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
  });
 }
@@ -182,13 +182,13 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
  thrust::device_vector<T> data = generate_input<T>();

  state.exec([&data](nvbench::launch& launch)
-  { 
+  {
    my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
  });
 }
 using my_types = nvbench::type_list<int, float, double>;
 NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types))
-  .set_type_axis_names({"ValueType"});
+  .set_type_axes_names({"ValueType"});
 ```

 The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in
@@ -293,7 +293,6 @@ In general::

 More examples can found in [examples/throughput.cu](../examples/throughput.cu).

-
 # Skip Uninteresting / Invalid Benchmarks

 Sometimes particular combinations of parameters aren't useful or interesting —
@@ -321,7 +320,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
 // Skip benchmarks at compile time -- for example, always skip when T == U
 // (Note that the `type_list` argument defines the same type twice).
 template <typename SameType>
-void my_benchmark(nvbench::state& state, 
+void my_benchmark(nvbench::state& state,
                  nvbench::type_list<SameType, SameType>)
 {
  state.skip("T must not be the same type as U.");
@@ -347,6 +346,15 @@ true:
  synchronize internally.
 - `nvbench::exec_tag::timer` requests a timer object that can be used to
  restrict the timed region.
+- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables
+  them during execution to reduce runtime, and prevents their compilation to
+  reduce compile-time and binary size.
+- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark is defined with `set_is_cpu_only(true)`.
+- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking
+  code from being compiled for a particular benchmark. A runtime error is emitted
+  if the benchmark does not also define `set_is_cpu_only(true)`.

 Multiple execution tags may be combined using `operator|`, e.g.

@@ -397,7 +405,7 @@ Note that using manual timer mode disables batch measurements.
 void timer_example(nvbench::state& state)
 {
  // Pass the `timer` exec tag to request a timer:
-  state.exec(nvbench::exec_tag::timer, 
+  state.exec(nvbench::exec_tag::timer,
    // Lambda now accepts a timer:
    [](nvbench::launch& launch, auto& timer)
    {
@@ -418,6 +426,79 @@ NVBENCH_BENCH(timer_example);
 See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete
 example.

+## Compilation hints: `nvbench::exec_tag::no_batch`, `gpu`, and `no_gpu`
+
+These execution tags are optional hints that disable the compilation of various
+code paths when they are not needed. They apply only to a single benchmark.
+
+- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend.
+- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends.
+  - Requires that the benchmark does not define `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - Host-side CPU measurements of GPU kernel execution time are still provided.
+- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends.
+  - Requires that the benchmark defines `set_is_cpu_only(true)`.
+  - Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
+  - See also [CPU-only Benchmarks](#cpu-only-benchmarks).
+
+# CPU-only Benchmarks
+
+NVBench provides CPU-only benchmarking facilities that are intended for measuring
+significant CPU workloads. We do not recommend using these features for high-resolution
+CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for
+such applications. Examples are provided in [examples/cpu_only.cu](../examples/cpu_only.cu).
+
+Note that NVBench still requires a CUDA compiler and runtime even if a project only contains
+CPU-only benchmarks.
+
+The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements:
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec([](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and
+binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while
+`is_cpu_only` is false.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends
+             [](nvbench::launch &) { /* workload */ });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
+The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This
+is useful for benchmarks that require additional per-sample setup/teardown. See the
+[`nvbench::exec_tag::timer`](#explicit-timer-mode-nvbenchexec_tagtimer) section for more
+details.
+
+```cpp
+void my_cpu_benchmark(nvbench::state &state)
+{
+  state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends
+             nvbench::exec_tag::timer,   // Request a timer object
+             [](nvbench::launch &, auto &timer)
+    {
+      // Setup here
+      timer.start();
+      // timed workload
+      timer.stop();
+      // teardown here
+    });
+}
+NVBENCH_BENCH(my_cpu_benchmark)
+  .set_is_cpu_only(true); // Mark as CPU-only.
+```
+
 # Beware: Combinatorial Explosion Is Lurking

 Be very careful of how quickly the configuration space can grow. The following
@@ -430,7 +511,7 @@ using value_types = nvbench::type_list<nvbench::uint8_t,
                                       nvbench::int32_t,
                                       nvbench::float32_t,
                                       nvbench::float64_t>;
-using op_types = nvbench::type_list<thrust::plus<>, 
+using op_types = nvbench::type_list<thrust::plus<>,
                                    thrust::multiplies<>,
                                    thrust::maximum<>>;

@@ -445,7 +526,7 @@ NVBENCH_BENCH_TYPES(my_benchmark,

 ```
 960 total configs
-= 4 [T=(U8, I32, F32, F64)] 
+= 4 [T=(U8, I32, F32, F64)]
 * 4 [U=(U8, I32, F32, F64)]
 * 4 [V=(U8, I32, F32, F64)]
 * 3 [Op=(plus, multiplies, max)]
@@ -453,9 +534,10 @@ NVBENCH_BENCH_TYPES(my_benchmark,
 ```

 For large configuration spaces like this, pruning some of the less useful
-combinations using the techniques described in the [Zipped/Tied Iteration of Value Axes](#zipped-iteration-of-value-axes)
-or [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks) section can help immensely with
-keeping compile / run times manageable.
+combinations using the techniques described in the
+[Zipped/Tied Iteration of Value Axes](#zipped-iteration-of-value-axes)
+or [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks)
+sections can help immensely with keeping compile / run times manageable.

 Splitting a single large configuration space into multiple, more focused
 benchmarks with reduced dimensionality will likely be worth the effort as well.
--- a/docs/cli_help.md
+++ b/docs/cli_help.md
@@ -83,28 +83,6 @@
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.

-* `--min-samples <count>`
-  * Gather at least `<count>` samples per measurement.
-  * Default is 10 samples.
-  * Applies to the most recent `--benchmark`, or all benchmarks if specified
-    before any `--benchmark` arguments.
-
-* `--min-time <seconds>`
-  * Accumulate at least `<seconds>` of execution time per measurement.
-  * Default is 0.5 seconds.
-  * If both GPU and CPU times are gathered, this applies to GPU time only.
-  * Applies to the most recent `--benchmark`, or all benchmarks if specified
-    before any `--benchmark` arguments.
-
-* `--max-noise <value>`
-  * Gather samples until the error in the measurement drops below `<value>`.
-  * Noise is specified as the percent relative standard deviation.
-  * Default is 0.5% (`--max-noise 0.5`)
-  * Only applies to Cold measurements.
-  * If both GPU and CPU times are gathered, this applies to GPU noise only.
-  * Applies to the most recent `--benchmark`, or all benchmarks if specified
-    before any `--benchmark` arguments.
-
 * `--skip-time <seconds>`
  * Skip a measurement when a warmup run executes in less than `<seconds>`.
  * Default is -1 seconds (disabled).
@@ -115,6 +93,42 @@
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.

+* `--throttle-threshold <value>`
+  * Set the GPU throttle threshold as percentage of the device's default clock rate.
+  * Default is 75.
+  * Set to 0 to disable throttle detection entirely.
+  * Note that throttling is disabled when `nvbench::exec_tag::sync` is used.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--throttle-recovery-delay <value>`
+  * Set the GPU throttle recovery delay in seconds.
+  * Default is 0.05 seconds.
+  * Note that throttling is disabled when `nvbench::exec_tag::sync` is used.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--run-once`
+  * Only run the benchmark once, skipping any warmup runs and batched
+    measurements.
+  * Intended for use with external profiling tools.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--disable-blocking-kernel`
+  * Don't use the `blocking_kernel`.
+  * Intended for use with external profiling tools.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--profile`
+  * Implies `--run-once` and `--disable-blocking-kernel`.
+  * Intended for use with external profiling tools.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+## Stopping Criteria
+
 * `--timeout <seconds>`
  * Measurements will timeout after `<seconds>` have elapsed.
  * Default is 15 seconds.
@@ -125,9 +139,55 @@
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.

-* `--run-once`
-  * Only run the benchmark once, skipping any warmup runs and batched
-    measurements.
-  * Intended for use with external profiling tools.
+* `--min-samples <count>`
+  * Gather at least `<count>` samples per measurement before checking any
+    other stopping criterion besides the timeout.
+  * Default is 10 samples.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--stopping-criterion <criterion>`
+  * After `--min-samples` is satisfied, use `<criterion>` to detect if enough
+    samples were collected.
+  * Only applies to Cold and CPU-only measurements.
+  * If both GPU and CPU times are gathered, GPU time is used for stopping
+    analysis.
+  * Stopping criteria provided by NVBench are:
+    * "stdrel": (default) Converges to a minimal relative standard deviation,
+       stdev / mean
+    * "entropy": Converges based on the cumulative entropy of all samples.
+  * Each stopping criterion may provide additional parameters to customize
+    behavior, as detailed below:
+
+### "stdrel" Stopping Criterion Parameters
+
+* `--min-time <seconds>`
+  * Accumulate at least `<seconds>` of execution time per measurement.
+  * Only applies to `stdrel` stopping criterion.
+  * Default is 0.5 seconds.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--max-noise <value>`
+  * Gather samples until the error in the measurement drops below `<value>`.
+  * Noise is specified as the percent relative standard deviation (stdev/mean).
+  * Default is 0.5% (`--max-noise 0.5`)
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+### "entropy" Stopping Criterion Parameters
+
+* `--max-angle <value>`
+  * Maximum linear regression angle of cumulative entropy.
+  * Smaller values give more accurate results.
+  * Default is 0.048.
+  * Applies to the most recent `--benchmark`, or all benchmarks if specified
+    before any `--benchmark` arguments.
+
+* `--min-r2 <value>`
+  * Minimum coefficient of determination for linear regression of cumulative
+    entropy.
+  * Larger values give more accurate results.
+  * Default is 0.36.
  * Applies to the most recent `--benchmark`, or all benchmarks if specified
    before any `--benchmark` arguments.
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,12 +1,15 @@
 set(example_srcs
+  auto_throughput.cu
  axes.cu
+  custom_criterion.cu
+  cpu_only.cu
  enums.cu
  exec_tag_sync.cu
  exec_tag_timer.cu
  skip.cu
  stream.cu
+  summaries.cu
  throughput.cu
-  auto_throughput.cu
  custom_iteration_spaces.cu
 )

@@ -14,39 +17,39 @@ set(example_srcs
 add_custom_target(nvbench.example.all)
 add_dependencies(nvbench.all nvbench.example.all)

-foreach(example_src IN LISTS example_srcs)
-  get_filename_component(example_name "${example_src}" NAME_WLE)
-  string(PREPEND example_name "nvbench.example.")
-  add_executable(${example_name} "${example_src}")
-  nvbench_config_target(${example_name})
-  target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
-  target_link_libraries(${example_name} PRIVATE nvbench::main)
-  set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
-  add_test(NAME ${example_name}
-    COMMAND "$<TARGET_FILE:${example_name}>" --timeout 0.1 --min-time 1e-5
-  )
+function (nvbench_add_examples_target target_prefix cuda_std)
+  add_custom_target(${target_prefix}.all)
+  add_dependencies(nvbench.example.all ${target_prefix}.all)

-  add_dependencies(nvbench.example.all ${example_name})
-endforeach()
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    string(PREPEND example_name "${target_prefix}.")
+    add_executable(${example_name} "${example_src}")
+    nvbench_config_target(${example_name})
+    target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
+    target_link_libraries(${example_name} PRIVATE nvbench::main)
+    set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})

-# Silence some warnings from old thrust headers:
-set(thrust_examples
-  auto_throughput
-  axes
-  exec_tag_sync
-  exec_tag_timer
-  skip
-  throughput
-)
-foreach (example IN LISTS thrust_examples)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    # C4324: structure was padded due to alignment specifier
-    nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4324")
-
-    # warning C4201: nonstandard extension used: nameless struct/union:
-    # Fixed in Thrust 1.12.0 (CTK 11.4, NV HPC 21.3)
-    if (${CUDAToolkit_VERSION} VERSION_LESS 11.4)
-      nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4201")
+    set(example_args --timeout 0.1)
+    # The custom_criterion example doesn't support the --min-time argument:
+    if (NOT "${example_src}" STREQUAL "custom_criterion.cu")
+      list(APPEND example_args --min-time 1e-5)
    endif()
-  endif()
+
+    add_test(NAME ${example_name}
+      COMMAND "$<TARGET_FILE:${example_name}>" ${example_args})
+
+    # These should not deadlock. If they do, it may be that the CUDA context was created before
+    # setting CUDA_MODULE_LOAD=EAGER in main, see NVIDIA/nvbench#136.
+    set_tests_properties(${example_name} PROPERTIES
+      FAIL_REGULAR_EXPRESSION "Possible Deadlock Detected"
+    )
+
+    add_dependencies(${target_prefix}.all ${example_name})
+  endforeach()
+endfunction()
+
+
+foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
+  nvbench_add_examples_target(nvbench.example.cpp${std} ${std})
 endforeach()
--- a/examples/auto_throughput.cu
+++ b/examples/auto_throughput.cu
@@ -24,37 +24,33 @@
 template <int ItemsPerThread>
 __global__ void kernel(std::size_t stride,
                       std::size_t elements,
-                       const nvbench::int32_t * __restrict__ in,
+                       const nvbench::int32_t *__restrict__ in,
                       nvbench::int32_t *__restrict__ out)
 {
-  const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const std::size_t tid  = threadIdx.x + blockIdx.x * blockDim.x;
  const std::size_t step = gridDim.x * blockDim.x;

-  for (std::size_t i = stride * tid;
-       i < stride * elements;
-       i += stride * step)
+  for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
  {
    for (int j = 0; j < ItemsPerThread; j++)
    {
-      const auto read_id = (ItemsPerThread * i + j) % elements;
+      const auto read_id  = (ItemsPerThread * i + j) % elements;
      const auto write_id = tid + j * elements;
-      out[write_id] = in[read_id];
+      out[write_id]       = in[read_id];
    }
  }
 }

-
 // `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
 // and cache hit rates.
 //
 // Calling state.collect_*() enables particular metric collection if nvbench
 // was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
 template <int ItemsPerThread>
-void throughput_bench(nvbench::state &state,
-                      nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
+void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
 {
  // Allocate input data:
-  const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
+  const std::size_t stride   = static_cast<std::size_t>(state.get_int64("Stride"));
  const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
  thrust::device_vector<nvbench::int32_t> input(elements);
  thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
    static_cast<int>((elements + threads_in_block - 1) / threads_in_block);

  state.exec([&](nvbench::launch &launch) {
-    kernel<ItemsPerThread>
-      <<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
-        stride,
-        elements,
-        thrust::raw_pointer_cast(input.data()),
-        thrust::raw_pointer_cast(output.data()));
+    kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
+      stride,
+      elements,
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()));
  });
 }

--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -56,8 +56,8 @@ NVBENCH_BENCH(single_float64_axis)
 void copy_sweep_grid_shape(nvbench::state &state)
 {
  // Get current parameters:
-  const int block_size = static_cast<int>(state.get_int64("BlockSize"));
-  const int num_blocks = static_cast<int>(state.get_int64("NumBlocks"));
+  const auto block_size = static_cast<unsigned int>(state.get_int64("BlockSize"));
+  const auto num_blocks = static_cast<unsigned int>(state.get_int64("NumBlocks"));

  // Number of int32s in 256 MiB:
  const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t);
@@ -71,17 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
  thrust::device_vector<nvbench::int32_t> in(num_values, 0);
  thrust::device_vector<nvbench::int32_t> out(num_values, 0);

-  state.exec(
-    [block_size,
-     num_blocks,
-     num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
-        in_ptr,
-        out_ptr,
-        num_values);
-    });
+  state.exec([block_size,
+              num_blocks,
+              num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
+                                                                             out_ptr,
+                                                                             num_values);
+  });
 }
 NVBENCH_BENCH(copy_sweep_grid_shape)
  // Every second power of two from  64->1024:
@@ -106,14 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
  thrust::device_vector<ValueType> in(num_values, 0);
  thrust::device_vector<ValueType> out(num_values, 0);

-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Define a type_list to use for the type axis:
 using cts_types = nvbench::type_list<nvbench::uint8_t,
@@ -129,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
 // Convert 64 MiB of InputTypes to OutputTypes, represented with various
 // value_types.
 template <typename InputType, typename OutputType>
-void copy_type_conversion_sweep(nvbench::state &state,
-                                nvbench::type_list<InputType, OutputType>)
+void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // Optional: Skip narrowing conversions.
-  if (sizeof(InputType) > sizeof(OutputType))
+  if constexpr (sizeof(InputType) > sizeof(OutputType))
  {
    state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
    return;
@@ -152,14 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
  thrust::device_vector<InputType> in(num_values, 0);
  thrust::device_vector<OutputType> out(num_values, 0);

-  state.exec(
-    [num_values,
-     in_ptr  = thrust::raw_pointer_cast(in.data()),
-     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
-      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
-                                                                 out_ptr,
-                                                                 num_values);
-    });
+  state.exec([num_values,
+              in_ptr  = thrust::raw_pointer_cast(in.data()),
+              out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
+  });
 }
 // Optional: Skip when InputType == OutputType. This approach avoids
 // instantiating the benchmark at all.
@@ -175,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
                                      nvbench::float32_t,
                                      nvbench::int64_t,
                                      nvbench::float64_t>;
-NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
-                    NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
+NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/cpu_only.cu
+++ b/examples/cpu_only.cu
@@ -0,0 +1,83 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+#include <chrono>
+#include <thread>
+
+// Block execution of the current CPU thread for `seconds` seconds.
+void sleep_host(double seconds)
+{
+  std::this_thread::sleep_for(
+    std::chrono::milliseconds(static_cast<nvbench::int64_t>(seconds * 1000)));
+}
+
+//=============================================================================
+// Simple CPU-only benchmark that sleeps on host for a specified duration.
+void simple(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec([duration](nvbench::launch &) { sleep_host(duration); });
+}
+NVBENCH_BENCH(simple)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
+
+//=============================================================================
+// Simple CPU-only benchmark that sleeps on host for a specified duration and
+// uses a custom timed region.
+void simple_timer(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec(nvbench::exec_tag::timer, [duration](nvbench::launch &, auto &timer) {
+    // Do any setup work before starting the timer here...
+    timer.start();
+
+    // The region of code to be timed:
+    sleep_host(duration);
+
+    timer.stop();
+    // Any per-run cleanup here...
+  });
+}
+NVBENCH_BENCH(simple_timer)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
+
+//=============================================================================
+// Simple CPU-only benchmark that uses the optional `nvbench::exec_tag::no_gpu`
+// hint to prevent GPU measurement code from being instantiated. Note that
+// `set_is_cpu_only(true)` is still required when using this hint.
+void simple_no_gpu(nvbench::state &state)
+{
+  const auto duration = state.get_float64("Duration");
+
+  state.exec(nvbench::exec_tag::no_gpu, [duration](nvbench::launch &) { sleep_host(duration); });
+}
+NVBENCH_BENCH(simple_no_gpu)
+  // 100 -> 500 ms in 100 ms increments.
+  .add_float64_axis("Duration", nvbench::range(.1, .5, .1))
+  // Mark as CPU-only.
+  .set_is_cpu_only(true);
--- a/examples/custom_criterion.cu
+++ b/examples/custom_criterion.cu
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2023 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+// Inherit from the stopping_criterion_base class:
+class fixed_criterion final : public nvbench::stopping_criterion_base
+{
+  nvbench::int64_t m_num_samples{};
+
+public:
+  fixed_criterion()
+      : nvbench::stopping_criterion_base{"fixed", {{"max-samples", nvbench::int64_t{42}}}}
+  {}
+
+protected:
+  // Setup the criterion in the `do_initialize()` method:
+  virtual void do_initialize() override { m_num_samples = 0; }
+
+  // Process new measurements in the `add_measurement()` method:
+  virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
+  {
+    m_num_samples++;
+  }
+
+  // Check if the stopping criterion is met in the `is_finished()` method:
+  virtual bool do_is_finished() override
+  {
+    return m_num_samples >= m_params.get_int64("max-samples");
+  }
+};
+
+// Register the criterion with NVBench:
+NVBENCH_REGISTER_CRITERION(fixed_criterion);
+
+void throughput_bench(nvbench::state &state)
+{
+  // Allocate input data:
+  const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
+  thrust::device_vector<nvbench::int32_t> input(num_values);
+  thrust::device_vector<nvbench::int32_t> output(num_values);
+
+  // Provide throughput information:
+  state.add_element_count(num_values, "NumElements");
+  state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
+    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
+      thrust::raw_pointer_cast(input.data()),
+      thrust::raw_pointer_cast(output.data()),
+      num_values);
+  });
+}
+NVBENCH_BENCH(throughput_bench).set_stopping_criterion("fixed");
--- a/examples/custom_iteration_spaces.cu
+++ b/examples/custom_iteration_spaces.cu
@@ -89,7 +89,7 @@ NVBENCH_BENCH(copy_sweep_grid_shape)
 //
 struct under_diag final : nvbench::user_axis_space
 {
-  under_diag(std::vector<std::size_t> input_indices)
+  explicit under_diag(std::vector<std::size_t> input_indices)
      : nvbench::user_axis_space(std::move(input_indices))
  {}

@@ -162,7 +162,7 @@ NVBENCH_BENCH(copy_sweep_grid_shape)
 struct gauss final : nvbench::user_axis_space
 {

-  gauss(std::vector<std::size_t> input_indices)
+  explicit gauss(std::vector<std::size_t> input_indices)
      : nvbench::user_axis_space(std::move(input_indices))
  {}

--- a/examples/enums.cu
+++ b/examples/enums.cu
@@ -17,7 +17,6 @@
 */

 #include <nvbench/nvbench.cuh>
-
 #include <nvbench/test_kernels.cuh>

 // Enum to use as parameter axis:
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
  // Create inputs, etc, configure runtime kernel parameters, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-NVBENCH_BENCH(runtime_enum_sweep_string)
-  .add_string_axis("MyEnum", {"A", "B", "C"});
+NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});

 //==============================================================================
 // Sweep through enum values at runtime using an int64 axis.
@@ -91,15 +88,14 @@ NVBENCH_BENCH(runtime_enum_sweep_string)
 // ```
 void runtime_enum_sweep_int64(nvbench::state &state)
 {
-  const auto enum_value = static_cast<MyEnum>(state.get_int64("MyEnum"));
+  [[maybe_unused]] const auto enum_value = static_cast<MyEnum>(state.get_int64("MyEnum"));

  // Do stuff with enum_value.
  // Create inputs, etc, configure runtime kernel parameters, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 NVBENCH_BENCH(runtime_enum_sweep_int64)
  .add_int64_axis("MyEnum",
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
  // Template parameters, static dispatch, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
-using MyEnumList =
-  nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
+using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
 NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
  .set_type_axes_names({"MyEnum"});

@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
 //  * `-12` (struct std::integral_constant<int,-12>)
 // ```
 template <nvbench::int32_t IntValue>
-void compile_time_int_sweep(nvbench::state &state,
-                            nvbench::type_list<nvbench::enum_type<IntValue>>)
+void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
 {
  // Use IntValue in compile time contexts.
  // Template parameters, static dispatch, etc.

  // Just a dummy kernel.
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
 NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))
--- a/examples/exec_tag_sync.cu
+++ b/examples/exec_tag_sync.cu
@@ -27,6 +27,9 @@
 // Used to initialize input data:
 #include <thrust/sequence.h>

+// Used to run the benchmark on a CUDA stream
+#include <thrust/execution_policy.h>
+
 // `sequence_bench` measures the execution time of `thrust::sequence`. Since
 // algorithms in `thrust::` implicitly sync the CUDA device, the
 // `nvbench::exec_tag::sync` must be passed to `state.exec(...)`.
@@ -50,9 +53,7 @@ void sequence_bench(nvbench::state &state)

  // nvbench::exec_tag::sync indicates that this will implicitly sync:
  state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
-    thrust::sequence(thrust::device.on(launch.get_stream()),
-                     data.begin(),
-                     data.end());
+    thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
  });
 }
 NVBENCH_BENCH(sequence_bench);
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -24,6 +24,7 @@
 // Thrust simplifies memory management, etc:
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
 #include <thrust/sequence.h>

 // mod2_inplace performs an in-place mod2 over every element in `data`. `data`
@@ -53,6 +54,8 @@ void mod2_inplace(nvbench::state &state)
  state.exec(nvbench::exec_tag::timer,
             // Lambda now takes a `timer` argument:
             [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
+               (void)num_values; // clang thinks this is unused...
+
               // Reset working data:
               thrust::copy(thrust::device.on(launch.get_stream()),
                            input.cbegin(),
--- a/examples/skip.cu
+++ b/examples/skip.cu
@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
 // Two type axes are swept, but configurations where InputType == OutputType are
 // skipped.
 template <typename InputType, typename OutputType>
-void skip_overload(nvbench::state &state,
-                   nvbench::type_list<InputType, OutputType>)
+void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Overload of skip_overload that is called when InputType == OutputType.
 template <typename T>
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
 {
  // This is a contrived example that focuses on the skip overloads, so this is
  // just a sleep kernel:
-  state.exec([](nvbench::launch &launch) {
-    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
-  });
+  state.exec(
+    [](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
 }
 // Enable this overload if InputType is larger than OutputType
 template <typename InputType, typename OutputType>
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
  state.skip("sizeof(InputType) > sizeof(OutputType).");
 }
 // The same type_list is used for both inputs/outputs.
-using sn_types = nvbench::type_list<nvbench::int8_t,
-                                    nvbench::int16_t,
-                                    nvbench::int32_t,
-                                    nvbench::int64_t>;
+using sn_types =
+  nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
 // Setup benchmark:
 NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
  .set_type_axes_names({"In", "Out"});
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -52,6 +52,7 @@ void stream_bench(nvbench::state &state)
  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));

  state.exec([&input, &output, num_values](nvbench::launch &) {
+    (void)num_values; // clang thinks this is unused...
    copy(thrust::raw_pointer_cast(input.data()),
         thrust::raw_pointer_cast(output.data()),
         num_values);
--- a/examples/summaries.cu
+++ b/examples/summaries.cu
@@ -0,0 +1,73 @@
+/*
+ *  Copyright 2025 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// #define PRINT_DEFAULT_SUMMARY_TAGS
+
+void summary_example(nvbench::state &state)
+{
+  // Fetch parameters and compute duration in seconds:
+  const auto ms       = static_cast<nvbench::float64_t>(state.get_int64("ms"));
+  const auto us       = static_cast<nvbench::float64_t>(state.get_int64("us"));
+  const auto duration = ms * 1e-3 + us * 1e-6;
+
+  // Add a new column to the summary table with the derived duration used by the benchmark.
+  // See the documentation in nvbench/summary.cuh for more details.
+  {
+    nvbench::summary &summary = state.add_summary("duration");
+    summary.set_string("name", "Duration (s)");
+    summary.set_string("description", "The duration of the kernel execution.");
+    summary.set_string("hint", "duration");
+    summary.set_float64("value", duration);
+  }
+
+  // Run the measurements:
+  state.exec(nvbench::exec_tag::no_batch, [duration](nvbench::launch &launch) {
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
+  });
+
+#ifdef PRINT_DEFAULT_SUMMARY_TAGS
+  // The default summary tags can be found by inspecting the state after calling
+  // state.exec.
+  // They can also be found by looking at the json output (--json <filename>)
+  for (const auto &summary : state.get_summaries())
+  {
+    std::cout << summary.get_tag() << std::endl;
+  }
+#endif
+
+  // Default summary columns can be shown/hidden in the markdown output tables by adding/removing
+  // the "hide" key. Modify this benchmark to show the minimum and maximum GPUs times, but hide the
+  // mean GPU time and all CPU times. SM Clock frequency and throttling info are also shown.
+  state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
+  state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
+  state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/min").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/max").set_string("hide", "");
+  state.get_summary("nv/cold/time/cpu/stdev/relative").set_string("hide", "");
+  state.get_summary("nv/cold/sm_clock_rate/mean").remove_value("hide");
+  state.get_summary("nv/cold/sm_clock_rate/scaling/percent").remove_value("hide");
+}
+NVBENCH_BENCH(summary_example)
+  .add_int64_axis("ms", nvbench::range(10, 50, 20))
+  .add_int64_axis("us", nvbench::range(100, 500, 200));
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -51,6 +51,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);

  state.exec([&input, &output, num_values](nvbench::launch &launch) {
+    (void)num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/exec/CMakeLists.txt
+++ b/exec/CMakeLists.txt
@@ -6,7 +6,6 @@ set_target_properties(nvbench.ctl PROPERTIES
  EXPORT_NAME ctl
 )
 add_dependencies(nvbench.all nvbench.ctl)
-nvbench_setup_dep_dlls(nvbench.ctl)
 nvbench_install_executables(nvbench.ctl)

 if (NVBench_ENABLE_TESTING)
--- a/exec/nvbench-ctl.cu
+++ b/exec/nvbench-ctl.cu
@@ -1,20 +1,20 @@
 /*
-*  Copyright 2021 NVIDIA Corporation
-*
-*  Licensed under the Apache License, Version 2.0 with the LLVM exception
-*  (the "License"); you may not use this file except in compliance with
-*  the License.
-*
-*  You may obtain a copy of the License at
-*
-*      http://llvm.org/foundation/relicensing/LICENSE.txt
-*
-*  Unless required by applicable law or agreed to in writing, software
-*  distributed under the License is distributed on an "AS IS" BASIS,
-*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*  See the License for the specific language governing permissions and
-*  limitations under the License.
-*/
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */

 #include <nvbench/main.cuh>

@@ -24,7 +24,7 @@ int main(int argc, char const *const *argv)
 try
 {
  // If no args, substitute a new argv that prints the version
-  std::vector<const char*> alt_argv;
+  std::vector<const char *> alt_argv;
  if (argc == 1)
  {
    alt_argv.push_back("--version");
@@ -36,7 +36,7 @@ try
  NVBENCH_CUDA_CALL(cudaDeviceReset());
  return 0;
 }
-catch (std::exception & e)
+catch (std::exception &e)
 {
  std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";
  return 1;
--- a/nvbench/CMakeLists.txt
+++ b/nvbench/CMakeLists.txt
@@ -5,6 +5,7 @@ set(srcs
  benchmark_base.cxx
  benchmark_manager.cxx
  blocking_kernel.cu
+  criterion_manager.cxx
  csv_printer.cu
  cuda_call.cu
  device_info.cu
@@ -19,25 +20,29 @@ set(srcs
  printer_multiplex.cxx
  runner.cxx
  state.cxx
+  stopping_criterion.cxx
  string_axis.cxx
  type_axis.cxx
  type_strings.cxx
  user_axis_space.cxx
  zip_axis_space.cxx

+  detail/entropy_criterion.cxx
  detail/measure_cold.cu
+  detail/measure_cpu_only.cxx
  detail/measure_hot.cu
  detail/state_generator.cxx
+  detail/stdrel_criterion.cxx
+  detail/gpu_frequency.cxx
+  detail/timestamps_kernel.cu
+
+  internal/nvml.cxx
 )

 if (NVBench_ENABLE_CUPTI)
  list(APPEND srcs detail/measure_cupti.cu cupti_profiler.cxx)
 endif()

-if (NVBench_ENABLE_NVML)
-  list(APPEND srcs internal/nvml.cxx)
-endif()
-
 # CUDA 11.0 can't compile json_printer without crashing
 # So for that version fall back to C++ with degraded
 # output ( no PTX version info )
@@ -69,7 +74,7 @@ nvbench_write_config_header(config.cuh.in
 )

 # nvbench (nvbench::nvbench)
-add_library(nvbench SHARED ${srcs})
+add_library(nvbench ${srcs})
 nvbench_config_target(nvbench)
 target_include_directories(nvbench PUBLIC
  "$<BUILD_INTERFACE:${NVBench_SOURCE_DIR}>"
@@ -82,8 +87,29 @@ target_link_libraries(nvbench
  PRIVATE
    fmt::fmt
    nvbench_json
-    nvbench_git_revision
 )
+
+# ##################################################################################################
+# * conda environment -----------------------------------------------------------------------------
+rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
+if(TARGET conda_env)
+  # When we are inside a conda env the linker will be set to
+  # `ld.bfd` which will try to resolve all undefined symbols at link time.
+  #
+  # Since we could be using a shared library version of fmt we need
+  # it on the final link line of consumers
+  target_link_libraries(nvbench PRIVATE $<BUILD_INTERFACE:conda_env>)
+
+  # When we are inside a conda env the linker will be set to
+  # `ld.bfd` which will try to resolve all undefined symbols at link time.
+  #
+  # Since we could be using a shared library version of fmt we need
+  # it on the final link line of consumers
+  if(fmt_is_external)
+    target_link_libraries(nvbench PUBLIC fmt::fmt)
+  endif()
+endif()
+
 target_compile_features(nvbench PUBLIC cuda_std_17 PRIVATE cxx_std_17)
 add_dependencies(nvbench.all nvbench)

@@ -98,7 +124,6 @@ add_dependencies(nvbench.all nvbench.main)
 add_library(nvbench::nvbench ALIAS nvbench)
 add_library(nvbench::main ALIAS nvbench.main)

-nvbench_setup_dep_dlls(nvbench)
 nvbench_install_libraries(nvbench nvbench.main nvbench.build_interface)

 # nvcc emits several unavoidable warnings while compiling nlohmann_json:
@@ -111,3 +136,19 @@ if (json_is_cu)
    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--diag_suppress=940>
  )
 endif()
+
+# The call to `rapids_cmake_write_git_revision_file` must be in the same
+# CMakeLists.txt as the consumer ( nvbench ) for CMake to get the dependency
+# graph correct.
+rapids_cmake_write_git_revision_file(
+  nvbench_git_revision
+  "${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
+  PREFIX "NVBENCH"
+)
+target_link_libraries(nvbench PRIVATE nvbench_git_revision)
+
+if(NOT BUILD_SHARED_LIBS)
+  # Need to ensure that for static builds we export the nvbench_git_revision
+  # target
+  nvbench_install_libraries(nvbench_git_revision)
+endif()
--- a/Show More
+++ b/Show More