diff --git a/.clangd b/.clangd
index 1649e16..0e4c84b 100644
--- a/.clangd
+++ b/.clangd
@@ -32,7 +32,6 @@ CompileFlags:
     # report all errors
     - "-ferror-limit=0"
     - "-ftemplate-backtrace-limit=0"
-    - "-stdlib=libc++"
     - "-std=c++17"
   Remove:
     # strip CUDA fatbin args
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
new file mode 100644
index 0000000..e84b5f3
--- /dev/null
+++ b/.devcontainer/README.md
@@ -0,0 +1,198 @@
+> **Note**
+> The instructions in this README are specific to Linux development environments. Instructions for Windows are coming soon!
+
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
+
+# CCCL Dev Containers
+
+CCCL uses [Development Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. This guide covers setup in [Visual Studio Code](#quickstart-vscode-recommended) and [Docker](#quickstart-docker-manual-approach). The guide also provides additional instructions in case you want use WSL.
+
+## Table of Contents
+1. [Quickstart: VSCode (Recommended)](#vscode)
+2. [Quickstart: Docker (Manual Approach)](#docker)
+3. [Quickstart: Using WSL](#wsl)
+
+## Quickstart: VSCode (Recommended) <a name="vscode"></a>
+
+### Prerequisites
+- [Visual Studio Code](https://code.visualstudio.com/)
+- [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
+- [Docker](https://docs.docker.com/engine/install/) - This is only for completeness because it should already be implicitly installed by the Dev Containers extension
+
+### Steps
+
+1. Clone the Repository
+    ```bash
+    git clone https://github.com/nvidia/cccl.git
+    ```
+2. Open the cloned directory in VSCode
+
+3. Launch a Dev Container by clicking the prompt suggesting to "Reopen in Container"
+
+   ![Shows "Reopen in Container" prompt when opening the cccl directory in VScode.](./img/reopen_in_container.png)
+
+   - Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.
+
+     ![Shows "Reopen in Container" in command pallete.](./img/open_in_container_manual.png)
+
+4. Select an environment with the desired CTK and host compiler from the list:
+
+   ![Shows list of available container environments.](./img/container_list.png)
+
+5. VSCode will initialize the selected Dev Container. This can take a few minutes the first time.
+
+6. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent.
+
+7. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+
+### (Optional) Authenticate with GitHub for `sccache`
+
+After starting the container, there will be a prompt to authenticate with GitHub. This grants access to a [`sccache`](https://github.com/mozilla/sccache) server shared with CI and greatly accelerates local build times. This is currently limited to NVIDIA employees belonging to the `NVIDIA` or `rapidsai` GitHub organizations.
+
+Without authentication to the remote server, `sccache` will still accelerate local builds by using a filesystem cache.
+
+Follow the instructions in the prompt as below and enter the one-time code at https://github.com/login/device
+
+  ![Shows authentication with GitHub to access sccache bucket.](./img/github_auth.png)
+
+To manually trigger this authentication, execute the `devcontainer-utils-vault-s3-init` script within the container.
+
+For more information about the sccache configuration and authentication, see the documentation at [`rapidsai/devcontainers`](https://github.com/rapidsai/devcontainers/blob/branch-23.10/USAGE.md#build-caching-with-sccache).
+
+## Quickstart: Docker (Manual Approach) <a name="docker"></a>
+
+### Prerequisites
+- [Docker](https://docs.docker.com/desktop/install/linux-install/)
+
+### Steps
+1. Clone the repository and use the [`launch.sh`](./launch.sh) script to launch the default container environment
+    ```bash
+    git clone https://github.com/nvidia/cccl.git
+    cd cccl
+    ./.devcontainer/launch.sh --docker
+    ```
+    This script starts an interactive shell as the `coder` user inside the container with the local `cccl/` directory mirrored into `/home/coder/cccl`.
+
+    For specific environments, use the `--cuda` and `--host` options:
+    ```bassh
+    ./.devcontainer/launch.sh --docker --cuda 12.2 --host gcc10
+    ```
+    See `./.devcontainer/launch.sh --help` for more information.
+
+2. Done. See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
+
+## Available Environments
+
+CCCL provides environments for both the oldest and newest supported CUDA versions with all compatible host compilers.
+
+Look in the [`.devcontainer/`](.) directory to see the available configurations. The top-level [`devcontainer.json`](./devcontainer.json) serves as the default environment. All `devcontainer.json` files in the `cuda<CTK_VERSION>-<HOST-COMPILER>` sub-directories are variations on this top-level file, with different base images for the different CUDA and host compiler versions.
+
+## VSCode Customization
+
+By default, CCCL's Dev Containers come with certain VSCode settings and extensions configured by default, as can be seen in the [`devcontainer.json`](./devcontainer.json) file. This can be further customized by users without needing to modify the `devcontainer.json` file directly.
+
+For extensions, the [`dev.containers.defaultExtensions` setting](https://code.visualstudio.com/docs/devcontainers/containers#_always-installed-extensions) allows listing extensions that will always be installed.
+
+For more general customizations, VSCode allows using a dotfile repository. See the [VSCode documentation](https://code.visualstudio.com/docs/devcontainers/containers#_personalizing-with-dotfile-repositories) for more information.
+
+## GitHub Codespaces
+
+[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
+
+One of the benefits of Dev Containers is that they integrate natively with [GitHub Codespaces](https://github.com/features/codespaces). Codespaces provide a VSCode development environment right in your browser running on a machine in the cloud. This provides a truly one-click, turnkey development environment where you can develop, build, and test with no other setup required.
+
+Click the badge above or [click here](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) to get started with CCCL's Dev Containers on Codespaces. This will start the default Dev Container environment. [Click here](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=296416761&skip_quickstart=true) to start a Codespace with a particular environment and hardware configuration as shown:
+
+   ![Shows configuring a Codespace with a custom environment](../docs/images/codespaces.png)
+
+## For Maintainers: The `make_devcontainers.sh` Script
+
+### Overview
+
+[`make_devcontainers.sh`](./make_devcontainers.sh) generates devcontainer configurations for the unique combinations of CUDA Toolkit (CTK) versions and host compilers in [`ci/matrix.yaml`](../ci/matrix.yaml).
+
+### How It Works:
+
+1. Parses the matrix from `ci/matrix.yaml`.
+2. Use the top-level [`.devcontainer/devcontainer.json`](./devcontainer.json) as a template. For each unique combination of CTK version and host compiler, generate a corresponding `devcontainer.json` configuration, adjusting only the base Docker image to match the desired environment.
+3. Place the generated configurations in the `.devcontainer` directory, organizing them into subdirectories following the naming convention `cuda<CTK_VERSION>-<COMPILER_VERSION>`.
+
+For more information, see the `.devcontainer/make_devcontainers.sh --help` message.
+
+**Note**: When adding or updating supported environments, modify `matrix.yaml` and then rerun this script to synchronize the `devcontainer` configurations.
+
+## Quickstart: Using WSL <a name="wsl"></a>
+
+> [!NOTE]
+> _Make sure you have the Nvidia driver installed on your Windows host before moving further_. Type in `nvidia-smi` for verification.
+
+### Install WSL on your Windows host
+
+> [!WARNING]
+> Disclaimer: This guide was developed for WSL 2 on Windows 11.
+
+1. Launch a Windows terminal (_e.g. Powershell_) as an administrator.
+
+2. Install WSL 2 by running:
+```bash
+wsl --install 
+```
+This should probably install Ubuntu distro as a default.
+
+3. Restart your computer and run `wsl -l -v` on a Windows terminal to verify installation.
+
+<h3 id="prereqs"> Install prerequisites and VS Code extensions</h3>
+
+4. Launch your WSL/Ubuntu terminal by running `wsl` in Powershell.
+
+5. Install the [WSL extension](ms-vscode-remote.remote-wsl) on VS Code.
+
+    - `Ctrl + Shift + P` and select `WSL: Connect to WSL` (it will prompt you to install the WSL extension).
+
+    - Make sure you are connected to WSL with VS Code by checking the bottom left corner of the VS Code window (should indicate "WSL: Ubuntu" in our case).
+
+6. Install the [Dev Containers extension](ms-vscode-remote.remote-containers) on VS Code.
+
+    - In a vanilla system you should be prompted to install `Docker` at this point, accept it. If it hangs you might have to restart VS Code after that.
+
+7. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). **Make sure you install the WSL 2 version and not the native Linux one**. This builds on top of Docker so make sure you have Docker properly installed (run `docker --version`).
+
+8. Open `/etc/docker/daemon.json` from within your WSL system (if the file does not exist, create it) and add the following:
+
+```json
+{
+    "runtimes": {
+        "nvidia": {
+            "path": "nvidia-container-runtime",
+            "runtimeArgs": []
+        }
+    }
+}
+```
+
+then run `sudo systemctl restart docker.service`.
+
+---
+### Build CCCL in WSL using Dev Containers
+
+9. Still on your WSL terminal run `git clone https://github.com/NVIDIA/cccl.git`
+
+
+10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).
+
+11. If prompted, choose `Reopen in Container`.
+    
+    - If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.
+
+12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.
+
+From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:
+
+13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message. 
+
+> Failed opening a web browser at https://github.com/login/device
+  exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
+  Please try entering the URL in your browser manually
+
+In that case type in the address manually in your web browser https://github.com/login/device and fill in the one-time code.
diff --git a/.devcontainer/cuda11.1-gcc6/devcontainer.json b/.devcontainer/cuda11.1-gcc6/devcontainer.json
new file mode 100644
index 0000000..9faa25a
--- /dev/null
+++ b/.devcontainer/cuda11.1-gcc6/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc6-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-gcc6",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "6",
+    "CCCL_BUILD_INFIX": "cuda11.1-gcc6"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-gcc6"
+}
diff --git a/.devcontainer/cuda11.1-gcc7/devcontainer.json b/.devcontainer/cuda11.1-gcc7/devcontainer.json
new file mode 100644
index 0000000..3e9e2f7
--- /dev/null
+++ b/.devcontainer/cuda11.1-gcc7/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc7-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-gcc7",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda11.1-gcc7"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-gcc7"
+}
diff --git a/.devcontainer/cuda11.1-gcc8/devcontainer.json b/.devcontainer/cuda11.1-gcc8/devcontainer.json
new file mode 100644
index 0000000..3862680
--- /dev/null
+++ b/.devcontainer/cuda11.1-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc8-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-gcc8",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda11.1-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-gcc8"
+}
diff --git a/.devcontainer/cuda11.1-gcc9/devcontainer.json b/.devcontainer/cuda11.1-gcc9/devcontainer.json
new file mode 100644
index 0000000..54f7b88
--- /dev/null
+++ b/.devcontainer/cuda11.1-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc9-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-gcc9",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda11.1-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-gcc9"
+}
diff --git a/.devcontainer/cuda11.1-llvm9/devcontainer.json b/.devcontainer/cuda11.1-llvm9/devcontainer.json
new file mode 100644
index 0000000..d875c9b
--- /dev/null
+++ b/.devcontainer/cuda11.1-llvm9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm9-cuda11.1-ubuntu18.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.1-llvm9",
+    "CCCL_CUDA_VERSION": "11.1",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda11.1-llvm9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.1-llvm9"
+}
diff --git a/.devcontainer/cuda11.8-gcc11/devcontainer.json b/.devcontainer/cuda11.8-gcc11/devcontainer.json
new file mode 100644
index 0000000..389d6c7
--- /dev/null
+++ b/.devcontainer/cuda11.8-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc11-cuda11.8-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda11.8-gcc11",
+    "CCCL_CUDA_VERSION": "11.8",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda11.8-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda11.8-gcc11"
+}
diff --git a/.devcontainer/cuda12.4-gcc10/devcontainer.json b/.devcontainer/cuda12.4-gcc10/devcontainer.json
new file mode 100644
index 0000000..9968a84
--- /dev/null
+++ b/.devcontainer/cuda12.4-gcc10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc10-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-gcc10",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.4-gcc10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-gcc10"
+}
diff --git a/.devcontainer/cuda12.4-gcc11/devcontainer.json b/.devcontainer/cuda12.4-gcc11/devcontainer.json
new file mode 100644
index 0000000..d2c2662
--- /dev/null
+++ b/.devcontainer/cuda12.4-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc11-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-gcc11",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.4-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-gcc11"
+}
diff --git a/.devcontainer/cuda12.4-gcc12/devcontainer.json b/.devcontainer/cuda12.4-gcc12/devcontainer.json
new file mode 100644
index 0000000..fa6e0c5
--- /dev/null
+++ b/.devcontainer/cuda12.4-gcc12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc12-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-gcc12",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.4-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-gcc12"
+}
diff --git a/.devcontainer/cuda12.4-gcc7/devcontainer.json b/.devcontainer/cuda12.4-gcc7/devcontainer.json
new file mode 100644
index 0000000..af6fdb1
--- /dev/null
+++ b/.devcontainer/cuda12.4-gcc7/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc7-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-gcc7",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda12.4-gcc7"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-gcc7"
+}
diff --git a/.devcontainer/cuda12.4-gcc8/devcontainer.json b/.devcontainer/cuda12.4-gcc8/devcontainer.json
new file mode 100644
index 0000000..46670d4
--- /dev/null
+++ b/.devcontainer/cuda12.4-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc8-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-gcc8",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda12.4-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-gcc8"
+}
diff --git a/.devcontainer/cuda12.4-gcc9/devcontainer.json b/.devcontainer/cuda12.4-gcc9/devcontainer.json
new file mode 100644
index 0000000..4005e72
--- /dev/null
+++ b/.devcontainer/cuda12.4-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc9-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-gcc9",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.4-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-gcc9"
+}
diff --git a/.devcontainer/cuda12.4-llvm10/devcontainer.json b/.devcontainer/cuda12.4-llvm10/devcontainer.json
new file mode 100644
index 0000000..6ee5788
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm10-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm10",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm10"
+}
diff --git a/.devcontainer/cuda12.4-llvm11/devcontainer.json b/.devcontainer/cuda12.4-llvm11/devcontainer.json
new file mode 100644
index 0000000..66bd26c
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm11-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm11",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm11"
+}
diff --git a/.devcontainer/cuda12.4-llvm12/devcontainer.json b/.devcontainer/cuda12.4-llvm12/devcontainer.json
new file mode 100644
index 0000000..8889f14
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm12-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm12",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm12"
+}
diff --git a/.devcontainer/cuda12.4-llvm13/devcontainer.json b/.devcontainer/cuda12.4-llvm13/devcontainer.json
new file mode 100644
index 0000000..76faea9
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm13/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm13-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm13",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm13"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm13"
+}
diff --git a/.devcontainer/cuda12.4-llvm14/devcontainer.json b/.devcontainer/cuda12.4-llvm14/devcontainer.json
new file mode 100644
index 0000000..58b00cc
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm14/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm14-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm14",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm14"
+}
diff --git a/.devcontainer/cuda12.4-llvm15/devcontainer.json b/.devcontainer/cuda12.4-llvm15/devcontainer.json
new file mode 100644
index 0000000..9c92653
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm15/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm15-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm15",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "15",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm15"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm15"
+}
diff --git a/.devcontainer/cuda12.4-llvm16/devcontainer.json b/.devcontainer/cuda12.4-llvm16/devcontainer.json
new file mode 100644
index 0000000..9f6fcad
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm16/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm16-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm16",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "16",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm16"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm16"
+}
diff --git a/.devcontainer/cuda12.4-llvm9/devcontainer.json b/.devcontainer/cuda12.4-llvm9/devcontainer.json
new file mode 100644
index 0000000..d9910e2
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm9-cuda12.4-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm9",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm9"
+}
diff --git a/.devcontainer/cuda12.4-oneapi2023.2.0/devcontainer.json b/.devcontainer/cuda12.4-oneapi2023.2.0/devcontainer.json
new file mode 100644
index 0000000..04d71c2
--- /dev/null
+++ b/.devcontainer/cuda12.4-oneapi2023.2.0/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-oneapi2023.2.0-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-oneapi2023.2.0",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "oneapi",
+    "CCCL_HOST_COMPILER_VERSION": "2023.2.0",
+    "CCCL_BUILD_INFIX": "cuda12.4-oneapi2023.2.0"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-oneapi2023.2.0"
+}
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..fa6e0c5
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc12-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-gcc12",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.4-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-gcc12"
+}
diff --git a/.devcontainer/img/container_list.png b/.devcontainer/img/container_list.png
new file mode 100644
index 0000000..09c4510
Binary files /dev/null and b/.devcontainer/img/container_list.png differ
diff --git a/.devcontainer/img/github_auth.png b/.devcontainer/img/github_auth.png
new file mode 100644
index 0000000..3f52b3a
Binary files /dev/null and b/.devcontainer/img/github_auth.png differ
diff --git a/.devcontainer/img/open_in_container_manual.png b/.devcontainer/img/open_in_container_manual.png
new file mode 100644
index 0000000..e09435b
Binary files /dev/null and b/.devcontainer/img/open_in_container_manual.png differ
diff --git a/.devcontainer/img/reopen_in_container.png b/.devcontainer/img/reopen_in_container.png
new file mode 100644
index 0000000..0e1d82d
Binary files /dev/null and b/.devcontainer/img/reopen_in_container.png differ
diff --git a/.devcontainer/launch.sh b/.devcontainer/launch.sh
new file mode 100755
index 0000000..0299e0c
--- /dev/null
+++ b/.devcontainer/launch.sh
@@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Ensure the script is being executed in the cccl/ root
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..";
+
+print_help() {
+    echo "Usage: $0 [-c|--cuda <CUDA version>] [-H|--host <Host compiler>] [-d|--docker]"
+    echo "Launch a development container. If no CUDA version or Host compiler are specified,"
+    echo "the top-level devcontainer in .devcontainer/devcontainer.json will be used."
+    echo ""
+    echo "Options:"
+    echo "  -c, --cuda       Specify the CUDA version. E.g., 12.2"
+    echo "  -H, --host       Specify the host compiler. E.g., gcc12"
+    echo "  -d, --docker     Launch the development environment in Docker directly without using VSCode."
+    echo "  -h, --help       Display this help message and exit."
+}
+
+parse_options() {
+    local OPTIONS=c:H:dh
+    local LONG_OPTIONS=cuda:,host:,docker,help
+    local PARSED_OPTIONS=$(getopt -n "$0" -o "${OPTIONS}" --long "${LONG_OPTIONS}" -- "$@")
+
+    if [[ $? -ne 0 ]]; then
+        exit 1
+    fi
+
+    eval set -- "${PARSED_OPTIONS}"
+
+    while true; do
+        case "$1" in
+            -c|--cuda)
+                cuda_version="$2"
+                shift 2
+                ;;
+            -H|--host)
+                host_compiler="$2"
+                shift 2
+                ;;
+            -d|--docker)
+                docker_mode=true
+                shift
+                ;;
+            -h|--help)
+                print_help
+                exit 0
+                ;;
+            --)
+                shift
+                break
+                ;;
+            *)
+                echo "Invalid option: $1"
+                print_help
+                exit 1
+                ;;
+        esac
+    done
+}
+
+launch_docker() {
+    DOCKER_IMAGE=$(grep "image" "${path}/devcontainer.json" | sed 's/.*: "\(.*\)",/\1/')
+    echo "Found image: ${DOCKER_IMAGE}"
+    docker pull ${DOCKER_IMAGE}
+    docker run   \
+        -it --rm \
+        --user coder \
+        --workdir /home/coder/cccl \
+        --mount type=bind,src="$(pwd)",dst='/home/coder/cccl' \
+        ${DOCKER_IMAGE} \
+        /bin/bash
+}
+
+launch_vscode() {
+    # Since Visual Studio Code allows only one instance per `devcontainer.json`,
+    # this code prepares a unique temporary directory structure for each launch of a devcontainer.
+    # By doing so, it ensures that multiple instances of the same environment can be run
+    # simultaneously. The script replicates the `devcontainer.json` from the desired CUDA
+    # and compiler environment into this temporary directory, adjusting paths to ensure the
+    # correct workspace is loaded. A special URL is then generated to instruct VSCode to
+    # launch the development container using this temporary configuration.
+    local workspace="$(basename "$(pwd)")"
+    local tmpdir="$(mktemp -d)/${workspace}"
+    mkdir -p "${tmpdir}"
+    mkdir -p "${tmpdir}/.devcontainer"
+    cp -arL "${path}/devcontainer.json" "${tmpdir}/.devcontainer"
+    sed -i 's@\\${localWorkspaceFolder}@$(pwd)@g' "${tmpdir}/.devcontainer/devcontainer.json"
+    local path="${tmpdir}"
+    local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')"
+    local url="vscode://vscode-remote/dev-container+${hash}/home/coder/cccl"
+
+    local launch=""
+    if type open >/dev/null 2>&1; then
+        launch="open"
+    elif type xdg-open >/dev/null 2>&1; then
+        launch="xdg-open"
+    fi
+
+    if [ -n "${launch}" ]; then
+        echo "Launching VSCode Dev Container URL: ${url}"
+        code --new-window "${tmpdir}"
+        exec "${launch}" "${url}" >/dev/null 2>&1
+    fi
+}
+
+main() {
+    parse_options "$@"
+
+    # If no CTK/Host compiler are provided, just use the default environment
+    if [[ -z ${cuda_version:-} ]] && [[ -z ${host_compiler:-} ]]; then
+        path=".devcontainer"
+    else
+        path=".devcontainer/cuda${cuda_version}-${host_compiler}"
+        if [[ ! -f "${path}/devcontainer.json" ]]; then
+            echo "Unknown CUDA [${cuda_version}] compiler [${host_compiler}] combination"
+            echo "Requested devcontainer ${path}/devcontainer.json does not exist"
+            exit 1
+        fi
+    fi
+
+    if ${docker_mode:-'false'}; then
+        launch_docker
+    else
+        launch_vscode
+    fi
+}
+
+main "$@"
+
diff --git a/.devcontainer/make_devcontainers.sh b/.devcontainer/make_devcontainers.sh
new file mode 100755
index 0000000..64b92c0
--- /dev/null
+++ b/.devcontainer/make_devcontainers.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+
+# This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of
+# CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the
+# .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version.
+# GitHub docs on using multiple devcontainer.json files:
+# https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson
+
+set -euo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+
+function usage {
+    echo "Usage: $0 [--clean] [-h/--help] [-v/--verbose]"
+    echo "  --clean   Remove stale devcontainer subdirectories"
+    echo "  -h, --help   Display this help message"
+    echo "  -v, --verbose  Enable verbose mode (set -x)"
+    exit 1
+}
+
+# Function to update the devcontainer.json file with the provided parameters
+update_devcontainer() {
+    local input_file="$1"
+    local output_file="$2"
+    local name="$3"
+    local cuda_version="$4"
+    local compiler_name="$5"
+    local compiler_exe="$6"
+    local compiler_version="$7"
+    local os="$8"
+    local devcontainer_version="$9"
+
+    local IMAGE_ROOT="rapidsai/devcontainers:${devcontainer_version}-cpp-"
+    local image="${IMAGE_ROOT}${compiler_name}${compiler_version}-cuda${cuda_version}-${os}"
+
+    jq --arg image "$image" --arg name "$name" \
+       --arg cuda_version "$cuda_version" --arg compiler_name "$compiler_name" \
+       --arg compiler_exe "$compiler_exe" --arg compiler_version "$compiler_version" --arg os "$os" \
+       '.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name |
+        .containerEnv.CCCL_BUILD_INFIX = $name |
+        .containerEnv.CCCL_CUDA_VERSION = $cuda_version | .containerEnv.CCCL_HOST_COMPILER = $compiler_name |
+        .containerEnv.CCCL_HOST_COMPILER_VERSION = $compiler_version '\
+       "$input_file" > "$output_file"
+}
+
+make_name() {
+    local cuda_version="$1"
+    local compiler_name="$2"
+    local compiler_version="$3"
+
+    echo "cuda$cuda_version-$compiler_name$compiler_version"
+}
+
+CLEAN=false
+VERBOSE=false
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --clean)
+            CLEAN=true
+            ;;
+        -h|--help)
+            usage
+            ;;
+        -v|--verbose)
+            VERBOSE=true
+            ;;
+        *)
+            usage
+            ;;
+    esac
+    shift
+done
+
+MATRIX_FILE="../ci/matrix.yaml"
+
+# Enable verbose mode if requested
+if [ "$VERBOSE" = true ]; then
+    set -x
+    cat ${MATRIX_FILE}
+fi
+
+# Read matrix.yaml and convert it to json
+matrix_json=$(yq -o json ${MATRIX_FILE})
+
+# Exclude Windows environments
+readonly matrix_json=$(echo "$matrix_json" | jq 'del(.pull_request.nvcc[] | select(.os | contains("windows")))')
+
+# Get the devcontainer image version and define image tag root
+readonly DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version')
+
+# Get unique combinations of cuda version, compiler name/version, and Ubuntu version
+readonly combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_exe: .compiler.exe, compiler_version: .compiler.version, os: .os}] | unique | .[]')
+
+# Update the base devcontainer with the default values
+# The root devcontainer.json file is used as the default container as well as a template for all
+# other devcontainer.json files by replacing the `image:` field with the appropriate image name
+readonly base_devcontainer_file="./devcontainer.json"
+readonly NEWEST_GCC_CUDA_ENTRY=$(echo "$combinations" | jq -rs '[.[] | select(.compiler_name == "gcc")] | sort_by((.cuda | tonumber), (.compiler_version | tonumber)) | .[-1]')
+readonly DEFAULT_CUDA=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.cuda')
+readonly DEFAULT_COMPILER_NAME=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_name')
+readonly DEFAULT_COMPILER_EXE=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_exe')
+readonly DEFAULT_COMPILER_VERSION=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_version')
+readonly DEFAULT_OS=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.os')
+readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION")
+
+update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEFAULT_OS" "$DEVCONTAINER_VERSION"
+mv "./temp_devcontainer.json" ${base_devcontainer_file}
+
+# Create an array to keep track of valid subdirectory names
+valid_subdirs=()
+
+# For each unique combination
+for combination in $combinations; do
+    cuda_version=$(echo "$combination" | jq -r '.cuda')
+    compiler_name=$(echo "$combination" | jq -r '.compiler_name')
+    compiler_exe=$(echo "$combination" | jq -r '.compiler_exe')
+    compiler_version=$(echo "$combination" | jq -r '.compiler_version')
+    os=$(echo "$combination" | jq -r '.os')
+
+    name=$(make_name "$cuda_version" "$compiler_name" "$compiler_version")
+    mkdir -p "$name"
+    new_devcontainer_file="$name/devcontainer.json"
+
+    update_devcontainer "$base_devcontainer_file" "$new_devcontainer_file" "$name" "$cuda_version" "$compiler_name" "$compiler_exe" "$compiler_version" "$os" "$DEVCONTAINER_VERSION"
+    echo "Created $new_devcontainer_file"
+
+    # Add the subdirectory name to the valid_subdirs array
+    valid_subdirs+=("$name")
+done
+
+# Clean up stale subdirectories and devcontainer.json files
+if [ "$CLEAN" = true ]; then
+    for subdir in ./*; do
+        if [ -d "$subdir" ] && [[ ! " ${valid_subdirs[@]} " =~ " ${subdir#./} " ]]; then
+            echo "Removing stale subdirectory: $subdir"
+            rm -r "$subdir"
+        fi
+    done
+fi
diff --git a/.devcontainer/verify_devcontainer.sh b/.devcontainer/verify_devcontainer.sh
new file mode 100755
index 0000000..b5934ea
--- /dev/null
+++ b/.devcontainer/verify_devcontainer.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+function usage {
+    echo "Usage: $0"
+    echo
+    echo "This script is intended to be run within one of CCCL's Dev Containers."
+    echo "It verifies that the expected environment variables and binary versions match what is expected."
+}
+
+check_envvars() {
+    for var_name in "$@"; do
+        if [[ -z "${!var_name:-}" ]]; then
+            echo "::error:: ${var_name} variable is not set."
+            exit 1
+        else
+            echo "$var_name=${!var_name}"
+        fi
+    done
+}
+
+check_host_compiler_version() {
+    local version_output=$($CXX --version)
+
+    if [[ "$CXX" == "g++" ]]; then
+        local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 4 | cut -d '.' -f 1)
+        local expected_compiler="gcc"
+    elif [[ "$CXX" == "clang++" ]]; then
+        if [[ $version_output =~ clang\ version\ ([0-9]+) ]]; then
+            actual_version=${BASH_REMATCH[1]}
+        else
+            echo "::error:: Unable to determine clang version."
+            exit 1
+        fi
+        expected_compiler="llvm"
+    elif [[ "$CXX" == "icpc" ]]; then
+        local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 3 )
+        # The icpc compiler version of oneAPI release 2023.2.0 is 2021.10.0
+        if [[ "$actual_version" == "2021.10.0" ]]; then
+            actual_version="2023.2.0"
+        fi
+        expected_compiler="oneapi"
+    else
+        echo "::error:: Unexpected CXX value ($CXX)."
+        exit 1
+    fi
+
+    if [[ "$expected_compiler" != "${CCCL_HOST_COMPILER}" || "$actual_version" != "$CCCL_HOST_COMPILER_VERSION" ]]; then
+        echo "::error:: CXX ($CXX) version ($actual_version) does not match the expected compiler (${CCCL_HOST_COMPILER}) and version (${CCCL_HOST_COMPILER_VERSION})."
+        exit 1
+    else
+        echo "Detected host compiler: $CXX version $actual_version"
+    fi
+}
+
+check_cuda_version() {
+    local cuda_version_output=$(nvcc --version)
+    if [[ $cuda_version_output =~ release\ ([0-9]+\.[0-9]+) ]]; then
+        local actual_cuda_version=${BASH_REMATCH[1]}
+    else
+        echo "::error:: Unable to determine CUDA version from nvcc."
+        exit 1
+    fi
+
+    if [[ "$actual_cuda_version" != "$CCCL_CUDA_VERSION" ]]; then
+        echo "::error:: CUDA version ($actual_cuda_version) does not match the expected CUDA version ($CCCL_CUDA_VERSION)."
+        exit 1
+    else
+        echo "Detected CUDA version: $actual_cuda_version"
+    fi
+}
+
+main() {
+    if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+        usage
+        exit 0
+    fi
+
+    set -euo pipefail
+
+    check_envvars DEVCONTAINER_NAME CXX CUDAHOSTCXX CCCL_BUILD_INFIX CCCL_HOST_COMPILER CCCL_CUDA_VERSION CCCL_HOST_COMPILER_VERSION
+
+    check_host_compiler_version
+
+    check_cuda_version
+
+    echo "Dev Container successfully verified!"
+}
+
+main "$@"
diff --git a/.github/actions/compute-matrix/action.yml b/.github/actions/compute-matrix/action.yml
new file mode 100644
index 0000000..b8155e7
--- /dev/null
+++ b/.github/actions/compute-matrix/action.yml
@@ -0,0 +1,25 @@
+
+name: Compute Matrix
+description: "Compute the matrix for a given matrix type from the specified matrix file"
+
+inputs:
+  matrix_query:
+    description: "The jq query used to specify the desired matrix. e.g., .pull_request.nvcc"
+    required: true
+  matrix_file:
+    description: 'The file containing the matrix'
+    required: true
+outputs:
+  matrix:
+    description: 'The requested matrix'
+    value: ${{ steps.compute-matrix.outputs.MATRIX }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Compute matrix
+      id: compute-matrix
+      run: |
+        MATRIX=$(./.github/actions/compute-matrix/compute-matrix.sh ${{inputs.matrix_file}}  ${{inputs.matrix_query}} )
+        echo "matrix=$MATRIX" | tee -a $GITHUB_OUTPUT
+      shell: bash -euxo pipefail {0}
diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
new file mode 100755
index 0000000..8a6d635
--- /dev/null
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -euo pipefail
+
+write_output() {
+  local key="$1"
+  local value="$2"
+  echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
+}
+
+explode_std_versions() {
+  jq -cr 'map(. as $o | {std: $o.std[]} + del($o.std))'
+}
+
+explode_libs() {
+  jq -cr 'map(. as $o | {lib: $o.lib[]} + del($o.lib))'
+}
+
+extract_matrix() {
+  local file="$1"
+  local type="$2"
+  local matrix=$(yq -o=json "$file" | jq -cr ".$type")
+  write_output "DEVCONTAINER_VERSION" "$(yq -o json "$file" | jq -cr '.devcontainer_version')"
+
+  local nvcc_full_matrix="$(echo "$matrix" | jq -cr '.nvcc' | explode_std_versions )"
+  local per_cuda_compiler_matrix="$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
+  write_output "PER_CUDA_COMPILER_MATRIX"  "$per_cuda_compiler_matrix"
+  write_output "PER_CUDA_COMPILER_KEYS" "$(echo "$per_cuda_compiler_matrix" | jq -r 'keys | @json')"
+}
+
+main() {
+  if [ "$1" == "-v" ]; then
+    set -x
+    shift
+  fi
+
+  if [ $# -ne 2 ] || [ "$2" != "pull_request" ]; then
+    echo "Usage: $0 [-v] MATRIX_FILE MATRIX_TYPE"
+    echo "  -v            : Enable verbose output"
+    echo "  MATRIX_FILE   : The path to the matrix file."
+    echo "  MATRIX_TYPE   : The desired matrix. Supported values: 'pull_request'"
+    exit 1
+  fi
+
+  echo "Input matrix file:" >&2
+  cat "$1" >&2
+  echo "Matrix Type: $2" >&2
+
+  extract_matrix "$1" "$2"
+}
+
+main "$@"
diff --git a/.github/actions/configure_cccl_sccache/action.yml b/.github/actions/configure_cccl_sccache/action.yml
new file mode 100644
index 0000000..1b42fc9
--- /dev/null
+++ b/.github/actions/configure_cccl_sccache/action.yml
@@ -0,0 +1,19 @@
+name: Set up AWS credentials and environment variables for sccache
+description: "Set up AWS credentials and environment variables for sccache"
+runs:
+  using: "composite"
+  steps:
+    - name: Get AWS credentials for sccache bucket
+      uses: aws-actions/configure-aws-credentials@v2
+      with:
+        role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+        aws-region: us-east-2 
+        role-duration-seconds: 43200 # 12 hours)
+    - name: Set environment variables
+      run: |
+        echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV
+        echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
+        echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
+        echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
+        echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
+      shell: bash
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 0000000..895ba83
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/problem-matchers/problem-matcher.json b/.github/problem-matchers/problem-matcher.json
new file mode 100644
index 0000000..f196a5c
--- /dev/null
+++ b/.github/problem-matchers/problem-matcher.json
@@ -0,0 +1,14 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "nvcc",
+      "pattern": [
+        {
+          "regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$",
+          "severity": 4,
+          "message": 5
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/build-and-test-linux.yml b/.github/workflows/build-and-test-linux.yml
new file mode 100644
index 0000000..6c5ba40
--- /dev/null
+++ b/.github/workflows/build-and-test-linux.yml
@@ -0,0 +1,47 @@
+name: build and test
+
+defaults:
+  run:
+    shell: bash -exo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      cpu: {type: string, required: true}
+      test_name: {type: string, required: false}
+      build_script: {type: string, required: false}
+      test_script: {type: string, required: false}
+      container_image: {type: string, required: false}
+      run_tests: {type: boolean, required: false, default: true}
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: Build ${{inputs.test_name}}
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name: Build ${{inputs.test_name}}
+      runner: linux-${{inputs.cpu}}-cpu16
+      image:  ${{ inputs.container_image }}
+      command: |
+        ${{ inputs.build_script }}
+
+  test:
+    needs: build
+    permissions:
+      id-token: write
+      contents: read
+    if:  ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.run_tests}}
+    name: Test ${{inputs.test_name}}
+    uses: ./.github/workflows/run-as-coder.yml
+    with:
+      name: Test ${{inputs.test_name}}
+      runner: linux-${{inputs.cpu}}-gpu-v100-latest-1
+      image: ${{inputs.container_image}}
+      command: |
+        ${{ inputs.test_script }}
diff --git a/.github/workflows/build-and-test-windows.yml b/.github/workflows/build-and-test-windows.yml
new file mode 100644
index 0000000..55b3100
--- /dev/null
+++ b/.github/workflows/build-and-test-windows.yml
@@ -0,0 +1,49 @@
+name: Build Windows
+
+on:
+  workflow_call:
+    inputs:
+      test_name: {type: string, required: false}
+      build_script: {type: string, required: false}
+      container_image: {type: string, required: false}
+
+jobs:
+  prepare:
+    name: Build ${{inputs.test_name}}
+    runs-on: windows-amd64-cpu16
+    permissions:
+      id-token: write
+      contents: read
+    env:
+      SCCACHE_BUCKET: rapids-sccache-devs
+      SCCACHE_REGION: us-east-2
+      SCCACHE_IDLE_TIMEOUT: 0
+      SCCACHE_S3_USE_SSL: true
+      SCCACHE_S3_NO_CREDENTIALS: false
+    steps:
+      - name: Get AWS credentials for sccache bucket
+        uses: aws-actions/configure-aws-credentials@v2
+        with:
+          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+          aws-region: us-east-2
+          role-duration-seconds: 43200 # 12 hours
+      - name: Fetch ${{ inputs.container_image }}
+        shell: powershell
+        run: docker pull ${{ inputs.container_image }}
+      - name: Run the tests
+        shell: powershell
+        run: >-
+            docker run ${{ inputs.container_image }} powershell -c "[System.Environment]::SetEnvironmentVariable('AWS_ACCESS_KEY_ID','${{env.AWS_ACCESS_KEY_ID}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('AWS_SECRET_ACCESS_KEY','${{env.AWS_SECRET_ACCESS_KEY}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('AWS_SESSION_TOKEN','${{env.AWS_SESSION_TOKEN }}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_BUCKET','${{env.SCCACHE_BUCKET}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_REGION','${{env.SCCACHE_REGION}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_IDLE_TIMEOUT','${{env.SCCACHE_IDLE_TIMEOUT}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_USE_SSL','${{env.SCCACHE_S3_USE_SSL}}')
+                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_NO_CREDENTIALS','${{env.SCCACHE_S3_NO_CREDENTIALS}}')
+                                                                    git clone https://github.com/NVIDIA/cccl.git;
+                                                                    cd cccl;
+                                                                    git fetch --all;
+                                                                    git checkout ${{github.ref_name}};
+                                                                    ${{inputs.build_script}};"
+
diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml
new file mode 100644
index 0000000..7b5ed4e
--- /dev/null
+++ b/.github/workflows/dispatch-build-and-test.yml
@@ -0,0 +1,51 @@
+name: Dispatch build and test
+
+on:
+  workflow_call:
+    inputs:
+      project_name: {type: string, required: true}
+      per_cuda_compiler_matrix: {type: string, required: true}
+      devcontainer_version: {type: string, required: true}
+      is_windows: {type: boolean, required: true}
+
+permissions:
+  contents: read
+
+jobs:
+  # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
+  # ensures that the build/test steps can overlap across different configurations. For example,
+  # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
+  build_and_test_linux:
+    name: build and test linux
+    permissions:
+      id-token: write
+      contents: read
+    if: ${{ !inputs.is_windows }}
+    uses: ./.github/workflows/build-and-test-linux.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
+    with:
+      cpu: ${{ matrix.cpu }}
+      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}/C++${{matrix.std}} ${{matrix.extra_build_args}}
+      build_script: './ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} "${{matrix.extra_build_args}}"'
+      test_script:  './ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} "${{matrix.extra_build_args}}"'
+      container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
+      run_tests: ${{ contains(matrix.jobs, 'test') && !contains(github.event.head_commit.message, 'skip-tests') && matrix.os != 'windows-2022' }}
+
+  build_and_test_windows:
+    name: build and test windows
+    permissions:
+      id-token: write
+      contents: read
+    if: ${{ inputs.is_windows }}
+    uses: ./.github/workflows/build-and-test-windows.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
+    with:
+      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}/C++${{matrix.std}}
+      build_script: "./ci/windows/build_${{ inputs.project_name }}.ps1 -std ${{matrix.std}}"
+      container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cuda${{matrix.cuda}}-${{matrix.compiler.name}}${{matrix.compiler.version}}-${{matrix.os}}
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 0000000..6ea22ab
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the main workflow that runs on every PR and push to main
+name: pr
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+
+# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts.
+concurrency:
+  group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: read
+
+jobs:
+  compute-matrix:
+    name: Compute matrix
+    runs-on: ubuntu-latest
+    outputs:
+      DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
+      PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
+      PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Compute matrix outputs
+        id: set-outputs
+        run: |
+          .github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
+
+  nvbench:
+    name: NVBench CUDA${{ matrix.cuda_host_combination }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: compute-matrix
+    uses: ./.github/workflows/dispatch-build-and-test.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
+    with:
+      project_name: "nvbench"
+      per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
+      devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
+      is_windows: ${{ contains(matrix.cuda_host_combination, 'cl') }}
+
+  verify-devcontainers:
+    name: Verify Dev Containers
+    permissions:
+      id-token: write
+      contents: read
+    uses: ./.github/workflows/verify-devcontainers.yml
+
+  # This job is the final job that runs after all other jobs and is used for branch protection status checks.
+  # See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
+  # https://github.com/orgs/community/discussions/26822#discussioncomment-5122101
+  ci:
+    runs-on: ubuntu-latest
+    name: CI
+    if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
+    needs:
+      - nvbench
+      - verify-devcontainers
+    steps:
+      - name: Check status of all precursor jobs
+        if: >-
+          ${{
+               contains(needs.*.result, 'failure')
+            || contains(needs.*.result, 'cancelled')
+            || contains(needs.*.result, 'skipped')
+          }}
+        run: exit 1
diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml
new file mode 100644
index 0000000..29399b6
--- /dev/null
+++ b/.github/workflows/run-as-coder.yml
@@ -0,0 +1,67 @@
+name: Run as coder user
+
+defaults:
+  run:
+    shell: bash -exo pipefail {0}
+
+on:
+  workflow_call:
+    inputs:
+      name: {type: string, required: true}
+      image: {type: string, required: true}
+      runner: {type: string, required: true}
+      command: {type: string, required: true}
+      env: { type: string, required: false, default: "" }
+
+permissions:
+  contents: read
+
+jobs:
+  run-as-coder:
+    name: ${{inputs.name}}
+    permissions:
+      id-token: write
+      contents: read
+    runs-on: ${{inputs.runner}}
+    container:
+      options: -u root
+      image: ${{inputs.image}}
+      env:
+        NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          path: nvbench
+          persist-credentials: false
+      - name: Move files to coder user home directory
+        run: |
+          cp -R nvbench /home/coder/nvbench
+          chown -R coder:coder /home/coder/
+      - name: Add NVCC problem matcher
+        run: |
+          echo "::add-matcher::nvbench/.github/problem-matchers/problem-matcher.json"
+      - name: Configure credentials and environment variables for sccache
+        uses: ./nvbench/.github/actions/configure_cccl_sccache
+      - name: Run command
+        shell: su coder {0}
+        run: |
+            set -eo pipefail
+            cd ~/nvbench
+            echo -e "\e[1;34mRunning as 'coder' user in $(pwd):\e[0m"
+            echo -e "\e[1;34m${{inputs.command}}\e[0m"
+            eval "${{inputs.command}}" || exit_code=$?
+            if [ ! -z "$exit_code" ]; then
+              echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
+              echo "::error:: To replicate this failure locally, follow the steps below:"
+              echo "1. Clone the repository, and navigate to the correct branch and commit:"
+              echo "   git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
+              echo ""
+              echo "2. Run the failed command inside the same Docker container used by the CI:"
+              echo "   docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
+              echo ""
+              echo "For additional information, see:"
+              echo "   - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
+              echo "   - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
+              exit $exit_code
+            fi
diff --git a/.github/workflows/verify-devcontainers.yml b/.github/workflows/verify-devcontainers.yml
new file mode 100644
index 0000000..6fea8ae
--- /dev/null
+++ b/.github/workflows/verify-devcontainers.yml
@@ -0,0 +1,94 @@
+name: Verify devcontainers
+
+on:
+  workflow_call:
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+permissions:
+  contents: read
+
+jobs:
+  verify-make-devcontainers:
+    name: Verify devcontainer files are up-to-date
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+    - name: Setup jq and yq
+      run: |
+        sudo apt-get update
+        sudo apt-get install jq -y
+        sudo wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.34.2/yq_linux_amd64
+        sudo chmod +x /usr/local/bin/yq
+    - name: Run the script to generate devcontainer files
+      run: |
+        ./.devcontainer/make_devcontainers.sh --verbose
+    - name: Check for changes
+      run: |
+        if [[ $(git diff --stat) != '' || $(git status --porcelain | grep '^??') != '' ]]; then
+          git diff --minimal
+          git status --porcelain
+          echo "::error:: Dev Container files are out of date or there are untracked files. Run the .devcontainer/make_devcontainers.sh script and commit the changes."
+          exit 1
+        else
+          echo "::note::Dev Container files are up-to-date."
+        fi
+
+  get-devcontainer-list:
+    needs: verify-make-devcontainers
+    name: Get list of devcontainer.json files
+    runs-on: ubuntu-latest
+    outputs:
+      devcontainers: ${{ steps.get-list.outputs.devcontainers }}
+    steps:
+    - name: Check out the code
+      uses: actions/checkout@v3
+    - name: Get list of devcontainer.json paths and names
+      id: get-list
+      run: |
+        devcontainers=$(find .devcontainer/ -name 'devcontainer.json' | while read -r devcontainer; do
+          jq --arg path "$devcontainer" '{path: $path, name: .name}' "$devcontainer"
+          done | jq -s -c .)
+        echo "devcontainers=${devcontainers}" | tee --append "${GITHUB_OUTPUT}"
+
+  verify-devcontainers:
+    needs: get-devcontainer-list
+    name: ${{matrix.devcontainer.name}}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        devcontainer: ${{fromJson(needs.get-devcontainer-list.outputs.devcontainers)}}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+    - name: Check out the code
+      uses: actions/checkout@v3
+      # devcontainer/ci doesn't supported nested devcontainer.json files, so we need to copy the devcontainer.json
+      # file to the top level .devcontainer/ directory
+    - name: Copy devcontainer.json to .devcontainer/
+      run: |
+        src="${{ matrix.devcontainer.path }}"
+        dst=".devcontainer/devcontainer.json"
+        if [[ "$src" != "$dst" ]]; then
+          cp "$src" "$dst"
+        fi
+      # We don't really need sccache configured, but we need the AWS credentials envvars to be set
+      # in order to avoid the devcontainer hanging waiting for GitHub authentication
+    - name: Configure credentials and environment variables for sccache
+      uses: ./.github/actions/configure_cccl_sccache
+    - name: Run in devcontainer
+      uses: devcontainers/ci@v0.3
+      with:
+        push: never
+        env: |
+          SCCACHE_REGION=${{ env.SCCACHE_REGION }}
+          AWS_ACCESS_KEY_ID=${{ env.AWS_ACCESS_KEY_ID }}
+          AWS_SESSION_TOKEN=${{ env.AWS_SESSION_TOKEN }}
+          AWS_SECRET_ACCESS_KEY=${{ env.AWS_SECRET_ACCESS_KEY }}
+        runCmd: |
+          .devcontainer/verify_devcontainer.sh
diff --git a/.gitignore b/.gitignore
index 20d94d8..d41aa02 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,8 @@
 build*/
+.aws
+.vscode
 .cache
+.config
 .idea
 cmake-build-*
 *~
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 0000000..9dc92cd
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,76 @@
+{
+  "version": 3,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 23,
+    "patch": 1
+  },
+  "configurePresets": [
+    {
+      "name": "base",
+      "hidden": true,
+      "generator": "Ninja",
+      "binaryDir": "${sourceDir}/build/$env{CCCL_BUILD_INFIX}/${presetName}",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_CUDA_ARCHITECTURES": "60;70;80",
+        "NVBench_ENABLE_CUPTI": true,
+        "NVBench_ENABLE_DEVICE_TESTING": false,
+        "NVBench_ENABLE_EXAMPLES": true,
+        "NVBench_ENABLE_INSTALL_RULES": true,
+        "NVBench_ENABLE_NVML": true,
+        "NVBench_ENABLE_TESTING": true,
+        "NVBench_ENABLE_WERROR": true
+      }
+    },
+    {
+      "name": "all-dev",
+      "inherits": "base",
+      "cacheVariables": {
+        "NVBench_ENABLE_DEVICE_TESTING": true
+      }
+    },
+    {
+      "name": "nvbench-cpp17",
+      "displayName": "nvbench_c++17",
+      "inherits": "base",
+      "cacheVariables": {
+        "CMAKE_CXX_STANDARD": "17",
+        "CMAKE_CUDA_STANDARD": "17"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "all-dev",
+      "configurePreset": "all-dev"
+    },
+    {
+      "name": "nvbench-cpp17",
+      "configurePreset": "nvbench-cpp17"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "base",
+      "hidden": true,
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "noTestsAction": "error",
+        "stopOnFailure": false
+      }
+    },
+    {
+      "name": "all-dev",
+      "configurePreset": "all-dev",
+      "inherits": "base"
+    },
+    {
+      "name": "nvbench-cpp17",
+      "configurePreset": "nvbench-cpp17",
+      "inherits": "base"
+    }
+  ]
+}
diff --git a/ci/axis/cpu.yml b/ci/axis/cpu.yml
deleted file mode 100644
index 7230b66..0000000
--- a/ci/axis/cpu.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 5
-  - 6
-  - 7
-  - 8
-  - 9
-  - 10
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 5
-  - CXX_TYPE: clang
-    CXX_VER: 6
-  - CXX_TYPE: gcc
-    CXX_VER: 12
diff --git a/ci/axis/gpu.yml b/ci/axis/gpu.yml
deleted file mode 100644
index 1531079..0000000
--- a/ci/axis/gpu.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 11
-  - CXX_TYPE: gcc
-    CXX_VER: 12
diff --git a/ci/build_common.sh b/ci/build_common.sh
new file mode 100755
index 0000000..ee95b00
--- /dev/null
+++ b/ci/build_common.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+# Script defaults
+HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
+CXX_STANDARD=17
+CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc`
+CUDA_ARCHS= # Empty, use presets by default.
+GLOBAL_CMAKE_OPTIONS=()
+DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks.
+
+# Check if the correct number of arguments has been provided
+function usage {
+    echo "Usage: $0 [OPTIONS]"
+    echo
+    echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo
+    echo "Options:"
+    echo "  -v/--verbose: enable shell echo for debugging"
+    echo "  -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
+    echo "  -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
+    echo "  -std: CUDA/C++ standard (Defaults to 17)"
+    echo "  -arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to value in presets file)"
+    echo "  -cmake-options: Additional options to pass to CMake"
+    echo
+    echo "Examples:"
+    echo "  $ PARALLEL_LEVEL=8 $0"
+    echo "  $ PARALLEL_LEVEL=8 $0 -cxx g++-9"
+    echo "  $ $0 -cxx clang++-8"
+    echo "  $ $0 -cxx g++-8 -std 20 -arch 80-real -v -cuda /usr/local/bin/nvcc"
+    echo "  $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\""
+    exit 1
+}
+
+# Parse options
+
+# Copy the args into a temporary array, since we will modify them and
+# the parent script may still need them.
+args=("$@")
+while [ "${#args[@]}" -ne 0 ]; do
+    case "${args[0]}" in
+    -v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
+    -cxx)  HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
+    -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -arch) CUDA_ARCHS="${args[1]}";    args=("${args[@]:2}");;
+    -disable-benchmarks) DISABLE_CUB_BENCHMARKS=1; args=("${args[@]:1}");;
+    -cmake-options)
+        if [ -n "${args[1]}" ]; then
+            IFS=' ' read -ra split_args <<< "${args[1]}"
+            GLOBAL_CMAKE_OPTIONS+=("${split_args[@]}")
+            args=("${args[@]:2}")
+        else
+            echo "Error: No arguments provided for -cmake-options"
+            usage
+            exit 1
+        fi
+        ;;
+    -h | -help | --help) usage ;;
+    *) echo "Unrecognized option: ${args[0]}"; usage ;;
+    esac
+done
+
+# Convert to full paths:
+HOST_COMPILER=$(which ${HOST_COMPILER})
+CUDA_COMPILER=$(which ${CUDA_COMPILER})
+
+if [[ -n "${CUDA_ARCHS}" ]]; then
+    GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}")
+fi
+
+if [ $VERBOSE ]; then
+    set -x
+fi
+
+# Begin processing unsets after option parsing
+set -u
+
+readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
+
+if [ -z ${CCCL_BUILD_INFIX+x} ]; then
+    CCCL_BUILD_INFIX=""
+fi
+
+# Presets will be configured in this directory:
+BUILD_DIR="../build/${CCCL_BUILD_INFIX}"
+
+# The most recent build will always be symlinked to cccl/build/latest
+mkdir -p $BUILD_DIR
+rm -f ../build/latest
+ln -sf $BUILD_DIR ../build/latest
+
+# Now that BUILD_DIR exists, use readlink to canonicalize the path:
+BUILD_DIR=$(readlink -f "${BUILD_DIR}")
+
+# Prepare environment for CMake:
+export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}"
+export CTEST_PARALLEL_LEVEL="1"
+export CXX="${HOST_COMPILER}"
+export CUDACXX="${CUDA_COMPILER}"
+export CUDAHOSTCXX="${HOST_COMPILER}"
+export CXX_STANDARD
+
+source ./pretty_printing.sh
+
+print_environment_details() {
+  begin_group "⚙️ Environment Details"
+
+  echo "pwd=$(pwd)"
+
+  print_var_values \
+      BUILD_DIR \
+      CXX_STANDARD \
+      CXX \
+      CUDACXX \
+      CUDAHOSTCXX \
+      NVCC_VERSION \
+      CMAKE_BUILD_PARALLEL_LEVEL \
+      CTEST_PARALLEL_LEVEL \
+      CCCL_BUILD_INFIX \
+      GLOBAL_CMAKE_OPTIONS
+
+  echo "Current commit is:"
+  git log -1 || echo "Not a repository"
+
+  if command -v nvidia-smi &> /dev/null; then
+    nvidia-smi
+  else
+    echo "nvidia-smi not found"
+  fi
+
+  end_group "⚙️ Environment Details"
+}
+
+fail_if_no_gpu() {
+    if ! nvidia-smi &> /dev/null; then
+        echo "Error: No NVIDIA GPU detected. Please ensure you have an NVIDIA GPU installed and the drivers are properly configured." >&2
+        exit 1
+    fi
+}
+
+function print_test_time_summary()
+{
+    ctest_log=${1}
+
+    if [ -f ${ctest_log} ]; then
+        begin_group "⏱️ Longest Test Steps"
+        # Only print the full output in CI:
+        if [ -n "${GITHUB_ACTIONS:-}" ]; then
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake
+        else
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake | head -n 15
+        fi
+        end_group "⏱️ Longest Test Steps"
+    fi
+}
+
+function configure_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+    local GROUP_NAME="🛠️  CMake Configure ${BUILD_NAME}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE "${GLOBAL_CMAKE_OPTIONS[@]}" $CMAKE_OPTIONS
+    status=$?
+    popd > /dev/null
+    return $status
+}
+
+function build_preset() {
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local green="1;32"
+    local red="1;31"
+    local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
+
+    source "./sccache_stats.sh" "start"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v
+    status=$?
+    popd > /dev/null
+
+    minimal_sccache_stats=$(source "./sccache_stats.sh" "end")
+
+    # Only print detailed stats in actions workflow
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        begin_group "💲 sccache stats"
+        echo "${minimal_sccache_stats}"
+        sccache -s
+        end_group
+
+        begin_group "🥷 ninja build times"
+        echo "The "weighted" time is the elapsed time of each build step divided by the number
+              of tasks that were running in parallel. This makes it an excellent approximation
+              of how "important" a slow step was. A link that is entirely or mostly serialized
+              will have a weighted time that is the same or similar to its elapsed time. A
+              compile that runs in parallel with 999 other compiles will have a weighted time
+              that is tiny."
+        ./ninja_summary.py -C ${BUILD_DIR}/${PRESET}
+        end_group
+    else
+      echo $minimal_sccache_stats
+    fi
+
+    return $status
+}
+
+function test_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local GROUP_NAME="🚀  Test ${BUILD_NAME}"
+
+    fail_if_no_gpu
+
+
+    ctest_log_dir="${BUILD_DIR}/log/ctest"
+    ctest_log="${ctest_log_dir}/${PRESET}"
+    mkdir -p "${ctest_log_dir}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" ctest --output-log "${ctest_log}" --preset=$PRESET
+    status=$?
+    popd > /dev/null
+
+    print_test_time_summary ${ctest_log}
+
+    return $status
+}
+
+function configure_and_build_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+
+    configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
+    build_preset "$BUILD_NAME" "$PRESET"
+}
diff --git a/ci/build_nvbench.sh b/ci/build_nvbench.sh
new file mode 100755
index 0000000..ecd0628
--- /dev/null
+++ b/ci/build_nvbench.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+print_environment_details
+
+PRESET="nvbench-cpp$CXX_STANDARD"
+
+CMAKE_OPTIONS=""
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
diff --git a/ci/common/build.bash b/ci/common/build.bash
deleted file mode 100755
index 61b3654..0000000
--- a/ci/common/build.bash
+++ /dev/null
@@ -1,231 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI
-################################################################################
-
-set -e
-
-# append variable value
-# Appends ${value} to ${variable}, adding a space before ${value} if
-# ${variable} is not empty.
-function append {
-  tmp="${!1:+${!1} }${2}"
-  eval "${1}=\${tmp}"
-}
-
-# log args...
-# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after.
-function log() {
-  printf "\n>>>> %s\n\n" "${*}"
-}
-
-# print_with_trailing_blank_line args...
-# Prints ${args[*]} with one blank line following, preserving newlines within
-# ${args[*]} but stripping any preceding ${args[*]}.
-function print_with_trailing_blank_line {
-  printf "%s\n\n" "${*}"
-}
-
-# echo_and_run name args...
-# Echo ${args[@]}, then execute ${args[@]}
-function echo_and_run {
-  echo "${1}: ${@:2}"
-  ${@:2}
-}
-
-# echo_and_run_timed name args...
-# Echo ${args[@]}, then execute ${args[@]} and report how long it took,
-# including ${name} in the output of the time.
-function echo_and_run_timed {
-  echo "${@:2}"
-  TIMEFORMAT=$'\n'"${1} Time: %lR"
-  time ${@:2}
-}
-
-# join_delimit <delimiter> [value [value [...]]]
-# Combine all values into a single string, separating each by a single character
-# delimiter. Eg:
-# foo=(bar baz kramble)
-# joined_foo=$(join_delimit "|" "${foo[@]}")
-# echo joined_foo # "bar|baz|kramble"
-function join_delimit {
-  local IFS="${1}"
-  shift
-  echo "${*}"
-}
-
-################################################################################
-# VARIABLES - Set up bash and environmental variables.
-################################################################################
-
-# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
-source /etc/cccl.bashrc
-
-# Set path.
-export PATH=/usr/local/cuda/bin:${PATH}
-
-# Set home to the job's workspace.
-export HOME=${WORKSPACE}
-
-# Switch to the build directory.
-cd ${WORKSPACE}
-mkdir -p build
-cd build
-
-# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
-rm -f .ninja_log
-
-if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
-  CMAKE_BUILD_TYPE="Release"
-fi
-
-CMAKE_BUILD_FLAGS="--"
-
-# The Docker image sets up `${CXX}` and `${CUDACXX}`.
-append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
-append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'"
-
-if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
-  echo "nvc++ not supported."
-  exit 1
-else
-  if [[ "${CXX_TYPE}" == "icc" ]]; then
-    echo "icc not supported."
-    exit 1
-  fi
-  # We're using NVCC so we need to set the host compiler.
-  append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-DCMAKE_CUDA_HOST_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-G Ninja"
-  # Don't stop on build failures.
-  append CMAKE_BUILD_FLAGS "-k0"
-fi
-
-if [[ -n "${PARALLEL_LEVEL}" ]]; then
-  DETERMINE_PARALLELISM_FLAGS="-j ${PARALLEL_LEVEL}"
-fi
-
-WSL=0
-if [[ $(grep -i microsoft /proc/version) ]]; then
-  echo "Windows Subsystem for Linux detected."
-  WSL=1
-fi
-export WSL
-
-#append CMAKE_FLAGS "-DCMAKE_CUDA_ARCHITECTURES=all"
-
-append CMAKE_FLAGS "-DNVBench_ENABLE_EXAMPLES=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_TESTING=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_CUPTI=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_WERROR=ON"
-
-# These consume a lot of time and don't currently have
-# any value as regression tests.
-append CMAKE_FLAGS "-DNVBench_ENABLE_DEVICE_TESTING=OFF"
-
-# NVML doesn't work under WSL
-if [[ ${WSL} -eq 0 ]]; then
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=ON"
-else
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=OFF"
-fi
-
-if [[ -n "${@}" ]]; then
-  append CMAKE_BUILD_FLAGS "${@}"
-fi
-
-append CTEST_FLAGS "--output-on-failure"
-
-# Export variables so they'll show up in the logs when we report the environment.
-export CMAKE_FLAGS
-export CMAKE_BUILD_FLAGS
-export CTEST_FLAGS
-
-################################################################################
-# ENVIRONMENT - Configure and print out information about the environment.
-################################################################################
-
-log "Determine system topology..."
-
-# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the
-# system topology.
-source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS}
-
-log "Get environment..."
-
-env | sort
-
-log "Check versions..."
-
-# We use sed and echo below to ensure there is always one and only trailing
-# line following the output from each tool.
-
-${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-if [[ "${BUILD_TYPE}" == "gpu" ]]; then
-  nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
-fi
-
-################################################################################
-# BUILD
-################################################################################
-
-log "Configure..."
-
-echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
-configure_status=$?
-
-log "Build..."
-
-# ${PARALLEL_LEVEL} needs to be passed after we run
-# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
-set +e # Don't stop on build failures.
-echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
-build_status=$?
-set -e
-
-################################################################################
-# TEST - Run examples and tests.
-################################################################################
-
-log "Test..."
-
-(
-  # Make sure test_status captures ctest, not tee:
-  # https://stackoverflow.com/a/999259/11130318
-  set -o pipefail
-  echo_and_run_timed "Test" ctest ${CTEST_FLAGS} -j ${PARALLEL_LEVEL} | tee ctest_log
-)
-
-test_status=$?
-
-################################################################################
-# SUMMARY - Print status of each step and exit with failure if needed.
-################################################################################
-
-log "Summary:"
-echo "- Configure Error Code: ${configure_status}"
-echo "- Build Error Code: ${build_status}"
-echo "- Test Error Code: ${test_status}"
-
-if [[ "${configure_status}" != "0" ]] || \
-   [[ "${build_status}" != "0" ]] || \
-   [[ "${test_status}" != "0" ]]; then
-     exit 1
-fi
diff --git a/ci/common/determine_build_parallelism.bash b/ci/common/determine_build_parallelism.bash
deleted file mode 100755
index 1a1cf4c..0000000
--- a/ci/common/determine_build_parallelism.bash
+++ /dev/null
@@ -1,119 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-function usage {
-  echo "Usage: ${0} [flags...]"
-  echo
-  echo "Examine the system topology to determine a reasonable amount of build"
-  echo "parallelism."
-  echo
-  echo "Exported variables:"
-  echo "  \${LOGICAL_CPUS}          : Logical processors (e.g. threads)."
-  echo "  \${PHYSICAL_CPUS}         : Physical processors (e.g. cores)."
-  echo "  \${TOTAL_MEM}             : Total system memory [GB]."
-  echo "  \${MAX_THREADS_PER_CORE}  : Maximum threads per core allowed."
-  echo "  \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed."
-  echo "  \${CPU_BOUND_THREADS}     : # of build threads constrained by processors."
-  echo "  \${MEM_BOUND_THREADS}     : # of build threads constrained by memory [GB]."
-  echo "  \${PARALLEL_LEVEL}        : Determined # of build threads."
-  echo "  \${MEM_PER_THREAD}        : Memory [GB] per build thread."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-q, --quiet"
-  echo "  Print nothing and only export variables."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Explicitly set the number of build threads to use."
-  echo
-  echo "--max-threads-per-core <threads>"
-  echo "  Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])."
-  echo
-  echo "--min-memory-per-thread <gigabytes>"
-  echo "  Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])."
-
-  exit -3
-}
-
-QUIET=0
-
-export MAX_THREADS_PER_CORE=2
-export MIN_MEMORY_PER_THREAD=1 # [GB]
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -q) ;&
-  --quiet) QUIET=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  --max-threads-per-core)
-    shift # The next argument is the number of threads per core.
-    MAX_THREADS_PER_CORE="${1}"
-    ;;
-  --min-memory-per-thread)
-    shift # The next argument is the amount of memory per thread.
-    MIN_MEMORY_PER_THREAD="${1}"
-    ;;
-  esac
-  shift
-done
-
-# https://stackoverflow.com/a/23378780
-if [ $(uname) == "Darwin" ]; then
-  export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max)
-  export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max)
-else
-  export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l)
-  export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
-fi
-
-export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }")
-
-export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }")
-export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }")
-
-if [[ -z "${PARALLEL_LEVEL}" ]]; then
-  # Pick the smaller of the two as the default.
-  if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then
-    export PARALLEL_LEVEL=${MEM_BOUND_THREADS}
-  else
-    export PARALLEL_LEVEL=${CPU_BOUND_THREADS}
-  fi
-else
-  EXPLICIT_PARALLEL_LEVEL=1
-fi
-
-# This can be a floating point number.
-export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }")
-
-if [[ "${QUIET}" == 0 ]]; then
-  echo    "Logical CPUs:           ${LOGICAL_CPUS} [threads]"
-  echo    "Physical CPUs:          ${PHYSICAL_CPUS} [cores]"
-  echo    "Total Mem:              ${TOTAL_MEM} [GBs]"
-  echo    "Max Threads Per Core:   ${MAX_THREADS_PER_CORE} [threads/core]"
-  echo    "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]"
-  echo    "CPU Bound Threads:      ${CPU_BOUND_THREADS} [threads]"
-  echo    "Mem Bound Threads:      ${MEM_BOUND_THREADS} [threads]"
-
-  echo -n "Parallel Level:         ${PARALLEL_LEVEL} [threads]"
-  if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then
-    echo " (explicitly set)"
-  else
-    echo
-  fi
-
-  echo    "Mem Per Thread:         ${MEM_PER_THREAD} [GBs/thread]"
-fi
-
diff --git a/ci/cpu/build.bash b/ci/cpu/build.bash
deleted file mode 100755
index edf1ba3..0000000
--- a/ci/cpu/build.bash
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (CPU-only)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/gpu/build.bash b/ci/gpu/build.bash
deleted file mode 100755
index 9f6fc01..0000000
--- a/ci/gpu/build.bash
+++ /dev/null
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (heterogeneous)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/local/build.bash b/ci/local/build.bash
deleted file mode 100755
index 60d22de..0000000
--- a/ci/local/build.bash
+++ /dev/null
@@ -1,215 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench local containerized build script
-################################################################################
-
-function usage {
-  echo "Usage: ${0} [flags...] [cmake-targets...]"
-  echo
-  echo "Build and test your local repository using a gpuCI Docker image."
-  echo "If CMake targets are specified, only those targets are built and tested."
-  echo "Otherwise, everything is built and tested."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-r <path>, --repository <path>"
-  echo "  Path to the repository (default: ${REPOSITORY_PATH})."
-  echo
-  echo "-i <image>, --image <image>"
-  echo "  Docker image to use (default: ${IMAGE})"
-  echo
-  echo "-l, --local-image"
-  echo "  Use the local version of the image instead of pulling from Docker hub."
-  echo
-  echo "-s, --shell-only"
-  echo "  Skip building and testing and launch an interactive shell instead."
-  echo
-  echo "-d, --disable-gpus"
-  echo "  Don't start the container with the NVIDIA runtime and GPUs attached."
-  echo
-  echo "-c, --clean"
-  echo "  If the build directory already exists, delete it."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Number of threads to use when building (default: inferred)."
-  echo
-  echo "-b <type>, --cmake-build-type <plan>"
-  echo "  CMake build type to use, either Release, RelWithDebInfo, or Debug"
-  echo "  (default: ${CMAKE_BUILD_TYPE})."
-  echo
-
-  exit -3
-}
-
-SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
-
-REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..)
-
-################################################################################
-# FLAGS - Process command line flags.
-################################################################################
-
-IMAGE="gpuci/cccl:cuda11.5.1-devel-ubuntu20.04-gcc9"
-
-LOCAL_IMAGE=0
-
-SHELL_ONLY=0
-
-BUILD_TYPE="gpu"
-
-CLEAN=0
-
-PARALLEL_LEVEL=""
-
-CMAKE_BUILD_TYPE="Release"
-
-TARGETS=""
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -r) ;&
-  --repository)
-    shift # The next argument is the path.
-    REPOSITORY_PATH="${1}"
-    ;;
-  -i) ;&
-  --image)
-    shift # The next argument is the image.
-    IMAGE="${1}"
-    ;;
-  -l) ;&
-  --local-image) LOCAL_IMAGE=1 ;;
-  -s) ;&
-  --shell-only) SHELL_ONLY=1 ;;
-  -d) ;&
-  --disable-gpus) BUILD_TYPE="cpu" ;;
-  -c) ;&
-  --clean) CLEAN=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  -b) ;&
-  --cmake-build-type)
-    shift # The next argument is the build type.
-    CMAKE_BUILD_TYPE="${1}"
-    ;;
-  *)
-    TARGETS="${TARGETS:+${TARGETS} }${1}"
-    ;;
-  esac
-  shift
-done
-
-################################################################################
-# PATHS - Setup paths for the container.
-################################################################################
-
-# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being
-# built and tested. It can be set with the --repository flag.
-#
-# ${BUILD_PATH} is the local filesystem path that will be used for the build. It
-# is named after the image name, allowing multiple image builds to coexist on
-# the local filesystem.
-#
-# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside
-# the container.
-#
-# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the
-# container.
-
-BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g')
-
-if [[ "${CLEAN}" != 0 ]]; then
-  rm -rf ${BUILD_PATH}
-fi
-
-mkdir -p ${BUILD_PATH}
-
-BASE_PATH_IN_CONTAINER="/cccl"
-
-REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")"
-
-BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build"
-
-################################################################################
-# ENVIRONMENT - Setup the thunk build script that will be run by the container.
-################################################################################
-
-# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this
-# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399
-
-COMMAND="sudo ldconfig; sudo ldconfig"
-if [[ "${SHELL_ONLY}" != 0 ]]; then
-  COMMAND="${COMMAND}; bash"
-else
-  COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash"
-fi
-
-################################################################################
-# GPU - Setup GPUs.
-################################################################################
-
-# Note: We always start docker with --gpus, even for cpu builds. Otherwise
-# libcuda.so.1 is not present and no NVBench tests are able to run.
-
-# Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}.
-if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then
-  VISIBLE_DEVICES="all"
-else
-  VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}"
-fi
-
-DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/')
-GPU_OPTS="--gpus device=${VISIBLE_DEVICES}"
-if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]]
-then
-  GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'"
-fi
-
-################################################################################
-# LAUNCH - Pull and launch the container.
-################################################################################
-
-#NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
-NVIDIA_DOCKER_INSTALLED=1 # Broken on WSL
-if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
-  echo "NVIDIA Docker not found, please install it: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
-  exit -4
-fi
-
-if [[ "${LOCAL_IMAGE}" == 0 ]]; then
-  docker pull "${IMAGE}"
-fi
-
-docker run --rm -it ${GPU_OPTS} \
-  --cap-add=SYS_PTRACE \
-  --user "$(id -u)":"$(id -g)" \
-  -v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \
-  -v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \
-  -v /etc/passwd:/etc/passwd:ro \
-  -v /etc/group:/etc/group:ro \
-  -v /etc/subuid:/etc/subuid:ro \
-  -v /etc/subgid:/etc/subgid:ro \
-  -v /etc/shadow:/etc/shadow:ro \
-  -v /etc/gshadow:/etc/gshadow:ro \
-  -e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \
-  -e "BUILD_TYPE=${BUILD_TYPE}" \
-  -e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \
-  -e "COVERAGE_PLAN=${COVERAGE_PLAN}" \
-  -e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \
-  -w "${BUILD_PATH_IN_CONTAINER}" \
-  "${IMAGE}" bash -c "${COMMAND}"
-
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
new file mode 100644
index 0000000..e6054fd
--- /dev/null
+++ b/ci/matrix.yaml
@@ -0,0 +1,85 @@
+
+cuda_prev_min: &cuda_prev_min '11.1'
+cuda_prev_max:  &cuda_prev_max  '11.8'
+cuda_curr: &cuda_curr '12.4'
+
+# The GPUs to test on
+gpus:
+  - 'a100'
+  - 'v100'
+
+# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
+devcontainer_version: '24.06'
+
+# gcc compiler configurations
+gcc6: &gcc6 { name: 'gcc', version: '6', exe: 'g++' }
+gcc7: &gcc7 { name: 'gcc', version: '7', exe: 'g++' }
+gcc8: &gcc8 { name: 'gcc', version: '8', exe: 'g++' }
+gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' }
+gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' }
+gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' }
+gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' }
+gcc-oldest: &gcc-oldest { name: 'gcc', version: '6', exe: 'g++' }
+gcc-newest: &gcc-newest { name: 'gcc', version: '12', exe: 'g++' }
+
+# LLVM Compiler configurations
+llvm9: &llvm9 { name: 'llvm', version: '9', exe: 'clang++' }
+llvm10: &llvm10 { name: 'llvm', version: '10', exe: 'clang++' }
+llvm11: &llvm11 { name: 'llvm', version: '11', exe: 'clang++' }
+llvm12: &llvm12 { name: 'llvm', version: '12', exe: 'clang++' }
+llvm13: &llvm13 { name: 'llvm', version: '13', exe: 'clang++' }
+llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' }
+llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' }
+llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' }
+llvm-oldest: &llvm-oldest { name: 'llvm', version: '9', exe: 'clang++' }
+llvm-newest: &llvm-newest { name: 'llvm', version: '16', exe: 'clang++' }
+
+# MSVC configs
+msvc2017: &msvc2017 { name: 'cl', version: '14.16', exe: 'cl++' }
+msvc2019: &msvc2019 { name: 'cl', version: '14.29', exe: 'cl++' }
+msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }
+
+# oneAPI configs
+oneapi: &oneapi { name: 'oneapi', version: '2023.2.0', exe: 'icpc' }
+
+# Each environment below will generate a unique build/test job
+# See the "compute-matrix" job in the workflow for how this is parsed and used
+# cuda: The CUDA Toolkit version
+# os: The operating system used
+# cpu: The CPU architecture
+# compiler: The compiler to use
+#   name: The compiler name
+#   version: The compiler version
+#   exe: The unverionsed compiler binary name
+# std: The C++ standards to build for
+#    This field is unique as it will generate an independent build/test job for each value
+
+# Configurations that will run for every PR
+pull_request:
+  nvcc:
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'windows2022', cpu: 'amd64', compiler: *msvc2017, std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17],     jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90'}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17],     jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90a'}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17],     jobs: ['build', 'test']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *gcc12,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16,   std: [17], jobs: ['build', 'test']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *llvm16,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *oneapi,   std: [17],     jobs: ['build']}
diff --git a/ci/pretty_printing.sh b/ci/pretty_printing.sh
new file mode 100644
index 0000000..5bea1af
--- /dev/null
+++ b/ci/pretty_printing.sh
@@ -0,0 +1,105 @@
+# Print "ARG=${ARG}" for all args.
+function print_var_values() {
+    # Iterate through the arguments
+    for var_name in "$@"; do
+        if [ -z "$var_name" ]; then
+            echo "Usage: print_var_values <variable_name1> <variable_name2> ..."
+            return 1
+        fi
+
+        # Dereference the variable and print the result
+        echo "$var_name=${!var_name:-(undefined)}"
+    done
+}
+
+# begin_group: Start a named section of log output, possibly with color.
+# Usage: begin_group "Group Name" [Color]
+#   Group Name: A string specifying the name of the group.
+#   Color (optional): ANSI color code to set text color. Default is blue (1;34).
+function begin_group() {
+    # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124
+    local blue="34"
+    local name="${1:-}"
+    local color="${2:-$blue}"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo -e "::group::\e[${color}m${name}\e[0m"
+    else
+        echo -e "\e[${color}m================== ${name} ======================\e[0m"
+    fi
+}
+
+# end_group: End a named section of log output and print status based on exit status.
+# Usage: end_group "Group Name" [Exit Status]
+#   Group Name: A string specifying the name of the group.
+#   Exit Status (optional): The exit status of the command run within the group. Default is 0.
+function end_group() {
+    local name="${1:-}"
+    local build_status="${2:-0}"
+    local duration="${3:-}"
+    local red="31"
+    local blue="34"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo "::endgroup::"
+
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m"
+        fi
+    else
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m"
+        else
+            echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m"
+        fi
+    fi
+}
+
+declare -A command_durations
+
+# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result.
+# Usage: run_command "Group Name" command [arguments...]
+function run_command() {
+    local group_name="${1:-}"
+    shift
+    local command=("$@")
+    local status
+
+    begin_group "$group_name"
+    set +e
+    local start_time=$(date +%s)
+    "${command[@]}"
+    status=$?
+    local end_time=$(date +%s)
+    set -e
+    local duration=$((end_time - start_time))
+    end_group "$group_name" $status $duration
+    command_durations["$group_name"]=$duration
+    return $status
+}
+
+function string_width() {
+    local str="$1"
+    echo "$str" | awk '{print length}'
+}
+
+function print_time_summary() {
+    local max_length=0
+    local group
+
+    # Find the longest group name for formatting
+    for group in "${!command_durations[@]}"; do
+        local group_length=$(echo "$group" | awk '{print length}')
+        if [ "$group_length" -gt "$max_length" ]; then
+            max_length=$group_length
+        fi
+    done
+
+    echo "Time Summary:"
+    for group in "${!command_durations[@]}"; do
+        printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
+    done
+
+    # Clear the array of timing info
+    declare -gA command_durations=()
+}
diff --git a/ci/sccache_hit_rate.sh b/ci/sccache_hit_rate.sh
new file mode 100755
index 0000000..de8ae46
--- /dev/null
+++ b/ci/sccache_hit_rate.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Ensure two arguments are provided
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <before-file> <after-file>" >&2
+  exit 1
+fi
+
+# Print the contents of the before file
+echo "=== Contents of $1 ===" >&2
+cat $1 >&2
+echo "=== End of $1 ===" >&2
+
+# Print the contents of the after file
+echo "=== Contents of $2 ==="  >&2
+cat $2 >&2
+echo "=== End of $2 ===" >&2
+
+# Extract compile requests and cache hits from the before and after files
+requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1")
+hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1")
+requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2")
+hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2")
+
+# Calculate the differences to find out how many new requests and hits
+requests_diff=$((requests_after - requests_before))
+hits_diff=$((hits_after - hits_before))
+
+echo "New Compile Requests: $requests_diff" >&2
+echo "New Hits: $hits_diff" >&2
+
+# Calculate and print the hit rate
+if [ $requests_diff -eq 0 ]; then
+    echo "No new compile requests, hit rate is not applicable"
+else
+    hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
+    echo "sccache hit rate: $hit_rate%" >&2
+    echo "$hit_rate" 
+fi
diff --git a/ci/sccache_stats.sh b/ci/sccache_stats.sh
new file mode 100755
index 0000000..3a3ebc4
--- /dev/null
+++ b/ci/sccache_stats.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# This script prints the sccache hit rate between two calls to sccache --show-stats.
+# It should be sourced in your script before and after the operations you want to profile,
+# with the 'start' or 'end' argument respectively.
+
+mode=$1
+
+if [[ "$mode" != "start" && "$mode" != "end" ]]; then
+    echo "Invalid mode: $mode"
+    echo "Usage: $0 {start|end}"
+    exit 1
+fi
+
+# Check if sccache is available
+if ! command -v sccache &> /dev/null; then
+    echo "Notice: sccache is not available. Skipping..."
+    exit 0
+fi
+
+case $mode in
+  start)
+    export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    ;;
+  end)
+    if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then
+        echo "Error: start stats not collected. Did you call this script with 'start' before your operations?"
+        exit 1
+    fi
+
+    final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    hits=$((final_hits - SCCACHE_START_HITS))
+    misses=$((final_misses - SCCACHE_START_MISSES))
+    total=$((hits + misses))
+
+    prefix=""
+    if [ ${GITHUB_ACTIONS:-false} = "true" ]; then
+      prefix="::notice::"
+    fi
+
+    if (( total > 0 )); then
+      hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }')
+      echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%"
+    else
+      echo ${prefix}"sccache stats: N/A No new compilation requests"
+    fi
+    unset SCCACHE_START_HITS
+    unset SCCACHE_START_MISSES
+    ;;
+esac
diff --git a/ci/test_nvbench.sh b/ci/test_nvbench.sh
new file mode 100755
index 0000000..f89c6fe
--- /dev/null
+++ b/ci/test_nvbench.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+# Run NVBench tests with high parallelism. If any need to be
+# serialized, define the `RUN_SERIAL` CMake property on the
+# test.
+export CTEST_PARALLEL_LEVEL=${PARALLEL_LEVEL}
+
+print_environment_details
+
+./build_nvbench.sh "$@"
+
+PRESET="nvbench-cpp$CXX_STANDARD"
+
+test_preset "NVBench" ${PRESET}
+
+print_time_summary
diff --git a/ci/windows/build_common.psm1 b/ci/windows/build_common.psm1
new file mode 100644
index 0000000..9d8e9cf
--- /dev/null
+++ b/ci/windows/build_common.psm1
@@ -0,0 +1,205 @@
+
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("std")]
+    [ValidateNotNullOrEmpty()]
+    [ValidateSet(17)]
+    [int]$CXX_STANDARD = 17
+)
+
+# We need the full path to cl because otherwise cmake will replace CMAKE_CXX_COMPILER with the full path
+# and keep CMAKE_CUDA_HOST_COMPILER at "cl" which breaks our cmake script
+$script:HOST_COMPILER  = (Get-Command "cl").source -replace '\\','/'
+$script:PARALLEL_LEVEL = (Get-WmiObject -class Win32_processor).NumberOfLogicalProcessors
+
+# Extract the CL version for export to build scripts:
+$script:CL_VERSION_STRING = & cl.exe /?
+if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") {
+    $CL_VERSION = [version]$matches[1]
+    Write-Host "Detected cl.exe version: $CL_VERSION"
+}
+
+if (-not $env:CCCL_BUILD_INFIX) {
+    $env:CCCL_BUILD_INFIX = ""
+}
+
+# Presets will be configured in this directory:
+$BUILD_DIR = "../build/$env:CCCL_BUILD_INFIX"
+
+If(!(test-path -PathType container "../build")) {
+    New-Item -ItemType Directory -Path "../build"
+}
+
+# The most recent build will always be symlinked to cccl/build/latest
+New-Item -ItemType Directory -Path "$BUILD_DIR" -Force
+
+# Prepare environment for CMake:
+$env:CMAKE_BUILD_PARALLEL_LEVEL = $PARALLEL_LEVEL
+$env:CTEST_PARALLEL_LEVEL = 1
+$env:CUDAHOSTCXX = $HOST_COMPILER.FullName
+$env:CXX = $HOST_COMPILER.FullName
+
+Write-Host "========================================"
+Write-Host "Begin build"
+Write-Host "pwd=$pwd"
+Write-Host "BUILD_DIR=$BUILD_DIR"
+Write-Host "CXX_STANDARD=$CXX_STANDARD"
+Write-Host "CXX=$env:CXX"
+Write-Host "CUDACXX=$env:CUDACXX"
+Write-Host "CUDAHOSTCXX=$env:CUDAHOSTCXX"
+Write-Host "NVCC_VERSION=$NVCC_VERSION"
+Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL"
+Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL"
+Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX"
+Write-Host "Current commit is:"
+Write-Host "$(git log -1)"
+Write-Host "========================================"
+
+function configure_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET,
+        [Parameter(Mandatory = $true)]
+        [AllowEmptyString()]
+        [string]$CMAKE_OPTIONS
+    )
+
+    $step = "$BUILD_NAME (configure)"
+
+    # CMake must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE
+    $test_result = $LastExitCode
+
+    If ($test_result -ne 0) {
+        throw "$step Failed"
+    }
+
+    popd
+    Write-Host "$step complete."
+}
+
+function build_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET
+    )
+
+    $step = "$BUILD_NAME (build)"
+
+    # CMake must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    sccache_stats('Start')
+
+    cmake --build --preset $PRESET -v
+    $test_result = $LastExitCode
+
+    sccache_stats('Stop')
+
+    echo "$step complete"
+
+    If ($test_result -ne 0) {
+         throw "$step Failed"
+    }
+
+    popd
+}
+
+function test_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET
+    )
+
+    $step = "$BUILD_NAME (test)"
+
+    # CTest must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    sccache_stats('Start')
+
+    ctest --preset $PRESET
+    $test_result = $LastExitCode
+
+    sccache_stats('Stop')
+
+    echo "$step complete"
+
+    If ($test_result -ne 0) {
+         throw "$step Failed"
+    }
+
+    popd
+}
+
+function configure_and_build_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET,
+        [Parameter(Mandatory = $true)]
+        [AllowEmptyString()]
+        [string]$CMAKE_OPTIONS
+    )
+
+    configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
+    build_preset "$BUILD_NAME" "$PRESET"
+}
+
+function sccache_stats {
+    Param (
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [ValidateSet('Start','Stop')]
+        [string]$MODE
+    )
+
+    $sccache_stats = sccache -s
+    If($MODE -eq 'Start') {
+        [int]$script:sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+')
+        [int]$script:sccache_cache_hits_cpp   = ($sccache_stats[2] -replace '[^\d]+')
+        [int]$script:sccache_cache_hits_cuda  = ($sccache_stats[3] -replace '[^\d]+')
+        [int]$script:sccache_cache_miss_cpp   = ($sccache_stats[5] -replace '[^\d]+')
+        [int]$script:sccache_cache_miss_cuda  = ($sccache_stats[6] -replace '[^\d]+')
+    } else {
+        [int]$final_sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+')
+        [int]$final_sccache_cache_hits_cpp   = ($sccache_stats[2] -replace '[^\d]+')
+        [int]$final_sccache_cache_hits_cuda  = ($sccache_stats[3] -replace '[^\d]+')
+        [int]$final_sccache_cache_miss_cpp   = ($sccache_stats[5] -replace '[^\d]+')
+        [int]$final_sccache_cache_miss_cuda  = ($sccache_stats[6] -replace '[^\d]+')
+
+        [int]$total_requests  = $final_sccache_compile_requests - $script:sccache_compile_requests
+        [int]$total_hits_cpp  = $final_sccache_cache_hits_cpp   - $script:sccache_cache_hits_cpp
+        [int]$total_hits_cuda = $final_sccache_cache_hits_cuda  - $script:sccache_cache_hits_cuda
+        [int]$total_miss_cpp  = $final_sccache_cache_miss_cpp   - $script:sccache_cache_miss_cpp
+        [int]$total_miss_cuda = $final_sccache_cache_miss_cuda  - $script:sccache_cache_miss_cuda
+        If ( $total_requests -gt 0 ) {
+            [int]$hit_rate_cpp  = $total_hits_cpp  / $total_requests * 100;
+            [int]$hit_rate_cuda = $total_hits_cuda / $total_requests * 100;
+            echo "sccache hits cpp:  $total_hits_cpp  `t| misses: $total_miss_cpp  `t| hit rate: $hit_rate_cpp%"
+            echo "sccache hits cuda: $total_hits_cuda `t| misses: $total_miss_cuda `t| hit rate: $hit_rate_cuda%"
+        } else {
+            echo "sccache stats: N/A No new compilation requests"
+        }
+    }
+}
+
+Export-ModuleMember -Function configure_preset, build_preset, test_preset, configure_and_build_preset, sccache_stats
+Export-ModuleMember -Variable BUILD_DIR, CL_VERSION
diff --git a/ci/windows/build_nvbench.ps1 b/ci/windows/build_nvbench.ps1
new file mode 100644
index 0000000..31b4f7c
--- /dev/null
+++ b/ci/windows/build_nvbench.ps1
@@ -0,0 +1,26 @@
+
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("std")]
+    [ValidateNotNullOrEmpty()]
+    [ValidateSet(17)]
+    [int]$CXX_STANDARD = 17
+)
+
+$CURRENT_PATH = Split-Path $pwd -leaf
+If($CURRENT_PATH -ne "ci") {
+    Write-Host "Moving to ci folder"
+    pushd "$PSScriptRoot/.."
+}
+
+Remove-Module -Name build_common
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+
+$PRESET = "nvbench-cpp$CXX_STANDARD"
+$CMAKE_OPTIONS = ""
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+
+If($CURRENT_PATH -ne "ci") {
+    popd
+}
diff --git a/cmake/PrintCTestRunTimes.cmake b/cmake/PrintCTestRunTimes.cmake
new file mode 100644
index 0000000..f4ac7d9
--- /dev/null
+++ b/cmake/PrintCTestRunTimes.cmake
@@ -0,0 +1,127 @@
+## This CMake script parses the output of ctest and prints a formatted list
+## of individual test runtimes, sorted longest first.
+##
+## ctest > ctest_log
+## cmake -DLOGFILE=ctest_log \
+##       -DMINSEC=10 \
+##       -P PrintCTestRunTimes.cmake
+##
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  message(FATAL_ERROR "Missing -DLOGFILE=<ctest output> argument.")
+endif()
+
+if (NOT DEFINED MINSEC)
+  set(MINSEC 10)
+endif()
+
+set(num_below_thresh 0)
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+string(JOIN "" regex
+  "[0-9]+/[0-9]+[ ]+Test[ ]+#"
+  "([0-9]+)"                        # Test ID
+  ":[ ]+"
+  "([^ ]+)"                         # Test Name
+  "[ ]*\\.+[ ]*\\**[ ]*"
+  "([^ ]+)"                         # Result
+  "[ ]+"
+  "([0-9]+)"                        # Seconds
+  "\\.[0-9]+[ ]+sec"
+)
+
+message(DEBUG "LOGFILE: ${LOGFILE}")
+message(DEBUG "MINSEC: ${MINSEC}")
+message(DEBUG "regex: ${regex}")
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH "${regex}" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 4)
+    set(test_id      "${CMAKE_MATCH_1}")
+    set(test_name    "${CMAKE_MATCH_2}")
+    set(test_result  "${CMAKE_MATCH_3}")
+    set(tmp          "${CMAKE_MATCH_4}") # floor(runtime_seconds)
+
+    if (tmp LESS MINSEC)
+      math(EXPR num_below_thresh "${num_below_thresh} + 1")
+      continue()
+    endif()
+
+    # Compute human readable time
+    math(EXPR days         "${tmp} / (60 * 60 * 24)")
+    math(EXPR tmp          "${tmp} - (${days} * 60 * 60 * 24)")
+    math(EXPR hours        "${tmp} / (60 * 60)")
+    math(EXPR tmp          "${tmp} - (${hours} * 60 * 60)")
+    math(EXPR minutes      "${tmp} / (60)")
+    math(EXPR tmp          "${tmp} - (${minutes} * 60)")
+    math(EXPR seconds      "${tmp}")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${test_id}" key)
+    string(JOIN " | " ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s"
+      "${test_result}"
+      "${test_id}: ${test_name}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(STATUS "LOGFILE contained no test times ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries ORDER DESCENDING)
+
+# Dump table:
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
+
+if (num_below_thresh GREATER 0)
+  message(STATUS "${num_below_thresh} additional tests took < ${MINSEC}s each.")
+endif()
diff --git a/cmake/PrintNinjaBuildTimes.cmake b/cmake/PrintNinjaBuildTimes.cmake
new file mode 100644
index 0000000..65d243d
--- /dev/null
+++ b/cmake/PrintNinjaBuildTimes.cmake
@@ -0,0 +1,101 @@
+## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of
+## build/link times, sorted longest first.
+##
+## cmake -DLOGFILE=<.ninja_log file> \
+##       -P PrintNinjaBuildTimes.cmake
+##
+## If LOGFILE is omitted, the current directory's .ninja_log file is used.
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  set(LOGFILE ".ninja_log")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH
+    "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 3)
+    set(start_ms ${CMAKE_MATCH_1})
+    set(end_ms ${CMAKE_MATCH_2})
+    set(command "${CMAKE_MATCH_3}")
+    math(EXPR runtime_ms "${end_ms} - ${start_ms}")
+
+    # Compute human readable time
+    math(EXPR days         "${runtime_ms} / (1000 * 60 * 60 * 24)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)")
+    math(EXPR hours        "${runtime_ms} / (1000 * 60 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${hours} * 1000 * 60 * 60)")
+    math(EXPR minutes      "${runtime_ms} / (1000 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${minutes} * 1000 * 60)")
+    math(EXPR seconds      "${runtime_ms} / 1000")
+    math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+    pad_string_with_zeros(milliseconds 3)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${command}" key)
+    set(ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries)
+list(REVERSE entries)
+
+# Dump table:
+message(STATUS "-----------------------+----------------------------")
+message(STATUS "Time                   | Command                    ")
+message(STATUS "-----------------------+----------------------------")
+
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()