Squashed commit of the following:

commit 4b309e6ad8 Author: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 13:19:14 2024 +0000 Minor cleanups commit 476ed2ceae Author: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 12:53:37 2024 +0000 WAR compiler ice in nlohmann json. Only seeing this on GCC 9 + CTK 11.1. Seems to be having trouble with the `[[no_unique_address]]` optimization. commit a9bf1d3e42 Author: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 00:24:47 2024 +0000 Bump nlohmann json. commit 80980fe373 Author: Allison Piper <alliepiper16@gmail.com> Date: Sat Apr 6 00:22:07 2024 +0000 Fix llvm filesystem support commit f6099e6311 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 23:18:44 2024 +0000 Drop MSVC 2017 testing. commit 5ae50a8ef5 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 23:02:32 2024 +0000 Add mroe missing headers. commit b2a9ae04d9 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 22:37:56 2024 +0000 Remove old CUDA+MSVC builds and make windows build-only. commit 5b18c26a28 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 22:37:07 2024 +0000 Fix header for std::min/max. Why do I always think it's utility instead of algorithm.... commit 6a409efa2d Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 22:18:18 2024 +0000 Temporarily disable CUPTI on all windows builds. commit f432f88866 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 21:42:52 2024 +0000 Fix warnings on MSVC. commit 829787649b Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 21:03:16 2024 +0000 More flailing about in powershell. commit 21742e6bea Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 20:36:08 2024 +0000 Cleanup filesystem header handling. commit de3d202635 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 20:09:00 2024 +0000 Windows CI debugging. commit a4151667ff Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:45:40 2024 +0000 Quotation mark madness commit dd04f3befe Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:27:27 2024 +0000 Temporarily disable NVML on windows CI until new containers are ready. commit f3952848c4 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:25:22 2024 +0000 WAR issues on gcc-7. commit 198986875e Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 19:25:04 2024 +0000 More matrix/devcontainer updates. commit b9712f8696 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 18:30:35 2024 +0000 Fix windows build scripts. commit 943f268280 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 18:18:33 2024 +0000 Fix warnings with clang host compiler. commit 7063e1d60a Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 18:14:28 2024 +0000 More devcontainer hijinks. commit 06532fde81 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:51:25 2024 +0000 More matrix updates. commit 78a265ea55 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:34:00 2024 +0000 Support CLI CMake options for windows ci scripts. commit 670895c867 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:31:59 2024 +0000 Add missing devcontainers. commit b121823e74 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:22:54 2024 +0000 Build for `all-major` architectures in presets. We can get away with this because we require CMake 3.23.1. This was added in 3.23. commit fccfd44685 Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 17:22:08 2024 +0000 Update matrix file. commit e7d43ba90e Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 16:23:48 2024 +0000 Consolidate build/test jobs. commit c4044056ec Author: Allison Piper <alliepiper16@gmail.com> Date: Fri Apr 5 16:04:11 2024 +0000 Add missing build script.
2026-03-14 20:27:24 +00:00 · 2024-04-06 13:56:10 +00:00
parent 04b70059b8
commit e8c8877d36
39 changed files with 1031 additions and 8928 deletions
--- a/.devcontainer/cuda12.4-oneapi2023.2.0/devcontainer.json
+++ b/.devcontainer/cuda12.4-oneapi2023.2.0/devcontainer.json
@@ -1,6 +1,6 @@
 {
  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.06-cpp-oneapi2023.2.0-cuda12.4-ubuntu22.04",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc10-cuda12.0-ubuntu20.04",
  "hostRequirements": {
    "gpu": "optional"
  },
@@ -14,11 +14,11 @@
    "SCCACHE_BUCKET": "rapids-sccache-devs",
    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.4-oneapi2023.2.0",
-    "CCCL_CUDA_VERSION": "12.4",
-    "CCCL_HOST_COMPILER": "oneapi",
-    "CCCL_HOST_COMPILER_VERSION": "2023.2.0",
-    "CCCL_BUILD_INFIX": "cuda12.4-oneapi2023.2.0"
+    "DEVCONTAINER_NAME": "cuda12.0-gcc10",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc10"
  },
  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -42,5 +42,5 @@
      }
    }
  },
-  "name": "cuda12.4-oneapi2023.2.0"
+  "name": "cuda12.0-gcc10"
 }
--- a/.devcontainer/cuda12.0-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc11-cuda12.0-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc11",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc11"
+}
--- a/.devcontainer/cuda12.0-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc12-cuda12.0-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc12",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc12"
+}
--- a/.devcontainer/cuda12.0-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc7/devcontainer.json
@@ -1,6 +1,6 @@
 {
  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:24.06-cpp-gcc6-cuda11.1-ubuntu18.04",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc7-cuda12.0-ubuntu20.04",
  "hostRequirements": {
    "gpu": "optional"
  },
@@ -14,11 +14,11 @@
    "SCCACHE_BUCKET": "rapids-sccache-devs",
    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda11.1-gcc6",
-    "CCCL_CUDA_VERSION": "11.1",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc7",
+    "CCCL_CUDA_VERSION": "12.0",
    "CCCL_HOST_COMPILER": "gcc",
-    "CCCL_HOST_COMPILER_VERSION": "6",
-    "CCCL_BUILD_INFIX": "cuda11.1-gcc6"
+    "CCCL_HOST_COMPILER_VERSION": "7",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc7"
  },
  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
@@ -42,5 +42,5 @@
      }
    }
  },
-  "name": "cuda11.1-gcc6"
+  "name": "cuda12.0-gcc7"
 }
--- a/.devcontainer/cuda12.0-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc8/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc8-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc8",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "8",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc8"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc8"
+}
--- a/.devcontainer/cuda12.0-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.0-gcc9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-gcc9-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-gcc9",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "gcc",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.0-gcc9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-gcc9"
+}
--- a/.devcontainer/cuda12.0-llvm10/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm10/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm10-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm10",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "10",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm10"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm10"
+}
--- a/.devcontainer/cuda12.0-llvm11/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm11/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm11-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm11",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "11",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm11"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm11"
+}
--- a/.devcontainer/cuda12.0-llvm12/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm12/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm12-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm12",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "12",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm12"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm12"
+}
--- a/.devcontainer/cuda12.0-llvm13/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm13/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm13-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm13",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "13",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm13"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm13"
+}
--- a/.devcontainer/cuda12.0-llvm14/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm14/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm14-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm14",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm14"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm14"
+}
--- a/.devcontainer/cuda12.0-llvm9/devcontainer.json
+++ b/.devcontainer/cuda12.0-llvm9/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm9-cuda12.0-ubuntu20.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.0-llvm9",
+    "CCCL_CUDA_VERSION": "12.0",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "9",
+    "CCCL_BUILD_INFIX": "cuda12.0-llvm9"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.0-llvm9"
+}
--- a/.github/workflows/build-and-test-linux.yml
+++ b/.github/workflows/build-and-test-linux.yml
@@ -18,30 +18,15 @@ permissions:
  contents: read

 jobs:
-  build:
-    name: Build ${{inputs.test_name}}
+  build-and-test:
+    name: Build/Test ${{inputs.test_name}}
    permissions:
      id-token: write
      contents: read
    uses: ./.github/workflows/run-as-coder.yml
    with:
-      name: Build ${{inputs.test_name}}
-      runner: linux-${{inputs.cpu}}-cpu16
+      name: Build/Test ${{inputs.test_name}}
+      runner: linux-${{inputs.cpu}}-gpu-v100-latest-1
      image:  ${{ inputs.container_image }}
-      command: |
-        ${{ inputs.build_script }}
-
-  test:
-    needs: build
-    permissions:
-      id-token: write
-      contents: read
-    if:  ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.run_tests}}
-    name: Test ${{inputs.test_name}}
-    uses: ./.github/workflows/run-as-coder.yml
-    with:
-      name: Test ${{inputs.test_name}}
-      runner: linux-${{inputs.cpu}}-gpu-v100-latest-1
-      image: ${{inputs.container_image}}
      command: |
        ${{ inputs.test_script }}
--- a/.github/workflows/build-and-test-windows.yml
+++ b/.github/workflows/build-and-test-windows.yml
@@ -5,11 +5,12 @@ on:
    inputs:
      test_name: {type: string, required: false}
      build_script: {type: string, required: false}
+      test_script: {type: string, required: false}
      container_image: {type: string, required: false}

 jobs:
  prepare:
-    name: Build ${{inputs.test_name}}
+    name: Build Only ${{inputs.test_name}}
    runs-on: windows-amd64-cpu16
    permissions:
      id-token: write
@@ -41,9 +42,8 @@ jobs:
                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_IDLE_TIMEOUT','${{env.SCCACHE_IDLE_TIMEOUT}}')
                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_USE_SSL','${{env.SCCACHE_S3_USE_SSL}}')
                                                                    [System.Environment]::SetEnvironmentVariable('SCCACHE_S3_NO_CREDENTIALS','${{env.SCCACHE_S3_NO_CREDENTIALS}}')
-                                                                    git clone https://github.com/NVIDIA/cccl.git;
-                                                                    cd cccl;
+                                                                    git clone https://github.com/NVIDIA/nvbench.git;
+                                                                    cd nvbench;
                                                                    git fetch --all;
                                                                    git checkout ${{github.ref_name}};
                                                                    ${{inputs.build_script}};"
-
--- a/.github/workflows/dispatch-build-and-test.yml
+++ b/.github/workflows/dispatch-build-and-test.yml
@@ -29,10 +29,9 @@ jobs:
    with:
      cpu: ${{ matrix.cpu }}
      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}/C++${{matrix.std}} ${{matrix.extra_build_args}}
-      build_script: './ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} "${{matrix.extra_build_args}}"'
-      test_script:  './ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} "${{matrix.extra_build_args}}"'
+      build_script: "./ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} ${{matrix.extra_build_args}}"
+      test_script:  "./ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} ${{matrix.extra_build_args}}"
      container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
-      run_tests: ${{ contains(matrix.jobs, 'test') && !contains(github.event.head_commit.message, 'skip-tests') && matrix.os != 'windows-2022' }}

  build_and_test_windows:
    name: build and test windows
@@ -47,5 +46,6 @@ jobs:
        include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
    with:
      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}/C++${{matrix.std}}
-      build_script: "./ci/windows/build_${{ inputs.project_name }}.ps1 -std ${{matrix.std}}"
+      build_script: "./ci/windows/build_${{ inputs.project_name }}.ps1 -std ${{matrix.std}} ${{matrix.extra_build_args}}"
+      test_script:  "./ci/windows/test_${{ inputs.project_name }}.ps1 -std ${{matrix.std}} ${{matrix.extra_build_args}}"
      container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cuda${{matrix.cuda}}-${{matrix.compiler.name}}${{matrix.compiler.version}}-${{matrix.os}}
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -13,7 +13,7 @@
      "binaryDir": "${sourceDir}/build/$env{CCCL_BUILD_INFIX}/${presetName}",
      "cacheVariables": {
        "CMAKE_BUILD_TYPE": "Release",
-        "CMAKE_CUDA_ARCHITECTURES": "60;70;80",
+        "CMAKE_CUDA_ARCHITECTURES": "all-major",
        "NVBench_ENABLE_CUPTI": true,
        "NVBench_ENABLE_DEVICE_TESTING": false,
        "NVBench_ENABLE_EXAMPLES": true,
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -1,26 +1,19 @@

-cuda_prev_min: &cuda_prev_min '11.1'
-cuda_prev_max:  &cuda_prev_max  '11.8'
-cuda_curr: &cuda_curr '12.4'
-
-# The GPUs to test on
-gpus:
-  - 'a100'
-  - 'v100'
+cuda_prev_min: &cuda_prev_min '11.1' # Does not support the CUPTI APIs we use (added in 11.3)
+cuda_prev_max: &cuda_prev_max '11.8'
+cuda_curr_min: &cuda_curr_min '12.0'
+cuda_curr_max: &cuda_curr_max '12.4'

 # The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
 devcontainer_version: '24.06'

 # gcc compiler configurations
-gcc6: &gcc6 { name: 'gcc', version: '6', exe: 'g++' }
 gcc7: &gcc7 { name: 'gcc', version: '7', exe: 'g++' }
 gcc8: &gcc8 { name: 'gcc', version: '8', exe: 'g++' }
 gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' }
 gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' }
 gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' }
 gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' }
-gcc-oldest: &gcc-oldest { name: 'gcc', version: '6', exe: 'g++' }
-gcc-newest: &gcc-newest { name: 'gcc', version: '12', exe: 'g++' }

 # LLVM Compiler configurations
 llvm9: &llvm9 { name: 'llvm', version: '9', exe: 'clang++' }
@@ -31,17 +24,11 @@ llvm13: &llvm13 { name: 'llvm', version: '13', exe: 'clang++' }
 llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' }
 llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' }
 llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' }
-llvm-oldest: &llvm-oldest { name: 'llvm', version: '9', exe: 'clang++' }
-llvm-newest: &llvm-newest { name: 'llvm', version: '16', exe: 'clang++' }

 # MSVC configs
-msvc2017: &msvc2017 { name: 'cl', version: '14.16', exe: 'cl++' }
 msvc2019: &msvc2019 { name: 'cl', version: '14.29', exe: 'cl++' }
 msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }

-# oneAPI configs
-oneapi: &oneapi { name: 'oneapi', version: '2023.2.0', exe: 'icpc' }
-
 # Each environment below will generate a unique build/test job
 # See the "compute-matrix" job in the workflow for how this is parsed and used
 # cuda: The CUDA Toolkit version
@@ -57,29 +44,36 @@ oneapi: &oneapi { name: 'oneapi', version: '2023.2.0', exe: 'icpc' }
 # Configurations that will run for every PR
 pull_request:
  nvcc:
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     std: [17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     std: [17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     std: [17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    std: [17],     jobs: ['build']}
-    - {cuda: *cuda_prev_min, os: 'windows2022', cpu: 'amd64', compiler: *msvc2017, std: [17],     jobs: ['build']}
-    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17],     jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90'}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17],     jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90a'}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17],     jobs: ['build', 'test']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *gcc12,    std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [17], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [17], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [17], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [17], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15,   std: [17], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16,   std: [17], jobs: ['build', 'test']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *llvm16,   std: [17], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, std: [17],     jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, std: [17], jobs: ['build']}
-    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *oneapi,   std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [17]}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16,   std: [17]}
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
--- a/ci/ninja_summary.py
+++ b/ci/ninja_summary.py
@@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+# Copyright (c) 2018 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+r"""Summarize the last ninja build, invoked with ninja's -C syntax.
+
+This script is designed to be automatically run after each ninja build in
+order to summarize the build's performance. Making build performance information
+more visible should make it easier to notice anomalies and opportunities. To use
+this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat.
+
+On Linux you can get autoninja to invoke this script using this syntax:
+
+$ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome
+
+You can also call this script directly using ninja's syntax to specify the
+output directory of interest:
+
+> python3 post_build_ninja_summary.py -C out/Default
+
+Typical output looks like this:
+
+>ninja -C out\debug_component base
+ninja.exe -C out\debug_component base -j 960 -l 48  -d keeprsp
+ninja: Entering directory `out\debug_component'
+[1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files
+Longest build steps:
+       0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time)
+       0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time)
+       0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time)
+       1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time)
+Time by build-step type:
+       0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum)
+       0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum)
+       0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum)
+       1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed
+time sum)
+      23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum)
+26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism)
+839 build steps completed, average of 32.17/s
+
+If no gn clean has been done then results will be for the last non-NULL
+invocation of ninja. Ideas for future statistics, and implementations are
+appreciated.
+
+The "weighted" time is the elapsed time of each build step divided by the number
+of tasks that were running in parallel. This makes it an excellent approximation
+of how "important" a slow step was. A link that is entirely or mostly serialized
+will have a weighted time that is the same or similar to its elapsed time. A
+compile that runs in parallel with 999 other compiles will have a weighted time
+that is tiny."""
+
+import argparse
+import errno
+import fnmatch
+import os
+import subprocess
+import sys
+
+# The number of long build times to report:
+long_count = 10
+# The number of long times by extension to report
+long_ext_count = 10
+
+
+class Target:
+    """Represents a single line read for a .ninja_log file."""
+    def __init__(self, start, end):
+        """Creates a target object by passing in the start/end times in seconds
+        as a float."""
+        self.start = start
+        self.end = end
+        # A list of targets, appended to by the owner of this object.
+        self.targets = []
+        self.weighted_duration = 0.0
+
+    def Duration(self):
+        """Returns the task duration in seconds as a float."""
+        return self.end - self.start
+
+    def SetWeightedDuration(self, weighted_duration):
+        """Sets the duration, in seconds, passed in as a float."""
+        self.weighted_duration = weighted_duration
+
+    def WeightedDuration(self):
+        """Returns the task's weighted duration in seconds as a float.
+
+        Weighted_duration takes the elapsed time of the task and divides it
+        by how many other tasks were running at the same time. Thus, it
+        represents the approximate impact of this task on the total build time,
+        with serialized or serializing steps typically ending up with much
+        longer weighted durations.
+        weighted_duration should always be the same or shorter than duration.
+        """
+        # Allow for modest floating-point errors
+        epsilon = 0.000002
+        if (self.weighted_duration > self.Duration() + epsilon):
+            print('%s > %s?' % (self.weighted_duration, self.Duration()))
+        assert (self.weighted_duration <= self.Duration() + epsilon)
+        return self.weighted_duration
+
+    def DescribeTargets(self):
+        """Returns a printable string that summarizes the targets."""
+        # Some build steps generate dozens of outputs - handle them sanely.
+        # The max_length was chosen so that it can fit most of the long
+        # single-target names, while minimizing word wrapping.
+        result = ', '.join(self.targets)
+        max_length = 65
+        if len(result) > max_length:
+            result = result[:max_length] + '...'
+        return result
+
+
+# Copied with some modifications from ninjatracing
+def ReadTargets(log, show_all):
+    """Reads all targets from .ninja_log file |log_file|, sorted by duration.
+
+    The result is a list of Target objects."""
+    header = log.readline()
+    # Handle empty ninja_log gracefully by silently returning an empty list of
+    # targets.
+    if not header:
+        return []
+    assert header == '# ninja log v5\n', \
+           'unrecognized ninja log version %r' % header
+    targets_dict = {}
+    last_end_seen = 0.0
+    for line in log:
+        parts = line.strip().split('\t')
+        if len(parts) != 5:
+            # If ninja.exe is rudely halted then the .ninja_log file may be
+            # corrupt. Silently continue.
+            continue
+        start, end, _, name, cmdhash = parts  # Ignore restat.
+        # Convert from integral milliseconds to float seconds.
+        start = int(start) / 1000.0
+        end = int(end) / 1000.0
+        if not show_all and end < last_end_seen:
+            # An earlier time stamp means that this step is the first in a new
+            # build, possibly an incremental build. Throw away the previous
+            # data so that this new build will be displayed independently.
+            # This has to be done by comparing end times because records are
+            # written to the .ninja_log file when commands complete, so end
+            # times are guaranteed to be in order, but start times are not.
+            targets_dict = {}
+        target = None
+        if cmdhash in targets_dict:
+            target = targets_dict[cmdhash]
+            if not show_all and (target.start != start or target.end != end):
+                # If several builds in a row just run one or two build steps
+                # then the end times may not go backwards so the last build may
+                # not be detected as such. However in many cases there will be a
+                # build step repeated in the two builds and the changed
+                # start/stop points for that command, identified by the hash,
+                # can be used to detect and reset the target dictionary.
+                targets_dict = {}
+                target = None
+        if not target:
+            targets_dict[cmdhash] = target = Target(start, end)
+        last_end_seen = end
+        target.targets.append(name)
+    return list(targets_dict.values())
+
+
+def GetExtension(target, extra_patterns):
+    """Return the file extension that best represents a target.
+
+  For targets that generate multiple outputs it is important to return a
+  consistent 'canonical' extension. Ultimately the goal is to group build steps
+  by type."""
+    for output in target.targets:
+        if extra_patterns:
+            for fn_pattern in extra_patterns.split(';'):
+                if fnmatch.fnmatch(output, '*' + fn_pattern + '*'):
+                    return fn_pattern
+        # Not a true extension, but a good grouping.
+        if output.endswith('type_mappings'):
+            extension = 'type_mappings'
+            break
+
+        # Capture two extensions if present. For example: file.javac.jar should
+        # be distinguished from file.interface.jar.
+        root, ext1 = os.path.splitext(output)
+        _, ext2 = os.path.splitext(root)
+        extension = ext2 + ext1  # Preserve the order in the file name.
+
+        if len(extension) == 0:
+            extension = '(no extension found)'
+
+        if ext1 in ['.pdb', '.dll', '.exe']:
+            extension = 'PEFile (linking)'
+            # Make sure that .dll and .exe are grouped together and that the
+            # .dll.lib files don't cause these to be listed as libraries
+            break
+        if ext1 in ['.so', '.TOC']:
+            extension = '.so (linking)'
+            # Attempt to identify linking, avoid identifying as '.TOC'
+            break
+        # Make sure .obj files don't get categorized as mojo files
+        if ext1 in ['.obj', '.o']:
+            break
+        # Jars are the canonical output of java targets.
+        if ext1 == '.jar':
+            break
+        # Normalize all mojo related outputs to 'mojo'.
+        if output.count('.mojom') > 0:
+            extension = 'mojo'
+            break
+    return extension
+
+
+def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
+    """Print a summary of the passed in list of Target objects."""
+
+    # Create a list that is in order by time stamp and has entries for the
+    # beginning and ending of each build step (one time stamp may have multiple
+    # entries due to multiple steps starting/stopping at exactly the same time).
+    # Iterate through this list, keeping track of which tasks are running at all
+    # times. At each time step calculate a running total for weighted time so
+    # that when each task ends its own weighted time can easily be calculated.
+    task_start_stop_times = []
+
+    earliest = -1
+    latest = 0
+    total_cpu_time = 0
+    for target in entries:
+        if earliest < 0 or target.start < earliest:
+            earliest = target.start
+        if target.end > latest:
+            latest = target.end
+        total_cpu_time += target.Duration()
+        task_start_stop_times.append((target.start, 'start', target))
+        task_start_stop_times.append((target.end, 'stop', target))
+    length = latest - earliest
+    weighted_total = 0.0
+
+    # Sort by the time/type records and ignore |target|
+    task_start_stop_times.sort(key=lambda times: times[:2])
+    # Now we have all task start/stop times sorted by when they happen. If a
+    # task starts and stops on the same time stamp then the start will come
+    # first because of the alphabet, which is important for making this work
+    # correctly.
+    # Track the tasks which are currently running.
+    running_tasks = {}
+    # Record the time we have processed up to so we know how to calculate time
+    # deltas.
+    last_time = task_start_stop_times[0][0]
+    # Track the accumulated weighted time so that it can efficiently be added
+    # to individual tasks.
+    last_weighted_time = 0.0
+    # Scan all start/stop events.
+    for event in task_start_stop_times:
+        time, action_name, target = event
+        # Accumulate weighted time up to now.
+        num_running = len(running_tasks)
+        if num_running > 0:
+            # Update the total weighted time up to this moment.
+            last_weighted_time += (time - last_time) / float(num_running)
+        if action_name == 'start':
+            # Record the total weighted task time when this task starts.
+            running_tasks[target] = last_weighted_time
+        if action_name == 'stop':
+            # Record the change in the total weighted task time while this task
+            # ran.
+            weighted_duration = last_weighted_time - running_tasks[target]
+            target.SetWeightedDuration(weighted_duration)
+            weighted_total += weighted_duration
+            del running_tasks[target]
+        last_time = time
+    assert (len(running_tasks) == 0)
+
+    # Warn if the sum of weighted times is off by more than half a second.
+    if abs(length - weighted_total) > 500:
+        print('Warning: Possible corrupt ninja log, results may be '
+              'untrustworthy. Length = %.3f, weighted total = %.3f' %
+              (length, weighted_total))
+
+    # Print the slowest build steps:
+    print('    Longest build steps:')
+    if elapsed_time_sorting:
+        entries.sort(key=lambda x: x.Duration())
+    else:
+        entries.sort(key=lambda x: x.WeightedDuration())
+    for target in entries[-long_count:]:
+        print('      %8.1f weighted s to build %s (%.1f s elapsed time)' %
+              (target.WeightedDuration(), target.DescribeTargets(),
+               target.Duration()))
+
+    # Sum up the time by file extension/type of the output file
+    count_by_ext = {}
+    time_by_ext = {}
+    weighted_time_by_ext = {}
+    # Scan through all of the targets to build up per-extension statistics.
+    for target in entries:
+        extension = GetExtension(target, extra_step_types)
+        time_by_ext[extension] = time_by_ext.get(extension,
+                                                 0) + target.Duration()
+        weighted_time_by_ext[extension] = weighted_time_by_ext.get(
+            extension, 0) + target.WeightedDuration()
+        count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
+
+    print('    Time by build-step type:')
+    # Copy to a list with extension name and total time swapped, to (time, ext)
+    if elapsed_time_sorting:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in time_by_ext.items())
+    else:
+        weighted_time_by_ext_sorted = sorted(
+            (y, x) for (x, y) in weighted_time_by_ext.items())
+    # Print the slowest build target types:
+    for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
+        print(
+            '      %8.1f s weighted time to generate %d %s files '
+            '(%1.1f s elapsed time sum)' %
+            (time, count_by_ext[extension], extension, time_by_ext[extension]))
+
+    print('    %.1f s weighted time (%.1f s elapsed time sum, %1.1fx '
+          'parallelism)' %
+          (length, total_cpu_time, total_cpu_time * 1.0 / length))
+    print('    %d build steps completed, average of %1.2f/s' %
+          (len(entries), len(entries) / (length)))
+
+
+def main():
+    log_file = '.ninja_log'
+    metrics_file = 'siso_metrics.json'
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-C', dest='build_directory', help='Build directory.')
+    parser.add_argument(
+        '-s',
+        '--step-types',
+        help='semicolon separated fnmatch patterns for build-step grouping')
+    parser.add_argument(
+        '-e',
+        '--elapsed_time_sorting',
+        default=False,
+        action='store_true',
+        help='Sort output by elapsed time instead of weighted time')
+    parser.add_argument('--log-file',
+                        help="specific ninja log file to analyze.")
+    args, _extra_args = parser.parse_known_args()
+    if args.build_directory:
+        log_file = os.path.join(args.build_directory, log_file)
+        metrics_file = os.path.join(args.build_directory, metrics_file)
+    if args.log_file:
+        log_file = args.log_file
+    if not args.step_types:
+        # Offer a convenient way to add extra step types automatically,
+        # including when this script is run by autoninja. get() returns None if
+        # the variable isn't set.
+        args.step_types = os.environ.get('chromium_step_types')
+    if args.step_types:
+        # Make room for the extra build types.
+        global long_ext_count
+        long_ext_count += len(args.step_types.split(';'))
+
+    if os.path.exists(metrics_file):
+        # Automatically handle summarizing siso builds.
+        cmd = ['siso.bat' if 'win32' in sys.platform else 'siso']
+        cmd.extend(['metrics', 'summary'])
+        if args.build_directory:
+            cmd.extend(['-C', args.build_directory])
+        if args.step_types:
+            cmd.extend(['--step_types', args.step_types])
+        if args.elapsed_time_sorting:
+            cmd.append('--elapsed_time_sorting')
+        subprocess.run(cmd)
+    else:
+        try:
+            with open(log_file, 'r') as log:
+                entries = ReadTargets(log, False)
+                if entries:
+                    SummarizeEntries(entries, args.step_types,
+                                     args.elapsed_time_sorting)
+        except IOError:
+            print('Log file %r not found, no build summary created.' % log_file)
+            return errno.ENOENT
+
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/ci/windows/build_common.psm1
+++ b/ci/windows/build_common.psm1
@@ -73,7 +73,9 @@ function configure_preset {
    # CMake must be invoked in the same directory as the presets file:
    pushd ".."

-    cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE
+    $cmake_command = "cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE"
+    echo "$cmake_command"
+    Invoke-Expression $cmake_command
    $test_result = $LastExitCode

    If ($test_result -ne 0) {
--- a/ci/windows/build_nvbench.ps1
+++ b/ci/windows/build_nvbench.ps1
@@ -4,7 +4,11 @@ Param(
    [Alias("std")]
    [ValidateNotNullOrEmpty()]
    [ValidateSet(17)]
-    [int]$CXX_STANDARD = 17
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [Alias("cmake-options")]
+    [ValidateNotNullOrEmpty()]
+    [string]$ARG_CMAKE_OPTIONS = ""
 )

 $CURRENT_PATH = Split-Path $pwd -leaf
@@ -19,6 +23,11 @@ Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
 $PRESET = "nvbench-cpp$CXX_STANDARD"
 $CMAKE_OPTIONS = ""

+# Append any arguments pass in on the command line
+If($ARG_CMAKE_OPTIONS -ne "") {
+    $CMAKE_OPTIONS += "$ARG_CMAKE_OPTIONS"
+}
+
 configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"

 If($CURRENT_PATH -ne "ci") {
--- a/ci/windows/test_nvbench.ps1
+++ b/ci/windows/test_nvbench.ps1
@@ -0,0 +1,36 @@
+
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("std")]
+    [ValidateNotNullOrEmpty()]
+    [ValidateSet(17)]
+    [int]$CXX_STANDARD = 17,
+    [Parameter(Mandatory = $false)]
+    [Alias("cmake-options")]
+    [ValidateNotNullOrEmpty()]
+    [string]$ARG_CMAKE_OPTIONS = ""
+)
+
+$CURRENT_PATH = Split-Path $pwd -leaf
+If($CURRENT_PATH -ne "ci") {
+    Write-Host "Moving to ci folder"
+    pushd "$PSScriptRoot/.."
+}
+
+Remove-Module -Name build_common
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+
+$PRESET = "nvbench-cpp$CXX_STANDARD"
+$CMAKE_OPTIONS = ""
+
+# Append any arguments pass in on the command line
+If($ARG_CMAKE_OPTIONS -ne "") {
+    $CMAKE_OPTIONS += "$ARG_CMAKE_OPTIONS"
+}
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+test_preset "NVBench" "$PRESET"
+
+If($CURRENT_PATH -ne "ci") {
+    popd
+}
--- a/cmake/NVBenchConfigTarget.cmake
+++ b/cmake/NVBenchConfigTarget.cmake
@@ -29,7 +29,6 @@ function(nvbench_add_cxx_flag target_name type flag)
    target_compile_options(${target_name} ${type}
      $<$<COMPILE_LANGUAGE:CXX>:${flag}>
      $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=${flag}>
-      # FIXME nvc++ case
    )
  endif()
 endfunction()
@@ -64,8 +63,8 @@ else()
  endif()
 endif()

-# GCC-specific flags
-if (CMAKE_CXX_COMPILER_ID STREQUAL GNU)
+# Experimental filesystem library
+if (CMAKE_CXX_COMPILER_ID STREQUAL GNU OR CMAKE_CXX_COMPILER_ID STREQUAL Clang)
  target_link_libraries(nvbench.build_interface INTERFACE stdc++fs)
 endif()

--- a/cmake/NVBenchDependencies.cmake
+++ b/cmake/NVBenchDependencies.cmake
@@ -24,24 +24,16 @@ endif()
 # Following recipe from
 # http://github.com/cpm-cmake/CPM.cmake/blob/master/examples/json/CMakeLists.txt
 # Download the zips because the repo takes an excessively long time to clone.
-rapids_cpm_find(nlohmann_json 3.9.1
-  # Release:
+rapids_cpm_find(nlohmann_json 3.11.3
  CPM_ARGS
-    URL https://github.com/nlohmann/json/releases/download/v3.9.1/include.zip
-    URL_HASH SHA256=6bea5877b1541d353bd77bdfbdb2696333ae5ed8f9e8cc22df657192218cad91
-    PATCH_COMMAND
-      # Work around compiler bug in nvcc 11.0, see NVIDIA/NVBench#18
-      ${CMAKE_COMMAND} -E copy
-        "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/nlohmann_json.hpp"
-        "./include/nlohmann/json.hpp"
-
-  # Development version:
-  # I'm waiting for https://github.com/nlohmann/json/issues/2676 to be fixed,
-  # leave this in to simplify testing patches as they come out.
-  #  CPM_ARGS
-  #    VERSION develop
-  #    URL https://github.com/nlohmann/json/archive/refs/heads/develop.zip
-  #    OPTIONS JSON_MultipleHeaders ON
+    URL https://github.com/nlohmann/json/releases/download/v3.11.3/include.zip
+    URL_HASH SHA256=a22461d13119ac5c78f205d3df1db13403e58ce1bb1794edc9313677313f4a9d
+  PATCH_COMMAND
+    ${CMAKE_COMMAND}
+      -D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
+      -D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
+      -D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
+      -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/json_unordered_map_ice.cmake"
 )

 add_library(nvbench_json INTERFACE IMPORTED)
--- a/cmake/patches/json_unordered_map_ice.cmake
+++ b/cmake/patches/json_unordered_map_ice.cmake
@@ -0,0 +1,22 @@
+# NVCC 11.1 and GCC 9 need a patch to build, otherwise:
+#
+# nlohmann/ordered_map.hpp(29): error #3316:
+# Internal Compiler Error (codegen): "internal error during structure layout!"
+#
+# Usage:
+# ${CMAKE_COMMAND}
+#   -D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
+#   -D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
+#   -D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
+#   -P "json_unordered_map_ice.cmake"
+
+if(CUDA_VERSION VERSION_GREATER 11.8 OR NOT CXX_ID STREQUAL "GNU" OR CXX_VERSION VERSION_LESS 9.0)
+  return()
+endif()
+
+# Read the file and replace the string "JSON_NO_UNIQUE_ADDRESS" with
+# "/* JSON_NO_UNIQUE_ADDRESS */".
+file(READ "include/nlohmann/ordered_map.hpp" NLOHMANN_ORDERED_MAP_HPP)
+string(REPLACE "JSON_NO_UNIQUE_ADDRESS" "/* [NVBench Patch] JSON_NO_UNIQUE_ADDRESS */"
+  NLOHMANN_ORDERED_MAP_HPP "${NLOHMANN_ORDERED_MAP_HPP}")
+file(WRITE "include/nlohmann/ordered_map.hpp" "${NLOHMANN_ORDERED_MAP_HPP}")
--- a/cmake/patches/nlohmann_json.hpp
+++ b/cmake/patches/nlohmann_json.hpp
--- a/examples/axes.cu
+++ b/examples/axes.cu
@@ -56,8 +56,8 @@ NVBENCH_BENCH(single_float64_axis)
 void copy_sweep_grid_shape(nvbench::state &state)
 {
  // Get current parameters:
-  const int block_size = static_cast<int>(state.get_int64("BlockSize"));
-  const int num_blocks = static_cast<int>(state.get_int64("NumBlocks"));
+  const auto block_size = static_cast<unsigned int>(state.get_int64("BlockSize"));
+  const auto num_blocks = static_cast<unsigned int>(state.get_int64("NumBlocks"));

  // Number of int32s in 256 MiB:
  const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t);
@@ -77,6 +77,7 @@ void copy_sweep_grid_shape(nvbench::state &state)
     num_values,
     in_ptr  = thrust::raw_pointer_cast(in.data()),
     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      (void) num_values; // clang thinks this is unused...
      nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
        in_ptr,
        out_ptr,
@@ -110,6 +111,7 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
    [num_values,
     in_ptr  = thrust::raw_pointer_cast(in.data()),
     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      (void) num_values; // clang thinks this is unused...
      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
                                                                 out_ptr,
                                                                 num_values);
@@ -156,6 +158,7 @@ void copy_type_conversion_sweep(nvbench::state &state,
    [num_values,
     in_ptr  = thrust::raw_pointer_cast(in.data()),
     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      (void) num_values; // clang thinks this is unused...
      nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
                                                                 out_ptr,
                                                                 num_values);
--- a/examples/custom_criterion.cu
+++ b/examples/custom_criterion.cu
@@ -36,7 +36,7 @@ public:

 protected:
  // Setup the criterion in the `do_initialize()` method:
-  virtual void do_initialize() override 
+  virtual void do_initialize() override
  {
    m_num_samples = 0;
  }
@@ -71,6 +71,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);

  state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
+    (void) num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/examples/exec_tag_timer.cu
+++ b/examples/exec_tag_timer.cu
@@ -54,6 +54,8 @@ void mod2_inplace(nvbench::state &state)
  state.exec(nvbench::exec_tag::timer,
             // Lambda now takes a `timer` argument:
             [&input, &data, num_values](nvbench::launch &launch, auto &timer) {
+               (void) num_values; // clang thinks this is unused...
+
               // Reset working data:
               thrust::copy(thrust::device.on(launch.get_stream()),
                            input.cbegin(),
--- a/examples/stream.cu
+++ b/examples/stream.cu
@@ -52,6 +52,7 @@ void stream_bench(nvbench::state &state)
  state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));

  state.exec([&input, &output, num_values](nvbench::launch &) {
+    (void) num_values; // clang thinks this is unused...
    copy(thrust::raw_pointer_cast(input.data()),
         thrust::raw_pointer_cast(output.data()),
         num_values);
--- a/examples/throughput.cu
+++ b/examples/throughput.cu
@@ -51,6 +51,7 @@ void throughput_bench(nvbench::state &state)
  state.add_global_memory_writes<nvbench::int32_t>(num_values);

  state.exec([&input, &output, num_values](nvbench::launch &launch) {
+    (void) num_values; // clang thinks this is unused...
    nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
      thrust::raw_pointer_cast(input.data()),
      thrust::raw_pointer_cast(output.data()),
--- a/nvbench/criterion_manager.cxx
+++ b/nvbench/criterion_manager.cxx
@@ -19,6 +19,13 @@
 #include <nvbench/criterion_manager.cuh>
 #include <nvbench/detail/throw.cuh>

+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
 namespace nvbench
 {

@@ -60,7 +67,7 @@ stopping_criterion_base &criterion_manager::add(std::unique_ptr<stopping_criteri

  auto [it, success] = m_map.emplace(name, std::move(criterion));

-  if (!success) 
+  if (!success)
  {
    NVBENCH_THROW(std::runtime_error,
                  "Stopping criterion \"{}\" is already registered.", name);
--- a/nvbench/detail/measure_cold.cu
+++ b/nvbench/detail/measure_cold.cu
@@ -43,8 +43,8 @@ measure_cold_base::measure_cold_base(state &exec_state)
 {
  if (m_min_samples > 0)
  {
-    m_cuda_times.reserve(m_min_samples);
-    m_cpu_times.reserve(m_min_samples);
+    m_cuda_times.reserve(static_cast<std::size_t>(m_min_samples));
+    m_cpu_times.reserve(static_cast<std::size_t>(m_min_samples));
  }
 }

--- a/nvbench/detail/measure_hot.cuh
+++ b/nvbench/detail/measure_hot.cuh
@@ -27,7 +27,7 @@

 #include <cuda_runtime.h>

-#include <utility>
+#include <algorithm>

 namespace nvbench
 {
--- a/nvbench/json_printer.cu
+++ b/nvbench/json_printer.cu
@@ -43,10 +43,14 @@
 #include <utility>
 #include <vector>

-#if defined __GNUC__ && !defined __clang__
-#include <experimental/filesystem>
-#else
+#if __has_include(<filesystem>)
 #include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+static_assert(false, "No <filesystem> or <experimental/filesystem> found.");
 #endif

 #if NVBENCH_CPP_DIALECT >= 2020
@@ -140,12 +144,6 @@ void json_printer::do_process_bulk_data_float64(state &state,

  if (hint == "sample_times")
  {
-#if defined __GNUC__ && !defined __clang__
-    namespace fs = std::experimental::filesystem;
-#else
-    namespace fs = std::filesystem;
-#endif
-
    nvbench::cpu_timer timer;
    timer.start();

--- a/testing/axes_metadata.cu
+++ b/testing/axes_metadata.cu
@@ -159,7 +159,7 @@ Axis: Other
  const std::string test = fmt::to_string(buffer);
  const auto diff =
    std::mismatch(ref.cbegin(), ref.cend(), test.cbegin(), test.cend());
-  const auto idx = diff.second - test.cbegin();
+  const auto idx = static_cast<std::size_t>(diff.second - test.cbegin());
  ASSERT_MSG(test == ref,
             "Differs at character {}.\n"
             "Expected:\n\"{}\"\n\n"
--- a/testing/criterion_manager.cu
+++ b/testing/criterion_manager.cu
@@ -46,7 +46,7 @@ void test_no_duplicates_are_allowed()
  bool exception_triggered = false;

  try {
-    nvbench::stopping_criterion_base& custom = manager.get_criterion("custom");
+    [[maybe_unused]] nvbench::stopping_criterion_base& _ = manager.get_criterion("custom");
  } catch(...) {
    exception_triggered = true;
  }
@@ -73,4 +73,3 @@ int main()
  test_standard_criteria_exist();
  test_no_duplicates_are_allowed();
 }
-
--- a/testing/enum_type_list.cu
+++ b/testing/enum_type_list.cu
@@ -24,6 +24,11 @@

 #include <type_traits>

+// If using gcc version < 7, disable some tests to WAR a compiler bug. See NVIDIA/nvbench#39.
+#if defined(__GNUC__) && __GNUC__ == 7
+#define USING_GCC_7
+#endif
+
 enum class scoped_enum
 {
  val_1,
@@ -109,9 +114,11 @@ void test_int()

 void test_scoped_enum()
 {
+#ifndef USING_GCC_7
  ASSERT((
    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1>,
                   nvbench::type_list<nvbench::enum_type<scoped_enum::val_1>>>));
+#endif
  ASSERT((
    std::is_same_v<nvbench::enum_type_list<scoped_enum::val_1,
                                           scoped_enum::val_2,
@@ -123,6 +130,7 @@ void test_scoped_enum()

 void test_unscoped_enum()
 {
+#ifndef USING_GCC_7
  ASSERT(
    (std::is_same_v<nvbench::enum_type_list<unscoped_val_1>,
                    nvbench::type_list<nvbench::enum_type<unscoped_val_1>>>));
@@ -132,6 +140,7 @@ void test_unscoped_enum()
      nvbench::type_list<nvbench::enum_type<unscoped_val_1>,
                         nvbench::enum_type<unscoped_val_2>,
                         nvbench::enum_type<unscoped_val_3>>>));
+#endif
 }

 void test_scoped_enum_type_strings()
--- a/testing/statistics.cu
+++ b/testing/statistics.cu
@@ -21,6 +21,7 @@

 #include "test_asserts.cuh"

+#include <algorithm>
 #include <vector>

 namespace statistics = nvbench::detail::statistics;
--- a/testing/stdrel_criterion.cu
+++ b/testing/stdrel_criterion.cu
@@ -32,7 +32,7 @@ void test_const()
  nvbench::detail::stdrel_criterion criterion;

  criterion.initialize(params);
-  for (int i = 0; i < 5; i++) 
+  for (int i = 0; i < 5; i++)
  { // nvbench wants at least 5 to compute the standard deviation
    criterion.add_measurement(42.0);
  }
@@ -43,7 +43,7 @@ std::vector<double> generate(double mean, double rel_std_dev, int size)
 {
  std::random_device rd;
  std::mt19937 gen(rd());
-  std::vector<nvbench::float64_t> v(size);
+  std::vector<nvbench::float64_t> v(static_cast<std::size_t>(size));
  std::normal_distribution<nvbench::float64_t> dist(mean, mean * rel_std_dev);
  std::generate(v.begin(), v.end(), [&]{ return dist(gen); });
  return v;
@@ -61,7 +61,7 @@ void test_stdrel()
  nvbench::detail::stdrel_criterion criterion;
  criterion.initialize(params);

-  for (nvbench::float64_t measurement: generate(mean, max_noise / 2, size)) 
+  for (nvbench::float64_t measurement: generate(mean, max_noise / 2, size))
  {
    criterion.add_measurement(measurement);
  }
@@ -70,7 +70,7 @@ void test_stdrel()
  params.set_float64("max-noise", max_noise);
  criterion.initialize(params);

-  for (nvbench::float64_t measurement: generate(mean, max_noise * 2, size)) 
+  for (nvbench::float64_t measurement: generate(mean, max_noise * 2, size))
  {
    criterion.add_measurement(measurement);
  }