Add initial CI, presets, and devcontainers.

2026-04-29 11:11:15 +00:00 · 2024-04-04 21:07:27 +00:00
parent 42d99a5753
commit eb5940c64f
62 changed files with 3185 additions and 662 deletions
--- a/ci/axis/cpu.yml
+++ b/ci/axis/cpu.yml
@@ -1,38 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 5
-  - 6
-  - 7
-  - 8
-  - 9
-  - 10
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 5
-  - CXX_TYPE: clang
-    CXX_VER: 6
-  - CXX_TYPE: gcc
-    CXX_VER: 12
--- a/ci/axis/gpu.yml
+++ b/ci/axis/gpu.yml
@@ -1,30 +0,0 @@
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-SDK_TYPE:
-  - cuda
-
-SDK_VER:
-  - 11.5.1-devel
-
-OS_TYPE:
-  - ubuntu
-
-OS_VER:
-  - 20.04
-
-CXX_TYPE:
-  - clang
-  - gcc
-
-CXX_VER:
-  - 11
-  - 12
-
-exclude:
-  - CXX_TYPE: clang
-    CXX_VER: 11
-  - CXX_TYPE: gcc
-    CXX_VER: 12
--- a/ci/build_common.sh
+++ b/ci/build_common.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+set -eo pipefail
+
+# Ensure the script is being executed in its containing directory
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
+
+# Script defaults
+HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
+CXX_STANDARD=17
+CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc`
+CUDA_ARCHS= # Empty, use presets by default.
+GLOBAL_CMAKE_OPTIONS=()
+DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks.
+
+# Check if the correct number of arguments has been provided
+function usage {
+    echo "Usage: $0 [OPTIONS]"
+    echo
+    echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
+    echo
+    echo "Options:"
+    echo "  -v/--verbose: enable shell echo for debugging"
+    echo "  -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
+    echo "  -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
+    echo "  -std: CUDA/C++ standard (Defaults to 17)"
+    echo "  -arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to value in presets file)"
+    echo "  -cmake-options: Additional options to pass to CMake"
+    echo
+    echo "Examples:"
+    echo "  $ PARALLEL_LEVEL=8 $0"
+    echo "  $ PARALLEL_LEVEL=8 $0 -cxx g++-9"
+    echo "  $ $0 -cxx clang++-8"
+    echo "  $ $0 -cxx g++-8 -std 20 -arch 80-real -v -cuda /usr/local/bin/nvcc"
+    echo "  $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\""
+    exit 1
+}
+
+# Parse options
+
+# Copy the args into a temporary array, since we will modify them and
+# the parent script may still need them.
+args=("$@")
+while [ "${#args[@]}" -ne 0 ]; do
+    case "${args[0]}" in
+    -v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
+    -cxx)  HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
+    -cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");;
+    -arch) CUDA_ARCHS="${args[1]}";    args=("${args[@]:2}");;
+    -disable-benchmarks) DISABLE_CUB_BENCHMARKS=1; args=("${args[@]:1}");;
+    -cmake-options)
+        if [ -n "${args[1]}" ]; then
+            IFS=' ' read -ra split_args <<< "${args[1]}"
+            GLOBAL_CMAKE_OPTIONS+=("${split_args[@]}")
+            args=("${args[@]:2}")
+        else
+            echo "Error: No arguments provided for -cmake-options"
+            usage
+            exit 1
+        fi
+        ;;
+    -h | -help | --help) usage ;;
+    *) echo "Unrecognized option: ${args[0]}"; usage ;;
+    esac
+done
+
+# Convert to full paths:
+HOST_COMPILER=$(which ${HOST_COMPILER})
+CUDA_COMPILER=$(which ${CUDA_COMPILER})
+
+if [[ -n "${CUDA_ARCHS}" ]]; then
+    GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}")
+fi
+
+if [ $VERBOSE ]; then
+    set -x
+fi
+
+# Begin processing unsets after option parsing
+set -u
+
+readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
+
+if [ -z ${CCCL_BUILD_INFIX+x} ]; then
+    CCCL_BUILD_INFIX=""
+fi
+
+# Presets will be configured in this directory:
+BUILD_DIR="../build/${CCCL_BUILD_INFIX}"
+
+# The most recent build will always be symlinked to cccl/build/latest
+mkdir -p $BUILD_DIR
+rm -f ../build/latest
+ln -sf $BUILD_DIR ../build/latest
+
+# Now that BUILD_DIR exists, use readlink to canonicalize the path:
+BUILD_DIR=$(readlink -f "${BUILD_DIR}")
+
+# Prepare environment for CMake:
+export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}"
+export CTEST_PARALLEL_LEVEL="1"
+export CXX="${HOST_COMPILER}"
+export CUDACXX="${CUDA_COMPILER}"
+export CUDAHOSTCXX="${HOST_COMPILER}"
+export CXX_STANDARD
+
+source ./pretty_printing.sh
+
+print_environment_details() {
+  begin_group "⚙️ Environment Details"
+
+  echo "pwd=$(pwd)"
+
+  print_var_values \
+      BUILD_DIR \
+      CXX_STANDARD \
+      CXX \
+      CUDACXX \
+      CUDAHOSTCXX \
+      NVCC_VERSION \
+      CMAKE_BUILD_PARALLEL_LEVEL \
+      CTEST_PARALLEL_LEVEL \
+      CCCL_BUILD_INFIX \
+      GLOBAL_CMAKE_OPTIONS
+
+  echo "Current commit is:"
+  git log -1 || echo "Not a repository"
+
+  if command -v nvidia-smi &> /dev/null; then
+    nvidia-smi
+  else
+    echo "nvidia-smi not found"
+  fi
+
+  end_group "⚙️ Environment Details"
+}
+
+fail_if_no_gpu() {
+    if ! nvidia-smi &> /dev/null; then
+        echo "Error: No NVIDIA GPU detected. Please ensure you have an NVIDIA GPU installed and the drivers are properly configured." >&2
+        exit 1
+    fi
+}
+
+function print_test_time_summary()
+{
+    ctest_log=${1}
+
+    if [ -f ${ctest_log} ]; then
+        begin_group "⏱️ Longest Test Steps"
+        # Only print the full output in CI:
+        if [ -n "${GITHUB_ACTIONS:-}" ]; then
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake
+        else
+            cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake | head -n 15
+        fi
+        end_group "⏱️ Longest Test Steps"
+    fi
+}
+
+function configure_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+    local GROUP_NAME="🛠️  CMake Configure ${BUILD_NAME}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE "${GLOBAL_CMAKE_OPTIONS[@]}" $CMAKE_OPTIONS
+    status=$?
+    popd > /dev/null
+    return $status
+}
+
+function build_preset() {
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local green="1;32"
+    local red="1;31"
+    local GROUP_NAME="🏗️  Build ${BUILD_NAME}"
+
+    source "./sccache_stats.sh" "start"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v
+    status=$?
+    popd > /dev/null
+
+    minimal_sccache_stats=$(source "./sccache_stats.sh" "end")
+
+    # Only print detailed stats in actions workflow
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        begin_group "💲 sccache stats"
+        echo "${minimal_sccache_stats}"
+        sccache -s
+        end_group
+
+        begin_group "🥷 ninja build times"
+        echo "The "weighted" time is the elapsed time of each build step divided by the number
+              of tasks that were running in parallel. This makes it an excellent approximation
+              of how "important" a slow step was. A link that is entirely or mostly serialized
+              will have a weighted time that is the same or similar to its elapsed time. A
+              compile that runs in parallel with 999 other compiles will have a weighted time
+              that is tiny."
+        ./ninja_summary.py -C ${BUILD_DIR}/${PRESET}
+        end_group
+    else
+      echo $minimal_sccache_stats
+    fi
+
+    return $status
+}
+
+function test_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local GROUP_NAME="🚀  Test ${BUILD_NAME}"
+
+    fail_if_no_gpu
+
+
+    ctest_log_dir="${BUILD_DIR}/log/ctest"
+    ctest_log="${ctest_log_dir}/${PRESET}"
+    mkdir -p "${ctest_log_dir}"
+
+    pushd .. > /dev/null
+    run_command "$GROUP_NAME" ctest --output-log "${ctest_log}" --preset=$PRESET
+    status=$?
+    popd > /dev/null
+
+    print_test_time_summary ${ctest_log}
+
+    return $status
+}
+
+function configure_and_build_preset()
+{
+    local BUILD_NAME=$1
+    local PRESET=$2
+    local CMAKE_OPTIONS=$3
+
+    configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
+    build_preset "$BUILD_NAME" "$PRESET"
+}
--- a/ci/build_nvbench.sh
+++ b/ci/build_nvbench.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+print_environment_details
+
+PRESET="nvbench-cpp$CXX_STANDARD"
+
+CMAKE_OPTIONS=""
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+
+print_time_summary
--- a/ci/common/build.bash
+++ b/ci/common/build.bash
@@ -1,231 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI
-################################################################################
-
-set -e
-
-# append variable value
-# Appends ${value} to ${variable}, adding a space before ${value} if
-# ${variable} is not empty.
-function append {
-  tmp="${!1:+${!1} }${2}"
-  eval "${1}=\${tmp}"
-}
-
-# log args...
-# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after.
-function log() {
-  printf "\n>>>> %s\n\n" "${*}"
-}
-
-# print_with_trailing_blank_line args...
-# Prints ${args[*]} with one blank line following, preserving newlines within
-# ${args[*]} but stripping any preceding ${args[*]}.
-function print_with_trailing_blank_line {
-  printf "%s\n\n" "${*}"
-}
-
-# echo_and_run name args...
-# Echo ${args[@]}, then execute ${args[@]}
-function echo_and_run {
-  echo "${1}: ${@:2}"
-  ${@:2}
-}
-
-# echo_and_run_timed name args...
-# Echo ${args[@]}, then execute ${args[@]} and report how long it took,
-# including ${name} in the output of the time.
-function echo_and_run_timed {
-  echo "${@:2}"
-  TIMEFORMAT=$'\n'"${1} Time: %lR"
-  time ${@:2}
-}
-
-# join_delimit <delimiter> [value [value [...]]]
-# Combine all values into a single string, separating each by a single character
-# delimiter. Eg:
-# foo=(bar baz kramble)
-# joined_foo=$(join_delimit "|" "${foo[@]}")
-# echo joined_foo # "bar|baz|kramble"
-function join_delimit {
-  local IFS="${1}"
-  shift
-  echo "${*}"
-}
-
-################################################################################
-# VARIABLES - Set up bash and environmental variables.
-################################################################################
-
-# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
-source /etc/cccl.bashrc
-
-# Set path.
-export PATH=/usr/local/cuda/bin:${PATH}
-
-# Set home to the job's workspace.
-export HOME=${WORKSPACE}
-
-# Switch to the build directory.
-cd ${WORKSPACE}
-mkdir -p build
-cd build
-
-# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
-rm -f .ninja_log
-
-if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
-  CMAKE_BUILD_TYPE="Release"
-fi
-
-CMAKE_BUILD_FLAGS="--"
-
-# The Docker image sets up `${CXX}` and `${CUDACXX}`.
-append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
-append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'"
-
-if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
-  echo "nvc++ not supported."
-  exit 1
-else
-  if [[ "${CXX_TYPE}" == "icc" ]]; then
-    echo "icc not supported."
-    exit 1
-  fi
-  # We're using NVCC so we need to set the host compiler.
-  append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-DCMAKE_CUDA_HOST_COMPILER='${CXX}'"
-  append CMAKE_FLAGS "-G Ninja"
-  # Don't stop on build failures.
-  append CMAKE_BUILD_FLAGS "-k0"
-fi
-
-if [[ -n "${PARALLEL_LEVEL}" ]]; then
-  DETERMINE_PARALLELISM_FLAGS="-j ${PARALLEL_LEVEL}"
-fi
-
-WSL=0
-if [[ $(grep -i microsoft /proc/version) ]]; then
-  echo "Windows Subsystem for Linux detected."
-  WSL=1
-fi
-export WSL
-
-#append CMAKE_FLAGS "-DCMAKE_CUDA_ARCHITECTURES=all"
-
-append CMAKE_FLAGS "-DNVBench_ENABLE_EXAMPLES=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_TESTING=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_CUPTI=ON"
-append CMAKE_FLAGS "-DNVBench_ENABLE_WERROR=ON"
-
-# These consume a lot of time and don't currently have
-# any value as regression tests.
-append CMAKE_FLAGS "-DNVBench_ENABLE_DEVICE_TESTING=OFF"
-
-# NVML doesn't work under WSL
-if [[ ${WSL} -eq 0 ]]; then
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=ON"
-else
-  append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=OFF"
-fi
-
-if [[ -n "${@}" ]]; then
-  append CMAKE_BUILD_FLAGS "${@}"
-fi
-
-append CTEST_FLAGS "--output-on-failure"
-
-# Export variables so they'll show up in the logs when we report the environment.
-export CMAKE_FLAGS
-export CMAKE_BUILD_FLAGS
-export CTEST_FLAGS
-
-################################################################################
-# ENVIRONMENT - Configure and print out information about the environment.
-################################################################################
-
-log "Determine system topology..."
-
-# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the
-# system topology.
-source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS}
-
-log "Get environment..."
-
-env | sort
-
-log "Check versions..."
-
-# We use sed and echo below to ensure there is always one and only trailing
-# line following the output from each tool.
-
-${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
-
-echo
-
-if [[ "${BUILD_TYPE}" == "gpu" ]]; then
-  nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
-fi
-
-################################################################################
-# BUILD
-################################################################################
-
-log "Configure..."
-
-echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
-configure_status=$?
-
-log "Build..."
-
-# ${PARALLEL_LEVEL} needs to be passed after we run
-# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
-set +e # Don't stop on build failures.
-echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
-build_status=$?
-set -e
-
-################################################################################
-# TEST - Run examples and tests.
-################################################################################
-
-log "Test..."
-
-(
-  # Make sure test_status captures ctest, not tee:
-  # https://stackoverflow.com/a/999259/11130318
-  set -o pipefail
-  echo_and_run_timed "Test" ctest ${CTEST_FLAGS} -j ${PARALLEL_LEVEL} | tee ctest_log
-)
-
-test_status=$?
-
-################################################################################
-# SUMMARY - Print status of each step and exit with failure if needed.
-################################################################################
-
-log "Summary:"
-echo "- Configure Error Code: ${configure_status}"
-echo "- Build Error Code: ${build_status}"
-echo "- Test Error Code: ${test_status}"
-
-if [[ "${configure_status}" != "0" ]] || \
-   [[ "${build_status}" != "0" ]] || \
-   [[ "${test_status}" != "0" ]]; then
-     exit 1
-fi
--- a/ci/common/determine_build_parallelism.bash
+++ b/ci/common/determine_build_parallelism.bash
@@ -1,119 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-function usage {
-  echo "Usage: ${0} [flags...]"
-  echo
-  echo "Examine the system topology to determine a reasonable amount of build"
-  echo "parallelism."
-  echo
-  echo "Exported variables:"
-  echo "  \${LOGICAL_CPUS}          : Logical processors (e.g. threads)."
-  echo "  \${PHYSICAL_CPUS}         : Physical processors (e.g. cores)."
-  echo "  \${TOTAL_MEM}             : Total system memory [GB]."
-  echo "  \${MAX_THREADS_PER_CORE}  : Maximum threads per core allowed."
-  echo "  \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed."
-  echo "  \${CPU_BOUND_THREADS}     : # of build threads constrained by processors."
-  echo "  \${MEM_BOUND_THREADS}     : # of build threads constrained by memory [GB]."
-  echo "  \${PARALLEL_LEVEL}        : Determined # of build threads."
-  echo "  \${MEM_PER_THREAD}        : Memory [GB] per build thread."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-q, --quiet"
-  echo "  Print nothing and only export variables."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Explicitly set the number of build threads to use."
-  echo
-  echo "--max-threads-per-core <threads>"
-  echo "  Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])."
-  echo
-  echo "--min-memory-per-thread <gigabytes>"
-  echo "  Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])."
-
-  exit -3
-}
-
-QUIET=0
-
-export MAX_THREADS_PER_CORE=2
-export MIN_MEMORY_PER_THREAD=1 # [GB]
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -q) ;&
-  --quiet) QUIET=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  --max-threads-per-core)
-    shift # The next argument is the number of threads per core.
-    MAX_THREADS_PER_CORE="${1}"
-    ;;
-  --min-memory-per-thread)
-    shift # The next argument is the amount of memory per thread.
-    MIN_MEMORY_PER_THREAD="${1}"
-    ;;
-  esac
-  shift
-done
-
-# https://stackoverflow.com/a/23378780
-if [ $(uname) == "Darwin" ]; then
-  export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max)
-  export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max)
-else
-  export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l)
-  export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
-fi
-
-export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }")
-
-export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }")
-export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }")
-
-if [[ -z "${PARALLEL_LEVEL}" ]]; then
-  # Pick the smaller of the two as the default.
-  if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then
-    export PARALLEL_LEVEL=${MEM_BOUND_THREADS}
-  else
-    export PARALLEL_LEVEL=${CPU_BOUND_THREADS}
-  fi
-else
-  EXPLICIT_PARALLEL_LEVEL=1
-fi
-
-# This can be a floating point number.
-export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }")
-
-if [[ "${QUIET}" == 0 ]]; then
-  echo    "Logical CPUs:           ${LOGICAL_CPUS} [threads]"
-  echo    "Physical CPUs:          ${PHYSICAL_CPUS} [cores]"
-  echo    "Total Mem:              ${TOTAL_MEM} [GBs]"
-  echo    "Max Threads Per Core:   ${MAX_THREADS_PER_CORE} [threads/core]"
-  echo    "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]"
-  echo    "CPU Bound Threads:      ${CPU_BOUND_THREADS} [threads]"
-  echo    "Mem Bound Threads:      ${MEM_BOUND_THREADS} [threads]"
-
-  echo -n "Parallel Level:         ${PARALLEL_LEVEL} [threads]"
-  if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then
-    echo " (explicitly set)"
-  else
-    echo
-  fi
-
-  echo    "Mem Per Thread:         ${MEM_PER_THREAD} [GBs/thread]"
-fi
-
--- a/ci/cpu/build.bash
+++ b/ci/cpu/build.bash
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (CPU-only)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
--- a/ci/gpu/build.bash
+++ b/ci/gpu/build.bash
@@ -1,14 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench build script for gpuCI (heterogeneous)
-################################################################################
-
-export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
-
-source ${WORKSPACE}/ci/common/build.bash
--- a/ci/local/build.bash
+++ b/ci/local/build.bash
@@ -1,215 +0,0 @@
-#! /usr/bin/env bash
-
-# Copyright (c) 2018-2020 NVIDIA Corporation
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# Released under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-
-################################################################################
-# NVBench local containerized build script
-################################################################################
-
-function usage {
-  echo "Usage: ${0} [flags...] [cmake-targets...]"
-  echo
-  echo "Build and test your local repository using a gpuCI Docker image."
-  echo "If CMake targets are specified, only those targets are built and tested."
-  echo "Otherwise, everything is built and tested."
-  echo
-  echo "-h, -help, --help"
-  echo "  Print this message."
-  echo
-  echo "-r <path>, --repository <path>"
-  echo "  Path to the repository (default: ${REPOSITORY_PATH})."
-  echo
-  echo "-i <image>, --image <image>"
-  echo "  Docker image to use (default: ${IMAGE})"
-  echo
-  echo "-l, --local-image"
-  echo "  Use the local version of the image instead of pulling from Docker hub."
-  echo
-  echo "-s, --shell-only"
-  echo "  Skip building and testing and launch an interactive shell instead."
-  echo
-  echo "-d, --disable-gpus"
-  echo "  Don't start the container with the NVIDIA runtime and GPUs attached."
-  echo
-  echo "-c, --clean"
-  echo "  If the build directory already exists, delete it."
-  echo
-  echo "-j <threads>, --jobs <threads>"
-  echo "  Number of threads to use when building (default: inferred)."
-  echo
-  echo "-b <type>, --cmake-build-type <plan>"
-  echo "  CMake build type to use, either Release, RelWithDebInfo, or Debug"
-  echo "  (default: ${CMAKE_BUILD_TYPE})."
-  echo
-
-  exit -3
-}
-
-SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
-
-REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..)
-
-################################################################################
-# FLAGS - Process command line flags.
-################################################################################
-
-IMAGE="gpuci/cccl:cuda11.5.1-devel-ubuntu20.04-gcc9"
-
-LOCAL_IMAGE=0
-
-SHELL_ONLY=0
-
-BUILD_TYPE="gpu"
-
-CLEAN=0
-
-PARALLEL_LEVEL=""
-
-CMAKE_BUILD_TYPE="Release"
-
-TARGETS=""
-
-while test ${#} != 0
-do
-  case "${1}" in
-  -h) ;&
-  -help) ;&
-  --help) usage ;;
-  -r) ;&
-  --repository)
-    shift # The next argument is the path.
-    REPOSITORY_PATH="${1}"
-    ;;
-  -i) ;&
-  --image)
-    shift # The next argument is the image.
-    IMAGE="${1}"
-    ;;
-  -l) ;&
-  --local-image) LOCAL_IMAGE=1 ;;
-  -s) ;&
-  --shell-only) SHELL_ONLY=1 ;;
-  -d) ;&
-  --disable-gpus) BUILD_TYPE="cpu" ;;
-  -c) ;&
-  --clean) CLEAN=1 ;;
-  -j) ;&
-  --jobs)
-    shift # The next argument is the number of threads.
-    PARALLEL_LEVEL="${1}"
-    ;;
-  -b) ;&
-  --cmake-build-type)
-    shift # The next argument is the build type.
-    CMAKE_BUILD_TYPE="${1}"
-    ;;
-  *)
-    TARGETS="${TARGETS:+${TARGETS} }${1}"
-    ;;
-  esac
-  shift
-done
-
-################################################################################
-# PATHS - Setup paths for the container.
-################################################################################
-
-# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being
-# built and tested. It can be set with the --repository flag.
-#
-# ${BUILD_PATH} is the local filesystem path that will be used for the build. It
-# is named after the image name, allowing multiple image builds to coexist on
-# the local filesystem.
-#
-# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside
-# the container.
-#
-# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the
-# container.
-
-BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g')
-
-if [[ "${CLEAN}" != 0 ]]; then
-  rm -rf ${BUILD_PATH}
-fi
-
-mkdir -p ${BUILD_PATH}
-
-BASE_PATH_IN_CONTAINER="/cccl"
-
-REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")"
-
-BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build"
-
-################################################################################
-# ENVIRONMENT - Setup the thunk build script that will be run by the container.
-################################################################################
-
-# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this
-# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399
-
-COMMAND="sudo ldconfig; sudo ldconfig"
-if [[ "${SHELL_ONLY}" != 0 ]]; then
-  COMMAND="${COMMAND}; bash"
-else
-  COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash"
-fi
-
-################################################################################
-# GPU - Setup GPUs.
-################################################################################
-
-# Note: We always start docker with --gpus, even for cpu builds. Otherwise
-# libcuda.so.1 is not present and no NVBench tests are able to run.
-
-# Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}.
-if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then
-  VISIBLE_DEVICES="all"
-else
-  VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}"
-fi
-
-DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/')
-GPU_OPTS="--gpus device=${VISIBLE_DEVICES}"
-if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]]
-then
-  GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'"
-fi
-
-################################################################################
-# LAUNCH - Pull and launch the container.
-################################################################################
-
-#NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
-NVIDIA_DOCKER_INSTALLED=1 # Broken on WSL
-if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
-  echo "NVIDIA Docker not found, please install it: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
-  exit -4
-fi
-
-if [[ "${LOCAL_IMAGE}" == 0 ]]; then
-  docker pull "${IMAGE}"
-fi
-
-docker run --rm -it ${GPU_OPTS} \
-  --cap-add=SYS_PTRACE \
-  --user "$(id -u)":"$(id -g)" \
-  -v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \
-  -v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \
-  -v /etc/passwd:/etc/passwd:ro \
-  -v /etc/group:/etc/group:ro \
-  -v /etc/subuid:/etc/subuid:ro \
-  -v /etc/subgid:/etc/subgid:ro \
-  -v /etc/shadow:/etc/shadow:ro \
-  -v /etc/gshadow:/etc/gshadow:ro \
-  -e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \
-  -e "BUILD_TYPE=${BUILD_TYPE}" \
-  -e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \
-  -e "COVERAGE_PLAN=${COVERAGE_PLAN}" \
-  -e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \
-  -w "${BUILD_PATH_IN_CONTAINER}" \
-  "${IMAGE}" bash -c "${COMMAND}"
-
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -0,0 +1,85 @@
+
+cuda_prev_min: &cuda_prev_min '11.1'
+cuda_prev_max:  &cuda_prev_max  '11.8'
+cuda_curr: &cuda_curr '12.4'
+
+# The GPUs to test on
+gpus:
+  - 'a100'
+  - 'v100'
+
+# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
+devcontainer_version: '24.06'
+
+# gcc compiler configurations
+gcc6: &gcc6 { name: 'gcc', version: '6', exe: 'g++' }
+gcc7: &gcc7 { name: 'gcc', version: '7', exe: 'g++' }
+gcc8: &gcc8 { name: 'gcc', version: '8', exe: 'g++' }
+gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' }
+gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' }
+gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' }
+gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' }
+gcc-oldest: &gcc-oldest { name: 'gcc', version: '6', exe: 'g++' }
+gcc-newest: &gcc-newest { name: 'gcc', version: '12', exe: 'g++' }
+
+# LLVM Compiler configurations
+llvm9: &llvm9 { name: 'llvm', version: '9', exe: 'clang++' }
+llvm10: &llvm10 { name: 'llvm', version: '10', exe: 'clang++' }
+llvm11: &llvm11 { name: 'llvm', version: '11', exe: 'clang++' }
+llvm12: &llvm12 { name: 'llvm', version: '12', exe: 'clang++' }
+llvm13: &llvm13 { name: 'llvm', version: '13', exe: 'clang++' }
+llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' }
+llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' }
+llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' }
+llvm-oldest: &llvm-oldest { name: 'llvm', version: '9', exe: 'clang++' }
+llvm-newest: &llvm-newest { name: 'llvm', version: '16', exe: 'clang++' }
+
+# MSVC configs
+msvc2017: &msvc2017 { name: 'cl', version: '14.16', exe: 'cl++' }
+msvc2019: &msvc2019 { name: 'cl', version: '14.29', exe: 'cl++' }
+msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }
+
+# oneAPI configs
+oneapi: &oneapi { name: 'oneapi', version: '2023.2.0', exe: 'icpc' }
+
+# Each environment below will generate a unique build/test job
+# See the "compute-matrix" job in the workflow for how this is parsed and used
+# cuda: The CUDA Toolkit version
+# os: The operating system used
+# cpu: The CPU architecture
+# compiler: The compiler to use
+#   name: The compiler name
+#   version: The compiler version
+#   exe: The unverionsed compiler binary name
+# std: The C++ standards to build for
+#    This field is unique as it will generate an independent build/test job for each value
+
+# Configurations that will run for every PR
+pull_request:
+  nvcc:
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_min, os: 'windows2022', cpu: 'amd64', compiler: *msvc2017, std: [17],     jobs: ['build']}
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17],     jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90'}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17],     jobs: ['build'], extra_build_args: '-cmake-options -DCMAKE_CUDA_ARCHITECTURES=90a'}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17],     jobs: ['build', 'test']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *gcc12,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16,   std: [17], jobs: ['build', 'test']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'arm64', compiler: *llvm16,   std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, std: [17],     jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, std: [17], jobs: ['build']}
+    - {cuda: *cuda_curr,     os: 'ubuntu22.04', cpu: 'amd64', compiler: *oneapi,   std: [17],     jobs: ['build']}
--- a/ci/pretty_printing.sh
+++ b/ci/pretty_printing.sh
@@ -0,0 +1,105 @@
+# Print "ARG=${ARG}" for all args.
+function print_var_values() {
+    # Iterate through the arguments
+    for var_name in "$@"; do
+        if [ -z "$var_name" ]; then
+            echo "Usage: print_var_values <variable_name1> <variable_name2> ..."
+            return 1
+        fi
+
+        # Dereference the variable and print the result
+        echo "$var_name=${!var_name:-(undefined)}"
+    done
+}
+
+# begin_group: Start a named section of log output, possibly with color.
+# Usage: begin_group "Group Name" [Color]
+#   Group Name: A string specifying the name of the group.
+#   Color (optional): ANSI color code to set text color. Default is blue (1;34).
+function begin_group() {
+    # See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124
+    local blue="34"
+    local name="${1:-}"
+    local color="${2:-$blue}"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo -e "::group::\e[${color}m${name}\e[0m"
+    else
+        echo -e "\e[${color}m================== ${name} ======================\e[0m"
+    fi
+}
+
+# end_group: End a named section of log output and print status based on exit status.
+# Usage: end_group "Group Name" [Exit Status]
+#   Group Name: A string specifying the name of the group.
+#   Exit Status (optional): The exit status of the command run within the group. Default is 0.
+function end_group() {
+    local name="${1:-}"
+    local build_status="${2:-0}"
+    local duration="${3:-}"
+    local red="31"
+    local blue="34"
+
+    if [ -n "${GITHUB_ACTIONS:-}" ]; then
+        echo "::endgroup::"
+
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m"
+        fi
+    else
+        if [ "$build_status" -ne 0 ]; then
+            echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m"
+        else
+            echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m"
+        fi
+    fi
+}
+
+declare -A command_durations
+
+# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result.
+# Usage: run_command "Group Name" command [arguments...]
+function run_command() {
+    local group_name="${1:-}"
+    shift
+    local command=("$@")
+    local status
+
+    begin_group "$group_name"
+    set +e
+    local start_time=$(date +%s)
+    "${command[@]}"
+    status=$?
+    local end_time=$(date +%s)
+    set -e
+    local duration=$((end_time - start_time))
+    end_group "$group_name" $status $duration
+    command_durations["$group_name"]=$duration
+    return $status
+}
+
+function string_width() {
+    local str="$1"
+    echo "$str" | awk '{print length}'
+}
+
+function print_time_summary() {
+    local max_length=0
+    local group
+
+    # Find the longest group name for formatting
+    for group in "${!command_durations[@]}"; do
+        local group_length=$(echo "$group" | awk '{print length}')
+        if [ "$group_length" -gt "$max_length" ]; then
+            max_length=$group_length
+        fi
+    done
+
+    echo "Time Summary:"
+    for group in "${!command_durations[@]}"; do
+        printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
+    done
+
+    # Clear the array of timing info
+    declare -gA command_durations=()
+}
--- a/ci/sccache_hit_rate.sh
+++ b/ci/sccache_hit_rate.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Ensure two arguments are provided
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <before-file> <after-file>" >&2
+  exit 1
+fi
+
+# Print the contents of the before file
+echo "=== Contents of $1 ===" >&2
+cat $1 >&2
+echo "=== End of $1 ===" >&2
+
+# Print the contents of the after file
+echo "=== Contents of $2 ==="  >&2
+cat $2 >&2
+echo "=== End of $2 ===" >&2
+
+# Extract compile requests and cache hits from the before and after files
+requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1")
+hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1")
+requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2")
+hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2")
+
+# Calculate the differences to find out how many new requests and hits
+requests_diff=$((requests_after - requests_before))
+hits_diff=$((hits_after - hits_before))
+
+echo "New Compile Requests: $requests_diff" >&2
+echo "New Hits: $hits_diff" >&2
+
+# Calculate and print the hit rate
+if [ $requests_diff -eq 0 ]; then
+    echo "No new compile requests, hit rate is not applicable"
+else
+    hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
+    echo "sccache hit rate: $hit_rate%" >&2
+    echo "$hit_rate" 
+fi
--- a/ci/sccache_stats.sh
+++ b/ci/sccache_stats.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# This script prints the sccache hit rate between two calls to sccache --show-stats.
+# It should be sourced in your script before and after the operations you want to profile,
+# with the 'start' or 'end' argument respectively.
+
+mode=$1
+
+if [[ "$mode" != "start" && "$mode" != "end" ]]; then
+    echo "Invalid mode: $mode"
+    echo "Usage: $0 {start|end}"
+    exit 1
+fi
+
+# Check if sccache is available
+if ! command -v sccache &> /dev/null; then
+    echo "Notice: sccache is not available. Skipping..."
+    exit 0
+fi
+
+case $mode in
+  start)
+    export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    ;;
+  end)
+    if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then
+        echo "Error: start stats not collected. Did you call this script with 'start' before your operations?"
+        exit 1
+    fi
+
+    final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
+    final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
+    hits=$((final_hits - SCCACHE_START_HITS))
+    misses=$((final_misses - SCCACHE_START_MISSES))
+    total=$((hits + misses))
+
+    prefix=""
+    if [ ${GITHUB_ACTIONS:-false} = "true" ]; then
+      prefix="::notice::"
+    fi
+
+    if (( total > 0 )); then
+      hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }')
+      echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%"
+    else
+      echo ${prefix}"sccache stats: N/A No new compilation requests"
+    fi
+    unset SCCACHE_START_HITS
+    unset SCCACHE_START_MISSES
+    ;;
+esac
--- a/ci/test_nvbench.sh
+++ b/ci/test_nvbench.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+source "$(dirname "$0")/build_common.sh"
+
+# Run NVBench tests with high parallelism. If any need to be
+# serialized, define the `RUN_SERIAL` CMake property on the
+# test.
+export CTEST_PARALLEL_LEVEL=${PARALLEL_LEVEL}
+
+print_environment_details
+
+./build_nvbench.sh "$@"
+
+PRESET="nvbench-cpp$CXX_STANDARD"
+
+test_preset "NVBench" ${PRESET}
+
+print_time_summary
--- a/ci/windows/build_common.psm1
+++ b/ci/windows/build_common.psm1
@@ -0,0 +1,205 @@
+
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("std")]
+    [ValidateNotNullOrEmpty()]
+    [ValidateSet(17)]
+    [int]$CXX_STANDARD = 17
+)
+
+# We need the full path to cl because otherwise cmake will replace CMAKE_CXX_COMPILER with the full path
+# and keep CMAKE_CUDA_HOST_COMPILER at "cl" which breaks our cmake script
+$script:HOST_COMPILER  = (Get-Command "cl").source -replace '\\','/'
+$script:PARALLEL_LEVEL = (Get-WmiObject -class Win32_processor).NumberOfLogicalProcessors
+
+# Extract the CL version for export to build scripts:
+$script:CL_VERSION_STRING = & cl.exe /?
+if ($script:CL_VERSION_STRING -match "Version (\d+\.\d+)\.\d+") {
+    $CL_VERSION = [version]$matches[1]
+    Write-Host "Detected cl.exe version: $CL_VERSION"
+}
+
+if (-not $env:CCCL_BUILD_INFIX) {
+    $env:CCCL_BUILD_INFIX = ""
+}
+
+# Presets will be configured in this directory:
+$BUILD_DIR = "../build/$env:CCCL_BUILD_INFIX"
+
+If(!(test-path -PathType container "../build")) {
+    New-Item -ItemType Directory -Path "../build"
+}
+
+# The most recent build will always be symlinked to cccl/build/latest
+New-Item -ItemType Directory -Path "$BUILD_DIR" -Force
+
+# Prepare environment for CMake:
+$env:CMAKE_BUILD_PARALLEL_LEVEL = $PARALLEL_LEVEL
+$env:CTEST_PARALLEL_LEVEL = 1
+$env:CUDAHOSTCXX = $HOST_COMPILER.FullName
+$env:CXX = $HOST_COMPILER.FullName
+
+Write-Host "========================================"
+Write-Host "Begin build"
+Write-Host "pwd=$pwd"
+Write-Host "BUILD_DIR=$BUILD_DIR"
+Write-Host "CXX_STANDARD=$CXX_STANDARD"
+Write-Host "CXX=$env:CXX"
+Write-Host "CUDACXX=$env:CUDACXX"
+Write-Host "CUDAHOSTCXX=$env:CUDAHOSTCXX"
+Write-Host "NVCC_VERSION=$NVCC_VERSION"
+Write-Host "CMAKE_BUILD_PARALLEL_LEVEL=$env:CMAKE_BUILD_PARALLEL_LEVEL"
+Write-Host "CTEST_PARALLEL_LEVEL=$env:CTEST_PARALLEL_LEVEL"
+Write-Host "CCCL_BUILD_INFIX=$env:CCCL_BUILD_INFIX"
+Write-Host "Current commit is:"
+Write-Host "$(git log -1)"
+Write-Host "========================================"
+
+function configure_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET,
+        [Parameter(Mandatory = $true)]
+        [AllowEmptyString()]
+        [string]$CMAKE_OPTIONS
+    )
+
+    $step = "$BUILD_NAME (configure)"
+
+    # CMake must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    cmake --preset $PRESET $CMAKE_OPTIONS --log-level VERBOSE
+    $test_result = $LastExitCode
+
+    If ($test_result -ne 0) {
+        throw "$step Failed"
+    }
+
+    popd
+    Write-Host "$step complete."
+}
+
+function build_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET
+    )
+
+    $step = "$BUILD_NAME (build)"
+
+    # CMake must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    sccache_stats('Start')
+
+    cmake --build --preset $PRESET -v
+    $test_result = $LastExitCode
+
+    sccache_stats('Stop')
+
+    echo "$step complete"
+
+    If ($test_result -ne 0) {
+         throw "$step Failed"
+    }
+
+    popd
+}
+
+function test_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET
+    )
+
+    $step = "$BUILD_NAME (test)"
+
+    # CTest must be invoked in the same directory as the presets file:
+    pushd ".."
+
+    sccache_stats('Start')
+
+    ctest --preset $PRESET
+    $test_result = $LastExitCode
+
+    sccache_stats('Stop')
+
+    echo "$step complete"
+
+    If ($test_result -ne 0) {
+         throw "$step Failed"
+    }
+
+    popd
+}
+
+function configure_and_build_preset {
+    Param(
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$BUILD_NAME,
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [string]$PRESET,
+        [Parameter(Mandatory = $true)]
+        [AllowEmptyString()]
+        [string]$CMAKE_OPTIONS
+    )
+
+    configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
+    build_preset "$BUILD_NAME" "$PRESET"
+}
+
+function sccache_stats {
+    Param (
+        [Parameter(Mandatory = $true)]
+        [ValidateNotNullOrEmpty()]
+        [ValidateSet('Start','Stop')]
+        [string]$MODE
+    )
+
+    $sccache_stats = sccache -s
+    If($MODE -eq 'Start') {
+        [int]$script:sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+')
+        [int]$script:sccache_cache_hits_cpp   = ($sccache_stats[2] -replace '[^\d]+')
+        [int]$script:sccache_cache_hits_cuda  = ($sccache_stats[3] -replace '[^\d]+')
+        [int]$script:sccache_cache_miss_cpp   = ($sccache_stats[5] -replace '[^\d]+')
+        [int]$script:sccache_cache_miss_cuda  = ($sccache_stats[6] -replace '[^\d]+')
+    } else {
+        [int]$final_sccache_compile_requests = ($sccache_stats[0] -replace '[^\d]+')
+        [int]$final_sccache_cache_hits_cpp   = ($sccache_stats[2] -replace '[^\d]+')
+        [int]$final_sccache_cache_hits_cuda  = ($sccache_stats[3] -replace '[^\d]+')
+        [int]$final_sccache_cache_miss_cpp   = ($sccache_stats[5] -replace '[^\d]+')
+        [int]$final_sccache_cache_miss_cuda  = ($sccache_stats[6] -replace '[^\d]+')
+
+        [int]$total_requests  = $final_sccache_compile_requests - $script:sccache_compile_requests
+        [int]$total_hits_cpp  = $final_sccache_cache_hits_cpp   - $script:sccache_cache_hits_cpp
+        [int]$total_hits_cuda = $final_sccache_cache_hits_cuda  - $script:sccache_cache_hits_cuda
+        [int]$total_miss_cpp  = $final_sccache_cache_miss_cpp   - $script:sccache_cache_miss_cpp
+        [int]$total_miss_cuda = $final_sccache_cache_miss_cuda  - $script:sccache_cache_miss_cuda
+        If ( $total_requests -gt 0 ) {
+            [int]$hit_rate_cpp  = $total_hits_cpp  / $total_requests * 100;
+            [int]$hit_rate_cuda = $total_hits_cuda / $total_requests * 100;
+            echo "sccache hits cpp:  $total_hits_cpp  `t| misses: $total_miss_cpp  `t| hit rate: $hit_rate_cpp%"
+            echo "sccache hits cuda: $total_hits_cuda `t| misses: $total_miss_cuda `t| hit rate: $hit_rate_cuda%"
+        } else {
+            echo "sccache stats: N/A No new compilation requests"
+        }
+    }
+}
+
+Export-ModuleMember -Function configure_preset, build_preset, test_preset, configure_and_build_preset, sccache_stats
+Export-ModuleMember -Variable BUILD_DIR, CL_VERSION
--- a/ci/windows/build_nvbench.ps1
+++ b/ci/windows/build_nvbench.ps1
@@ -0,0 +1,26 @@
+
+Param(
+    [Parameter(Mandatory = $true)]
+    [Alias("std")]
+    [ValidateNotNullOrEmpty()]
+    [ValidateSet(17)]
+    [int]$CXX_STANDARD = 17
+)
+
+$CURRENT_PATH = Split-Path $pwd -leaf
+If($CURRENT_PATH -ne "ci") {
+    Write-Host "Moving to ci folder"
+    pushd "$PSScriptRoot/.."
+}
+
+Remove-Module -Name build_common
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+
+$PRESET = "nvbench-cpp$CXX_STANDARD"
+$CMAKE_OPTIONS = ""
+
+configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
+
+If($CURRENT_PATH -ne "ci") {
+    popd
+}