Merge remote-tracking branch 'origin/main' into fea/axes_iteration_space

This commit is contained in:
Allison Piper
2025-05-01 10:42:35 -04:00
234 changed files with 10357 additions and 12278 deletions

View File

@@ -36,9 +36,33 @@ BreakBeforeBinaryOperators: None
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
ColumnLimit: 80
ColumnLimit: 100
CompactNamespaces: false
ContinuationIndentWidth: 2
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '^<nvbench'
Priority: 1
- Regex: '^<cub'
Priority: 2
- Regex: '^<thrust'
Priority: 3
- Regex: '^<cuda/'
Priority: 4
- Regex: '^<cuda'
Priority: 5
- Regex: '^<nvml'
Priority: 6
- Regex: '^<cupti'
Priority: 7
- Regex: '^<nvperf'
Priority: 8
- Regex: '^<nlohmann'
Priority: 9
- Regex: '^<fmt'
Priority: 10
- Regex: '^<[a-z_]*>$'
Priority: 11
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 2
@@ -55,7 +79,7 @@ PenaltyExcessCharacter: 100
PenaltyReturnTypeOnItsOwnLine: 90
PointerAlignment: Right
ReflowComments: true
SortIncludes: true
SortIncludes: CaseInsensitive
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true

62
.clangd Normal file
View File

@@ -0,0 +1,62 @@
# https://clangd.llvm.org/config
# Apply a config conditionally to all C files
If:
PathMatch: .*\.(c|h)$
---
# Apply a config conditionally to all C++ files
If:
PathMatch: .*\.(c|h)pp
---
# Apply a config conditionally to all CUDA files
If:
PathMatch: .*\.cuh?
CompileFlags:
Add:
# Allow variadic CUDA functions
- "-Xclang=-fcuda-allow-variadic-functions"
---
# Tweak the clangd parse settings for all files
CompileFlags:
Compiler: clang++
CompilationDatabase: .
Add:
- -x
- cuda
# report all errors
- "-ferror-limit=0"
- "-ftemplate-backtrace-limit=0"
- "-std=c++17"
Remove:
# strip CUDA fatbin args
- "-Xfatbin*"
- "-Xcompiler*"
- "-Xcudafe*"
- "-rdc=*"
- "-gpu=*"
- "--diag_suppress*"
# strip CUDA arch flags
- "-gencode*"
- "--generate-code*"
# strip gcc's -fcoroutines
- -fcoroutines
# strip CUDA flags unknown to clang
- "-ccbin*"
- "--compiler-options*"
- "--expt-extended-lambda"
- "--expt-relaxed-constexpr"
- "-forward-unknown-to-host-compiler"
- "-Werror=cross-execution-space-call"
Diagnostics:
Suppress:
- "variadic_device_fn"
- "attributes_not_allowed"
# The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error.
# Temporarily suppressing it, but should probably fix
- "template_param_shadow"

198
.devcontainer/README.md Normal file
View File

@@ -0,0 +1,198 @@
> **Note**
> The instructions in this README are specific to Linux development environments. Instructions for Windows are coming soon!
[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
# CCCL Dev Containers
CCCL uses [Development Containers](https://containers.dev/) to provide consistent and convenient development environments for both local development and for CI. This guide covers setup in [Visual Studio Code](#quickstart-vscode-recommended) and [Docker](#quickstart-docker-manual-approach). The guide also provides additional instructions in case you want use WSL.
## Table of Contents
1. [Quickstart: VSCode (Recommended)](#vscode)
2. [Quickstart: Docker (Manual Approach)](#docker)
3. [Quickstart: Using WSL](#wsl)
## Quickstart: VSCode (Recommended) <a name="vscode"></a>
### Prerequisites
- [Visual Studio Code](https://code.visualstudio.com/)
- [Remote - Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)
- [Docker](https://docs.docker.com/engine/install/) - This is only for completeness because it should already be implicitly installed by the Dev Containers extension
### Steps
1. Clone the Repository
```bash
git clone https://github.com/nvidia/cccl.git
```
2. Open the cloned directory in VSCode
3. Launch a Dev Container by clicking the prompt suggesting to "Reopen in Container"
![Shows "Reopen in Container" prompt when opening the cccl directory in VScode.](./img/reopen_in_container.png)
- Alternatively, use the Command Palette to start a Dev Container. Press `Ctrl+Shift+P` to open the Command Palette. Type "Remote-Containers: Reopen in Container" and select it.
![Shows "Reopen in Container" in command palette.](./img/open_in_container_manual.png)
4. Select an environment with the desired CTK and host compiler from the list:
![Shows list of available container environments.](./img/container_list.png)
5. VSCode will initialize the selected Dev Container. This can take a few minutes the first time.
6. Once initialized, the local `cccl/` directory is mirrored into the container to ensure any changes are persistent.
7. Done! See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
### (Optional) Authenticate with GitHub for `sccache`
After starting the container, there will be a prompt to authenticate with GitHub. This grants access to a [`sccache`](https://github.com/mozilla/sccache) server shared with CI and greatly accelerates local build times. This is currently limited to NVIDIA employees belonging to the `NVIDIA` or `rapidsai` GitHub organizations.
Without authentication to the remote server, `sccache` will still accelerate local builds by using a filesystem cache.
Follow the instructions in the prompt as below and enter the one-time code at https://github.com/login/device
![Shows authentication with GitHub to access sccache bucket.](./img/github_auth.png)
To manually trigger this authentication, execute the `devcontainer-utils-vault-s3-init` script within the container.
For more information about the sccache configuration and authentication, see the documentation at [`rapidsai/devcontainers`](https://github.com/rapidsai/devcontainers/blob/branch-23.10/USAGE.md#build-caching-with-sccache).
## Quickstart: Docker (Manual Approach) <a name="docker"></a>
### Prerequisites
- [Docker](https://docs.docker.com/desktop/install/linux-install/)
### Steps
1. Clone the repository and use the [`launch.sh`](./launch.sh) script to launch the default container environment
```bash
git clone https://github.com/nvidia/cccl.git
cd cccl
./.devcontainer/launch.sh --docker
```
This script starts an interactive shell as the `coder` user inside the container with the local `cccl/` directory mirrored into `/home/coder/cccl`.
For specific environments, use the `--cuda` and `--host` options:
```bassh
./.devcontainer/launch.sh --docker --cuda 12.2 --host gcc10
```
See `./.devcontainer/launch.sh --help` for more information.
2. Done. See the [contributing guide](../CONTRIBUTING.md#building-and-testing) for instructions on how to build and run tests.
## Available Environments
CCCL provides environments for both the oldest and newest supported CUDA versions with all compatible host compilers.
Look in the [`.devcontainer/`](.) directory to see the available configurations. The top-level [`devcontainer.json`](./devcontainer.json) serves as the default environment. All `devcontainer.json` files in the `cuda<CTK_VERSION>-<HOST-COMPILER>` sub-directories are variations on this top-level file, with different base images for the different CUDA and host compiler versions.
## VSCode Customization
By default, CCCL's Dev Containers come with certain VSCode settings and extensions configured by default, as can be seen in the [`devcontainer.json`](./devcontainer.json) file. This can be further customized by users without needing to modify the `devcontainer.json` file directly.
For extensions, the [`dev.containers.defaultExtensions` setting](https://code.visualstudio.com/docs/devcontainers/containers#_always-installed-extensions) allows listing extensions that will always be installed.
For more general customizations, VSCode allows using a dotfile repository. See the [VSCode documentation](https://code.visualstudio.com/docs/devcontainers/containers#_personalizing-with-dotfile-repositories) for more information.
## GitHub Codespaces
[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json)
One of the benefits of Dev Containers is that they integrate natively with [GitHub Codespaces](https://github.com/features/codespaces). Codespaces provide a VSCode development environment right in your browser running on a machine in the cloud. This provides a truly one-click, turnkey development environment where you can develop, build, and test with no other setup required.
Click the badge above or [click here](https://codespaces.new/NVIDIA/cccl?quickstart=1&devcontainer_path=.devcontainer%2Fdevcontainer.json) to get started with CCCL's Dev Containers on Codespaces. This will start the default Dev Container environment. [Click here](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=296416761&skip_quickstart=true) to start a Codespace with a particular environment and hardware configuration as shown:
![Shows configuring a Codespace with a custom environment](../docs/images/codespaces.png)
## For Maintainers: The `make_devcontainers.sh` Script
### Overview
[`make_devcontainers.sh`](./make_devcontainers.sh) generates devcontainer configurations for the unique combinations of CUDA Toolkit (CTK) versions and host compilers in [`ci/matrix.yaml`](../ci/matrix.yaml).
### How It Works:
1. Parses the matrix from `ci/matrix.yaml`.
2. Use the top-level [`.devcontainer/devcontainer.json`](./devcontainer.json) as a template. For each unique combination of CTK version and host compiler, generate a corresponding `devcontainer.json` configuration, adjusting only the base Docker image to match the desired environment.
3. Place the generated configurations in the `.devcontainer` directory, organizing them into subdirectories following the naming convention `cuda<CTK_VERSION>-<COMPILER_VERSION>`.
For more information, see the `.devcontainer/make_devcontainers.sh --help` message.
**Note**: When adding or updating supported environments, modify `matrix.yaml` and then rerun this script to synchronize the `devcontainer` configurations.
## Quickstart: Using WSL <a name="wsl"></a>
> [!NOTE]
> _Make sure you have the Nvidia driver installed on your Windows host before moving further_. Type in `nvidia-smi` for verification.
### Install WSL on your Windows host
> [!WARNING]
> Disclaimer: This guide was developed for WSL 2 on Windows 11.
1. Launch a Windows terminal (_e.g. Powershell_) as an administrator.
2. Install WSL 2 by running:
```bash
wsl --install
```
This should probably install Ubuntu distro as a default.
3. Restart your computer and run `wsl -l -v` on a Windows terminal to verify installation.
<h3 id="prereqs"> Install prerequisites and VS Code extensions</h3>
4. Launch your WSL/Ubuntu terminal by running `wsl` in Powershell.
5. Install the [WSL extension](ms-vscode-remote.remote-wsl) on VS Code.
- `Ctrl + Shift + P` and select `WSL: Connect to WSL` (it will prompt you to install the WSL extension).
- Make sure you are connected to WSL with VS Code by checking the bottom left corner of the VS Code window (should indicate "WSL: Ubuntu" in our case).
6. Install the [Dev Containers extension](ms-vscode-remote.remote-containers) on VS Code.
- In a vanilla system you should be prompted to install `Docker` at this point, accept it. If it hangs you might have to restart VS Code after that.
7. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). **Make sure you install the WSL 2 version and not the native Linux one**. This builds on top of Docker so make sure you have Docker properly installed (run `docker --version`).
8. Open `/etc/docker/daemon.json` from within your WSL system (if the file does not exist, create it) and add the following:
```json
{
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
```
then run `sudo systemctl restart docker.service`.
---
### Build CCCL in WSL using Dev Containers
9. Still on your WSL terminal run `git clone https://github.com/NVIDIA/cccl.git`
10. Open the CCCL cloned repo in VS Code ( `Ctrl + Shift + P `, select `File: Open Folder...` and select the path where your CCCL clone is located).
11. If prompted, choose `Reopen in Container`.
- If you are not prompted just type `Ctrl + Shift + P` and `Dev Containers: Open Folder in Container ...`.
12. Verify that Dev Container was configured properly by running `nvidia-smi` in your Dev Container terminal. For a proper configuration it is important for the steps in [Install prerequisites and VS Code extensions](#prereqs) to be followed in a precise order.
From that point on, the guide aligns with our [existing Dev Containers native Linux guide](https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md) with just one minor potential alteration:
13. If WSL was launched without the X-server enabled, when asked to "authenticate Git with your Github credentials", if you answer **Yes**, the browser might not open automatically, with the following error message.
> Failed opening a web browser at https://github.com/login/device
exec: "xdg-open,x-www-browser,www-browser,wslview": executable file not found in $PATH
Please try entering the URL in your browser manually
In that case type in the address manually in your web browser https://github.com/login/device and fill in the one-time code.

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc10-cuda12.0-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0-gcc10",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "10",
"CCCL_BUILD_INFIX": "cuda12.0-gcc10"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.0-gcc10"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc11-cuda12.0-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0-gcc11",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "11",
"CCCL_BUILD_INFIX": "cuda12.0-gcc11"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.0-gcc11"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc12-cuda12.0-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0-gcc12",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "12",
"CCCL_BUILD_INFIX": "cuda12.0-gcc12"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.0-gcc12"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc7-cuda12.0-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0-gcc7",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "7",
"CCCL_BUILD_INFIX": "cuda12.0-gcc7"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.0-gcc7"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc8-cuda12.0-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0-gcc8",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "8",
"CCCL_BUILD_INFIX": "cuda12.0-gcc8"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.0-gcc8"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc9-cuda12.0-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0-gcc9",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "9",
"CCCL_BUILD_INFIX": "cuda12.0-gcc9"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.0-gcc9"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-llvm14-cuda12.0-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.0-llvm14",
"CCCL_CUDA_VERSION": "12.0",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "14",
"CCCL_BUILD_INFIX": "cuda12.0-llvm14"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.0-llvm14"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc10-cuda12.8-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc10",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "10",
"CCCL_BUILD_INFIX": "cuda12.8-gcc10"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc10"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc11-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc11",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "11",
"CCCL_BUILD_INFIX": "cuda12.8-gcc11"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc11"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc12-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc12",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "12",
"CCCL_BUILD_INFIX": "cuda12.8-gcc12"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc12"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc13-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc13",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "13",
"CCCL_BUILD_INFIX": "cuda12.8-gcc13"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc13"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc14-cuda12.8-ubuntu24.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc14",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "14",
"CCCL_BUILD_INFIX": "cuda12.8-gcc14"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc14"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc7-cuda12.8-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc7",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "7",
"CCCL_BUILD_INFIX": "cuda12.8-gcc7"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc7"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc8-cuda12.8-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc8",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "8",
"CCCL_BUILD_INFIX": "cuda12.8-gcc8"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc8"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc9-cuda12.8-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc9",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "9",
"CCCL_BUILD_INFIX": "cuda12.8-gcc9"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc9"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-llvm14-cuda12.8-ubuntu20.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-llvm14",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "14",
"CCCL_BUILD_INFIX": "cuda12.8-llvm14"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-llvm14"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-llvm15-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-llvm15",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "15",
"CCCL_BUILD_INFIX": "cuda12.8-llvm15"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-llvm15"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-llvm16-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-llvm16",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "16",
"CCCL_BUILD_INFIX": "cuda12.8-llvm16"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-llvm16"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-llvm17-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-llvm17",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "17",
"CCCL_BUILD_INFIX": "cuda12.8-llvm17"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-llvm17"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-llvm18-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-llvm18",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "18",
"CCCL_BUILD_INFIX": "cuda12.8-llvm18"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-llvm18"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-llvm19-cuda12.8-ubuntu22.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-llvm19",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "llvm",
"CCCL_HOST_COMPILER_VERSION": "19",
"CCCL_BUILD_INFIX": "cuda12.8-llvm19"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-llvm19"
}

View File

@@ -0,0 +1,46 @@
{
"shutdownAction": "stopContainer",
"image": "rapidsai/devcontainers:25.06-cpp-gcc14-cuda12.8-ubuntu24.04",
"hostRequirements": {
"gpu": "optional"
},
"initializeCommand": [
"/bin/bash",
"-c",
"mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
],
"containerEnv": {
"SCCACHE_REGION": "us-east-2",
"SCCACHE_BUCKET": "rapids-sccache-devs",
"AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
"HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
"DEVCONTAINER_NAME": "cuda12.8-gcc14",
"CCCL_CUDA_VERSION": "12.8",
"CCCL_HOST_COMPILER": "gcc",
"CCCL_HOST_COMPILER_VERSION": "14",
"CCCL_BUILD_INFIX": "cuda12.8-gcc14"
},
"workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"xaver.clang-format"
],
"settings": {
"editor.defaultFormatter": "xaver.clang-format",
"clang-format.executable": "/usr/local/bin/clang-format",
"clangd.arguments": [
"--compile-commands-dir=${workspaceFolder}"
]
}
}
},
"name": "cuda12.8-gcc14"
}

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Maybe change the UID/GID of the container's non-root user to match the host's UID/GID
: "${REMOTE_USER:="coder"}";
: "${OLD_UID:=}";
: "${OLD_GID:=}";
: "${NEW_UID:=}";
: "${NEW_GID:=}";
eval "$(sed -n "s/${REMOTE_USER}:[^:]*:\([^:]*\):\([^:]*\):[^:]*:\([^:]*\).*/OLD_UID=\1;OLD_GID=\2;HOME_FOLDER=\3/p" /etc/passwd)";
eval "$(sed -n "s/\([^:]*\):[^:]*:${NEW_UID}:.*/EXISTING_USER=\1/p" /etc/passwd)";
eval "$(sed -n "s/\([^:]*\):[^:]*:${NEW_GID}:.*/EXISTING_GROUP=\1/p" /etc/group)";
if [ -z "$OLD_UID" ]; then
echo "Remote user not found in /etc/passwd ($REMOTE_USER).";
exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
elif [ "$OLD_UID" = "$NEW_UID" ] && [ "$OLD_GID" = "$NEW_GID" ]; then
echo "UIDs and GIDs are the same ($NEW_UID:$NEW_GID).";
exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
elif [ "$OLD_UID" != "$NEW_UID" ] && [ -n "$EXISTING_USER" ]; then
echo "User with UID exists ($EXISTING_USER=$NEW_UID).";
exec "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
else
if [ "$OLD_GID" != "$NEW_GID" ] && [ -n "$EXISTING_GROUP" ]; then
echo "Group with GID exists ($EXISTING_GROUP=$NEW_GID).";
NEW_GID="$OLD_GID";
fi
echo "Updating UID:GID from $OLD_UID:$OLD_GID to $NEW_UID:$NEW_GID.";
sed -i -e "s/\(${REMOTE_USER}:[^:]*:\)[^:]*:[^:]*/\1${NEW_UID}:${NEW_GID}/" /etc/passwd;
if [ "$OLD_GID" != "$NEW_GID" ]; then
sed -i -e "s/\([^:]*:[^:]*:\)${OLD_GID}:/\1${NEW_GID}:/" /etc/group;
fi
# Fast parallel `chown -R`
find "$HOME_FOLDER/" -not -user "$REMOTE_USER" -print0 \
| xargs -0 -r -n1 -P"$(nproc --all)" chown "$NEW_UID:$NEW_GID"
# Run the container command as $REMOTE_USER, preserving the container startup environment.
#
# We cannot use `su -w` because that's not supported by the `su` in Ubuntu18.04, so we reset the following
# environment variables to the expected values, then pass through everything else from the startup environment.
export HOME="$HOME_FOLDER";
export XDG_CACHE_HOME="$HOME_FOLDER/.cache";
export XDG_CONFIG_HOME="$HOME_FOLDER/.config";
export XDG_STATE_HOME="$HOME_FOLDER/.local/state";
export PYTHONHISTFILE="$HOME_FOLDER/.local/state/.python_history";
exec su -p "$REMOTE_USER" -- "$(pwd)/.devcontainer/nvbench-entrypoint.sh" "$@";
fi

Binary file not shown.

After

Width:  |  Height:  |  Size: 156 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

306
.devcontainer/launch.sh Executable file
View File

@@ -0,0 +1,306 @@
#!/usr/bin/env bash
set -euo pipefail
# Ensure the script is being executed in the nvbench/ root
cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/..";
print_help() {
echo "Usage: $0 [-c|--cuda <CUDA version>] [-H|--host <Host compiler>] [-d|--docker]"
echo "Launch a development container. If no CUDA version or Host compiler are specified,"
echo "the top-level devcontainer in .devcontainer/devcontainer.json will be used."
echo ""
echo "Options:"
echo " -c, --cuda Specify the CUDA version. E.g., 12.2"
echo " -H, --host Specify the host compiler. E.g., gcc12"
echo " -d, --docker Launch the development environment in Docker directly without using VSCode."
echo " --gpus gpu-request GPU devices to add to the container ('all' to pass all GPUs)."
echo " -e, --env list Set additional container environment variables."
echo " -v, --volume list Bind mount a volume."
echo " -h, --help Display this help message and exit."
}
# Assign variable one scope above the caller
# Usage: local "$1" && _upvar $1 "value(s)"
# Param: $1 Variable name to assign value to
# Param: $* Value(s) to assign. If multiple values, an array is
# assigned, otherwise a single value is assigned.
# See: http://fvue.nl/wiki/Bash:_Passing_variables_by_reference
_upvar() {
if unset -v "$1"; then
if (( $# == 2 )); then
eval $1=\"\$2\";
else
eval $1=\(\"\${@:2}\"\);
fi;
fi
}
parse_options() {
local -;
set -euo pipefail;
# Read the name of the variable in which to return unparsed arguments
local UNPARSED="${!#}";
# Splice the unparsed arguments variable name from the arguments list
set -- "${@:1:$#-1}";
local OPTIONS=c:e:H:dhv
local LONG_OPTIONS=cuda:,env:,host:,gpus:,volume:,docker,help
# shellcheck disable=SC2155
local PARSED_OPTIONS=$(getopt -n "$0" -o "${OPTIONS}" --long "${LONG_OPTIONS}" -- "$@")
# shellcheck disable=SC2181
if [[ $? -ne 0 ]]; then
exit 1
fi
eval set -- "${PARSED_OPTIONS}"
while true; do
case "$1" in
-c|--cuda)
cuda_version="$2"
shift 2
;;
-e|--env)
env_vars+=("$1" "$2")
shift 2
;;
-H|--host)
host_compiler="$2"
shift 2
;;
--gpus)
gpu_request="$2"
shift 2
;;
-d|--docker)
docker_mode=true
shift
;;
-h|--help)
print_help
exit 0
;;
-v|--volume)
volumes+=("$1" "$2")
shift 2
;;
--)
shift
_upvar "${UNPARSED}" "${@}"
break
;;
*)
echo "Invalid option: $1"
print_help
exit 1
;;
esac
done
}
# shellcheck disable=SC2155
launch_docker() {
local -;
set -euo pipefail
inline_vars() {
cat - \
`# inline local workspace folder` \
| sed "s@\${localWorkspaceFolder}@$(pwd)@g" \
`# inline local workspace folder basename` \
| sed "s@\${localWorkspaceFolderBasename}@$(basename "$(pwd)")@g" \
`# inline container workspace folder` \
| sed "s@\${containerWorkspaceFolder}@${WORKSPACE_FOLDER:-}@g" \
`# inline container workspace folder basename` \
| sed "s@\${containerWorkspaceFolderBasename}@$(basename "${WORKSPACE_FOLDER:-}")@g" \
`# translate local envvars to shell syntax` \
| sed -r 's/\$\{localEnv:([^\:]*):?(.*)\}/${\1:-\2}/g'
}
args_to_path() {
local -a keys=("${@}")
keys=("${keys[@]/#/[}")
keys=("${keys[@]/%/]}")
echo "$(IFS=; echo "${keys[*]}")"
}
json_string() {
python3 -c "import json,sys; print(json.load(sys.stdin)$(args_to_path "${@}"))" 2>/dev/null | inline_vars
}
json_array() {
python3 -c "import json,sys; [print(f'\"{x}\"') for x in json.load(sys.stdin)$(args_to_path "${@}")]" 2>/dev/null | inline_vars
}
json_map() {
python3 -c "import json,sys; [print(f'{k}=\"{v}\"') for k,v in json.load(sys.stdin)$(args_to_path "${@}").items()]" 2>/dev/null | inline_vars
}
devcontainer_metadata_json() {
docker inspect --type image --format '{{json .Config.Labels}}' "$DOCKER_IMAGE" \
| json_string '"devcontainer.metadata"'
}
###
# Read relevant values from devcontainer.json
###
local devcontainer_json="${path}/devcontainer.json";
# Read image
local DOCKER_IMAGE="$(json_string '"image"' < "${devcontainer_json}")"
# Always pull the latest copy of the image
docker pull "$DOCKER_IMAGE"
# Read workspaceFolder
local WORKSPACE_FOLDER="$(json_string '"workspaceFolder"' < "${devcontainer_json}")"
# Read remoteUser
local REMOTE_USER="$(json_string '"remoteUser"' < "${devcontainer_json}")"
# If remoteUser isn't in our devcontainer.json, read it from the image's "devcontainer.metadata" label
if test -z "${REMOTE_USER:-}"; then
REMOTE_USER="$(devcontainer_metadata_json | json_string "-1" '"remoteUser"')"
fi
# Read runArgs
local -a RUN_ARGS="($(json_array '"runArgs"' < "${devcontainer_json}"))"
# Read initializeCommand
local -a INITIALIZE_COMMAND="($(json_array '"initializeCommand"' < "${devcontainer_json}"))"
# Read containerEnv
local -a ENV_VARS="($(json_map '"containerEnv"' < "${devcontainer_json}" | sed -r 's/(.*)=(.*)/--env \1=\2/'))"
# Read mounts
local -a MOUNTS="($(
tee < "${devcontainer_json}" \
1>/dev/null \
>(json_array '"mounts"') \
>(json_string '"workspaceMount"') \
| xargs -r -I% echo --mount '%'
))"
###
# Update run arguments and container environment variables
###
# Only pass `-it` if the shell is a tty
if ! ${CI:-'false'} && tty >/dev/null 2>&1 && (exec </dev/tty); then
RUN_ARGS+=("-it")
fi
for flag in rm init; do
if [[ " ${RUN_ARGS[*]} " != *" --${flag} "* ]]; then
RUN_ARGS+=("--${flag}")
fi
done
# Prefer the user-provided --gpus argument
if test -n "${gpu_request:-}"; then
RUN_ARGS+=(--gpus "${gpu_request}")
else
# Otherwise read and infer from hostRequirements.gpu
local GPU_REQUEST="$(json_string '"hostRequirements"' '"gpu"' < "${devcontainer_json}")"
if test "${GPU_REQUEST:-false}" = true; then
RUN_ARGS+=(--gpus all)
elif test "${GPU_REQUEST:-false}" = optional && \
command -v nvidia-container-runtime >/dev/null 2>&1; then
RUN_ARGS+=(--gpus all)
fi
fi
RUN_ARGS+=(--workdir "${WORKSPACE_FOLDER:-/home/coder/nvbench}")
if test -n "${REMOTE_USER:-}"; then
ENV_VARS+=(--env NEW_UID="$(id -u)")
ENV_VARS+=(--env NEW_GID="$(id -g)")
ENV_VARS+=(--env REMOTE_USER="$REMOTE_USER")
RUN_ARGS+=(-u root:root)
RUN_ARGS+=(--entrypoint "${WORKSPACE_FOLDER:-/home/coder/nvbench}/.devcontainer/docker-entrypoint.sh")
fi
if test -n "${SSH_AUTH_SOCK:-}"; then
ENV_VARS+=(--env "SSH_AUTH_SOCK=/tmp/ssh-auth-sock")
MOUNTS+=(--mount "source=${SSH_AUTH_SOCK},target=/tmp/ssh-auth-sock,type=bind")
fi
# Append user-provided volumes
if test -v volumes && test ${#volumes[@]} -gt 0; then
MOUNTS+=("${volumes[@]}")
fi
# Append user-provided envvars
if test -v env_vars && test ${#env_vars[@]} -gt 0; then
ENV_VARS+=("${env_vars[@]}")
fi
# Run the initialize command before starting the container
if test "${#INITIALIZE_COMMAND[@]}" -gt 0; then
eval "${INITIALIZE_COMMAND[*]@Q}"
fi
exec docker run \
"${RUN_ARGS[@]}" \
"${ENV_VARS[@]}" \
"${MOUNTS[@]}" \
"${DOCKER_IMAGE}" \
"$@"
}
launch_vscode() {
local -;
set -euo pipefail;
# Since Visual Studio Code allows only one instance per `devcontainer.json`,
# this code prepares a unique temporary directory structure for each launch of a devcontainer.
# By doing so, it ensures that multiple instances of the same environment can be run
# simultaneously. The script replicates the `devcontainer.json` from the desired CUDA
# and compiler environment into this temporary directory, adjusting paths to ensure the
# correct workspace is loaded. A special URL is then generated to instruct VSCode to
# launch the development container using this temporary configuration.
local workspace="$(basename "$(pwd)")"
local tmpdir="$(mktemp -d)/${workspace}"
mkdir -p "${tmpdir}"
mkdir -p "${tmpdir}/.devcontainer"
cp -arL "${path}/devcontainer.json" "${tmpdir}/.devcontainer"
sed -i "s@\\${localWorkspaceFolder}@$(pwd)@g" "${tmpdir}/.devcontainer/devcontainer.json"
local path="${tmpdir}"
local hash="$(echo -n "${path}" | xxd -pu - | tr -d '[:space:]')"
local url="vscode://vscode-remote/dev-container+${hash}/home/coder/nvbench"
local launch=""
if type open >/dev/null 2>&1; then
launch="open"
elif type xdg-open >/dev/null 2>&1; then
launch="xdg-open"
fi
if [ -n "${launch}" ]; then
echo "Launching VSCode Dev Container URL: ${url}"
code --new-window "${tmpdir}"
exec "${launch}" "${url}" >/dev/null 2>&1
fi
}
main() {
local -a unparsed;
parse_options "$@" unparsed;
set -- "${unparsed[@]}";
# If no CTK/Host compiler are provided, just use the default environment
if [[ -z ${cuda_version:-} ]] && [[ -z ${host_compiler:-} ]]; then
path=".devcontainer"
else
path=".devcontainer/cuda${cuda_version}-${host_compiler}"
if [[ ! -f "${path}/devcontainer.json" ]]; then
echo "Unknown CUDA [${cuda_version}] compiler [${host_compiler}] combination"
echo "Requested devcontainer ${path}/devcontainer.json does not exist"
exit 1
fi
fi
if ${docker_mode:-'false'}; then
launch_docker "$@"
else
launch_vscode
fi
}
main "$@"

View File

@@ -0,0 +1,144 @@
#!/bin/bash
# This script parses the CI matrix.yaml file and generates a devcontainer.json file for each unique combination of
# CUDA version, compiler name/version, and Ubuntu version. The devcontainer.json files are written to the
# .devcontainer directory to a subdirectory named after the CUDA version and compiler name/version.
# GitHub docs on using multiple devcontainer.json files:
# https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers#devcontainerjson
set -euo pipefail
# Ensure the script is being executed in its containing directory
cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
function usage {
echo "Usage: $0 [--clean] [-h/--help] [-v/--verbose]"
echo " --clean Remove stale devcontainer subdirectories"
echo " -h, --help Display this help message"
echo " -v, --verbose Enable verbose mode (set -x)"
exit 1
}
# Function to update the devcontainer.json file with the provided parameters
update_devcontainer() {
local input_file="$1"
local output_file="$2"
local name="$3"
local cuda_version="$4"
local compiler_name="$5"
local compiler_exe="$6"
local compiler_version="$7"
local os="$8"
local devcontainer_version="$9"
local IMAGE_ROOT="rapidsai/devcontainers:${devcontainer_version}-cpp-"
local image="${IMAGE_ROOT}${compiler_name}${compiler_version}-cuda${cuda_version}-${os}"
jq --arg image "$image" --arg name "$name" \
--arg cuda_version "$cuda_version" --arg compiler_name "$compiler_name" \
--arg compiler_exe "$compiler_exe" --arg compiler_version "$compiler_version" --arg os "$os" \
'.image = $image | .name = $name | .containerEnv.DEVCONTAINER_NAME = $name |
.containerEnv.CCCL_BUILD_INFIX = $name |
.containerEnv.CCCL_CUDA_VERSION = $cuda_version | .containerEnv.CCCL_HOST_COMPILER = $compiler_name |
.containerEnv.CCCL_HOST_COMPILER_VERSION = $compiler_version '\
"$input_file" > "$output_file"
}
make_name() {
local cuda_version="$1"
local compiler_name="$2"
local compiler_version="$3"
echo "cuda$cuda_version-$compiler_name$compiler_version"
}
CLEAN=false
VERBOSE=false
while [[ $# -gt 0 ]]; do
case "$1" in
--clean)
CLEAN=true
;;
-h|--help)
usage
;;
-v|--verbose)
VERBOSE=true
;;
*)
usage
;;
esac
shift
done
MATRIX_FILE="../ci/matrix.yaml"
# Enable verbose mode if requested
if [ "$VERBOSE" = true ]; then
set -x
cat ${MATRIX_FILE}
fi
# Read matrix.yaml and convert it to json
matrix_json=$(yq -o json ${MATRIX_FILE})
# Exclude Windows environments
readonly matrix_json=$(echo "$matrix_json" | jq 'del(.pull_request.nvcc[] | select(.os | contains("windows")))')
# Get the devcontainer image version and define image tag root
readonly DEVCONTAINER_VERSION=$(echo "$matrix_json" | jq -r '.devcontainer_version')
# Get unique combinations of cuda version, compiler name/version, and Ubuntu version
readonly combinations=$(echo "$matrix_json" | jq -c '[.pull_request.nvcc[] | {cuda: .cuda, compiler_name: .compiler.name, compiler_exe: .compiler.exe, compiler_version: .compiler.version, os: .os}] | unique | .[]')
# Update the base devcontainer with the default values
# The root devcontainer.json file is used as the default container as well as a template for all
# other devcontainer.json files by replacing the `image:` field with the appropriate image name
readonly base_devcontainer_file="./devcontainer.json"
readonly NEWEST_GCC_CUDA_ENTRY=$(echo "$combinations" | jq -rs '[.[] | select(.compiler_name == "gcc")] | sort_by((.cuda | tonumber), (.compiler_version | tonumber)) | .[-1]')
readonly DEFAULT_CUDA=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.cuda')
readonly DEFAULT_COMPILER_NAME=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_name')
readonly DEFAULT_COMPILER_EXE=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_exe')
readonly DEFAULT_COMPILER_VERSION=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.compiler_version')
readonly DEFAULT_OS=$(echo "$NEWEST_GCC_CUDA_ENTRY" | jq -r '.os')
readonly DEFAULT_NAME=$(make_name "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_VERSION")
update_devcontainer ${base_devcontainer_file} "./temp_devcontainer.json" "$DEFAULT_NAME" "$DEFAULT_CUDA" "$DEFAULT_COMPILER_NAME" "$DEFAULT_COMPILER_EXE" "$DEFAULT_COMPILER_VERSION" "$DEFAULT_OS" "$DEVCONTAINER_VERSION"
mv "./temp_devcontainer.json" ${base_devcontainer_file}
# Create an array to keep track of valid subdirectory names
valid_subdirs=()
# The img folder should not be removed:
valid_subdirs+=("img")
# For each unique combination
for combination in $combinations; do
cuda_version=$(echo "$combination" | jq -r '.cuda')
compiler_name=$(echo "$combination" | jq -r '.compiler_name')
compiler_exe=$(echo "$combination" | jq -r '.compiler_exe')
compiler_version=$(echo "$combination" | jq -r '.compiler_version')
os=$(echo "$combination" | jq -r '.os')
name=$(make_name "$cuda_version" "$compiler_name" "$compiler_version")
mkdir -p "$name"
new_devcontainer_file="$name/devcontainer.json"
update_devcontainer "$base_devcontainer_file" "$new_devcontainer_file" "$name" "$cuda_version" "$compiler_name" "$compiler_exe" "$compiler_version" "$os" "$DEVCONTAINER_VERSION"
echo "Created $new_devcontainer_file"
# Add the subdirectory name to the valid_subdirs array
valid_subdirs+=("$name")
done
# Clean up stale subdirectories and devcontainer.json files
if [ "$CLEAN" = true ]; then
for subdir in ./*; do
if [ -d "$subdir" ] && [[ ! " ${valid_subdirs[@]} " =~ " ${subdir#./} " ]]; then
echo "Removing stale subdirectory: $subdir"
rm -r "$subdir"
fi
done
fi

View File

@@ -0,0 +1,17 @@
#!/usr/bin/env bash
# shellcheck disable=SC1091
set -e;
devcontainer-utils-post-create-command;
devcontainer-utils-init-git;
devcontainer-utils-post-attach-command;
cd /home/coder/nvbench/
if test $# -gt 0; then
exec "$@";
else
exec /bin/bash -li;
fi

View File

@@ -0,0 +1,89 @@
#!/bin/bash
function usage {
echo "Usage: $0"
echo
echo "This script is intended to be run within one of CCCL's Dev Containers."
echo "It verifies that the expected environment variables and binary versions match what is expected."
}
check_envvars() {
for var_name in "$@"; do
if [[ -z "${!var_name:-}" ]]; then
echo "::error:: ${var_name} variable is not set."
exit 1
else
echo "$var_name=${!var_name}"
fi
done
}
check_host_compiler_version() {
local version_output=$($CXX --version)
if [[ "$CXX" == "g++" ]]; then
local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 4 | cut -d '.' -f 1)
local expected_compiler="gcc"
elif [[ "$CXX" == "clang++" ]]; then
if [[ $version_output =~ clang\ version\ ([0-9]+) ]]; then
actual_version=${BASH_REMATCH[1]}
else
echo "::error:: Unable to determine clang version."
exit 1
fi
expected_compiler="llvm"
elif [[ "$CXX" == "icpc" ]]; then
local actual_version=$(echo "$version_output" | head -n 1 | cut -d ' ' -f 3 )
# The icpc compiler version of oneAPI release 2023.2.0 is 2021.10.0
if [[ "$actual_version" == "2021.10.0" ]]; then
actual_version="2023.2.0"
fi
expected_compiler="oneapi"
else
echo "::error:: Unexpected CXX value ($CXX)."
exit 1
fi
if [[ "$expected_compiler" != "${CCCL_HOST_COMPILER}" || "$actual_version" != "$CCCL_HOST_COMPILER_VERSION" ]]; then
echo "::error:: CXX ($CXX) version ($actual_version) does not match the expected compiler (${CCCL_HOST_COMPILER}) and version (${CCCL_HOST_COMPILER_VERSION})."
exit 1
else
echo "Detected host compiler: $CXX version $actual_version"
fi
}
check_cuda_version() {
local cuda_version_output=$(nvcc --version)
if [[ $cuda_version_output =~ release\ ([0-9]+\.[0-9]+) ]]; then
local actual_cuda_version=${BASH_REMATCH[1]}
else
echo "::error:: Unable to determine CUDA version from nvcc."
exit 1
fi
if [[ "$actual_cuda_version" != "$CCCL_CUDA_VERSION" ]]; then
echo "::error:: CUDA version ($actual_cuda_version) does not match the expected CUDA version ($CCCL_CUDA_VERSION)."
exit 1
else
echo "Detected CUDA version: $actual_cuda_version"
fi
}
main() {
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
usage
exit 0
fi
set -euo pipefail
check_envvars DEVCONTAINER_NAME CXX CUDAHOSTCXX CCCL_BUILD_INFIX CCCL_HOST_COMPILER CCCL_CUDA_VERSION CCCL_HOST_COMPILER_VERSION
check_host_compiler_version
check_cuda_version
echo "Dev Container successfully verified!"
}
main "$@"

17
.git-blame-ignore-revs Normal file
View File

@@ -0,0 +1,17 @@
# Exclude these commits from git-blame and similar tools.
#
# To use this file, run the following command from the repo root:
#
# ```
# $ git config blame.ignoreRevsFile .git-blame-ignore-revs
# ```
#
# Include a brief comment with each commit added, for example:
#
# ```
# 8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
# ```
#
# Only add commits that are pure formatting changes (e.g. clang-format version changes, etc).
8f1152d4a22287a35be2dde596e3cf86ace8054a # Increase column limit to 100
3440855dbd405db614861885ad1577fffd882867 # Initial addition of pre-commit.ci formatting.

View File

@@ -0,0 +1,25 @@
name: Compute Matrix
description: "Compute the matrix for a given matrix type from the specified matrix file"
inputs:
matrix_query:
description: "The jq query used to specify the desired matrix. e.g., .pull_request.nvcc"
required: true
matrix_file:
description: 'The file containing the matrix'
required: true
outputs:
matrix:
description: 'The requested matrix'
value: ${{ steps.compute-matrix.outputs.MATRIX }}
runs:
using: "composite"
steps:
- name: Compute matrix
id: compute-matrix
run: |
MATRIX=$(./.github/actions/compute-matrix/compute-matrix.sh ${{inputs.matrix_file}} ${{inputs.matrix_query}} )
echo "matrix=$MATRIX" | tee -a $GITHUB_OUTPUT
shell: bash -euxo pipefail {0}

View File

@@ -0,0 +1,44 @@
#!/bin/bash
set -euo pipefail
write_output() {
local key="$1"
local value="$2"
echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
}
extract_matrix() {
local file="$1"
local type="$2"
local matrix=$(yq -o=json "$file" | jq -cr ".$type")
write_output "DEVCONTAINER_VERSION" "$(yq -o json "$file" | jq -cr '.devcontainer_version')"
local nvcc_full_matrix="$(echo "$matrix" | jq -cr '.nvcc')"
local per_cuda_compiler_matrix="$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
write_output "PER_CUDA_COMPILER_MATRIX" "$per_cuda_compiler_matrix"
write_output "PER_CUDA_COMPILER_KEYS" "$(echo "$per_cuda_compiler_matrix" | jq -r 'keys | @json')"
}
main() {
if [ "$1" == "-v" ]; then
set -x
shift
fi
if [ $# -ne 2 ] || [ "$2" != "pull_request" ]; then
echo "Usage: $0 [-v] MATRIX_FILE MATRIX_TYPE"
echo " -v : Enable verbose output"
echo " MATRIX_FILE : The path to the matrix file."
echo " MATRIX_TYPE : The desired matrix. Supported values: 'pull_request'"
exit 1
fi
echo "Input matrix file:" >&2
cat "$1" >&2
echo "Matrix Type: $2" >&2
extract_matrix "$1" "$2"
}
main "$@"

View File

@@ -0,0 +1,13 @@
name: Set up AWS credentials and environment variables for sccache
description: "Set up AWS credentials and environment variables for sccache"
runs:
using: "composite"
steps:
- name: Set environment variables
run: |
echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV
echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
shell: bash

4
.github/copy-pr-bot.yaml vendored Normal file
View File

@@ -0,0 +1,4 @@
# Configuration file for `copy-pr-bot` GitHub App
# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
enabled: true

View File

@@ -0,0 +1,14 @@
{
"problemMatcher": [
{
"owner": "nvcc",
"pattern": [
{
"regexp": "^\\/home\\/coder\\/(.+):(\\d+):(\\d+): (\\w+): \"(.+)\"$",
"severity": 4,
"message": 5
}
]
}
]
}

View File

@@ -0,0 +1,36 @@
name: build and test
defaults:
run:
shell: bash -exo pipefail {0}
on:
workflow_call:
inputs:
cuda: {type: string, required: true}
host: {type: string, required: true}
cpu: {type: string, required: true}
test_name: {type: string, required: false}
build_script: {type: string, required: false}
test_script: {type: string, required: false}
container_image: {type: string, required: false}
run_tests: {type: boolean, required: false, default: true}
permissions:
contents: read
jobs:
build-and-test:
name: Build/Test ${{inputs.test_name}}
permissions:
id-token: write
contents: read
uses: ./.github/workflows/run-as-coder.yml
with:
cuda: ${{ inputs.cuda }}
host: ${{ inputs.host }}
name: Build/Test ${{inputs.test_name}}
runner: linux-${{inputs.cpu}}-gpu-l4-latest-1
image: ${{ inputs.container_image }}
command: |
${{ inputs.test_script }}

View File

@@ -0,0 +1,34 @@
name: Dispatch build and test
on:
workflow_call:
inputs:
project_name: {type: string, required: true}
per_cuda_compiler_matrix: {type: string, required: true}
devcontainer_version: {type: string, required: true}
permissions:
contents: read
jobs:
# Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
# ensures that the build/test steps can overlap across different configurations. For example,
# the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
build_and_test_linux:
name: build and test linux
permissions:
id-token: write
contents: read
uses: ./.github/workflows/build-and-test-linux.yml
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
with:
cuda: ${{ matrix.cuda }}
host: ${{matrix.compiler.name}}${{matrix.compiler.version}}
cpu: ${{ matrix.cpu }}
test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}} ${{matrix.extra_build_args}}
build_script: "./ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
test_script: "./ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}

107
.github/workflows/pr.yml vendored Normal file
View File

@@ -0,0 +1,107 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is the main workflow that runs on every PR and push to main
name: pr
defaults:
run:
shell: bash -euo pipefail {0}
on:
push:
branches:
- "pull-request/[0-9]+"
# Only runs one instance of this workflow at a time for a given PR and cancels any in-progress runs when a new one starts.
concurrency:
group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
cancel-in-progress: true
permissions:
contents: read
pull-requests: read
jobs:
compute-matrix:
name: Compute matrix
runs-on: ubuntu-latest
outputs:
DEVCONTAINER_VERSION: ${{steps.set-outputs.outputs.DEVCONTAINER_VERSION}}
PER_CUDA_COMPILER_MATRIX: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_MATRIX}}
PER_CUDA_COMPILER_KEYS: ${{steps.set-outputs.outputs.PER_CUDA_COMPILER_KEYS}}
base_sha: ${{ steps.export-pr-info.outputs.base_sha }}
pr_number: ${{ steps.export-pr-info.outputs.pr_number }}
steps:
- name: Checkout repo
uses: actions/checkout@v4
- name: Lookup PR info
id: get-pr-info
uses: nv-gha-runners/get-pr-info@main
- name: Export PR info
id: export-pr-info
run: |
echo "base_sha=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}" | tee -a "${GITHUB_OUTPUT}"
echo "pr_number=${{ fromJSON(steps.get-pr-info.outputs.pr-info).number }}" | tee -a "${GITHUB_OUTPUT}"
- name: Compute matrix outputs
id: set-outputs
run: |
.github/actions/compute-matrix/compute-matrix.sh ci/matrix.yaml pull_request
nvbench:
name: NVBench CUDA${{ matrix.cuda_host_combination }}
permissions:
id-token: write
contents: read
needs: compute-matrix
uses: ./.github/workflows/dispatch-build-and-test.yml
strategy:
fail-fast: false
matrix:
cuda_host_combination: ${{ fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_KEYS) }}
with:
project_name: "nvbench"
per_cuda_compiler_matrix: ${{ toJSON(fromJSON(needs.compute-matrix.outputs.PER_CUDA_COMPILER_MATRIX)[ matrix.cuda_host_combination ]) }}
devcontainer_version: ${{ needs.compute-matrix.outputs.DEVCONTAINER_VERSION }}
verify-devcontainers:
name: Verify Dev Containers
if: ${{ !contains(github.event.head_commit.message, '[skip-vdc]') }}
needs: compute-matrix
permissions:
id-token: write
contents: read
uses: ./.github/workflows/verify-devcontainers.yml
with:
base_sha: ${{ needs.compute-matrix.outputs.base_sha }}
# This job is the final job that runs after all other jobs and is used for branch protection status checks.
# See: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks
# https://github.com/orgs/community/discussions/26822#discussioncomment-5122101
ci:
runs-on: ubuntu-latest
name: CI
if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
needs:
- nvbench
- verify-devcontainers
steps:
- name: Check status of all precursor jobs
if: >-
${{
contains(needs.*.result, 'failure')
|| contains(needs.*.result, 'cancelled')
}}
run: exit 1

156
.github/workflows/run-as-coder.yml vendored Normal file
View File

@@ -0,0 +1,156 @@
name: Run as coder user
defaults:
run:
shell: bash -exo pipefail {0}
on:
workflow_call:
inputs:
cuda: {type: string, required: true}
host: {type: string, required: true}
name: {type: string, required: true}
image: {type: string, required: true}
runner: {type: string, required: true}
command: {type: string, required: true}
env: { type: string, required: false, default: "" }
permissions:
contents: read
jobs:
run-as-coder:
name: ${{inputs.name}}
permissions:
id-token: write
contents: read
runs-on: ${{inputs.runner}}
container:
# This job now uses a docker-outside-of-docker (DOOD) strategy.
#
# The GitHub Actions runner application mounts the host's docker socket `/var/run/docker.sock` into the
# container. By using a container with the `docker` CLI, this container can launch docker containers
# using the host's docker daemon.
#
# This allows us to run actions that require node v20 in the `cruizba/ubuntu-dind:jammy-26.1.3` container, and
# then launch our Ubuntu18.04-based GCC 6/7 containers to build and test CCCL.
#
# The main inconvenience to this approach is that any container mounts have to match the paths of the runner host,
# not the paths as seen in the intermediate (`cruizba/ubuntu-dind`) container.
#
# Note: I am using `cruizba/ubuntu-dind:jammy-26.1.3` instead of `docker:latest`, because GitHub doesn't support
# JS actions in alpine aarch64 containers, instead failing actions with this error:
# ```
# Error: JavaScript Actions in Alpine containers are only supported on x64 Linux runners. Detected Linux Arm64
# ```
image: cruizba/ubuntu-dind:jammy-26.1.3
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
steps:
- name: Checkout repo
uses: actions/checkout@v4
with:
path: nvbench
persist-credentials: false
- name: Add NVCC problem matcher
run: |
echo "::add-matcher::nvbench/.github/problem-matchers/problem-matcher.json"
- name: Configure credentials and environment variables for sccache
uses: ./nvbench/.github/actions/configure_cccl_sccache
- name: Run command
env:
CI: true
RUNNER: "${{inputs.runner}}"
COMMAND: "${{inputs.command}}"
AWS_ACCESS_KEY_ID: "${{env.AWS_ACCESS_KEY_ID}}"
AWS_SESSION_TOKEN: "${{env.AWS_SESSION_TOKEN}}"
AWS_SECRET_ACCESS_KEY: "${{env.AWS_SECRET_ACCESS_KEY}}"
run: |
echo "[host] github.workspace: ${{github.workspace}}"
echo "[container] GITHUB_WORKSPACE: ${GITHUB_WORKSPACE:-}"
echo "[container] PWD: $(pwd)"
# Necessary because we're doing docker-outside-of-docker:
# Make a symlink in the container that matches the host's ${{github.workspace}}, so that way `$(pwd)`
# in `.devcontainer/launch.sh` constructs volume paths relative to the hosts's ${{github.workspace}}.
mkdir -p "$(dirname "${{github.workspace}}")"
ln -s "$(pwd)" "${{github.workspace}}"
cd "${{github.workspace}}"
cat <<"EOF" > ci.sh
#! /usr/bin/env bash
set -eo pipefail
echo -e "\e[1;34mRunning as '$(whoami)' user in $(pwd):\e[0m"
echo -e "\e[1;34m${{inputs.command}}\e[0m"
eval "${{inputs.command}}" || exit_code=$?
if [ ! -z "$exit_code" ]; then
echo -e "::group::️❗ \e[1;31mInstructions to Reproduce CI Failure Locally\e[0m"
echo "::error:: To replicate this failure locally, follow the steps below:"
echo "1. Clone the repository, and navigate to the correct branch and commit:"
echo " git clone --branch $GITHUB_REF_NAME --single-branch https://github.com/$GITHUB_REPOSITORY.git && cd $(echo $GITHUB_REPOSITORY | cut -d'/' -f2) && git checkout $GITHUB_SHA"
echo ""
echo "2. Run the failed command inside the same Docker container used by the CI:"
echo " docker run --rm -it --gpus all --pull=always --volume \$PWD:/repo --workdir /repo ${{ inputs.image }} ${{inputs.command}}"
echo ""
echo "For additional information, see:"
echo " - DevContainer Documentation: https://github.com/NVIDIA/cccl/blob/main/.devcontainer/README.md"
echo " - Continuous Integration (CI) Overview: https://github.com/NVIDIA/cccl/blob/main/ci-overview.md"
exit $exit_code
fi
EOF
chmod +x ci.sh
mkdir "$RUNNER_TEMP/.aws";
cat <<EOF > "$RUNNER_TEMP/.aws/config"
[default]
bucket=rapids-sccache-devs
region=us-east-2
EOF
cat <<EOF > "$RUNNER_TEMP/.aws/credentials"
[default]
aws_access_key_id=$AWS_ACCESS_KEY_ID
aws_session_token=$AWS_SESSION_TOKEN
aws_secret_access_key=$AWS_SECRET_ACCESS_KEY
EOF
chmod 0600 "$RUNNER_TEMP/.aws/credentials"
chmod 0664 "$RUNNER_TEMP/.aws/config"
declare -a gpu_request=()
# Explicitly pass which GPU to use if on a GPU runner
if [[ "${RUNNER}" = *"-gpu-"* ]]; then
gpu_request+=(--gpus "device=${NVIDIA_VISIBLE_DEVICES}")
fi
host_path() {
sed "s@/__w@$(dirname "$(dirname "${{github.workspace}}")")@" <<< "$1"
}
# Launch this container using the host's docker daemon
${{github.event.repository.name}}/.devcontainer/launch.sh \
--docker \
--cuda ${{inputs.cuda}} \
--host ${{inputs.host}} \
"${gpu_request[@]}" \
--env "CI=$CI" \
--env "AWS_ROLE_ARN=" \
--env "COMMAND=$COMMAND" \
--env "GITHUB_ENV=$GITHUB_ENV" \
--env "GITHUB_SHA=$GITHUB_SHA" \
--env "GITHUB_PATH=$GITHUB_PATH" \
--env "GITHUB_OUTPUT=$GITHUB_OUTPUT" \
--env "GITHUB_ACTIONS=$GITHUB_ACTIONS" \
--env "GITHUB_REF_NAME=$GITHUB_REF_NAME" \
--env "GITHUB_WORKSPACE=$GITHUB_WORKSPACE" \
--env "GITHUB_REPOSITORY=$GITHUB_REPOSITORY" \
--env "GITHUB_STEP_SUMMARY=$GITHUB_STEP_SUMMARY" \
--volume "${{github.workspace}}/ci.sh:/ci.sh" \
--volume "$(host_path "$RUNNER_TEMP")/.aws:/root/.aws" \
--volume "$(dirname "$(dirname "${{github.workspace}}")"):/__w" \
-- /ci.sh

View File

@@ -0,0 +1,150 @@
name: Verify devcontainers
on:
workflow_call:
inputs:
base_sha:
type: string
description: 'For PRs, set the base SHA to conditionally run this workflow only when relevant files are modified.'
required: false
defaults:
run:
shell: bash -euo pipefail {0}
permissions:
contents: read
jobs:
get-devcontainer-list:
name: Verify devcontainer files are up-to-date
outputs:
skip: ${{ steps.inspect-changes.outputs.skip }}
devcontainers: ${{ steps.get-list.outputs.devcontainers }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Setup jq and yq
run: |
sudo apt-get update
sudo apt-get install jq -y
sudo wget -O /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.34.2/yq_linux_amd64
sudo chmod +x /usr/local/bin/yq
- name: Run the script to generate devcontainer files
run: |
./.devcontainer/make_devcontainers.sh --verbose --clean
- name: Check for changes
run: |
if [[ $(git diff --stat) != '' || $(git status --porcelain | grep '^??') != '' ]]; then
git diff --minimal
git status --porcelain
echo "::error:: Dev Container files are out of date or there are untracked files. Run the .devcontainer/make_devcontainers.sh script and commit the changes."
exit 1
else
echo "::note::Dev Container files are up-to-date."
fi
- name: Inspect changes
if: ${{ inputs.base_sha != '' }}
id: inspect-changes
env:
BASE_SHA: ${{ inputs.base_sha }}
run: |
echo "Fetch history and determine merge base..."
git fetch origin --unshallow -q
git fetch origin $BASE_SHA -q
merge_base_sha=$(git merge-base $GITHUB_SHA $BASE_SHA)
echo "Head SHA: $GITHUB_SHA"
echo "PR Base SHA: $BASE_SHA"
echo "Merge Base SHA: $merge_base_sha"
echo "Checking for changes to devcontainer/matrix files..."
all_dirty_files=$(git diff --name-only "${merge_base_sha}" "${GITHUB_SHA}")
echo "::group::All dirty files"
echo "${all_dirty_files}"
echo "::endgroup::"
file_regex="^(.devcontainer|ci/matrix.yaml|.github/actions/workflow-build/build-workflow.py)"
echo "Regex: ${file_regex}"
relevant_dirty_files=$(echo "${all_dirty_files}" | grep -E "${file_regex}" || true)
echo "::group::Relevant dirty files"
echo "${relevant_dirty_files}"
echo "::endgroup::"
if [[ -z "${relevant_dirty_files}" ]]; then
echo "No relevant changes detected. Skipping devcontainer testing."
echo "skip=true" >> $GITHUB_OUTPUT
else
echo "Detected relevant changes. Continuing."
echo "skip=false" >> $GITHUB_OUTPUT
fi
- name: Get list of devcontainer.json paths and names
if: ${{ steps.inspect-changes.outputs.skip != 'true' }}
id: get-list
run: |
devcontainers=$(find .devcontainer/ -name 'devcontainer.json' | while read -r devcontainer; do
jq --arg path "$devcontainer" '{path: $path, name: .name}' "$devcontainer"
done | jq -s -c .)
echo "devcontainers=${devcontainers}" | tee --append "${GITHUB_OUTPUT}"
verify-devcontainers:
name: ${{matrix.devcontainer.name}}
needs: get-devcontainer-list
if: ${{ needs.get-devcontainer-list.outputs.skip != 'true' }}
runs-on: linux-amd64-cpu4
strategy:
fail-fast: false
matrix:
devcontainer: ${{fromJson(needs.get-devcontainer-list.outputs.devcontainers)}}
permissions:
id-token: write
contents: read
steps:
- name: Check out the code
uses: actions/checkout@v4
with:
persist-credentials: false
- name: Install dependencies
run: |
# Add PPA for nodejs, devcontainer CLI requires a newer version:
curl -fsSL https://deb.nodesource.com/setup_20.x -o /tmp/nodesource_setup.sh
sudo bash /tmp/nodesource_setup.sh
sudo apt-get update
sudo apt-get install -y nodejs
sudo npm install -g @devcontainers/cli
# We don't really need sccache configured, but we need the AWS credentials envvars to be set
# in order to avoid the devcontainer hanging waiting for GitHub authentication
- name: Get AWS credentials for sccache bucket
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
aws-region: us-east-2
role-duration-seconds: 43200 # 12 hours
- name: Set environment variables
run: |
echo "SCCACHE_BUCKET=rapids-sccache-devs" >> $GITHUB_ENV
echo "SCCACHE_REGION=us-east-2" >> $GITHUB_ENV
echo "SCCACHE_IDLE_TIMEOUT=32768" >> $GITHUB_ENV
echo "SCCACHE_S3_USE_SSL=true" >> $GITHUB_ENV
echo "SCCACHE_S3_NO_CREDENTIALS=false" >> $GITHUB_ENV
- name: Run in devcontainer
uses: devcontainers/ci@v0.3
with:
push: never
configFile: ${{ matrix.devcontainer.path }}
env: |
SCCACHE_REGION=${{ env.SCCACHE_REGION }}
AWS_ACCESS_KEY_ID=${{ env.AWS_ACCESS_KEY_ID }}
AWS_SESSION_TOKEN=${{ env.AWS_SESSION_TOKEN }}
AWS_SECRET_ACCESS_KEY=${{ env.AWS_SECRET_ACCESS_KEY }}
runCmd: |
.devcontainer/verify_devcontainer.sh

6
.gitignore vendored
View File

@@ -1,4 +1,10 @@
build*/
.aws
.vscode
.cache
.config
.idea
cmake-build-*
*~
compile_commands.json
CMakeUserPresets.json

70
.pre-commit-config.yaml Normal file
View File

@@ -0,0 +1,70 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
ci:
autofix_commit_msg: |
[pre-commit.ci] auto code formatting
autofix_prs: false
autoupdate_branch: ''
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
autoupdate_schedule: quarterly
skip: []
submodules: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
- id: mixed-line-ending
- id: trailing-whitespace
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.6
hooks:
- id: clang-format
types_or: [file]
files: |
(?x)^(
^.*\.c$|
^.*\.cpp$|
^.*\.cu$|
^.*\.cuh$|
^.*\.cxx$|
^.*\.h$|
^.*\.hpp$|
^.*\.inl$|
^.*\.mm$
)
args: ["-fallback-style=none", "-style=file", "-i"]
# TODO/REMINDER: add the Ruff vscode extension to the devcontainers
# Ruff, the Python auto-correcting linter/formatter written in Rust
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.6
hooks:
- id: ruff # linter
- id: ruff-format # formatter
# TOML lint & format
- repo: https://github.com/ComPWA/taplo-pre-commit
rev: v0.9.3
hooks:
# See https://github.com/NVIDIA/cccl/issues/3426
# - id: taplo-lint
# exclude: "^docs/"
- id: taplo-format
exclude: "^docs/"
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
hooks:
- id: codespell
additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"]
exclude: |
(?x)^(
build|
CITATION.md
)
default_language_version:
python: python3

View File

@@ -1,6 +1,5 @@
# 3.20.1 required for rapids-cmake
# 3.21.0 required for NVBench_ADD_DEPENDENT_DLLS_TO_* (MSVC only)
cmake_minimum_required(VERSION 3.20.1)
# 3.30.4 required for rapids-cmake
cmake_minimum_required(VERSION 3.30.4)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
@@ -22,6 +21,11 @@ project(NVBench
nvbench_init_rapids_cmake()
# Define NVBench_DETECTED_${LANG}_STANDARDS
include(cmake/DetectSupportedStandards.cmake)
detect_supported_standards(NVBench CXX 17 20)
detect_supported_standards(NVBench CUDA 17 20)
# See NVIDIA/NVBench#52
find_package(CUDAToolkit REQUIRED)
set(cupti_default ON)
@@ -29,29 +33,37 @@ if (${CUDAToolkit_VERSION} VERSION_LESS 11.3)
set(cupti_default OFF)
endif()
option(BUILD_SHARED_LIBS "Build NVBench as a shared library" ON)
option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default})
option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
option(NVBench_ENABLE_HEADER_TESTING "Build NVBench testing suite." OFF)
option(NVBench_ENABLE_DEVICE_TESTING
"Include tests that require a GPU (with locked clocks)."
OFF
)
option(NVBench_ENABLE_EXAMPLES "Build NVBench examples." OFF)
option(NVBench_ENABLE_INSTALL_RULES "Install NVBench." ${NVBench_TOPLEVEL_PROJECT})
include(cmake/NVBenchUtilities.cmake) # Must be first
include(cmake/NVBenchClangdCompileInfo.cmake) # Must be before any targets are created
include(cmake/NVBenchConfigTarget.cmake)
include(cmake/NVBenchDependentDlls.cmake)
include(cmake/NVBenchExports.cmake)
include(cmake/NVBenchWriteConfigHeader.cmake)
include(cmake/NVBenchDependencies.cmake)
include(cmake/NVBenchInstallRules.cmake)
include(cmake/NVBenchUtilities.cmake)
message(STATUS "NVBench CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
add_subdirectory(nvbench)
if (NVBench_ENABLE_EXAMPLES OR NVBench_ENABLE_TESTING)
if (NVBench_ENABLE_EXAMPLES OR
NVBench_ENABLE_TESTING OR
NVBench_ENABLE_HEADER_TESTING)
include(CTest)
enable_testing()
endif()
@@ -65,4 +77,8 @@ if (NVBench_ENABLE_TESTING)
add_subdirectory(testing)
endif()
if (NVBench_ENABLE_HEADER_TESTING)
include(cmake/NVBenchHeaderTesting.cmake)
endif()
nvbench_generate_exports()

74
CMakePresets.json Normal file
View File

@@ -0,0 +1,74 @@
{
"version": 3,
"cmakeMinimumRequired": {
"major": 3,
"minor": 23,
"patch": 1
},
"configurePresets": [
{
"name": "base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build/$env{CCCL_BUILD_INFIX}/${presetName}",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"CMAKE_CUDA_ARCHITECTURES": "all-major",
"NVBench_ENABLE_CUPTI": true,
"NVBench_ENABLE_DEVICE_TESTING": false,
"NVBench_ENABLE_EXAMPLES": true,
"NVBench_ENABLE_HEADER_TESTING": true,
"NVBench_ENABLE_INSTALL_RULES": true,
"NVBench_ENABLE_NVML": true,
"NVBench_ENABLE_TESTING": true,
"NVBench_ENABLE_WERROR": true
}
},
{
"name": "nvbench-dev",
"displayName": "Developer Build",
"inherits": "base",
"cacheVariables": {
"NVBench_ENABLE_DEVICE_TESTING": true
}
},
{
"name": "nvbench-ci",
"displayName": "NVBench CI",
"inherits": "base"
}
],
"buildPresets": [
{
"name": "nvbench-dev",
"configurePreset": "nvbench-dev"
},
{
"name": "nvbench-ci",
"configurePreset": "nvbench-ci"
}
],
"testPresets": [
{
"name": "base",
"hidden": true,
"output": {
"outputOnFailure": true
},
"execution": {
"noTestsAction": "error",
"stopOnFailure": false
}
},
{
"name": "nvbench-dev",
"configurePreset": "nvbench-dev",
"inherits": "base"
},
{
"name": "nvbench-ci",
"configurePreset": "nvbench-ci",
"inherits": "base"
}
]
}

View File

@@ -25,6 +25,17 @@ features:
* Batch Measurements:
* Executes the benchmark multiple times back-to-back and records total time.
* Reports the average execution time (total time / number of executions).
* [CPU-only Measurements](docs/benchmarks.md#cpu-only-benchmarks)
* Measures the host-side execution time of a non-GPU benchmark.
* Not suitable for microbenchmarking.
# Supported Compilers and Tools
- CMake > 3.30.4
- CUDA Toolkit + nvcc: 12.0 and above
- g++: 7 -> 14
- clang++: 14 -> 19
- Headers are tested with C++17 -> C++20.
# Getting Started
@@ -34,7 +45,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:
```cpp
void my_benchmark(nvbench::state& state) {
state.exec([](nvbench::launch& launch) {
state.exec([](nvbench::launch& launch) {
my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
});
}
@@ -57,10 +68,12 @@ This repository provides a number of [examples](examples/) that demonstrate
various NVBench features and usecases:
- [Runtime and compile-time parameter sweeps](examples/axes.cu)
- [CPU-only benchmarking](examples/cpu_only.cu)
- [Enums and compile-time-constant-integral parameter axes](examples/enums.cu)
- [Reporting item/sec and byte/sec throughput statistics](examples/throughput.cu)
- [Skipping benchmark configurations](examples/skip.cu)
- [Benchmarking on a specific stream](examples/stream.cu)
- [Adding / hiding columns (summaries) in markdown output](examples/summaries.cu)
- [Benchmarks that sync CUDA devices: `nvbench::exec_tag::sync`](examples/exec_tag_sync.cu)
- [Manual timing: `nvbench::exec_tag::timer`](examples/exec_tag_timer.cu)
@@ -70,9 +83,9 @@ To build the examples:
```
mkdir -p build
cd build
cmake -DNVBench_ENABLE_EXAMPLES=ON -DCMAKE_CUDA_ARCHITECTURE=70 .. && make
cmake -DNVBench_ENABLE_EXAMPLES=ON -DCMAKE_CUDA_ARCHITECTURES=70 .. && make
```
Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on.
Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on.
Examples are built by default into `build/bin` and are prefixed with `nvbench.example`.
@@ -119,7 +132,7 @@ Pass: Batch: 0.261963ms GPU, 7.18s total GPU, 27394x
## Demo Project
To get started using NVBench with your own kernels, consider trying out
the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo).
the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo).
`nvbench_demo` provides a simple CMake project that uses NVBench to build an
example benchmark. It's a great way to experiment with the library without a lot
@@ -129,7 +142,7 @@ of investment.
Contributions are welcome!
For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors.
For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors.
## Tests
@@ -146,7 +159,7 @@ To run all tests:
```
make test
```
or
or
```
ctest
```
@@ -163,6 +176,7 @@ testing and parameter tuning of individual kernels. For in-depth analysis of
end-to-end performance of multiple applications, the NVIDIA Nsight tools are
more appropriate.
NVBench is focused on evaluating the performance of CUDA kernels and is not
optimized for CPU microbenchmarks. This may change in the future, but for now,
NVBench is focused on evaluating the performance of CUDA kernels. It also provides
CPU-only benchmarking facilities intended for non-trivial CPU workloads, but is
not optimized for CPU microbenchmarks. This may change in the future, but for now,
consider using Google Benchmark for high resolution CPU benchmarks.

View File

@@ -1,38 +0,0 @@
# Copyright (c) 2018-2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Released under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
SDK_TYPE:
- cuda
SDK_VER:
- 11.5.1-devel
OS_TYPE:
- ubuntu
OS_VER:
- 20.04
CXX_TYPE:
- clang
- gcc
CXX_VER:
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
exclude:
- CXX_TYPE: clang
CXX_VER: 5
- CXX_TYPE: clang
CXX_VER: 6
- CXX_TYPE: gcc
CXX_VER: 12

View File

@@ -1,30 +0,0 @@
# Copyright (c) 2018-2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Released under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
SDK_TYPE:
- cuda
SDK_VER:
- 11.5.1-devel
OS_TYPE:
- ubuntu
OS_VER:
- 20.04
CXX_TYPE:
- clang
- gcc
CXX_VER:
- 11
- 12
exclude:
- CXX_TYPE: clang
CXX_VER: 11
- CXX_TYPE: gcc
CXX_VER: 12

246
ci/build_common.sh Executable file
View File

@@ -0,0 +1,246 @@
#!/bin/bash
set -eo pipefail
# Ensure the script is being executed in its containing directory
cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
# Script defaults
HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
CXX_STANDARD=17
CUDA_COMPILER=${CUDACXX:-nvcc} # $CUDACXX if set, otherwise `nvcc`
CUDA_ARCHS= # Empty, use presets by default.
GLOBAL_CMAKE_OPTIONS=()
DISABLE_CUB_BENCHMARKS= # Enable to force-disable building CUB benchmarks.
# Check if the correct number of arguments has been provided
function usage {
echo "Usage: $0 [OPTIONS]"
echo
echo "The PARALLEL_LEVEL environment variable controls the amount of build parallelism. Default is the number of cores."
echo
echo "Options:"
echo " -v/--verbose: enable shell echo for debugging"
echo " -cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
echo " -cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
echo " -std: CUDA/C++ standard (Defaults to 17)"
echo " -arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to value in presets file)"
echo " -cmake-options: Additional options to pass to CMake"
echo
echo "Examples:"
echo " $ PARALLEL_LEVEL=8 $0"
echo " $ PARALLEL_LEVEL=8 $0 -cxx g++-9"
echo " $ $0 -cxx clang++-8"
echo " $ $0 -cxx g++-8 -std 20 -arch 80-real -v -cuda /usr/local/bin/nvcc"
echo " $ $0 -cmake-options \"-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-Wfatal-errors\""
exit 1
}
# Parse options
# Copy the args into a temporary array, since we will modify them and
# the parent script may still need them.
args=("$@")
while [ "${#args[@]}" -ne 0 ]; do
case "${args[0]}" in
-v | --verbose) VERBOSE=1; args=("${args[@]:1}");;
-cxx) HOST_COMPILER="${args[1]}"; args=("${args[@]:2}");;
-std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");;
-cuda) CUDA_COMPILER="${args[1]}"; args=("${args[@]:2}");;
-arch) CUDA_ARCHS="${args[1]}"; args=("${args[@]:2}");;
-disable-benchmarks) DISABLE_CUB_BENCHMARKS=1; args=("${args[@]:1}");;
-cmake-options)
if [ -n "${args[1]}" ]; then
IFS=' ' read -ra split_args <<< "${args[1]}"
GLOBAL_CMAKE_OPTIONS+=("${split_args[@]}")
args=("${args[@]:2}")
else
echo "Error: No arguments provided for -cmake-options"
usage
exit 1
fi
;;
-h | -help | --help) usage ;;
*) echo "Unrecognized option: ${args[0]}"; usage ;;
esac
done
# Convert to full paths:
HOST_COMPILER=$(which ${HOST_COMPILER})
CUDA_COMPILER=$(which ${CUDA_COMPILER})
if [[ -n "${CUDA_ARCHS}" ]]; then
GLOBAL_CMAKE_OPTIONS+=("-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCHS}")
fi
if [ $VERBOSE ]; then
set -x
fi
# Begin processing unsets after option parsing
set -u
readonly PARALLEL_LEVEL=${PARALLEL_LEVEL:=$(nproc)}
if [ -z ${CCCL_BUILD_INFIX+x} ]; then
CCCL_BUILD_INFIX=""
fi
# Presets will be configured in this directory:
BUILD_DIR="../build/${CCCL_BUILD_INFIX}"
# The most recent build will always be symlinked to cccl/build/latest
mkdir -p $BUILD_DIR
rm -f ../build/latest
ln -sf $BUILD_DIR ../build/latest
# Now that BUILD_DIR exists, use readlink to canonicalize the path:
BUILD_DIR=$(readlink -f "${BUILD_DIR}")
# Prepare environment for CMake:
export CMAKE_BUILD_PARALLEL_LEVEL="${PARALLEL_LEVEL}"
export CTEST_PARALLEL_LEVEL="1"
export CXX="${HOST_COMPILER}"
export CUDACXX="${CUDA_COMPILER}"
export CUDAHOSTCXX="${HOST_COMPILER}"
export CXX_STANDARD
source ./pretty_printing.sh
print_environment_details() {
begin_group "⚙️ Environment Details"
echo "pwd=$(pwd)"
print_var_values \
BUILD_DIR \
CXX_STANDARD \
CXX \
CUDACXX \
CUDAHOSTCXX \
NVCC_VERSION \
CMAKE_BUILD_PARALLEL_LEVEL \
CTEST_PARALLEL_LEVEL \
CCCL_BUILD_INFIX \
GLOBAL_CMAKE_OPTIONS
echo "Current commit is:"
git log -1 || echo "Not a repository"
if command -v nvidia-smi &> /dev/null; then
nvidia-smi
else
echo "nvidia-smi not found"
fi
end_group "⚙️ Environment Details"
}
fail_if_no_gpu() {
if ! nvidia-smi &> /dev/null; then
echo "Error: No NVIDIA GPU detected. Please ensure you have an NVIDIA GPU installed and the drivers are properly configured." >&2
exit 1
fi
}
function print_test_time_summary()
{
ctest_log=${1}
if [ -f ${ctest_log} ]; then
begin_group "⏱️ Longest Test Steps"
# Only print the full output in CI:
if [ -n "${GITHUB_ACTIONS:-}" ]; then
cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake
else
cmake -DLOGFILE=${ctest_log} -P ../cmake/PrintCTestRunTimes.cmake | head -n 15
fi
end_group "⏱️ Longest Test Steps"
fi
}
function configure_preset()
{
local BUILD_NAME=$1
local PRESET=$2
local CMAKE_OPTIONS=$3
local GROUP_NAME="🛠️ CMake Configure ${BUILD_NAME}"
pushd .. > /dev/null
run_command "$GROUP_NAME" cmake --preset=$PRESET --log-level=VERBOSE "${GLOBAL_CMAKE_OPTIONS[@]}" $CMAKE_OPTIONS
status=$?
popd > /dev/null
return $status
}
function build_preset() {
local BUILD_NAME=$1
local PRESET=$2
local green="1;32"
local red="1;31"
local GROUP_NAME="🏗️ Build ${BUILD_NAME}"
source "./sccache_stats.sh" "start"
pushd .. > /dev/null
run_command "$GROUP_NAME" cmake --build --preset=$PRESET -v
status=$?
popd > /dev/null
minimal_sccache_stats=$(source "./sccache_stats.sh" "end")
# Only print detailed stats in actions workflow
if [ -n "${GITHUB_ACTIONS:-}" ]; then
begin_group "💲 sccache stats"
echo "${minimal_sccache_stats}"
sccache -s
end_group
begin_group "🥷 ninja build times"
echo "The "weighted" time is the elapsed time of each build step divided by the number
of tasks that were running in parallel. This makes it an excellent approximation
of how "important" a slow step was. A link that is entirely or mostly serialized
will have a weighted time that is the same or similar to its elapsed time. A
compile that runs in parallel with 999 other compiles will have a weighted time
that is tiny."
./ninja_summary.py -C ${BUILD_DIR}/${PRESET} || echo "ninja_summary.py failed"
end_group
else
echo $minimal_sccache_stats
fi
return $status
}
function test_preset()
{
local BUILD_NAME=$1
local PRESET=$2
local GROUP_NAME="🚀 Test ${BUILD_NAME}"
fail_if_no_gpu
ctest_log_dir="${BUILD_DIR}/log/ctest"
ctest_log="${ctest_log_dir}/${PRESET}"
mkdir -p "${ctest_log_dir}"
pushd .. > /dev/null
run_command "$GROUP_NAME" ctest --output-log "${ctest_log}" --preset=$PRESET
status=$?
popd > /dev/null
print_test_time_summary ${ctest_log}
return $status
}
function configure_and_build_preset()
{
local BUILD_NAME=$1
local PRESET=$2
local CMAKE_OPTIONS=$3
configure_preset "$BUILD_NAME" "$PRESET" "$CMAKE_OPTIONS"
build_preset "$BUILD_NAME" "$PRESET"
}

30
ci/build_nvbench.sh Executable file
View File

@@ -0,0 +1,30 @@
#!/bin/bash
source "$(dirname "$0")/build_common.sh"
print_environment_details
PRESET="nvbench-ci"
CMAKE_OPTIONS=""
function version_lt() {
local lhs="${1//v/}"
local rhs="${2//v/}"
# If the versions are equal, return false
[ "$lhs" = "$rhs" ] && return 1
# If the left-hand side is less than the right-hand side, return true
[ "$lhs" = `echo -e "$lhs\n$rhs" | sort -V | head -n1` ]
}
# If CUDA_COMPILER is nvcc and the version < 11.3, disable CUPTI
if [[ "$CUDA_COMPILER" == *"nvcc"* ]]; then
CUDA_VERSION=$(nvcc --version | grep release | sed -r 's/.*release ([0-9.]+).*/\1/')
if version_lt "$CUDA_VERSION" "11.3"; then
CMAKE_OPTIONS+=" -DNVBench_ENABLE_CUPTI=OFF "
fi
fi
configure_and_build_preset "NVBench" "$PRESET" "$CMAKE_OPTIONS"
print_time_summary

View File

@@ -1,231 +0,0 @@
#! /usr/bin/env bash
# Copyright (c) 2018-2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Released under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
################################################################################
# NVBench build script for gpuCI
################################################################################
set -e
# append variable value
# Appends ${value} to ${variable}, adding a space before ${value} if
# ${variable} is not empty.
function append {
tmp="${!1:+${!1} }${2}"
eval "${1}=\${tmp}"
}
# log args...
# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after.
function log() {
printf "\n>>>> %s\n\n" "${*}"
}
# print_with_trailing_blank_line args...
# Prints ${args[*]} with one blank line following, preserving newlines within
# ${args[*]} but stripping any preceding ${args[*]}.
function print_with_trailing_blank_line {
printf "%s\n\n" "${*}"
}
# echo_and_run name args...
# Echo ${args[@]}, then execute ${args[@]}
function echo_and_run {
echo "${1}: ${@:2}"
${@:2}
}
# echo_and_run_timed name args...
# Echo ${args[@]}, then execute ${args[@]} and report how long it took,
# including ${name} in the output of the time.
function echo_and_run_timed {
echo "${@:2}"
TIMEFORMAT=$'\n'"${1} Time: %lR"
time ${@:2}
}
# join_delimit <delimiter> [value [value [...]]]
# Combine all values into a single string, separating each by a single character
# delimiter. Eg:
# foo=(bar baz kramble)
# joined_foo=$(join_delimit "|" "${foo[@]}")
# echo joined_foo # "bar|baz|kramble"
function join_delimit {
local IFS="${1}"
shift
echo "${*}"
}
################################################################################
# VARIABLES - Set up bash and environmental variables.
################################################################################
# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
source /etc/cccl.bashrc
# Set path.
export PATH=/usr/local/cuda/bin:${PATH}
# Set home to the job's workspace.
export HOME=${WORKSPACE}
# Switch to the build directory.
cd ${WORKSPACE}
mkdir -p build
cd build
# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
rm -f .ninja_log
if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
CMAKE_BUILD_TYPE="Release"
fi
CMAKE_BUILD_FLAGS="--"
# The Docker image sets up `${CXX}` and `${CUDACXX}`.
append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'"
if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
echo "nvc++ not supported."
exit 1
else
if [[ "${CXX_TYPE}" == "icc" ]]; then
echo "icc not supported."
exit 1
fi
# We're using NVCC so we need to set the host compiler.
append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'"
append CMAKE_FLAGS "-DCMAKE_CUDA_HOST_COMPILER='${CXX}'"
append CMAKE_FLAGS "-G Ninja"
# Don't stop on build failures.
append CMAKE_BUILD_FLAGS "-k0"
fi
if [[ -n "${PARALLEL_LEVEL}" ]]; then
DETERMINE_PARALLELISM_FLAGS="-j ${PARALLEL_LEVEL}"
fi
WSL=0
if [[ $(grep -i microsoft /proc/version) ]]; then
echo "Windows Subsystem for Linux detected."
WSL=1
fi
export WSL
#append CMAKE_FLAGS "-DCMAKE_CUDA_ARCHITECTURES=all"
append CMAKE_FLAGS "-DNVBench_ENABLE_EXAMPLES=ON"
append CMAKE_FLAGS "-DNVBench_ENABLE_TESTING=ON"
append CMAKE_FLAGS "-DNVBench_ENABLE_CUPTI=ON"
append CMAKE_FLAGS "-DNVBench_ENABLE_WERROR=ON"
# These consume a lot of time and don't currently have
# any value as regression tests.
append CMAKE_FLAGS "-DNVBench_ENABLE_DEVICE_TESTING=OFF"
# NVML doesn't work under WSL
if [[ ${WSL} -eq 0 ]]; then
append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=ON"
else
append CMAKE_FLAGS "-DNVBench_ENABLE_NVML=OFF"
fi
if [[ -n "${@}" ]]; then
append CMAKE_BUILD_FLAGS "${@}"
fi
append CTEST_FLAGS "--output-on-failure"
# Export variables so they'll show up in the logs when we report the environment.
export CMAKE_FLAGS
export CMAKE_BUILD_FLAGS
export CTEST_FLAGS
################################################################################
# ENVIRONMENT - Configure and print out information about the environment.
################################################################################
log "Determine system topology..."
# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the
# system topology.
source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS}
log "Get environment..."
env | sort
log "Check versions..."
# We use sed and echo below to ensure there is always one and only trailing
# line following the output from each tool.
${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
echo
${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
echo
cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
echo
if [[ "${BUILD_TYPE}" == "gpu" ]]; then
nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
fi
################################################################################
# BUILD
################################################################################
log "Configure..."
echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
configure_status=$?
log "Build..."
# ${PARALLEL_LEVEL} needs to be passed after we run
# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
set +e # Don't stop on build failures.
echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
build_status=$?
set -e
################################################################################
# TEST - Run examples and tests.
################################################################################
log "Test..."
(
# Make sure test_status captures ctest, not tee:
# https://stackoverflow.com/a/999259/11130318
set -o pipefail
echo_and_run_timed "Test" ctest ${CTEST_FLAGS} -j ${PARALLEL_LEVEL} | tee ctest_log
)
test_status=$?
################################################################################
# SUMMARY - Print status of each step and exit with failure if needed.
################################################################################
log "Summary:"
echo "- Configure Error Code: ${configure_status}"
echo "- Build Error Code: ${build_status}"
echo "- Test Error Code: ${test_status}"
if [[ "${configure_status}" != "0" ]] || \
[[ "${build_status}" != "0" ]] || \
[[ "${test_status}" != "0" ]]; then
exit 1
fi

View File

@@ -1,119 +0,0 @@
#! /usr/bin/env bash
# Copyright (c) 2018-2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Released under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
function usage {
echo "Usage: ${0} [flags...]"
echo
echo "Examine the system topology to determine a reasonable amount of build"
echo "parallelism."
echo
echo "Exported variables:"
echo " \${LOGICAL_CPUS} : Logical processors (e.g. threads)."
echo " \${PHYSICAL_CPUS} : Physical processors (e.g. cores)."
echo " \${TOTAL_MEM} : Total system memory [GB]."
echo " \${MAX_THREADS_PER_CORE} : Maximum threads per core allowed."
echo " \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed."
echo " \${CPU_BOUND_THREADS} : # of build threads constrained by processors."
echo " \${MEM_BOUND_THREADS} : # of build threads constrained by memory [GB]."
echo " \${PARALLEL_LEVEL} : Determined # of build threads."
echo " \${MEM_PER_THREAD} : Memory [GB] per build thread."
echo
echo "-h, -help, --help"
echo " Print this message."
echo
echo "-q, --quiet"
echo " Print nothing and only export variables."
echo
echo "-j <threads>, --jobs <threads>"
echo " Explicitly set the number of build threads to use."
echo
echo "--max-threads-per-core <threads>"
echo " Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])."
echo
echo "--min-memory-per-thread <gigabytes>"
echo " Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])."
exit -3
}
QUIET=0
export MAX_THREADS_PER_CORE=2
export MIN_MEMORY_PER_THREAD=1 # [GB]
while test ${#} != 0
do
case "${1}" in
-h) ;&
-help) ;&
--help) usage ;;
-q) ;&
--quiet) QUIET=1 ;;
-j) ;&
--jobs)
shift # The next argument is the number of threads.
PARALLEL_LEVEL="${1}"
;;
--max-threads-per-core)
shift # The next argument is the number of threads per core.
MAX_THREADS_PER_CORE="${1}"
;;
--min-memory-per-thread)
shift # The next argument is the amount of memory per thread.
MIN_MEMORY_PER_THREAD="${1}"
;;
esac
shift
done
# https://stackoverflow.com/a/23378780
if [ $(uname) == "Darwin" ]; then
export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max)
export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max)
else
export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l)
export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
fi
export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }")
export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }")
export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }")
if [[ -z "${PARALLEL_LEVEL}" ]]; then
# Pick the smaller of the two as the default.
if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then
export PARALLEL_LEVEL=${MEM_BOUND_THREADS}
else
export PARALLEL_LEVEL=${CPU_BOUND_THREADS}
fi
else
EXPLICIT_PARALLEL_LEVEL=1
fi
# This can be a floating point number.
export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }")
if [[ "${QUIET}" == 0 ]]; then
echo "Logical CPUs: ${LOGICAL_CPUS} [threads]"
echo "Physical CPUs: ${PHYSICAL_CPUS} [cores]"
echo "Total Mem: ${TOTAL_MEM} [GBs]"
echo "Max Threads Per Core: ${MAX_THREADS_PER_CORE} [threads/core]"
echo "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]"
echo "CPU Bound Threads: ${CPU_BOUND_THREADS} [threads]"
echo "Mem Bound Threads: ${MEM_BOUND_THREADS} [threads]"
echo -n "Parallel Level: ${PARALLEL_LEVEL} [threads]"
if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then
echo " (explicitly set)"
else
echo
fi
echo "Mem Per Thread: ${MEM_PER_THREAD} [GBs/thread]"
fi

View File

@@ -1,14 +0,0 @@
#! /usr/bin/env bash
# Copyright (c) 2018-2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Released under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
################################################################################
# NVBench build script for gpuCI (CPU-only)
################################################################################
export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
source ${WORKSPACE}/ci/common/build.bash

View File

@@ -1,14 +0,0 @@
#! /usr/bin/env bash
# Copyright (c) 2018-2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Released under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
################################################################################
# NVBench build script for gpuCI (heterogeneous)
################################################################################
export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
source ${WORKSPACE}/ci/common/build.bash

View File

@@ -1,215 +0,0 @@
#! /usr/bin/env bash
# Copyright (c) 2018-2020 NVIDIA Corporation
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# Released under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
################################################################################
# NVBench local containerized build script
################################################################################
function usage {
echo "Usage: ${0} [flags...] [cmake-targets...]"
echo
echo "Build and test your local repository using a gpuCI Docker image."
echo "If CMake targets are specified, only those targets are built and tested."
echo "Otherwise, everything is built and tested."
echo
echo "-h, -help, --help"
echo " Print this message."
echo
echo "-r <path>, --repository <path>"
echo " Path to the repository (default: ${REPOSITORY_PATH})."
echo
echo "-i <image>, --image <image>"
echo " Docker image to use (default: ${IMAGE})"
echo
echo "-l, --local-image"
echo " Use the local version of the image instead of pulling from Docker hub."
echo
echo "-s, --shell-only"
echo " Skip building and testing and launch an interactive shell instead."
echo
echo "-d, --disable-gpus"
echo " Don't start the container with the NVIDIA runtime and GPUs attached."
echo
echo "-c, --clean"
echo " If the build directory already exists, delete it."
echo
echo "-j <threads>, --jobs <threads>"
echo " Number of threads to use when building (default: inferred)."
echo
echo "-b <type>, --cmake-build-type <plan>"
echo " CMake build type to use, either Release, RelWithDebInfo, or Debug"
echo " (default: ${CMAKE_BUILD_TYPE})."
echo
exit -3
}
SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..)
################################################################################
# FLAGS - Process command line flags.
################################################################################
IMAGE="gpuci/cccl:cuda11.5.1-devel-ubuntu20.04-gcc9"
LOCAL_IMAGE=0
SHELL_ONLY=0
BUILD_TYPE="gpu"
CLEAN=0
PARALLEL_LEVEL=""
CMAKE_BUILD_TYPE="Release"
TARGETS=""
while test ${#} != 0
do
case "${1}" in
-h) ;&
-help) ;&
--help) usage ;;
-r) ;&
--repository)
shift # The next argument is the path.
REPOSITORY_PATH="${1}"
;;
-i) ;&
--image)
shift # The next argument is the image.
IMAGE="${1}"
;;
-l) ;&
--local-image) LOCAL_IMAGE=1 ;;
-s) ;&
--shell-only) SHELL_ONLY=1 ;;
-d) ;&
--disable-gpus) BUILD_TYPE="cpu" ;;
-c) ;&
--clean) CLEAN=1 ;;
-j) ;&
--jobs)
shift # The next argument is the number of threads.
PARALLEL_LEVEL="${1}"
;;
-b) ;&
--cmake-build-type)
shift # The next argument is the build type.
CMAKE_BUILD_TYPE="${1}"
;;
*)
TARGETS="${TARGETS:+${TARGETS} }${1}"
;;
esac
shift
done
################################################################################
# PATHS - Setup paths for the container.
################################################################################
# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being
# built and tested. It can be set with the --repository flag.
#
# ${BUILD_PATH} is the local filesystem path that will be used for the build. It
# is named after the image name, allowing multiple image builds to coexist on
# the local filesystem.
#
# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside
# the container.
#
# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the
# container.
BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g')
if [[ "${CLEAN}" != 0 ]]; then
rm -rf ${BUILD_PATH}
fi
mkdir -p ${BUILD_PATH}
BASE_PATH_IN_CONTAINER="/cccl"
REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")"
BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build"
################################################################################
# ENVIRONMENT - Setup the thunk build script that will be run by the container.
################################################################################
# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this
# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399
COMMAND="sudo ldconfig; sudo ldconfig"
if [[ "${SHELL_ONLY}" != 0 ]]; then
COMMAND="${COMMAND}; bash"
else
COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash"
fi
################################################################################
# GPU - Setup GPUs.
################################################################################
# Note: We always start docker with --gpus, even for cpu builds. Otherwise
# libcuda.so.1 is not present and no NVBench tests are able to run.
# Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}.
if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then
VISIBLE_DEVICES="all"
else
VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}"
fi
DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/')
GPU_OPTS="--gpus device=${VISIBLE_DEVICES}"
if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]]
then
GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'"
fi
################################################################################
# LAUNCH - Pull and launch the container.
################################################################################
#NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
NVIDIA_DOCKER_INSTALLED=1 # Broken on WSL
if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
echo "NVIDIA Docker not found, please install it: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
exit -4
fi
if [[ "${LOCAL_IMAGE}" == 0 ]]; then
docker pull "${IMAGE}"
fi
docker run --rm -it ${GPU_OPTS} \
--cap-add=SYS_PTRACE \
--user "$(id -u)":"$(id -g)" \
-v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \
-v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \
-v /etc/passwd:/etc/passwd:ro \
-v /etc/group:/etc/group:ro \
-v /etc/subuid:/etc/subuid:ro \
-v /etc/subgid:/etc/subgid:ro \
-v /etc/shadow:/etc/shadow:ro \
-v /etc/gshadow:/etc/gshadow:ro \
-e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \
-e "BUILD_TYPE=${BUILD_TYPE}" \
-e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \
-e "COVERAGE_PLAN=${COVERAGE_PLAN}" \
-e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \
-w "${BUILD_PATH_IN_CONTAINER}" \
"${IMAGE}" bash -c "${COMMAND}"

61
ci/matrix.yaml Normal file
View File

@@ -0,0 +1,61 @@
cuda_prev_min: &cuda_prev_min '11.1' # Unsupported: No cupti support, issues compiling newer fmt.
cuda_prev_max: &cuda_prev_max '11.8'
cuda_curr_min: &cuda_curr_min '12.0'
cuda_curr_max: &cuda_curr_max '12.8'
# The version of the devcontainer images to use from https://hub.docker.com/r/rapidsai/devcontainers
devcontainer_version: '25.06'
# gcc compiler configurations
gcc7: &gcc7 { name: 'gcc', version: '7', exe: 'g++' }
gcc8: &gcc8 { name: 'gcc', version: '8', exe: 'g++' }
gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' }
gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' }
gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' }
gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' }
gcc13: &gcc13 { name: 'gcc', version: '13', exe: 'g++' }
gcc14: &gcc14 { name: 'gcc', version: '14', exe: 'g++' }
# LLVM Compiler configurations
llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' }
llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' }
llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' }
llvm17: &llvm17 { name: 'llvm', version: '17', exe: 'clang++' }
llvm18: &llvm18 { name: 'llvm', version: '18', exe: 'clang++' }
llvm19: &llvm19 { name: 'llvm', version: '19', exe: 'clang++' }
# Each environment below will generate a unique build/test job
# See the "compute-matrix" job in the workflow for how this is parsed and used
# cuda: The CUDA Toolkit version
# os: The operating system used
# cpu: The CPU architecture
# compiler: The compiler to use
# name: The compiler name
# version: The compiler version
# exe: The unverionsed compiler binary name
# Configurations that will run for every PR
pull_request:
nvcc:
- {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7 }
- {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8 }
- {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9 }
- {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10 }
- {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11 }
- {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12 }
- {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14 }
- {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7 }
- {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8 }
- {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9 }
- {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13 }
- {cuda: *cuda_curr_max, os: 'ubuntu24.04', cpu: 'amd64', compiler: *gcc14 }
- {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18 }
- {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm19 }

390
ci/ninja_summary.py Executable file
View File

@@ -0,0 +1,390 @@
#!/usr/bin/env python3
# Copyright (c) 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
r"""Summarize the last ninja build, invoked with ninja's -C syntax.
This script is designed to be automatically run after each ninja build in
order to summarize the build's performance. Making build performance information
more visible should make it easier to notice anomalies and opportunities. To use
this script on Windows just set NINJA_SUMMARIZE_BUILD=1 and run autoninja.bat.
On Linux you can get autoninja to invoke this script using this syntax:
$ NINJA_SUMMARIZE_BUILD=1 autoninja -C out/Default/ chrome
You can also call this script directly using ninja's syntax to specify the
output directory of interest:
> python3 post_build_ninja_summary.py -C out/Default
Typical output looks like this:
>ninja -C out\debug_component base
ninja.exe -C out\debug_component base -j 960 -l 48 -d keeprsp
ninja: Entering directory `out\debug_component'
[1 processes, 1/1 @ 0.3/s : 3.092s ] Regenerating ninja files
Longest build steps:
0.1 weighted s to build obj/base/base/trace_log.obj (6.7 s elapsed time)
0.2 weighted s to build nasm.exe, nasm.exe.pdb (0.2 s elapsed time)
0.3 weighted s to build obj/base/base/win_util.obj (12.4 s elapsed time)
1.2 weighted s to build base.dll, base.dll.lib (1.2 s elapsed time)
Time by build-step type:
0.0 s weighted time to generate 6 .lib files (0.3 s elapsed time sum)
0.1 s weighted time to generate 25 .stamp files (1.2 s elapsed time sum)
0.2 s weighted time to generate 20 .o files (2.8 s elapsed time sum)
1.7 s weighted time to generate 4 PEFile (linking) files (2.0 s elapsed
time sum)
23.9 s weighted time to generate 770 .obj files (974.8 s elapsed time sum)
26.1 s weighted time (982.9 s elapsed time sum, 37.7x parallelism)
839 build steps completed, average of 32.17/s
If no gn clean has been done then results will be for the last non-NULL
invocation of ninja. Ideas for future statistics, and implementations are
appreciated.
The "weighted" time is the elapsed time of each build step divided by the number
of tasks that were running in parallel. This makes it an excellent approximation
of how "important" a slow step was. A link that is entirely or mostly serialized
will have a weighted time that is the same or similar to its elapsed time. A
compile that runs in parallel with 999 other compiles will have a weighted time
that is tiny."""
import argparse
import errno
import fnmatch
import os
import subprocess
import sys
# The number of long build times to report:
long_count = 10
# The number of long times by extension to report
long_ext_count = 10
class Target:
"""Represents a single line read for a .ninja_log file."""
def __init__(self, start, end):
"""Creates a target object by passing in the start/end times in seconds
as a float."""
self.start = start
self.end = end
# A list of targets, appended to by the owner of this object.
self.targets = []
self.weighted_duration = 0.0
def Duration(self):
"""Returns the task duration in seconds as a float."""
return self.end - self.start
def SetWeightedDuration(self, weighted_duration):
"""Sets the duration, in seconds, passed in as a float."""
self.weighted_duration = weighted_duration
def WeightedDuration(self):
"""Returns the task's weighted duration in seconds as a float.
Weighted_duration takes the elapsed time of the task and divides it
by how many other tasks were running at the same time. Thus, it
represents the approximate impact of this task on the total build time,
with serialized or serializing steps typically ending up with much
longer weighted durations.
weighted_duration should always be the same or shorter than duration.
"""
# Allow for modest floating-point errors
epsilon = 0.000002
if self.weighted_duration > self.Duration() + epsilon:
print("%s > %s?" % (self.weighted_duration, self.Duration()))
assert self.weighted_duration <= self.Duration() + epsilon
return self.weighted_duration
def DescribeTargets(self):
"""Returns a printable string that summarizes the targets."""
# Some build steps generate dozens of outputs - handle them sanely.
# The max_length was chosen so that it can fit most of the long
# single-target names, while minimizing word wrapping.
result = ", ".join(self.targets)
max_length = 65
if len(result) > max_length:
result = result[:max_length] + "..."
return result
# Copied with some modifications from ninjatracing
def ReadTargets(log, show_all):
"""Reads all targets from .ninja_log file |log_file|, sorted by duration.
The result is a list of Target objects."""
header = log.readline()
# Handle empty ninja_log gracefully by silently returning an empty list of
# targets.
if not header:
return []
assert header == "# ninja log v5\n", "unrecognized ninja log version %r" % header
targets_dict = {}
last_end_seen = 0.0
for line in log:
parts = line.strip().split("\t")
if len(parts) != 5:
# If ninja.exe is rudely halted then the .ninja_log file may be
# corrupt. Silently continue.
continue
start, end, _, name, cmdhash = parts # Ignore restat.
# Convert from integral milliseconds to float seconds.
start = int(start) / 1000.0
end = int(end) / 1000.0
if not show_all and end < last_end_seen:
# An earlier time stamp means that this step is the first in a new
# build, possibly an incremental build. Throw away the previous
# data so that this new build will be displayed independently.
# This has to be done by comparing end times because records are
# written to the .ninja_log file when commands complete, so end
# times are guaranteed to be in order, but start times are not.
targets_dict = {}
target = None
if cmdhash in targets_dict:
target = targets_dict[cmdhash]
if not show_all and (target.start != start or target.end != end):
# If several builds in a row just run one or two build steps
# then the end times may not go backwards so the last build may
# not be detected as such. However in many cases there will be a
# build step repeated in the two builds and the changed
# start/stop points for that command, identified by the hash,
# can be used to detect and reset the target dictionary.
targets_dict = {}
target = None
if not target:
targets_dict[cmdhash] = target = Target(start, end)
last_end_seen = end
target.targets.append(name)
return list(targets_dict.values())
def GetExtension(target, extra_patterns):
"""Return the file extension that best represents a target.
For targets that generate multiple outputs it is important to return a
consistent 'canonical' extension. Ultimately the goal is to group build steps
by type."""
for output in target.targets:
if extra_patterns:
for fn_pattern in extra_patterns.split(";"):
if fnmatch.fnmatch(output, "*" + fn_pattern + "*"):
return fn_pattern
# Not a true extension, but a good grouping.
if output.endswith("type_mappings"):
extension = "type_mappings"
break
# Capture two extensions if present. For example: file.javac.jar should
# be distinguished from file.interface.jar.
root, ext1 = os.path.splitext(output)
_, ext2 = os.path.splitext(root)
extension = ext2 + ext1 # Preserve the order in the file name.
if len(extension) == 0:
extension = "(no extension found)"
if ext1 in [".pdb", ".dll", ".exe"]:
extension = "PEFile (linking)"
# Make sure that .dll and .exe are grouped together and that the
# .dll.lib files don't cause these to be listed as libraries
break
if ext1 in [".so", ".TOC"]:
extension = ".so (linking)"
# Attempt to identify linking, avoid identifying as '.TOC'
break
# Make sure .obj files don't get categorized as mojo files
if ext1 in [".obj", ".o"]:
break
# Jars are the canonical output of java targets.
if ext1 == ".jar":
break
# Normalize all mojo related outputs to 'mojo'.
if output.count(".mojom") > 0:
extension = "mojo"
break
return extension
def SummarizeEntries(entries, extra_step_types, elapsed_time_sorting):
"""Print a summary of the passed in list of Target objects."""
# Create a list that is in order by time stamp and has entries for the
# beginning and ending of each build step (one time stamp may have multiple
# entries due to multiple steps starting/stopping at exactly the same time).
# Iterate through this list, keeping track of which tasks are running at all
# times. At each time step calculate a running total for weighted time so
# that when each task ends its own weighted time can easily be calculated.
task_start_stop_times = []
earliest = -1
latest = 0
total_cpu_time = 0
for target in entries:
if earliest < 0 or target.start < earliest:
earliest = target.start
if target.end > latest:
latest = target.end
total_cpu_time += target.Duration()
task_start_stop_times.append((target.start, "start", target))
task_start_stop_times.append((target.end, "stop", target))
length = latest - earliest
weighted_total = 0.0
# Sort by the time/type records and ignore |target|
task_start_stop_times.sort(key=lambda times: times[:2])
# Now we have all task start/stop times sorted by when they happen. If a
# task starts and stops on the same time stamp then the start will come
# first because of the alphabet, which is important for making this work
# correctly.
# Track the tasks which are currently running.
running_tasks = {}
# Record the time we have processed up to so we know how to calculate time
# deltas.
last_time = task_start_stop_times[0][0]
# Track the accumulated weighted time so that it can efficiently be added
# to individual tasks.
last_weighted_time = 0.0
# Scan all start/stop events.
for event in task_start_stop_times:
time, action_name, target = event
# Accumulate weighted time up to now.
num_running = len(running_tasks)
if num_running > 0:
# Update the total weighted time up to this moment.
last_weighted_time += (time - last_time) / float(num_running)
if action_name == "start":
# Record the total weighted task time when this task starts.
running_tasks[target] = last_weighted_time
if action_name == "stop":
# Record the change in the total weighted task time while this task
# ran.
weighted_duration = last_weighted_time - running_tasks[target]
target.SetWeightedDuration(weighted_duration)
weighted_total += weighted_duration
del running_tasks[target]
last_time = time
assert len(running_tasks) == 0
# Warn if the sum of weighted times is off by more than half a second.
if abs(length - weighted_total) > 500:
print(
"Warning: Possible corrupt ninja log, results may be "
"untrustworthy. Length = %.3f, weighted total = %.3f"
% (length, weighted_total)
)
# Print the slowest build steps:
print(" Longest build steps:")
if elapsed_time_sorting:
entries.sort(key=lambda x: x.Duration())
else:
entries.sort(key=lambda x: x.WeightedDuration())
for target in entries[-long_count:]:
print(
" %8.1f weighted s to build %s (%.1f s elapsed time)"
% (target.WeightedDuration(), target.DescribeTargets(), target.Duration())
)
# Sum up the time by file extension/type of the output file
count_by_ext = {}
time_by_ext = {}
weighted_time_by_ext = {}
# Scan through all of the targets to build up per-extension statistics.
for target in entries:
extension = GetExtension(target, extra_step_types)
time_by_ext[extension] = time_by_ext.get(extension, 0) + target.Duration()
weighted_time_by_ext[extension] = (
weighted_time_by_ext.get(extension, 0) + target.WeightedDuration()
)
count_by_ext[extension] = count_by_ext.get(extension, 0) + 1
print(" Time by build-step type:")
# Copy to a list with extension name and total time swapped, to (time, ext)
if elapsed_time_sorting:
weighted_time_by_ext_sorted = sorted((y, x) for (x, y) in time_by_ext.items())
else:
weighted_time_by_ext_sorted = sorted(
(y, x) for (x, y) in weighted_time_by_ext.items()
)
# Print the slowest build target types:
for time, extension in weighted_time_by_ext_sorted[-long_ext_count:]:
print(
" %8.1f s weighted time to generate %d %s files "
"(%1.1f s elapsed time sum)"
% (time, count_by_ext[extension], extension, time_by_ext[extension])
)
print(
" %.1f s weighted time (%.1f s elapsed time sum, %1.1fx "
"parallelism)" % (length, total_cpu_time, total_cpu_time * 1.0 / length)
)
print(
" %d build steps completed, average of %1.2f/s"
% (len(entries), len(entries) / (length))
)
def main():
log_file = ".ninja_log"
metrics_file = "siso_metrics.json"
parser = argparse.ArgumentParser()
parser.add_argument("-C", dest="build_directory", help="Build directory.")
parser.add_argument(
"-s",
"--step-types",
help="semicolon separated fnmatch patterns for build-step grouping",
)
parser.add_argument(
"-e",
"--elapsed_time_sorting",
default=False,
action="store_true",
help="Sort output by elapsed time instead of weighted time",
)
parser.add_argument("--log-file", help="specific ninja log file to analyze.")
args, _extra_args = parser.parse_known_args()
if args.build_directory:
log_file = os.path.join(args.build_directory, log_file)
metrics_file = os.path.join(args.build_directory, metrics_file)
if args.log_file:
log_file = args.log_file
if not args.step_types:
# Offer a convenient way to add extra step types automatically,
# including when this script is run by autoninja. get() returns None if
# the variable isn't set.
args.step_types = os.environ.get("chromium_step_types")
if args.step_types:
# Make room for the extra build types.
global long_ext_count
long_ext_count += len(args.step_types.split(";"))
if os.path.exists(metrics_file):
# Automatically handle summarizing siso builds.
cmd = ["siso.bat" if "win32" in sys.platform else "siso"]
cmd.extend(["metrics", "summary"])
if args.build_directory:
cmd.extend(["-C", args.build_directory])
if args.step_types:
cmd.extend(["--step_types", args.step_types])
if args.elapsed_time_sorting:
cmd.append("--elapsed_time_sorting")
subprocess.run(cmd)
else:
try:
with open(log_file, "r") as log:
entries = ReadTargets(log, False)
if entries:
SummarizeEntries(
entries, args.step_types, args.elapsed_time_sorting
)
except IOError:
print("Log file %r not found, no build summary created." % log_file)
return errno.ENOENT
if __name__ == "__main__":
sys.exit(main())

105
ci/pretty_printing.sh Normal file
View File

@@ -0,0 +1,105 @@
# Print "ARG=${ARG}" for all args.
function print_var_values() {
# Iterate through the arguments
for var_name in "$@"; do
if [ -z "$var_name" ]; then
echo "Usage: print_var_values <variable_name1> <variable_name2> ..."
return 1
fi
# Dereference the variable and print the result
echo "$var_name=${!var_name:-(undefined)}"
done
}
# begin_group: Start a named section of log output, possibly with color.
# Usage: begin_group "Group Name" [Color]
# Group Name: A string specifying the name of the group.
# Color (optional): ANSI color code to set text color. Default is blue (1;34).
function begin_group() {
# See options for colors here: https://gist.github.com/JBlond/2fea43a3049b38287e5e9cefc87b2124
local blue="34"
local name="${1:-}"
local color="${2:-$blue}"
if [ -n "${GITHUB_ACTIONS:-}" ]; then
echo -e "::group::\e[${color}m${name}\e[0m"
else
echo -e "\e[${color}m================== ${name} ======================\e[0m"
fi
}
# end_group: End a named section of log output and print status based on exit status.
# Usage: end_group "Group Name" [Exit Status]
# Group Name: A string specifying the name of the group.
# Exit Status (optional): The exit status of the command run within the group. Default is 0.
function end_group() {
local name="${1:-}"
local build_status="${2:-0}"
local duration="${3:-}"
local red="31"
local blue="34"
if [ -n "${GITHUB_ACTIONS:-}" ]; then
echo "::endgroup::"
if [ "$build_status" -ne 0 ]; then
echo -e "::error::\e[${red}m ${name} - Failed (⬆️ click above for full log ⬆️)\e[0m"
fi
else
if [ "$build_status" -ne 0 ]; then
echo -e "\e[${red}m================== End ${name} - Failed${duration:+ - Duration: ${duration}s} ==================\e[0m"
else
echo -e "\e[${blue}m================== End ${name} - Success${duration:+ - Duration: ${duration}s} ==================\n\e[0m"
fi
fi
}
declare -A command_durations
# Runs a command within a named group, handles the exit status, and prints appropriate messages based on the result.
# Usage: run_command "Group Name" command [arguments...]
function run_command() {
local group_name="${1:-}"
shift
local command=("$@")
local status
begin_group "$group_name"
set +e
local start_time=$(date +%s)
"${command[@]}"
status=$?
local end_time=$(date +%s)
set -e
local duration=$((end_time - start_time))
end_group "$group_name" $status $duration
command_durations["$group_name"]=$duration
return $status
}
function string_width() {
local str="$1"
echo "$str" | awk '{print length}'
}
function print_time_summary() {
local max_length=0
local group
# Find the longest group name for formatting
for group in "${!command_durations[@]}"; do
local group_length=$(echo "$group" | awk '{print length}')
if [ "$group_length" -gt "$max_length" ]; then
max_length=$group_length
fi
done
echo "Time Summary:"
for group in "${!command_durations[@]}"; do
printf "%-${max_length}s : %s seconds\n" "$group" "${command_durations[$group]}"
done
# Clear the array of timing info
declare -gA command_durations=()
}

41
ci/sccache_hit_rate.sh Executable file
View File

@@ -0,0 +1,41 @@
#!/bin/bash
set -euo pipefail
# Ensure two arguments are provided
if [ $# -ne 2 ]; then
echo "Usage: $0 <before-file> <after-file>" >&2
exit 1
fi
# Print the contents of the before file
echo "=== Contents of $1 ===" >&2
cat $1 >&2
echo "=== End of $1 ===" >&2
# Print the contents of the after file
echo "=== Contents of $2 ===" >&2
cat $2 >&2
echo "=== End of $2 ===" >&2
# Extract compile requests and cache hits from the before and after files
requests_before=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$1")
hits_before=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$1")
requests_after=$(awk '/^[ \t]*Compile requests[ \t]+[0-9]+/ {print $3}' "$2")
hits_after=$(awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}' "$2")
# Calculate the differences to find out how many new requests and hits
requests_diff=$((requests_after - requests_before))
hits_diff=$((hits_after - hits_before))
echo "New Compile Requests: $requests_diff" >&2
echo "New Hits: $hits_diff" >&2
# Calculate and print the hit rate
if [ $requests_diff -eq 0 ]; then
echo "No new compile requests, hit rate is not applicable"
else
hit_rate=$(awk -v hits=$hits_diff -v requests=$requests_diff 'BEGIN {printf "%.2f", hits/requests * 100}')
echo "sccache hit rate: $hit_rate%" >&2
echo "$hit_rate"
fi

52
ci/sccache_stats.sh Executable file
View File

@@ -0,0 +1,52 @@
#!/bin/bash
# This script prints the sccache hit rate between two calls to sccache --show-stats.
# It should be sourced in your script before and after the operations you want to profile,
# with the 'start' or 'end' argument respectively.
mode=$1
if [[ "$mode" != "start" && "$mode" != "end" ]]; then
echo "Invalid mode: $mode"
echo "Usage: $0 {start|end}"
exit 1
fi
# Check if sccache is available
if ! command -v sccache &> /dev/null; then
echo "Notice: sccache is not available. Skipping..."
exit 0
fi
case $mode in
start)
export SCCACHE_START_HITS=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
export SCCACHE_START_MISSES=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
;;
end)
if [[ -z ${SCCACHE_START_HITS+x} || -z ${SCCACHE_START_MISSES+x} ]]; then
echo "Error: start stats not collected. Did you call this script with 'start' before your operations?"
exit 1
fi
final_hits=$(sccache --show-stats | awk '/^[ \t]*Cache hits[ \t]+[0-9]+/ {print $3}')
final_misses=$(sccache --show-stats | awk '/^[ \t]*Cache misses[ \t]+[0-9]+/ {print $3}')
hits=$((final_hits - SCCACHE_START_HITS))
misses=$((final_misses - SCCACHE_START_MISSES))
total=$((hits + misses))
prefix=""
if [ ${GITHUB_ACTIONS:-false} = "true" ]; then
prefix="::notice::"
fi
if (( total > 0 )); then
hit_rate=$(awk -v hits="$hits" -v total="$total" 'BEGIN { printf "%.2f", (hits / total) * 100 }')
echo ${prefix}"sccache hits: $hits | misses: $misses | hit rate: $hit_rate%"
else
echo ${prefix}"sccache stats: N/A No new compilation requests"
fi
unset SCCACHE_START_HITS
unset SCCACHE_START_MISSES
;;
esac

18
ci/test_nvbench.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
source "$(dirname "$0")/build_common.sh"
# Run NVBench tests with high parallelism. If any need to be
# serialized, define the `RUN_SERIAL` CMake property on the
# test.
export CTEST_PARALLEL_LEVEL=${PARALLEL_LEVEL}
print_environment_details
./build_nvbench.sh "$@"
PRESET="nvbench-ci"
test_preset "NVBench" ${PRESET}
print_time_summary

View File

@@ -0,0 +1,65 @@
# Detect the language standards supported by the current compilers.
#
# Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
#
# - var_prefix: Used to name result variables,
# e.g. ${var_prefix}_${lang}_XX_SUPPORTED will be TRUE or FALSE. Defined for
# each XX in ${standards}.
# - lang: The language to test: C, CXX, or CUDA.
# - standards: List of any standard versions.
#
# Example: detect_supported_standards(PROJ CXX 11 14 17)
# - Sets the following variables in the parent scope to TRUE or FALSE:
# - PROJ_CXX_11_SUPPORTED
# - PROJ_CXX_14_SUPPORTED
# - PROJ_CXX_17_SUPPORTED
# - Sets `PROJ_DETECTED_CXX_STANDARDS` to a list of supported standards (e.g. "11;14;17").
function(detect_supported_standards prefix lang)
string(TOLOWER "${lang}_std" feature_prefix)
set(all_stds)
foreach(standard IN LISTS ARGN)
set(var_name "${prefix}_${lang}_${standard}_SUPPORTED")
if ("${feature_prefix}_${standard}" IN_LIST CMAKE_${lang}_COMPILE_FEATURES)
set(${var_name} TRUE)
else()
set(${var_name} FALSE)
endif()
# Special cases:
if (standard EQUAL 17 AND
(lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) OR
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)))
# gcc < 7 and clang < 8 don't fully support C++17.
# They accept the flag and have partial support, but nvcc will refuse
# to enable it and falls back to the default dialect for the current
# CXX compiler version. This breaks our CI.
# CMake's COMPILE_FEATURES var reports that these compilers support C++17,
# but we can't rely on it, so manually disable the dialect in these cases.
set(${var_name} FALSE)
endif()
if (standard EQUAL 20 AND
(lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND
CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1930)))
# Similar to the above, but for C++20.
set(${var_name} FALSE)
endif()
if (${var_name})
list(APPEND all_stds ${standard})
endif()
message(STATUS "Testing ${lang}${standard} Support: ${${var_name}}")
set(${var_name} ${${var_name}} PARENT_SCOPE)
endforeach()
set(${prefix}_DETECTED_${lang}_STANDARDS "${all_stds}" PARENT_SCOPE)
endfunction()

View File

@@ -22,47 +22,15 @@ function(nvbench_add_cupti_dep dep_name)
add_library(nvbench::${dep_name_lower} SHARED IMPORTED)
if (WIN32)
# Attempt to locate the dll in the expected location. This is necessary
# because the CUPTI dll has a versioned suffix, so we can't directly search
# for it with find_file.
file(GLOB dep_dll_path "${nvbench_cupti_root}/lib64/${dep_name_lower}*dll")
cmake_path(GET dep_dll_path FILENAME dep_dll_filename)
find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED
DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit."
HINTS "${nvbench_cupti_root}/lib64"
)
mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
# If the dll was not found in the expected location, use a default filename as a user hint.
if (NOT dep_dll_filename)
set(dep_dll_filename ${dep_name_lower}.dll)
endif()
# Use find_file to create a cache variable and mark the file as REQUIRED.
find_file(NVBench_${dep_name_upper}_DLL ${dep_dll_filename} REQUIRED
DOC "The full path to ${dep_name_lower}.dll from the CUDA Toolkit."
HINTS "${nvbench_cupti_root}/lib64/"
)
mark_as_advanced(NVBench_${dep_name_upper}_DLL)
# The .libs don't have suffixes, so we can just directly search for them.
find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower}.lib REQUIRED
DOC "The full path to ${dep_name_lower}.lib from the CUDA Toolkit."
HINTS "${nvbench_cupti_root}/lib64/"
)
mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
set_target_properties(nvbench::${dep_name_lower} PROPERTIES
IMPORTED_LOCATION "${NVBench_${dep_name_upper}_DLL}"
IMPORTED_IMPLIB "${NVBench_${dep_name_upper}_LIBRARY}"
)
else()
find_library(NVBench_${dep_name_upper}_LIBRARY ${dep_name_lower} REQUIRED
DOC "The full path to lib${dep_name_lower}.so from the CUDA Toolkit."
HINTS "${nvbench_cupti_root}/lib64"
)
mark_as_advanced(NVBench_${dep_name_upper}_LIBRARY)
set_target_properties(nvbench::${dep_name_lower} PROPERTIES
IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}"
)
endif()
set_target_properties(nvbench::${dep_name_lower} PROPERTIES
IMPORTED_LOCATION "${NVBench_${dep_name_upper}_LIBRARY}"
)
endfunction()
nvbench_add_cupti_dep(nvperf_target)

View File

@@ -0,0 +1,28 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Tell cmake to generate a json file of compile commands for clangd:
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# Symlink the compile command output to the source dir, where clangd will find it.
set(compile_commands_file "${CMAKE_BINARY_DIR}/compile_commands.json")
set(compile_commands_link "${CMAKE_SOURCE_DIR}/compile_commands.json")
message(STATUS "Creating symlink from ${compile_commands_link} to ${compile_commands_file}...")
nvbench_execute_non_fatal_process(COMMAND
"${CMAKE_COMMAND}" -E rm -f "${compile_commands_link}")
nvbench_execute_non_fatal_process(COMMAND
"${CMAKE_COMMAND}" -E touch "${compile_commands_file}")
nvbench_execute_non_fatal_process(COMMAND
"${CMAKE_COMMAND}" -E create_symlink "${compile_commands_file}" "${compile_commands_link}")

View File

@@ -29,46 +29,37 @@ function(nvbench_add_cxx_flag target_name type flag)
target_compile_options(${target_name} ${type}
$<$<COMPILE_LANGUAGE:CXX>:${flag}>
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=${flag}>
# FIXME nvc++ case
)
endif()
endfunction()
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "/W4")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wall")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wextra")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wconversion")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Woverloaded-virtual")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wcast-qual")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wpointer-arith")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-local-typedef")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-parameter")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wvla")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wgnu")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wno-gnu-line-marker") # WAR 3916341
if (NVBench_ENABLE_WERROR)
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "/WX")
endif()
# Suppress overly-pedantic/unavoidable warnings brought in with /W4:
# C4505: unreferenced local function has been removed
# The CUDA `host_runtime.h` header emits this for
# `__cudaUnregisterBinaryUtil`.
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "/wd4505")
else()
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wall")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wextra")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wconversion")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Woverloaded-virtual")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wcast-qual")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wpointer-arith")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-local-typedef")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wunused-parameter")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wvla")
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Wgnu")
if (NVBench_ENABLE_WERROR)
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Werror")
endif()
if (NVBench_ENABLE_WERROR)
nvbench_add_cxx_flag(nvbench.build_interface INTERFACE "-Werror")
endif()
# GCC-specific flags
if (CMAKE_CXX_COMPILER_ID STREQUAL GNU)
# Experimental filesystem library
if (CMAKE_CXX_COMPILER_ID STREQUAL GNU OR CMAKE_CXX_COMPILER_ID STREQUAL Clang)
target_link_libraries(nvbench.build_interface INTERFACE stdc++fs)
endif()
# CUDA-specific flags
if (CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
# fmtlib uses llvm's _BitInt internally, which is not available when compiling through nvcc:
target_compile_definitions(nvbench.build_interface INTERFACE "FMT_USE_BITINT=0")
endif()
target_compile_options(nvbench.build_interface INTERFACE
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--display_error_number>
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Wno-deprecated-gpu-targets>
@@ -85,6 +76,5 @@ function(nvbench_config_target target_name)
ARCHIVE_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
LIBRARY_OUTPUT_DIRECTORY "${NVBench_LIBRARY_OUTPUT_DIR}"
RUNTIME_OUTPUT_DIRECTORY "${NVBench_EXECUTABLE_OUTPUT_DIR}"
WINDOWS_EXPORT_ALL_SYMBOLS ON # oooo pretty hammer...
)
endfunction()

View File

@@ -1,52 +1,61 @@
################################################################################
# fmtlib/fmt
rapids_cpm_find(fmt 7.1.3
set(export_set_details)
set(install_fmt OFF)
if(NOT BUILD_SHARED_LIBS AND NVBench_ENABLE_INSTALL_RULES)
set(export_set_details BUILD_EXPORT_SET nvbench-targets
INSTALL_EXPORT_SET nvbench-targets)
set(install_fmt ON)
endif()
rapids_cpm_find(fmt 11.1.4 ${export_set_details}
GLOBAL_TARGETS fmt::fmt fmt::fmt-header-only
CPM_ARGS
GITHUB_REPOSITORY fmtlib/fmt
GIT_TAG 7.1.3
GIT_SHALLOW TRUE
GIT_REPOSITORY "https://github.com/fmtlib/fmt.git"
GIT_TAG "11.1.4"
OPTIONS
# Force static to keep fmt internal.
"BUILD_SHARED_LIBS OFF"
# Suppress warnings from fmt headers by marking them as system.
"FMT_SYSTEM_HEADERS ON"
# Disable install rules since we're linking statically.
"FMT_INSTALL ${install_fmt}"
"CMAKE_POSITION_INDEPENDENT_CODE ON"
)
if(NOT fmt_ADDED)
set(fmt_is_external TRUE)
endif()
################################################################################
# nlohmann/json
#
# Following recipe from
# http://github.com/cpm-cmake/CPM.cmake/blob/master/examples/json/CMakeLists.txt
# Download the zips because the repo takes an excessively long time to clone.
rapids_cpm_find(nlohmann_json 3.9.1
# Release:
rapids_cpm_find(nlohmann_json 3.11.3
CPM_ARGS
URL https://github.com/nlohmann/json/releases/download/v3.9.1/include.zip
URL_HASH SHA256=6bea5877b1541d353bd77bdfbdb2696333ae5ed8f9e8cc22df657192218cad91
PATCH_COMMAND
# Work around compiler bug in nvcc 11.0, see NVIDIA/NVBench#18
${CMAKE_COMMAND} -E copy
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/nlohmann_json.hpp"
"./include/nlohmann/json.hpp"
# Development version:
# I'm waiting for https://github.com/nlohmann/json/issues/2676 to be fixed,
# leave this in to simplify testing patches as they come out. Update the
# `nvbench_json` target too when switching branches.
# CPM_ARGS
# VERSION develop
# URL https://github.com/nlohmann/json/archive/refs/heads/develop.zip
# OPTIONS JSON_MultipleHeaders ON
URL https://github.com/nlohmann/json/releases/download/v3.11.3/include.zip
URL_HASH SHA256=a22461d13119ac5c78f205d3df1db13403e58ce1bb1794edc9313677313f4a9d
PATCH_COMMAND
${CMAKE_COMMAND}
-D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
-D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
-D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
-P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/json_unordered_map_ice.cmake"
)
# nlohmann_json release headers
add_library(nvbench_json INTERFACE IMPORTED)
target_include_directories(nvbench_json SYSTEM INTERFACE
"${nlohmann_json_SOURCE_DIR}/include"
)
# nlohmann_json development branch:
#add_library(nvbench_json INTERFACE)
#target_link_libraries(nvbench_json INTERFACE nlohmann_json)
if (TARGET nlohmann_json::nlohmann_json)
# If we have a target, just use it. Cannot be an ALIAS library because
# nlohmann_json::nlohmann_json itself might be one.
target_link_libraries(nvbench_json INTERFACE nlohmann_json::nlohmann_json)
else()
# Otherwise we only downloaded the headers.
target_include_directories(nvbench_json SYSTEM INTERFACE
"${nlohmann_json_SOURCE_DIR}/include"
)
endif()
################################################################################
# CUDAToolkit

View File

@@ -1,38 +0,0 @@
# By default, add dependent DLLs to the build dir on MSVC. This avoids
# a variety of runtime issues when using NVML, etc.
# This behavior can be disabled using the following options:
if (WIN32)
option(NVBench_ADD_DEPENDENT_DLLS_TO_BUILD
"Copy dependent dlls to NVBench library build location (MSVC only)."
ON
)
else()
# These are forced off for non-MSVC builds, as $<TARGET_RUNTIME_DLLS:...>
# will always be empty on non-dll platforms.
set(NVBench_ADD_DEPENDENT_DLLS_TO_BUILD OFF)
endif()
if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD)
message(STATUS
"CMake 3.21.0 is required when NVBench_ADD_DEPENDENT_DLLS_TO_BUILD "
"is enabled."
)
cmake_minimum_required(VERSION 3.21.0)
endif()
function(nvbench_setup_dep_dlls target_name)
# The custom command below fails when there aren't any runtime DLLs to copy,
# so only enable it when a relevant dependency is enabled:
if (NVBench_ADD_DEPENDENT_DLLS_TO_BUILD AND
(NVBench_ENABLE_NVML OR
NVBench_ENABLE_CUPTI))
add_custom_command(TARGET ${target_name}
POST_BUILD
COMMAND
"${CMAKE_COMMAND}" -E copy
"$<TARGET_RUNTIME_DLLS:${target_name}>"
"$<TARGET_FILE_DIR:${target_name}>"
COMMAND_EXPAND_LISTS
)
endif()
endfunction()

View File

@@ -1,37 +1,51 @@
macro(nvbench_generate_exports)
set(nvbench_build_export_code_block "")
set(nvbench_install_export_code_block "")
if(NVBench_ENABLE_INSTALL_RULES)
set(nvbench_build_export_code_block "")
set(nvbench_install_export_code_block "")
if (NVBench_ENABLE_NVML)
string(APPEND nvbench_build_export_code_block
"include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake\")\n"
if (NVBench_ENABLE_NVML)
string(APPEND nvbench_build_export_code_block
"include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake\")\n"
)
string(APPEND nvbench_install_export_code_block
"include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake\")\n"
)
endif()
if (NVBench_ENABLE_CUPTI)
string(APPEND nvbench_build_export_code_block
"include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
)
string(APPEND nvbench_install_export_code_block
"include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
)
endif()
if (TARGET nvbench_json)
set(nvbench_json_code_block
[=[
add_library(nvbench_json INTERFACE IMPORTED)
if (TARGET nlohmann_json::nlohmann_json)
target_link_libraries(nvbench_json INTERFACE nlohmann_json::nlohmann_json)
endif()
]=])
string(APPEND nvbench_build_export_code_block ${nvbench_json_code_block})
string(APPEND nvbench_install_export_code_block ${nvbench_json_code_block})
endif()
rapids_export(BUILD NVBench
EXPORT_SET nvbench-targets
NAMESPACE "nvbench::"
GLOBAL_TARGETS nvbench main ctl internal_build_interface
LANGUAGES CUDA CXX
FINAL_CODE_BLOCK nvbench_build_export_code_block
)
string(APPEND nvbench_install_export_code_block
"include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchNVML.cmake\")\n"
rapids_export(INSTALL NVBench
EXPORT_SET nvbench-targets
NAMESPACE "nvbench::"
GLOBAL_TARGETS nvbench main ctl internal_build_interface
LANGUAGES CUDA CXX
FINAL_CODE_BLOCK nvbench_install_export_code_block
)
endif()
if (NVBench_ENABLE_CUPTI)
string(APPEND nvbench_build_export_code_block
"include(\"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake\")\n"
)
string(APPEND nvbench_install_export_code_block
"include(\"\${CMAKE_CURRENT_LIST_DIR}/NVBenchCUPTI.cmake\")\n"
)
endif()
rapids_export(BUILD NVBench
EXPORT_SET nvbench-targets
NAMESPACE "nvbench::"
GLOBAL_TARGETS nvbench main ctl internal_build_interface
LANGUAGES CUDA CXX
FINAL_CODE_BLOCK nvbench_build_export_code_block
)
rapids_export(INSTALL NVBench
EXPORT_SET nvbench-targets
NAMESPACE "nvbench::"
GLOBAL_TARGETS nvbench main ctl internal_build_interface
LANGUAGES CUDA CXX
FINAL_CODE_BLOCK nvbench_install_export_code_block
)
endmacro()

View File

@@ -0,0 +1,40 @@
# For every public header, build a translation unit containing `#include <header>`
# with some various checks.
set(excluded_headers_regexes
# Should never be used externally.
"^detail"
"^internal"
)
# Meta target for all configs' header builds:
add_custom_target(nvbench.headers.all)
add_dependencies(nvbench.all nvbench.headers.all)
file(GLOB_RECURSE header_files
RELATIVE "${NVBench_SOURCE_DIR}/nvbench/"
CONFIGURE_DEPENDS
"${NVBench_SOURCE_DIR}/nvbench/*.cuh"
)
foreach (exclusion IN LISTS excluded_headers_regexes)
list(FILTER header_files EXCLUDE REGEX "${exclusion}")
endforeach()
function (nvbench_add_header_target target_name cuda_std)
foreach (header IN LISTS header_files)
set(headertest_src "headers/${target_name}/${header}.cu")
set(header_str "nvbench/${header}") # Substitution used by configure_file:
configure_file("${NVBench_SOURCE_DIR}/cmake/header_test.in.cxx" "${headertest_src}")
list(APPEND headertest_srcs "${headertest_src}")
endforeach()
add_library(${target_name} OBJECT ${headertest_srcs})
target_link_libraries(${target_name} PUBLIC nvbench::nvbench)
set_target_properties(${target_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
add_dependencies(nvbench.headers.all ${target_name})
endfunction()
foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
nvbench_add_header_target(nvbench.headers.cpp${std} ${std})
endforeach()

View File

@@ -1,61 +1,69 @@
include(GNUInstallDirs)
rapids_cmake_install_lib_dir(NVBench_INSTALL_LIB_DIR)
# in-source public headers:
install(DIRECTORY "${NVBench_SOURCE_DIR}/nvbench"
TYPE INCLUDE
FILES_MATCHING
PATTERN "*.cuh"
PATTERN "internal" EXCLUDE
)
if(NVBench_ENABLE_INSTALL_RULES)
# generated headers from build dir:
install(
FILES
"${NVBench_BINARY_DIR}/nvbench/config.cuh"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench"
)
install(
FILES
"${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
"${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench/detail"
)
include(GNUInstallDirs)
rapids_cmake_install_lib_dir(NVBench_INSTALL_LIB_DIR)
#
# Install CMake files needed by consumers to locate dependencies:
#
# in-source public headers:
install(DIRECTORY "${NVBench_SOURCE_DIR}/nvbench"
TYPE INCLUDE
FILES_MATCHING
PATTERN "*.cuh"
PATTERN "internal" EXCLUDE
)
# Borrowing this logic from rapids_cmake's export logic to make sure these end
# up in the same location as nvbench-config.cmake:
rapids_cmake_install_lib_dir(config_install_location)
set(config_install_location "${config_install_location}/cmake/nvbench")
if (NVBench_ENABLE_NVML)
# generated headers from build dir:
install(
FILES
"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake"
DESTINATION "${config_install_location}"
"${NVBench_BINARY_DIR}/nvbench/config.cuh"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench"
)
endif()
if (NVBench_ENABLE_CUPTI)
install(
FILES
"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
DESTINATION "${config_install_location}"
"${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
"${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nvbench/detail"
)
#
# Install CMake files needed by consumers to locate dependencies:
#
# Borrowing this logic from rapids_cmake's export logic to make sure these end
# up in the same location as nvbench-config.cmake:
rapids_cmake_install_lib_dir(config_install_location)
set(config_install_location "${config_install_location}/cmake/nvbench")
if (NVBench_ENABLE_NVML)
install(
FILES
"${NVBench_SOURCE_DIR}/cmake/NVBenchNVML.cmake"
DESTINATION "${config_install_location}"
)
endif()
if (NVBench_ENABLE_CUPTI)
install(
FILES
"${NVBench_SOURCE_DIR}/cmake/NVBenchCUPTI.cmake"
DESTINATION "${config_install_location}"
)
endif()
endif()
# Call with a list of library targets to generate install rules:
function(nvbench_install_libraries)
install(TARGETS ${ARGN}
DESTINATION "${NVBench_INSTALL_LIB_DIR}"
EXPORT nvbench-targets
)
if(NVBench_ENABLE_INSTALL_RULES)
install(TARGETS ${ARGN}
DESTINATION "${NVBench_INSTALL_LIB_DIR}"
EXPORT nvbench-targets
)
endif()
endfunction()
# Call with a list of executables to generate install rules:
function(nvbench_install_executables)
install(TARGETS ${ARGN} EXPORT nvbench-targets)
if(NVBench_ENABLE_INSTALL_RULES)
install(TARGETS ${ARGN} EXPORT nvbench-targets)
endif()
endfunction()

View File

@@ -1,37 +1,7 @@
# Since this file is installed, we need to make sure that the CUDAToolkit has
# been found by consumers:
if (NOT TARGET CUDA::toolkit)
find_package(CUDAToolkit REQUIRED)
endif()
if (WIN32)
# The CUDA:: targets currently don't provide dll locations through the
# `IMPORTED_LOCATION` property, nor are they marked as `SHARED` libraries
# (they're currently `UNKNOWN`). This prevents the `nvbench_setup_dep_dlls`
# CMake function from copying the dlls to the build / install directories.
# This is discussed in https://gitlab.kitware.com/cmake/cmake/-/issues/22845
# and the other CMake issues it links to.
#
# We create a nvbench-specific target that configures the nvml interface as
# described here:
# https://gitlab.kitware.com/cmake/cmake/-/issues/22845#note_1077538
#
# Use find_file instead of find_library, which would search for a .lib file.
# This is also nice because find_file searches recursively (find_library
# does not) and some versions of CTK nest nvml.dll several directories deep
# under C:\Windows\System32.
find_file(NVBench_NVML_DLL nvml.dll REQUIRED
DOC "The full path to nvml.dll. Usually somewhere under C:/Windows/System32."
PATHS "C:/Windows/System32"
)
mark_as_advanced(NVBench_NVML_DLL)
add_library(nvbench::nvml SHARED IMPORTED)
target_link_libraries(nvbench::nvml INTERFACE CUDA::toolkit)
set_target_properties(nvbench::nvml PROPERTIES
IMPORTED_LOCATION "${NVBench_NVML_DLL}"
IMPORTED_IMPLIB "${CUDA_nvml_LIBRARY}"
)
else()
# Linux is much easier...
add_library(nvbench::nvml ALIAS CUDA::nvml)
endif()
# Since this file is installed, we need to make sure that the CUDAToolkit has
# been found by consumers:
if (NOT TARGET CUDA::toolkit)
find_package(CUDAToolkit REQUIRED)
endif()
add_library(nvbench::nvml ALIAS CUDA::nvml)

View File

@@ -1,10 +1,12 @@
# Called before project(...)
macro(nvbench_load_rapids_cmake)
file(DOWNLOAD
https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
"${CMAKE_BINARY_DIR}/RAPIDS.cmake"
)
include("${CMAKE_BINARY_DIR}/RAPIDS.cmake")
if(NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake")
file(DOWNLOAD
https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-25.04/RAPIDS.cmake
"${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake"
)
endif()
include("${CMAKE_CURRENT_BINARY_DIR}/NVBENCH_RAPIDS.cmake")
include(rapids-cmake)
include(rapids-cpm)
@@ -18,10 +20,9 @@ endmacro()
# Called after project(...)
macro(nvbench_init_rapids_cmake)
rapids_cmake_build_type(Release)
rapids_cmake_write_version_file("${NVBench_BINARY_DIR}/nvbench/detail/version.cuh")
rapids_cmake_write_git_revision_file(
nvbench_git_revision
"${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
rapids_cmake_write_version_file(
"${NVBench_BINARY_DIR}/nvbench/detail/version.cuh"
PREFIX "NVBENCH"
)
rapids_cpm_init()
endmacro()

View File

@@ -1,3 +1,48 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Passes all args directly to execute_process while setting up the following
# results variables and propagating them to the caller's scope:
#
# - nvbench_process_exit_code
# - nvbench_process_stdout
# - nvbench_process_stderr
#
# If the command is not successful (e.g. the last command does not return zero),
# a non-fatal warning is printed.
function(nvbench_execute_non_fatal_process)
execute_process(${ARGN}
RESULT_VARIABLE nvbench_process_exit_code
OUTPUT_VARIABLE nvbench_process_stdout
ERROR_VARIABLE nvbench_process_stderr
)
if (NOT nvbench_process_exit_code EQUAL 0)
message(WARNING
"execute_process failed with non-zero exit code: ${nvbench_process_exit_code}\n"
"${ARGN}\n"
"stdout:\n${nvbench_process_stdout}\n"
"stderr:\n${nvbench_process_stderr}\n"
)
endif()
set(nvbench_process_exit_code "${nvbench_process_exit_code}" PARENT_SCOPE)
set(nvbench_process_stdout "${nvbench_process_stdout}" PARENT_SCOPE)
set(nvbench_process_stderr "${nvbench_process_stderr}" PARENT_SCOPE)
endfunction()
# Writes CMAKE_CUDA_ARCHITECTURES to out_var, but using escaped semicolons
# as delimiters
function(nvbench_escaped_cuda_arches out_var)

View File

@@ -0,0 +1,127 @@
## This CMake script parses the output of ctest and prints a formatted list
## of individual test runtimes, sorted longest first.
##
## ctest > ctest_log
## cmake -DLOGFILE=ctest_log \
## -DMINSEC=10 \
## -P PrintCTestRunTimes.cmake
##
################################################################################
cmake_minimum_required(VERSION 3.15)
# Prepend the string with "0" until the string length equals the specified width
function(pad_string_with_zeros string_var width)
set(local_string "${${string_var}}")
string(LENGTH "${local_string}" size)
while(size LESS width)
string(PREPEND local_string "0")
string(LENGTH "${local_string}" size)
endwhile()
set(${string_var} "${local_string}" PARENT_SCOPE)
endfunction()
################################################################################
if (NOT LOGFILE)
message(FATAL_ERROR "Missing -DLOGFILE=<ctest output> argument.")
endif()
if (NOT DEFINED MINSEC)
set(MINSEC 10)
endif()
set(num_below_thresh 0)
# Check if logfile exists
if (NOT EXISTS "${LOGFILE}")
message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
endif()
string(JOIN "" regex
"[0-9]+/[0-9]+[ ]+Test[ ]+#"
"([0-9]+)" # Test ID
":[ ]+"
"([^ ]+)" # Test Name
"[ ]*\\.+[ ]*\\**[ ]*"
"([^ ]+)" # Result
"[ ]+"
"([0-9]+)" # Seconds
"\\.[0-9]+[ ]+sec"
)
message(DEBUG "LOGFILE: ${LOGFILE}")
message(DEBUG "MINSEC: ${MINSEC}")
message(DEBUG "regex: ${regex}")
# Read the logfile and generate a map / keylist
set(keys)
file(STRINGS "${LOGFILE}" lines)
foreach(line ${lines})
# Parse each build time
string(REGEX MATCH "${regex}" _DUMMY "${line}")
if (CMAKE_MATCH_COUNT EQUAL 4)
set(test_id "${CMAKE_MATCH_1}")
set(test_name "${CMAKE_MATCH_2}")
set(test_result "${CMAKE_MATCH_3}")
set(tmp "${CMAKE_MATCH_4}") # floor(runtime_seconds)
if (tmp LESS MINSEC)
math(EXPR num_below_thresh "${num_below_thresh} + 1")
continue()
endif()
# Compute human readable time
math(EXPR days "${tmp} / (60 * 60 * 24)")
math(EXPR tmp "${tmp} - (${days} * 60 * 60 * 24)")
math(EXPR hours "${tmp} / (60 * 60)")
math(EXPR tmp "${tmp} - (${hours} * 60 * 60)")
math(EXPR minutes "${tmp} / (60)")
math(EXPR tmp "${tmp} - (${minutes} * 60)")
math(EXPR seconds "${tmp}")
# Format time components
pad_string_with_zeros(days 3)
pad_string_with_zeros(hours 2)
pad_string_with_zeros(minutes 2)
pad_string_with_zeros(seconds 2)
# Construct table entry
# Later values in the file for the same command overwrite earlier entries
string(MAKE_C_IDENTIFIER "${test_id}" key)
string(JOIN " | " ENTRY_${key}
"${days}d ${hours}h ${minutes}m ${seconds}s"
"${test_result}"
"${test_id}: ${test_name}"
)
# Record the key:
list(APPEND keys "${key}")
endif()
endforeach()
list(REMOVE_DUPLICATES keys)
# Build the entry list:
set(entries)
foreach(key ${keys})
list(APPEND entries "${ENTRY_${key}}")
endforeach()
if (NOT entries)
message(STATUS "LOGFILE contained no test times ('${LOGFILE}').")
endif()
# Sort in descending order:
list(SORT entries ORDER DESCENDING)
# Dump table:
foreach(entry ${entries})
message(STATUS ${entry})
endforeach()
if (num_below_thresh GREATER 0)
message(STATUS "${num_below_thresh} additional tests took < ${MINSEC}s each.")
endif()

View File

@@ -0,0 +1,101 @@
## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of
## build/link times, sorted longest first.
##
## cmake -DLOGFILE=<.ninja_log file> \
## -P PrintNinjaBuildTimes.cmake
##
## If LOGFILE is omitted, the current directory's .ninja_log file is used.
################################################################################
cmake_minimum_required(VERSION 3.15)
# Prepend the string with "0" until the string length equals the specified width
function(pad_string_with_zeros string_var width)
set(local_string "${${string_var}}")
string(LENGTH "${local_string}" size)
while(size LESS width)
string(PREPEND local_string "0")
string(LENGTH "${local_string}" size)
endwhile()
set(${string_var} "${local_string}" PARENT_SCOPE)
endfunction()
################################################################################
if (NOT LOGFILE)
set(LOGFILE ".ninja_log")
endif()
# Check if logfile exists
if (NOT EXISTS "${LOGFILE}")
message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
endif()
# Read the logfile and generate a map / keylist
set(keys)
file(STRINGS "${LOGFILE}" lines)
foreach(line ${lines})
# Parse each build time
string(REGEX MATCH
"^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}")
if (CMAKE_MATCH_COUNT EQUAL 3)
set(start_ms ${CMAKE_MATCH_1})
set(end_ms ${CMAKE_MATCH_2})
set(command "${CMAKE_MATCH_3}")
math(EXPR runtime_ms "${end_ms} - ${start_ms}")
# Compute human readable time
math(EXPR days "${runtime_ms} / (1000 * 60 * 60 * 24)")
math(EXPR runtime_ms "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)")
math(EXPR hours "${runtime_ms} / (1000 * 60 * 60)")
math(EXPR runtime_ms "${runtime_ms} - (${hours} * 1000 * 60 * 60)")
math(EXPR minutes "${runtime_ms} / (1000 * 60)")
math(EXPR runtime_ms "${runtime_ms} - (${minutes} * 1000 * 60)")
math(EXPR seconds "${runtime_ms} / 1000")
math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)")
# Format time components
pad_string_with_zeros(days 3)
pad_string_with_zeros(hours 2)
pad_string_with_zeros(minutes 2)
pad_string_with_zeros(seconds 2)
pad_string_with_zeros(milliseconds 3)
# Construct table entry
# Later values in the file for the same command overwrite earlier entries
string(MAKE_C_IDENTIFIER "${command}" key)
set(ENTRY_${key}
"${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}"
)
# Record the key:
list(APPEND keys "${key}")
endif()
endforeach()
list(REMOVE_DUPLICATES keys)
# Build the entry list:
set(entries)
foreach(key ${keys})
list(APPEND entries "${ENTRY_${key}}")
endforeach()
if (NOT entries)
message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').")
endif()
# Sort in descending order:
list(SORT entries)
list(REVERSE entries)
# Dump table:
message(STATUS "-----------------------+----------------------------")
message(STATUS "Time | Command ")
message(STATUS "-----------------------+----------------------------")
foreach(entry ${entries})
message(STATUS ${entry})
endforeach()

45
cmake/header_test.in.cxx Normal file
View File

@@ -0,0 +1,45 @@
// This source file checks that:
// 1) Header <${header_str}> compiles without error.
// 2) Common macro collisions with platform/system headers are avoided.
// Turn off failures for certain configurations:
#ifndef NVBench_IGNORE_MACRO_CHECKS
// Define NVBench_MACRO_CHECK(macro, header), which emits a diagnostic indicating
// a potential macro collision and halts.
//
// Hacky way to build a string, but it works on all tested platforms.
#define NVBench_MACRO_CHECK(MACRO, HEADER) \
NVBench_MACRO_CHECK_IMPL( \
Identifier MACRO should not be used from NVBench headers due to conflicts with HEADER macros.)
// Use raw platform checks instead of the NVBench_HOST_COMPILER macros since we
// don't want to #include any headers other than the one being tested.
//
// This is only implemented for GCC/Clang.
#if defined(__clang__) || defined(__GNUC__)
// GCC/clang are easy:
#define NVBench_MACRO_CHECK_IMPL(msg) NVBench_MACRO_CHECK_IMPL0(GCC error #msg)
#define NVBench_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
#endif // defined(__clang__) || defined(__GNUC__)
// complex.h conflicts
#define I NVBench_MACRO_CHECK('I', complex.h)
// windows.h conflicts
#define small NVBench_MACRO_CHECK('small', windows.h)
// We can't enable these checks without breaking some builds -- some standard
// library implementations unconditionally `#undef` these macros, which then
// causes random failures later.
// Leaving these commented out as a warning: Here be dragons.
// #define min(...) NVBench_MACRO_CHECK('min', windows.h)
// #define max(...) NVBench_MACRO_CHECK('max', windows.h)
// termios.h conflicts (NVIDIA/thrust#1547)
#define B0 NVBench_MACRO_CHECK("B0", termios.h)
#endif // NVBench_IGNORE_MACRO_CHECKS
#include <${header_str}>

View File

@@ -0,0 +1,22 @@
# NVCC 11.1 and GCC 9 need a patch to build, otherwise:
#
# nlohmann/ordered_map.hpp(29): error #3316:
# Internal Compiler Error (codegen): "internal error during structure layout!"
#
# Usage:
# ${CMAKE_COMMAND}
# -D "CUDA_VERSION=${CMAKE_CUDA_COMPILER_VERSION}"
# -D "CXX_VERSION=${CMAKE_CXX_COMPILER_VERSION}"
# -D "CXX_ID=${CMAKE_CXX_COMPILER_ID}"
# -P "json_unordered_map_ice.cmake"
if(CUDA_VERSION VERSION_GREATER 11.8 OR NOT CXX_ID STREQUAL "GNU" OR CXX_VERSION VERSION_LESS 9.0)
return()
endif()
# Read the file and replace the string "JSON_NO_UNIQUE_ADDRESS" with
# "/* JSON_NO_UNIQUE_ADDRESS */".
file(READ "include/nlohmann/ordered_map.hpp" NLOHMANN_ORDERED_MAP_HPP)
string(REPLACE "JSON_NO_UNIQUE_ADDRESS" "/* [NVBench Patch] JSON_NO_UNIQUE_ADDRESS */"
NLOHMANN_ORDERED_MAP_HPP "${NLOHMANN_ORDERED_MAP_HPP}")
file(WRITE "include/nlohmann/ordered_map.hpp" "${NLOHMANN_ORDERED_MAP_HPP}")

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:
```cpp
void my_benchmark(nvbench::state& state) {
state.exec([](nvbench::launch& launch) {
state.exec([](nvbench::launch& launch) {
my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
});
}
@@ -97,7 +97,7 @@ void benchmark(nvbench::state& state)
const auto num_inputs = state.get_int64("NumInputs");
thrust::device_vector<int> data = generate_input(num_inputs);
state.exec([&data](nvbench::launch& launch) {
state.exec([&data](nvbench::launch& launch) {
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
});
}
@@ -134,7 +134,7 @@ void benchmark(nvbench::state& state)
const auto quality = state.get_float64("Quality");
state.exec([&quality](nvbench::launch& launch)
{
{
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(quality);
});
}
@@ -153,7 +153,7 @@ void benchmark(nvbench::state& state)
thrust::device_vector<int> data = generate_input(rng_dist);
state.exec([&data](nvbench::launch& launch)
{
{
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
});
}
@@ -182,13 +182,13 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T>)
thrust::device_vector<T> data = generate_input<T>();
state.exec([&data](nvbench::launch& launch)
{
{
my_kernel<<<blocks, threads, 0, launch.get_stream()>>>(data.begin(), data.end());
});
}
using my_types = nvbench::type_list<int, float, double>;
NVBENCH_BENCH_TYPES(my_benchmark, NVBENCH_TYPE_AXES(my_types))
.set_type_axis_names({"ValueType"});
.set_type_axes_names({"ValueType"});
```
The `NVBENCH_TYPE_AXES` macro is unfortunately necessary to prevent commas in
@@ -293,7 +293,6 @@ In general::
More examples can found in [examples/throughput.cu](../examples/throughput.cu).
# Skip Uninteresting / Invalid Benchmarks
Sometimes particular combinations of parameters aren't useful or interesting —
@@ -321,7 +320,7 @@ void my_benchmark(nvbench::state& state, nvbench::type_list<T, U>)
// Skip benchmarks at compile time -- for example, always skip when T == U
// (Note that the `type_list` argument defines the same type twice).
template <typename SameType>
void my_benchmark(nvbench::state& state,
void my_benchmark(nvbench::state& state,
nvbench::type_list<SameType, SameType>)
{
state.skip("T must not be the same type as U.");
@@ -347,6 +346,15 @@ true:
synchronize internally.
- `nvbench::exec_tag::timer` requests a timer object that can be used to
restrict the timed region.
- `nvbench::exec_tag::no_batch` disables batch measurements. This both disables
them during execution to reduce runtime, and prevents their compilation to
reduce compile-time and binary size.
- `nvbench::exec_tag::gpu` is an optional hint that prevents non-GPU benchmarking
code from being compiled for a particular benchmark. A runtime error is emitted
if the benchmark is defined with `set_is_cpu_only(true)`.
- `nvbench::exec_tag::no_gpu` is an optional hint that prevents GPU benchmarking
code from being compiled for a particular benchmark. A runtime error is emitted
if the benchmark does not also define `set_is_cpu_only(true)`.
Multiple execution tags may be combined using `operator|`, e.g.
@@ -397,7 +405,7 @@ Note that using manual timer mode disables batch measurements.
void timer_example(nvbench::state& state)
{
// Pass the `timer` exec tag to request a timer:
state.exec(nvbench::exec_tag::timer,
state.exec(nvbench::exec_tag::timer,
// Lambda now accepts a timer:
[](nvbench::launch& launch, auto& timer)
{
@@ -418,6 +426,79 @@ NVBENCH_BENCH(timer_example);
See [examples/exec_tag_timer.cu](../examples/exec_tag_timer.cu) for a complete
example.
## Compilation hints: `nvbench::exec_tag::no_batch`, `gpu`, and `no_gpu`
These execution tags are optional hints that disable the compilation of various
code paths when they are not needed. They apply only to a single benchmark.
- `nvbench::exec_tag::no_batch` prevents the execution and instantiation of the batch measurement backend.
- `nvbench::exec_tag::gpu` prevents the instantiation of CPU-only benchmarking backends.
- Requires that the benchmark does not define `set_is_cpu_only(true)`.
- Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
- Host-side CPU measurements of GPU kernel execution time are still provided.
- `nvbench::exec_tag::no_gpu` prevents the instantiation of GPU benchmarking backends.
- Requires that the benchmark defines `set_is_cpu_only(true)`.
- Optional; this has no effect on runtime measurements, but reduces compile-time and binary size.
- See also [CPU-only Benchmarks](#cpu-only-benchmarks).
# CPU-only Benchmarks
NVBench provides CPU-only benchmarking facilities that are intended for measuring
significant CPU workloads. We do not recommend using these features for high-resolution
CPU benchmarking -- other libraries (such as Google Benchmark) are more appropriate for
such applications. Examples are provided in [examples/cpu_only.cu](../examples/cpu_only.cu).
Note that NVBench still requires a CUDA compiler and runtime even if a project only contains
CPU-only benchmarks.
The `is_cpu_only` property of the benchmark toggles between GPU and CPU-only measurements:
```cpp
void my_cpu_benchmark(nvbench::state &state)
{
state.exec([](nvbench::launch &) { /* workload */ });
}
NVBENCH_BENCH(my_cpu_benchmark)
.set_is_cpu_only(true); // Mark as CPU-only.
```
The optional `nvbench::exec_tag::no_gpu` hint may be used to reduce tbe compilation time and
binary size of CPU-only benchmarks. An error is emitted at runtime if this tag is used while
`is_cpu_only` is false.
```cpp
void my_cpu_benchmark(nvbench::state &state)
{
state.exec(nvbench::exec_tag::no_gpu, // Prevent compilation of GPU backends
[](nvbench::launch &) { /* workload */ });
}
NVBENCH_BENCH(my_cpu_benchmark)
.set_is_cpu_only(true); // Mark as CPU-only.
```
The `nvbench::exec_tag::timer` execution tag is also supported by CPU-only benchmarks. This
is useful for benchmarks that require additional per-sample setup/teardown. See the
[`nvbench::exec_tag::timer`](#explicit-timer-mode-nvbenchexec_tagtimer) section for more
details.
```cpp
void my_cpu_benchmark(nvbench::state &state)
{
state.exec(nvbench::exec_tag::no_gpu | // Prevent compilation of GPU backends
nvbench::exec_tag::timer, // Request a timer object
[](nvbench::launch &, auto &timer)
{
// Setup here
timer.start();
// timed workload
timer.stop();
// teardown here
});
}
NVBENCH_BENCH(my_cpu_benchmark)
.set_is_cpu_only(true); // Mark as CPU-only.
```
# Beware: Combinatorial Explosion Is Lurking
Be very careful of how quickly the configuration space can grow. The following
@@ -430,7 +511,7 @@ using value_types = nvbench::type_list<nvbench::uint8_t,
nvbench::int32_t,
nvbench::float32_t,
nvbench::float64_t>;
using op_types = nvbench::type_list<thrust::plus<>,
using op_types = nvbench::type_list<thrust::plus<>,
thrust::multiplies<>,
thrust::maximum<>>;
@@ -445,7 +526,7 @@ NVBENCH_BENCH_TYPES(my_benchmark,
```
960 total configs
= 4 [T=(U8, I32, F32, F64)]
= 4 [T=(U8, I32, F32, F64)]
* 4 [U=(U8, I32, F32, F64)]
* 4 [V=(U8, I32, F32, F64)]
* 3 [Op=(plus, multiplies, max)]
@@ -453,9 +534,10 @@ NVBENCH_BENCH_TYPES(my_benchmark,
```
For large configuration spaces like this, pruning some of the less useful
combinations using the techniques described in the [Zipped/Tied Iteration of Value Axes](#zipped-iteration-of-value-axes)
or [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks) section can help immensely with
keeping compile / run times manageable.
combinations using the techniques described in the
[Zipped/Tied Iteration of Value Axes](#zipped-iteration-of-value-axes)
or [Skip Uninteresting / Invalid Benchmarks](#skip-uninteresting--invalid-benchmarks)
sections can help immensely with keeping compile / run times manageable.
Splitting a single large configuration space into multiple, more focused
benchmarks with reduced dimensionality will likely be worth the effort as well.

View File

@@ -83,28 +83,6 @@
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--min-samples <count>`
* Gather at least `<count>` samples per measurement.
* Default is 10 samples.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--min-time <seconds>`
* Accumulate at least `<seconds>` of execution time per measurement.
* Default is 0.5 seconds.
* If both GPU and CPU times are gathered, this applies to GPU time only.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--max-noise <value>`
* Gather samples until the error in the measurement drops below `<value>`.
* Noise is specified as the percent relative standard deviation.
* Default is 0.5% (`--max-noise 0.5`)
* Only applies to Cold measurements.
* If both GPU and CPU times are gathered, this applies to GPU noise only.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--skip-time <seconds>`
* Skip a measurement when a warmup run executes in less than `<seconds>`.
* Default is -1 seconds (disabled).
@@ -115,6 +93,42 @@
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--throttle-threshold <value>`
* Set the GPU throttle threshold as percentage of the device's default clock rate.
* Default is 75.
* Set to 0 to disable throttle detection entirely.
* Note that throttling is disabled when `nvbench::exec_tag::sync` is used.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--throttle-recovery-delay <value>`
* Set the GPU throttle recovery delay in seconds.
* Default is 0.05 seconds.
* Note that throttling is disabled when `nvbench::exec_tag::sync` is used.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--run-once`
* Only run the benchmark once, skipping any warmup runs and batched
measurements.
* Intended for use with external profiling tools.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--disable-blocking-kernel`
* Don't use the `blocking_kernel`.
* Intended for use with external profiling tools.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--profile`
* Implies `--run-once` and `--disable-blocking-kernel`.
* Intended for use with external profiling tools.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
## Stopping Criteria
* `--timeout <seconds>`
* Measurements will timeout after `<seconds>` have elapsed.
* Default is 15 seconds.
@@ -125,9 +139,55 @@
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--run-once`
* Only run the benchmark once, skipping any warmup runs and batched
measurements.
* Intended for use with external profiling tools.
* `--min-samples <count>`
* Gather at least `<count>` samples per measurement before checking any
other stopping criterion besides the timeout.
* Default is 10 samples.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--stopping-criterion <criterion>`
* After `--min-samples` is satisfied, use `<criterion>` to detect if enough
samples were collected.
* Only applies to Cold and CPU-only measurements.
* If both GPU and CPU times are gathered, GPU time is used for stopping
analysis.
* Stopping criteria provided by NVBench are:
* "stdrel": (default) Converges to a minimal relative standard deviation,
stdev / mean
* "entropy": Converges based on the cumulative entropy of all samples.
* Each stopping criterion may provide additional parameters to customize
behavior, as detailed below:
### "stdrel" Stopping Criterion Parameters
* `--min-time <seconds>`
* Accumulate at least `<seconds>` of execution time per measurement.
* Only applies to `stdrel` stopping criterion.
* Default is 0.5 seconds.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--max-noise <value>`
* Gather samples until the error in the measurement drops below `<value>`.
* Noise is specified as the percent relative standard deviation (stdev/mean).
* Default is 0.5% (`--max-noise 0.5`)
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
### "entropy" Stopping Criterion Parameters
* `--max-angle <value>`
* Maximum linear regression angle of cumulative entropy.
* Smaller values give more accurate results.
* Default is 0.048.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.
* `--min-r2 <value>`
* Minimum coefficient of determination for linear regression of cumulative
entropy.
* Larger values give more accurate results.
* Default is 0.36.
* Applies to the most recent `--benchmark`, or all benchmarks if specified
before any `--benchmark` arguments.

View File

@@ -1,12 +1,15 @@
set(example_srcs
auto_throughput.cu
axes.cu
custom_criterion.cu
cpu_only.cu
enums.cu
exec_tag_sync.cu
exec_tag_timer.cu
skip.cu
stream.cu
summaries.cu
throughput.cu
auto_throughput.cu
custom_iteration_spaces.cu
)
@@ -14,39 +17,39 @@ set(example_srcs
add_custom_target(nvbench.example.all)
add_dependencies(nvbench.all nvbench.example.all)
foreach(example_src IN LISTS example_srcs)
get_filename_component(example_name "${example_src}" NAME_WLE)
string(PREPEND example_name "nvbench.example.")
add_executable(${example_name} "${example_src}")
nvbench_config_target(${example_name})
target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
target_link_libraries(${example_name} PRIVATE nvbench::main)
set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
add_test(NAME ${example_name}
COMMAND "$<TARGET_FILE:${example_name}>" --timeout 0.1 --min-time 1e-5
)
function (nvbench_add_examples_target target_prefix cuda_std)
add_custom_target(${target_prefix}.all)
add_dependencies(nvbench.example.all ${target_prefix}.all)
add_dependencies(nvbench.example.all ${example_name})
endforeach()
foreach(example_src IN LISTS example_srcs)
get_filename_component(example_name "${example_src}" NAME_WLE)
string(PREPEND example_name "${target_prefix}.")
add_executable(${example_name} "${example_src}")
nvbench_config_target(${example_name})
target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
target_link_libraries(${example_name} PRIVATE nvbench::main)
set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
# Silence some warnings from old thrust headers:
set(thrust_examples
auto_throughput
axes
exec_tag_sync
exec_tag_timer
skip
throughput
)
foreach (example IN LISTS thrust_examples)
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
# C4324: structure was padded due to alignment specifier
nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4324")
# warning C4201: nonstandard extension used: nameless struct/union:
# Fixed in Thrust 1.12.0 (CTK 11.4, NV HPC 21.3)
if (${CUDAToolkit_VERSION} VERSION_LESS 11.4)
nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4201")
set(example_args --timeout 0.1)
# The custom_criterion example doesn't support the --min-time argument:
if (NOT "${example_src}" STREQUAL "custom_criterion.cu")
list(APPEND example_args --min-time 1e-5)
endif()
endif()
add_test(NAME ${example_name}
COMMAND "$<TARGET_FILE:${example_name}>" ${example_args})
# These should not deadlock. If they do, it may be that the CUDA context was created before
# setting CUDA_MODULE_LOAD=EAGER in main, see NVIDIA/nvbench#136.
set_tests_properties(${example_name} PROPERTIES
FAIL_REGULAR_EXPRESSION "Possible Deadlock Detected"
)
add_dependencies(${target_prefix}.all ${example_name})
endforeach()
endfunction()
foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
nvbench_add_examples_target(nvbench.example.cpp${std} ${std})
endforeach()

View File

@@ -24,37 +24,33 @@
template <int ItemsPerThread>
__global__ void kernel(std::size_t stride,
std::size_t elements,
const nvbench::int32_t * __restrict__ in,
const nvbench::int32_t *__restrict__ in,
nvbench::int32_t *__restrict__ out)
{
const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const std::size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
const std::size_t step = gridDim.x * blockDim.x;
for (std::size_t i = stride * tid;
i < stride * elements;
i += stride * step)
for (std::size_t i = stride * tid; i < stride * elements; i += stride * step)
{
for (int j = 0; j < ItemsPerThread; j++)
{
const auto read_id = (ItemsPerThread * i + j) % elements;
const auto read_id = (ItemsPerThread * i + j) % elements;
const auto write_id = tid + j * elements;
out[write_id] = in[read_id];
out[write_id] = in[read_id];
}
}
}
// `throughput_bench` copies a 128 MiB buffer of int32_t, and reports throughput
// and cache hit rates.
//
// Calling state.collect_*() enables particular metric collection if nvbench
// was build with CUPTI support (CMake option: -DNVBench_ENABLE_CUPTI=ON).
template <int ItemsPerThread>
void throughput_bench(nvbench::state &state,
nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
void throughput_bench(nvbench::state &state, nvbench::type_list<nvbench::enum_type<ItemsPerThread>>)
{
// Allocate input data:
const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
const std::size_t stride = static_cast<std::size_t>(state.get_int64("Stride"));
const std::size_t elements = 128 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(elements);
thrust::device_vector<nvbench::int32_t> output(elements * ItemsPerThread);
@@ -72,12 +68,11 @@ void throughput_bench(nvbench::state &state,
static_cast<int>((elements + threads_in_block - 1) / threads_in_block);
state.exec([&](nvbench::launch &launch) {
kernel<ItemsPerThread>
<<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
stride,
elements,
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()));
kernel<ItemsPerThread><<<blocks_in_grid, threads_in_block, 0, launch.get_stream()>>>(
stride,
elements,
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()));
});
}

View File

@@ -56,8 +56,8 @@ NVBENCH_BENCH(single_float64_axis)
void copy_sweep_grid_shape(nvbench::state &state)
{
// Get current parameters:
const int block_size = static_cast<int>(state.get_int64("BlockSize"));
const int num_blocks = static_cast<int>(state.get_int64("NumBlocks"));
const auto block_size = static_cast<unsigned int>(state.get_int64("BlockSize"));
const auto num_blocks = static_cast<unsigned int>(state.get_int64("NumBlocks"));
// Number of int32s in 256 MiB:
const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t);
@@ -71,17 +71,16 @@ void copy_sweep_grid_shape(nvbench::state &state)
thrust::device_vector<nvbench::int32_t> in(num_values, 0);
thrust::device_vector<nvbench::int32_t> out(num_values, 0);
state.exec(
[block_size,
num_blocks,
num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
in_ptr,
out_ptr,
num_values);
});
state.exec([block_size,
num_blocks,
num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(in_ptr,
out_ptr,
num_values);
});
}
NVBENCH_BENCH(copy_sweep_grid_shape)
// Every second power of two from 64->1024:
@@ -106,14 +105,12 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
thrust::device_vector<ValueType> in(num_values, 0);
thrust::device_vector<ValueType> out(num_values, 0);
state.exec(
[num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
out_ptr,
num_values);
});
state.exec([num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
});
}
// Define a type_list to use for the type axis:
using cts_types = nvbench::type_list<nvbench::uint8_t,
@@ -129,11 +126,10 @@ NVBENCH_BENCH_TYPES(copy_type_sweep, NVBENCH_TYPE_AXES(cts_types));
// Convert 64 MiB of InputTypes to OutputTypes, represented with various
// value_types.
template <typename InputType, typename OutputType>
void copy_type_conversion_sweep(nvbench::state &state,
nvbench::type_list<InputType, OutputType>)
void copy_type_conversion_sweep(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
// Optional: Skip narrowing conversions.
if (sizeof(InputType) > sizeof(OutputType))
if constexpr (sizeof(InputType) > sizeof(OutputType))
{
state.skip("Narrowing conversion: sizeof(InputType) > sizeof(OutputType).");
return;
@@ -152,14 +148,12 @@ void copy_type_conversion_sweep(nvbench::state &state,
thrust::device_vector<InputType> in(num_values, 0);
thrust::device_vector<OutputType> out(num_values, 0);
state.exec(
[num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr,
out_ptr,
num_values);
});
state.exec([num_values,
in_ptr = thrust::raw_pointer_cast(in.data()),
out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(in_ptr, out_ptr, num_values);
});
}
// Optional: Skip when InputType == OutputType. This approach avoids
// instantiating the benchmark at all.
@@ -175,6 +169,5 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
nvbench::float32_t,
nvbench::int64_t,
nvbench::float64_t>;
NVBENCH_BENCH_TYPES(copy_type_conversion_sweep,
NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
NVBENCH_BENCH_TYPES(copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
.set_type_axes_names({"In", "Out"});

83
examples/cpu_only.cu Normal file
View File

@@ -0,0 +1,83 @@
/*
* Copyright 2025 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
#include <chrono>
#include <thread>
// Block execution of the current CPU thread for `seconds` seconds.
void sleep_host(double seconds)
{
std::this_thread::sleep_for(
std::chrono::milliseconds(static_cast<nvbench::int64_t>(seconds * 1000)));
}
//=============================================================================
// Simple CPU-only benchmark that sleeps on host for a specified duration.
void simple(nvbench::state &state)
{
const auto duration = state.get_float64("Duration");
state.exec([duration](nvbench::launch &) { sleep_host(duration); });
}
NVBENCH_BENCH(simple)
// 100 -> 500 ms in 100 ms increments.
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
// Mark as CPU-only.
.set_is_cpu_only(true);
//=============================================================================
// Simple CPU-only benchmark that sleeps on host for a specified duration and
// uses a custom timed region.
void simple_timer(nvbench::state &state)
{
const auto duration = state.get_float64("Duration");
state.exec(nvbench::exec_tag::timer, [duration](nvbench::launch &, auto &timer) {
// Do any setup work before starting the timer here...
timer.start();
// The region of code to be timed:
sleep_host(duration);
timer.stop();
// Any per-run cleanup here...
});
}
NVBENCH_BENCH(simple_timer)
// 100 -> 500 ms in 100 ms increments.
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
// Mark as CPU-only.
.set_is_cpu_only(true);
//=============================================================================
// Simple CPU-only benchmark that uses the optional `nvbench::exec_tag::no_gpu`
// hint to prevent GPU measurement code from being instantiated. Note that
// `set_is_cpu_only(true)` is still required when using this hint.
void simple_no_gpu(nvbench::state &state)
{
const auto duration = state.get_float64("Duration");
state.exec(nvbench::exec_tag::no_gpu, [duration](nvbench::launch &) { sleep_host(duration); });
}
NVBENCH_BENCH(simple_no_gpu)
// 100 -> 500 ms in 100 ms increments.
.add_float64_axis("Duration", nvbench::range(.1, .5, .1))
// Mark as CPU-only.
.set_is_cpu_only(true);

View File

@@ -0,0 +1,77 @@
/*
* Copyright 2023 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>
// Thrust vectors simplify memory management:
#include <thrust/device_vector.h>
// Inherit from the stopping_criterion_base class:
class fixed_criterion final : public nvbench::stopping_criterion_base
{
nvbench::int64_t m_num_samples{};
public:
fixed_criterion()
: nvbench::stopping_criterion_base{"fixed", {{"max-samples", nvbench::int64_t{42}}}}
{}
protected:
// Setup the criterion in the `do_initialize()` method:
virtual void do_initialize() override { m_num_samples = 0; }
// Process new measurements in the `add_measurement()` method:
virtual void do_add_measurement(nvbench::float64_t /* measurement */) override
{
m_num_samples++;
}
// Check if the stopping criterion is met in the `is_finished()` method:
virtual bool do_is_finished() override
{
return m_num_samples >= m_params.get_int64("max-samples");
}
};
// Register the criterion with NVBench:
NVBENCH_REGISTER_CRITERION(fixed_criterion);
void throughput_bench(nvbench::state &state)
{
// Allocate input data:
const std::size_t num_values = 64 * 1024 * 1024 / sizeof(nvbench::int32_t);
thrust::device_vector<nvbench::int32_t> input(num_values);
thrust::device_vector<nvbench::int32_t> output(num_values);
// Provide throughput information:
state.add_element_count(num_values, "NumElements");
state.add_global_memory_reads<nvbench::int32_t>(num_values, "DataSize");
state.add_global_memory_writes<nvbench::int32_t>(num_values);
state.exec(nvbench::exec_tag::no_batch, [&input, &output, num_values](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
num_values);
});
}
NVBENCH_BENCH(throughput_bench).set_stopping_criterion("fixed");

View File

@@ -89,7 +89,7 @@ NVBENCH_BENCH(copy_sweep_grid_shape)
//
struct under_diag final : nvbench::user_axis_space
{
under_diag(std::vector<std::size_t> input_indices)
explicit under_diag(std::vector<std::size_t> input_indices)
: nvbench::user_axis_space(std::move(input_indices))
{}
@@ -162,7 +162,7 @@ NVBENCH_BENCH(copy_sweep_grid_shape)
struct gauss final : nvbench::user_axis_space
{
gauss(std::vector<std::size_t> input_indices)
explicit gauss(std::vector<std::size_t> input_indices)
: nvbench::user_axis_space(std::move(input_indices))
{}

View File

@@ -17,7 +17,6 @@
*/
#include <nvbench/nvbench.cuh>
#include <nvbench/test_kernels.cuh>
// Enum to use as parameter axis:
@@ -68,12 +67,10 @@ void runtime_enum_sweep_string(nvbench::state &state)
// Create inputs, etc, configure runtime kernel parameters, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
NVBENCH_BENCH(runtime_enum_sweep_string)
.add_string_axis("MyEnum", {"A", "B", "C"});
NVBENCH_BENCH(runtime_enum_sweep_string).add_string_axis("MyEnum", {"A", "B", "C"});
//==============================================================================
// Sweep through enum values at runtime using an int64 axis.
@@ -91,15 +88,14 @@ NVBENCH_BENCH(runtime_enum_sweep_string)
// ```
void runtime_enum_sweep_int64(nvbench::state &state)
{
const auto enum_value = static_cast<MyEnum>(state.get_int64("MyEnum"));
[[maybe_unused]] const auto enum_value = static_cast<MyEnum>(state.get_int64("MyEnum"));
// Do stuff with enum_value.
// Create inputs, etc, configure runtime kernel parameters, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
NVBENCH_BENCH(runtime_enum_sweep_int64)
.add_int64_axis("MyEnum",
@@ -178,12 +174,10 @@ void compile_time_enum_sweep(nvbench::state &state,
// Template parameters, static dispatch, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
using MyEnumList =
nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
using MyEnumList = nvbench::enum_type_list<MyEnum::ValueA, MyEnum::ValueB, MyEnum::ValueC>;
NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
.set_type_axes_names({"MyEnum"});
@@ -199,16 +193,14 @@ NVBENCH_BENCH_TYPES(compile_time_enum_sweep, NVBENCH_TYPE_AXES(MyEnumList))
// * `-12` (struct std::integral_constant<int,-12>)
// ```
template <nvbench::int32_t IntValue>
void compile_time_int_sweep(nvbench::state &state,
nvbench::type_list<nvbench::enum_type<IntValue>>)
void compile_time_int_sweep(nvbench::state &state, nvbench::type_list<nvbench::enum_type<IntValue>>)
{
// Use IntValue in compile time contexts.
// Template parameters, static dispatch, etc.
// Just a dummy kernel.
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
using MyInts = nvbench::enum_type_list<0, 16, 4096, -12>;
NVBENCH_BENCH_TYPES(compile_time_int_sweep, NVBENCH_TYPE_AXES(MyInts))

View File

@@ -27,6 +27,9 @@
// Used to initialize input data:
#include <thrust/sequence.h>
// Used to run the benchmark on a CUDA stream
#include <thrust/execution_policy.h>
// `sequence_bench` measures the execution time of `thrust::sequence`. Since
// algorithms in `thrust::` implicitly sync the CUDA device, the
// `nvbench::exec_tag::sync` must be passed to `state.exec(...)`.
@@ -50,9 +53,7 @@ void sequence_bench(nvbench::state &state)
// nvbench::exec_tag::sync indicates that this will implicitly sync:
state.exec(nvbench::exec_tag::sync, [&data](nvbench::launch &launch) {
thrust::sequence(thrust::device.on(launch.get_stream()),
data.begin(),
data.end());
thrust::sequence(thrust::device.on(launch.get_stream()), data.begin(), data.end());
});
}
NVBENCH_BENCH(sequence_bench);

View File

@@ -24,6 +24,7 @@
// Thrust simplifies memory management, etc:
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sequence.h>
// mod2_inplace performs an in-place mod2 over every element in `data`. `data`
@@ -53,6 +54,8 @@ void mod2_inplace(nvbench::state &state)
state.exec(nvbench::exec_tag::timer,
// Lambda now takes a `timer` argument:
[&input, &data, num_values](nvbench::launch &launch, auto &timer) {
(void)num_values; // clang thinks this is unused...
// Reset working data:
thrust::copy(thrust::device.on(launch.get_stream()),
input.cbegin(),

View File

@@ -72,14 +72,12 @@ NVBENCH_BENCH(runtime_skip)
// Two type axes are swept, but configurations where InputType == OutputType are
// skipped.
template <typename InputType, typename OutputType>
void skip_overload(nvbench::state &state,
nvbench::type_list<InputType, OutputType>)
void skip_overload(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
// This is a contrived example that focuses on the skip overloads, so this is
// just a sleep kernel:
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
// Overload of skip_overload that is called when InputType == OutputType.
template <typename T>
@@ -107,9 +105,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
{
// This is a contrived example that focuses on the skip overloads, so this is
// just a sleep kernel:
state.exec([](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3);
});
state.exec(
[](nvbench::launch &launch) { nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(1e-3); });
}
// Enable this overload if InputType is larger than OutputType
template <typename InputType, typename OutputType>
@@ -119,10 +116,8 @@ skip_sfinae(nvbench::state &state, nvbench::type_list<InputType, OutputType>)
state.skip("sizeof(InputType) > sizeof(OutputType).");
}
// The same type_list is used for both inputs/outputs.
using sn_types = nvbench::type_list<nvbench::int8_t,
nvbench::int16_t,
nvbench::int32_t,
nvbench::int64_t>;
using sn_types =
nvbench::type_list<nvbench::int8_t, nvbench::int16_t, nvbench::int32_t, nvbench::int64_t>;
// Setup benchmark:
NVBENCH_BENCH_TYPES(skip_sfinae, NVBENCH_TYPE_AXES(sn_types, sn_types))
.set_type_axes_names({"In", "Out"});

View File

@@ -52,6 +52,7 @@ void stream_bench(nvbench::state &state)
state.set_cuda_stream(nvbench::make_cuda_stream_view(default_stream));
state.exec([&input, &output, num_values](nvbench::launch &) {
(void)num_values; // clang thinks this is unused...
copy(thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),
num_values);

73
examples/summaries.cu Normal file
View File

@@ -0,0 +1,73 @@
/*
* Copyright 2025 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/nvbench.cuh>
// Grab some testing kernels from NVBench:
#include <nvbench/test_kernels.cuh>
// #define PRINT_DEFAULT_SUMMARY_TAGS
void summary_example(nvbench::state &state)
{
// Fetch parameters and compute duration in seconds:
const auto ms = static_cast<nvbench::float64_t>(state.get_int64("ms"));
const auto us = static_cast<nvbench::float64_t>(state.get_int64("us"));
const auto duration = ms * 1e-3 + us * 1e-6;
// Add a new column to the summary table with the derived duration used by the benchmark.
// See the documentation in nvbench/summary.cuh for more details.
{
nvbench::summary &summary = state.add_summary("duration");
summary.set_string("name", "Duration (s)");
summary.set_string("description", "The duration of the kernel execution.");
summary.set_string("hint", "duration");
summary.set_float64("value", duration);
}
// Run the measurements:
state.exec(nvbench::exec_tag::no_batch, [duration](nvbench::launch &launch) {
nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration);
});
#ifdef PRINT_DEFAULT_SUMMARY_TAGS
// The default summary tags can be found by inspecting the state after calling
// state.exec.
// They can also be found by looking at the json output (--json <filename>)
for (const auto &summary : state.get_summaries())
{
std::cout << summary.get_tag() << std::endl;
}
#endif
// Default summary columns can be shown/hidden in the markdown output tables by adding/removing
// the "hide" key. Modify this benchmark to show the minimum and maximum GPUs times, but hide the
// mean GPU time and all CPU times. SM Clock frequency and throttling info are also shown.
state.get_summary("nv/cold/time/gpu/min").remove_value("hide");
state.get_summary("nv/cold/time/gpu/max").remove_value("hide");
state.get_summary("nv/cold/time/gpu/mean").set_string("hide", "");
state.get_summary("nv/cold/time/cpu/mean").set_string("hide", "");
state.get_summary("nv/cold/time/cpu/min").set_string("hide", "");
state.get_summary("nv/cold/time/cpu/max").set_string("hide", "");
state.get_summary("nv/cold/time/cpu/stdev/relative").set_string("hide", "");
state.get_summary("nv/cold/sm_clock_rate/mean").remove_value("hide");
state.get_summary("nv/cold/sm_clock_rate/scaling/percent").remove_value("hide");
}
NVBENCH_BENCH(summary_example)
.add_int64_axis("ms", nvbench::range(10, 50, 20))
.add_int64_axis("us", nvbench::range(100, 500, 200));

View File

@@ -51,6 +51,7 @@ void throughput_bench(nvbench::state &state)
state.add_global_memory_writes<nvbench::int32_t>(num_values);
state.exec([&input, &output, num_values](nvbench::launch &launch) {
(void)num_values; // clang thinks this is unused...
nvbench::copy_kernel<<<256, 256, 0, launch.get_stream()>>>(
thrust::raw_pointer_cast(input.data()),
thrust::raw_pointer_cast(output.data()),

View File

@@ -6,7 +6,6 @@ set_target_properties(nvbench.ctl PROPERTIES
EXPORT_NAME ctl
)
add_dependencies(nvbench.all nvbench.ctl)
nvbench_setup_dep_dlls(nvbench.ctl)
nvbench_install_executables(nvbench.ctl)
if (NVBench_ENABLE_TESTING)

View File

@@ -1,20 +1,20 @@
/*
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
* Copyright 2021 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 with the LLVM exception
* (the "License"); you may not use this file except in compliance with
* the License.
*
* You may obtain a copy of the License at
*
* http://llvm.org/foundation/relicensing/LICENSE.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <nvbench/main.cuh>
@@ -24,7 +24,7 @@ int main(int argc, char const *const *argv)
try
{
// If no args, substitute a new argv that prints the version
std::vector<const char*> alt_argv;
std::vector<const char *> alt_argv;
if (argc == 1)
{
alt_argv.push_back("--version");
@@ -36,7 +36,7 @@ try
NVBENCH_CUDA_CALL(cudaDeviceReset());
return 0;
}
catch (std::exception & e)
catch (std::exception &e)
{
std::cerr << "\nNVBench encountered an error:\n\n" << e.what() << "\n";
return 1;

View File

@@ -5,6 +5,7 @@ set(srcs
benchmark_base.cxx
benchmark_manager.cxx
blocking_kernel.cu
criterion_manager.cxx
csv_printer.cu
cuda_call.cu
device_info.cu
@@ -19,25 +20,29 @@ set(srcs
printer_multiplex.cxx
runner.cxx
state.cxx
stopping_criterion.cxx
string_axis.cxx
type_axis.cxx
type_strings.cxx
user_axis_space.cxx
zip_axis_space.cxx
detail/entropy_criterion.cxx
detail/measure_cold.cu
detail/measure_cpu_only.cxx
detail/measure_hot.cu
detail/state_generator.cxx
detail/stdrel_criterion.cxx
detail/gpu_frequency.cxx
detail/timestamps_kernel.cu
internal/nvml.cxx
)
if (NVBench_ENABLE_CUPTI)
list(APPEND srcs detail/measure_cupti.cu cupti_profiler.cxx)
endif()
if (NVBench_ENABLE_NVML)
list(APPEND srcs internal/nvml.cxx)
endif()
# CUDA 11.0 can't compile json_printer without crashing
# So for that version fall back to C++ with degraded
# output ( no PTX version info )
@@ -69,7 +74,7 @@ nvbench_write_config_header(config.cuh.in
)
# nvbench (nvbench::nvbench)
add_library(nvbench SHARED ${srcs})
add_library(nvbench ${srcs})
nvbench_config_target(nvbench)
target_include_directories(nvbench PUBLIC
"$<BUILD_INTERFACE:${NVBench_SOURCE_DIR}>"
@@ -82,8 +87,29 @@ target_link_libraries(nvbench
PRIVATE
fmt::fmt
nvbench_json
nvbench_git_revision
)
# ##################################################################################################
# * conda environment -----------------------------------------------------------------------------
rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
if(TARGET conda_env)
# When we are inside a conda env the linker will be set to
# `ld.bfd` which will try to resolve all undefined symbols at link time.
#
# Since we could be using a shared library version of fmt we need
# it on the final link line of consumers
target_link_libraries(nvbench PRIVATE $<BUILD_INTERFACE:conda_env>)
# When we are inside a conda env the linker will be set to
# `ld.bfd` which will try to resolve all undefined symbols at link time.
#
# Since we could be using a shared library version of fmt we need
# it on the final link line of consumers
if(fmt_is_external)
target_link_libraries(nvbench PUBLIC fmt::fmt)
endif()
endif()
target_compile_features(nvbench PUBLIC cuda_std_17 PRIVATE cxx_std_17)
add_dependencies(nvbench.all nvbench)
@@ -98,7 +124,6 @@ add_dependencies(nvbench.all nvbench.main)
add_library(nvbench::nvbench ALIAS nvbench)
add_library(nvbench::main ALIAS nvbench.main)
nvbench_setup_dep_dlls(nvbench)
nvbench_install_libraries(nvbench nvbench.main nvbench.build_interface)
# nvcc emits several unavoidable warnings while compiling nlohmann_json:
@@ -111,3 +136,19 @@ if (json_is_cu)
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--diag_suppress=940>
)
endif()
# The call to `rapids_cmake_write_git_revision_file` must be in the same
# CMakeLists.txt as the consumer ( nvbench ) for CMake to get the dependency
# graph correct.
rapids_cmake_write_git_revision_file(
nvbench_git_revision
"${NVBench_BINARY_DIR}/nvbench/detail/git_revision.cuh"
PREFIX "NVBENCH"
)
target_link_libraries(nvbench PRIVATE nvbench_git_revision)
if(NOT BUILD_SHARED_LIBS)
# Need to ensure that for static builds we export the nvbench_git_revision
# target
nvbench_install_libraries(nvbench_git_revision)
endif()

Some files were not shown because too many files have changed in this diff Show More