diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index eef2ead7..6a3dbfe1 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,13 +1,55 @@ ARG BASE_IMAGE FROM ${BASE_IMAGE} -ARG USERNAME=mscclpp +ARG USERNAME=devuser ARG USER_UID=1000 ARG USER_GID=$USER_UID +ARG SSH_PORT=22345 -# Create the user -RUN groupadd --gid $USER_GID $USERNAME && \ - useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \ - echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME && \ - chmod 0440 /etc/sudoers.d/$USERNAME +# Create or modify the user +RUN if getent group $USER_GID > /dev/null; then \ + EXISTING_GROUP=$(getent group $USER_GID | cut -d: -f1); \ + if [ "$EXISTING_GROUP" != "$USERNAME" ]; then \ + groupmod -n $USERNAME $EXISTING_GROUP; \ + fi; \ + else \ + groupadd --gid $USER_GID $USERNAME; \ + fi && \ + if id -u $USER_UID > /dev/null 2>&1; then \ + EXISTING_USER=$(getent passwd $USER_UID | cut -d: -f1); \ + if [ "$EXISTING_USER" != "$USERNAME" ]; then \ + usermod -l $USERNAME -d /home/$USERNAME -m $EXISTING_USER; \ + fi; \ + else \ + useradd --uid $USER_UID --gid $USER_GID -m $USERNAME; \ + fi && \ + usermod -g $USERNAME $USERNAME && \ + echo "$USERNAME ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +RUN rm -rf /etc/apt/sources.list.d/cuda-* && \ + apt-get update && \ + apt install -y --no-install-recommends \ + clang-format \ + openssh-server \ + gdb \ + doxygen \ + graphviz \ + && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +RUN python3 -m pip install --no-cache-dir \ + black \ + pytest \ + breathe \ + sphinx_rtd_theme \ + myst_parser \ + sphinxcontrib.mermaid + +RUN sed -i "s/^Port 22/Port ${SSH_PORT}/" /etc/ssh/sshd_config && \ + mkdir -p /home/$USERNAME/.ssh && \ + ssh-keygen -t rsa -f /home/$USERNAME/.ssh/id_rsa -N "" -q && \ + cat /home/$USERNAME/.ssh/id_rsa.pub >> /home/$USERNAME/.ssh/authorized_keys && \ + chown -R $USERNAME:$USERNAME /home/$USERNAME/.ssh USER $USERNAME diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index f295210c..51488fec 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -3,10 +3,17 @@ "build": { "dockerfile": "Dockerfile", "args": { - "BASE_IMAGE": "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8" + "BASE_IMAGE": "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8", + "USERNAME": "devuser", + "SSH_PORT": "22345" } }, - "remoteUser": "mscclpp", + "remoteUser": "devuser", + "containerEnv": { + "LC_ALL": "C", + "LANG": "C", + "LANGUAGE": "C" + }, "customizations": { "vscode": { "extensions": [ @@ -15,18 +22,32 @@ "ms-python.vscode-pylance", // C++ "ms-vscode.cpptools", - "ms-vscode.cpptools-extension-pack", "ms-vscode.cmake-tools" - ] + ], + "settings": { + "terminal.integrated.defaultProfile.linux": "bash", + "C_Cpp.default.includePath": [ + "${workspaceFolder}/**", + "/usr/local/cuda/include", + "/usr/include" + ], + "C_Cpp.default.compilerPath": "/usr/bin/gcc", + "C_Cpp.default.cStandard": "c17", + "C_Cpp.default.cppStandard": "c++17", + "C_Cpp.default.intelliSenseMode": "linux-gcc-x64" + } } }, "privileged": true, "runArgs": [ + "--cap-add=SYS_PTRACE", "--net=host", "--ipc=host", - "--gpus=all", - "--ulimit=memlock=-1:-1" + "--ulimit=memlock=-1:-1", + "--gpus=all" ], - "workspaceFolder": "/home/mscclpp/mscclpp", - "workspaceMount": "source=${localWorkspaceFolder},target=/home/mscclpp/mscclpp,type=bind,consistency=cached" + "workspaceFolder": "/home/devuser/mscclpp", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/devuser/mscclpp,type=bind,consistency=cached", + "postStartCommand": "sudo service ssh start", + "postCreateCommand": "bash /home/devuser/mscclpp/tools/install.sh nvidia /usr" } diff --git a/.devcontainer/devcontainer_amd.json b/.devcontainer/devcontainer_amd.json new file mode 100644 index 00000000..0647e63f --- /dev/null +++ b/.devcontainer/devcontainer_amd.json @@ -0,0 +1,56 @@ +{ + "name": "MSCCL++ Dev Container", + "build": { + "dockerfile": "Dockerfile", + "args": { + "BASE_IMAGE": "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2", + "USERNAME": "devuser", + "SSH_PORT": "22345" + } + }, + "remoteUser": "devuser", + "containerEnv": { + "LC_ALL": "C", + "LANG": "C", + "LANGUAGE": "C" + }, + "customizations": { + "vscode": { + "extensions": [ + // Python + "ms-python.python", + "ms-python.vscode-pylance", + // C++ + "ms-vscode.cpptools", + "ms-vscode.cmake-tools" + ], + "settings": { + "terminal.integrated.defaultProfile.linux": "bash", + "C_Cpp.default.includePath": [ + "${workspaceFolder}/**", + "/opt/rocm/include", + "/usr/include" + ], + "C_Cpp.default.compilerPath": "/usr/bin/gcc", + "C_Cpp.default.cStandard": "c17", + "C_Cpp.default.cppStandard": "c++17", + "C_Cpp.default.intelliSenseMode": "linux-gcc-x64" + } + } + }, + "privileged": true, + "runArgs": [ + "--cap-add=SYS_PTRACE", + "--net=host", + "--ipc=host", + "--ulimit=memlock=-1:-1", + "--security-opt=seccomp=unconfined", + "--group-add=video", + "--device=/dev/kfd", + "--device=/dev/dri" + ], + "workspaceFolder": "/home/devuser/mscclpp", + "workspaceMount": "source=${localWorkspaceFolder},target=/home/devuser/mscclpp,type=bind,consistency=cached", + "postStartCommand": "sudo service ssh start", + "postCreateCommand": "bash /home/devuser/mscclpp/tools/install.sh amd /usr" +} diff --git a/tools/install.sh b/tools/install.sh new file mode 100755 index 00000000..c191ae4d --- /dev/null +++ b/tools/install.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash + +set -euo pipefail + +PROJECT_ROOT=$(dirname "$(realpath "$0")")/.. +TMP_BUILD_DIR=$(mktemp -d) +INSTALL_DIR=/usr +NVIDIA=false +AMD=false + +usage() { + echo "Usage: $0 [install_dir]" + echo " nvidia Install for NVIDIA platforms" + echo " amd Install for AMD platforms" + echo " install_dir Directory to install to (default: /usr)" +} + +if [ ! -d "$TMP_BUILD_DIR" ]; then + echo "Error: Failed to create temporary build directory." + exit 1 +fi + +# Parse arguments +if [ $# -lt 1 ]; then + usage + exit 1 +fi +case "$1" in + nvidia) + NVIDIA=true + ;; + amd) + AMD=true + ;; + *) + echo "Error: Unknown argument '$1'" + usage + exit 1 + ;; +esac +if [ $# -ge 2 ]; then + INSTALL_DIR="$2" +fi +if [ ! -d "$INSTALL_DIR" ]; then + echo "Error: Install directory '$INSTALL_DIR' does not exist." + exit 1 +fi + +trap 'rm -rf "$TMP_BUILD_DIR"' EXIT + +pushd "$TMP_BUILD_DIR" || exit 1 + +if $AMD; then + export CXX=/opt/rocm/bin/hipcc + CMAKE="cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON" +elif $NVIDIA; then + CMAKE="cmake -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" +else + echo "Error: No valid platform specified." + exit 1 +fi + +$CMAKE \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" \ + -DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF \ + -DMSCCLPP_BUILD_TESTS=OFF \ + "$PROJECT_ROOT" + +make -j$(nproc) + +# Use 'make install' to ensure dependency checks are performed for a reliable installation. +sudo make install + +popd || exit 1 + +echo "Installation completed successfully." diff --git a/tools/lint.sh b/tools/lint.sh index eee150ea..97f93170 100755 --- a/tools/lint.sh +++ b/tools/lint.sh @@ -41,7 +41,7 @@ fi if $LINT_CPP; then echo "Linting C++ code..." # Find all git-tracked files with .c/.h/.cpp/.hpp/.cc/.cu/.cuh extensions - files=$(git ls-files --cached | grep -E '\.(c|h|cpp|hpp|cc|cu|cuh)$') + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.(c|h|cpp|hpp|cc|cu|cuh)$' | sed "s|^|$PROJECT_ROOT/|") if [ -n "$files" ]; then if $DRY_RUN; then clang-format -style=file --dry-run $files @@ -54,7 +54,7 @@ fi if $LINT_PYTHON; then echo "Linting Python code..." # Find all git-tracked files with .py extension - files=$(git ls-files --cached | grep -E '\.py$') + files=$(git -C "$PROJECT_ROOT" ls-files --cached | grep -E '\.py$' | sed "s|^|$PROJECT_ROOT/|") if [ -n "$files" ]; then if $DRY_RUN; then python3 -m black --check --diff $files