diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index c1458c3c..d888946b 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -37,7 +37,7 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) @@ -59,7 +59,7 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) @@ -79,7 +79,7 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) @@ -99,7 +99,7 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) @@ -121,7 +121,7 @@ jobs: strategy: matrix: cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 container: image: $(containerImage) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 575c472b..fb065141 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -40,7 +40,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] - version: [ 'cuda11.8', 'cuda12.8' ] + version: [ 'cuda11.8', 'cuda12.9' ] steps: - name: Checkout repository diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml index 5947b087..a9187e96 100644 --- a/.github/workflows/mscclpp-lang.yml +++ b/.github/workflows/mscclpp-lang.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - version: [ 'cuda11.8', 'cuda12.8' ] + version: [ 'cuda11.8', 'cuda12.9' ] steps: - uses: actions/checkout@v4 diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 3aa81422..7c6c927e 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -7,13 +7,38 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp RUN apt-get update && \ apt-get install -y --no-install-recommends \ htop \ - lcov \ vim \ && \ apt-get autoremove -y && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* +# Install lcov 2.2 +RUN LCOV_VERSION="2.2" && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + cpanminus \ + gcc \ + make \ + perl \ + && \ + cpanm --notest \ + Capture::Tiny \ + DateTime \ + JSON::XS \ + Memory::Process \ + TimeDate \ + && \ + cd /tmp && \ + curl -L https://github.com/linux-test-project/lcov/releases/download/v${LCOV_VERSION}/lcov-${LCOV_VERSION}.tar.gz -o lcov.tar.gz && \ + tar xzf lcov.tar.gz && \ + cd lcov-${LCOV_VERSION} && \ + make install && \ + cd / && rm -rf /tmp/lcov* && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + # Install CMake 3.26.4 RUN OS_ARCH=$(uname -m) && \ CMAKE_VERSION="3.26.4" && \ @@ -47,7 +72,8 @@ RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \ fi && \ pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r python/requirements_${target_type}.txt + pip install --no-cache-dir -r python/requirements_${target_type}.txt && \ + pip install --no-cache-dir coverage xlsxwriter # Cleanup RUN rm -rf /tmp/mscclpp diff --git a/docker/build.sh b/docker/build.sh index 63552f74..56d152bf 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -4,22 +4,21 @@ set -e declare -A baseImageTable baseImageTable=( - ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" - ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" - ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" - ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" + ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu22.04" ["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04" ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04" - ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04" + ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu24.04" ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04" ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2" ) declare -A extraLdPathTable extraLdPathTable=( - ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" - ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" - ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64" + ["cuda11.8"]="/usr/local/cuda-11.8/compat" + ["cuda12.4"]="/usr/local/cuda-12.4/compat" + ["cuda12.8"]="/usr/local/cuda-12.8/compat" + ["cuda12.9"]="/usr/local/cuda-12.9/compat" + ["cuda13.0"]="/usr/local/cuda-13.0/compat" ["rocm6.2"]="/opt/rocm/lib" ) @@ -36,7 +35,7 @@ TARGET=${1} OS_ARCH=$(uname -m) print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]" + echo "Usage: $0 [cuda11.8|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then diff --git a/docs/quickstart.md b/docs/quickstart.md index ac1b7d6b..fd0b7571 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -42,7 +42,7 @@ We provide docker images which package all prerequisites for MSCCL++. You can se ```bash # For NVIDIA platforms -$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.8 bash +$ docker run -it --privileged --net=host --ipc=host --gpus all --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9 bash # For AMD platforms $ docker run -it --privileged --net=host --ipc=host --security-opt=seccomp=unconfined --group-add=video --name mscclpp-dev ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 bash ```