Add CUDA 13.0 Docker images (#720)

* Updated Dockerfiles and the build script to support CUDA 13.0
* Added Python3 venv which is required since Python 3.12
* Updated the default MLNX-OFED version to the LTS version
* Added docker push instruction for multi-arch manifest
This commit is contained in:
Changho Hwang
2026-01-09 03:03:33 -08:00
committed by GitHub
parent eab2afb8b9
commit b8a1b0a134
5 changed files with 83 additions and 28 deletions

View File

@@ -1,4 +1,4 @@
ARG BASE_IMAGE
ARG BASE_IMAGE=base-cuda13.0-x86_64
FROM ${BASE_IMAGE}
LABEL maintainer="MSCCL++"
@@ -15,22 +15,27 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/* /tmp/*
# Install CMake 3.26.4
RUN ARCH=$(uname -m) && \
RUN OS_ARCH=$(uname -m) && \
CMAKE_VERSION="3.26.4" && \
CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-${ARCH}" && \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${ARCH}.tar.gz" && \
CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}" && \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}.tar.gz" && \
curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
rm -rf ${CMAKE_HOME}.tar.gz && \
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${ARCH}/bin/* /usr/bin/
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
# Create Python venv
RUN python3 -m venv /root/venv && \
echo 'source /root/venv/bin/activate' >> /root/.bashrc
ENV PATH="/root/venv/bin:${PATH}"
# Install Python dependencies
ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda12.1"
ARG TARGET="cuda13.0"
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt
pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r python/requirements_${target_type}.txt
# Cleanup
RUN rm -rf /tmp/mscclpp

View File

@@ -1,4 +1,4 @@
ARG BASE_IMAGE
ARG BASE_IMAGE=tmp-rocm6.2-x86_64
FROM ${BASE_IMAGE}
LABEL maintainer="MSCCL++"
@@ -7,8 +7,8 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
ENV DEBIAN_FRONTEND=noninteractive
ENV RCCL_VERSION=rocm-6.2.0
ARG ARCH=gfx942
ENV ARCH_TARGET=${ARCH}
ARG GPU_ARCH=gfx942
ENV ARCH_TARGET=${GPU_ARCH}
RUN cd /tmp && \
git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \
cd rccl && \

View File

@@ -1,4 +1,4 @@
ARG BASE_IMAGE
ARG BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04
FROM ${BASE_IMAGE}
LABEL maintainer="MSCCL++"
@@ -24,18 +24,19 @@ RUN apt-get update && \
python3-pip \
python3-setuptools \
python3-wheel \
python3-venv \
sudo \
wget
# Install OFED
ARG OFED_VERSION=5.2-2.2.3.0
ARG OFED_VERSION=24.10-3.2.5.0
RUN cd /tmp && \
ARCH=$(uname -m) && \
OS_ARCH=$(uname -m) && \
OS_VERSION=$(lsb_release -rs) && \
OS_VERSION=ubuntu${OS_VERSION} && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${OS_ARCH}.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${OS_ARCH}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${OS_ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
# Install OpenMPI (should be done after the OFED installation) & clean apt cache

View File

@@ -11,6 +11,7 @@ baseImageTable=(
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
)
@@ -27,13 +28,14 @@ ofedVersionTable=(
["cuda12.4"]="23.07-0.5.1.2"
["cuda12.8"]="24.10-1.1.4.0"
["cuda12.9"]="24.10-1.1.4.0"
["cuda13.0"]="24.10-3.2.5.0"
)
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
TARGET=${1}
OS_ARCH=$(uname -m)
print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|rocm6.2]"
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
}
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
@@ -53,7 +55,11 @@ if [[ -z ${OFED_VERSION} ]]; then
OFED_VERSION=${DEFAULT_OFED_VERSION}
fi
docker build -t ${GHCR}-common:base-${TARGET} \
TAG_TMP="tmp-${TARGET}-${OS_ARCH}"
TAG_BASE="base-${TARGET}-${OS_ARCH}"
TAG_BASE_DEV="base-dev-${TARGET}-${OS_ARCH}"
docker build -t ${TAG_TMP} \
-f docker/base-x.dockerfile \
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
@@ -62,20 +68,54 @@ docker build -t ${GHCR}-common:base-${TARGET} \
if [[ ${TARGET} == rocm* ]]; then
echo "Building ROCm base image..."
docker build -t ${GHCR}:base-${TARGET} \
docker build -t ${TAG_BASE} \
-f docker/base-x-rocm.dockerfile \
--build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \
--build-arg BASE_IMAGE=${TAG_TMP} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} \
--build-arg ARCH="gfx942" .
docker rmi ${GHCR}-common:base-${TARGET}
--build-arg GPU_ARCH="gfx942" .
docker rmi ${TAG_TMP}
else
echo "Building CUDA base image..."
docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
docker rmi --no-prune ${GHCR}-common:base-${TARGET}
docker tag ${TAG_TMP} ${TAG_BASE}
docker rmi --no-prune ${TAG_TMP}
fi
docker build -t ${GHCR}:base-dev-${TARGET} \
docker build -t ${TAG_BASE_DEV} \
-f docker/base-dev-x.dockerfile \
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
--build-arg BASE_IMAGE=${TAG_BASE} \
--build-arg TARGET=${TARGET} .
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
GHCR_TAG_BASE_DEV=${GHCR}:base-dev-${TARGET}
GHCR_TAG_BASE_DEV_ARCH=${GHCR}:base-dev-${TARGET}-${OS_ARCH}
echo "Successfully built images:"
echo " - ${TAG_BASE}"
echo " - ${TAG_BASE_DEV}"
echo ""
echo "To push the base-dev image to ghcr.io,"
echo ""
echo "0. Login to ghcr.io:"
echo ""
echo " docker login ghcr.io"
echo ""
echo "1. Tag and push the arch-specific image:"
echo ""
echo " docker tag ${TAG_BASE_DEV} ${GHCR_TAG_BASE_DEV_ARCH} && \\"
echo " docker push ${GHCR_TAG_BASE_DEV_ARCH}"
echo ""
echo "2. Create or update the multi-arch manifest:"
echo ""
echo " If \`${GHCR_TAG_BASE_DEV}\` already exists (adding another arch):"
echo ""
echo " docker buildx imagetools create \\"
echo " --tag ${GHCR_TAG_BASE_DEV} \\"
echo " --append ${GHCR_TAG_BASE_DEV_ARCH}"
echo ""
echo " If \`${GHCR_TAG_BASE_DEV}\` does not exist yet:"
echo ""
echo " docker buildx imagetools create \\"
echo " --tag ${GHCR_TAG_BASE_DEV} \\"
echo " ${GHCR_TAG_BASE_DEV_ARCH}"
echo ""

View File

@@ -0,0 +1,9 @@
mpi4py
cupy-cuda13x
prettytable
netifaces
pytest
numpy
matplotlib
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
blake3