mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
Add CUDA 13.0 Docker images (#720)
* Updated Dockerfiles and the build script to support CUDA 13.0 * Added Python3 venv which is required since Python 3.12 * Updated the default MLNX-OFED version to the LTS version * Added docker push instruction for multi-arch manifest
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
ARG BASE_IMAGE
|
||||
ARG BASE_IMAGE=base-cuda13.0-x86_64
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
LABEL maintainer="MSCCL++"
|
||||
@@ -15,22 +15,27 @@ RUN apt-get update && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
|
||||
# Install CMake 3.26.4
|
||||
RUN ARCH=$(uname -m) && \
|
||||
RUN OS_ARCH=$(uname -m) && \
|
||||
CMAKE_VERSION="3.26.4" && \
|
||||
CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-${ARCH}" && \
|
||||
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${ARCH}.tar.gz" && \
|
||||
CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}" && \
|
||||
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}.tar.gz" && \
|
||||
curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
|
||||
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
|
||||
rm -rf ${CMAKE_HOME}.tar.gz && \
|
||||
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${ARCH}/bin/* /usr/bin/
|
||||
ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/
|
||||
|
||||
# Create Python venv
|
||||
RUN python3 -m venv /root/venv && \
|
||||
echo 'source /root/venv/bin/activate' >> /root/.bashrc
|
||||
ENV PATH="/root/venv/bin:${PATH}"
|
||||
|
||||
# Install Python dependencies
|
||||
ADD . /tmp/mscclpp
|
||||
WORKDIR /tmp/mscclpp
|
||||
ARG TARGET="cuda12.1"
|
||||
ARG TARGET="cuda13.0"
|
||||
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
|
||||
python3 -m pip install --no-cache-dir --upgrade pip && \
|
||||
python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt
|
||||
pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r python/requirements_${target_type}.txt
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /tmp/mscclpp
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG BASE_IMAGE
|
||||
ARG BASE_IMAGE=tmp-rocm6.2-x86_64
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
LABEL maintainer="MSCCL++"
|
||||
@@ -7,8 +7,8 @@ LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
ENV RCCL_VERSION=rocm-6.2.0
|
||||
ARG ARCH=gfx942
|
||||
ENV ARCH_TARGET=${ARCH}
|
||||
ARG GPU_ARCH=gfx942
|
||||
ENV ARCH_TARGET=${GPU_ARCH}
|
||||
RUN cd /tmp && \
|
||||
git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \
|
||||
cd rccl && \
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG BASE_IMAGE
|
||||
ARG BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu24.04
|
||||
FROM ${BASE_IMAGE}
|
||||
|
||||
LABEL maintainer="MSCCL++"
|
||||
@@ -24,18 +24,19 @@ RUN apt-get update && \
|
||||
python3-pip \
|
||||
python3-setuptools \
|
||||
python3-wheel \
|
||||
python3-venv \
|
||||
sudo \
|
||||
wget
|
||||
|
||||
# Install OFED
|
||||
ARG OFED_VERSION=5.2-2.2.3.0
|
||||
ARG OFED_VERSION=24.10-3.2.5.0
|
||||
RUN cd /tmp && \
|
||||
ARCH=$(uname -m) && \
|
||||
OS_ARCH=$(uname -m) && \
|
||||
OS_VERSION=$(lsb_release -rs) && \
|
||||
OS_VERSION=ubuntu${OS_VERSION} && \
|
||||
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
|
||||
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}.tgz && \
|
||||
MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
|
||||
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${OS_ARCH}.tgz && \
|
||||
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${OS_ARCH}.tgz && \
|
||||
MLNX_OFED_LINUX-${OFED_VERSION}-${OS_VERSION}-${OS_ARCH}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
|
||||
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
|
||||
|
||||
# Install OpenMPI (should be done after the OFED installation) & clean apt cache
|
||||
|
||||
@@ -11,6 +11,7 @@ baseImageTable=(
|
||||
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
|
||||
["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04"
|
||||
["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04"
|
||||
["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04"
|
||||
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
|
||||
)
|
||||
|
||||
@@ -27,13 +28,14 @@ ofedVersionTable=(
|
||||
["cuda12.4"]="23.07-0.5.1.2"
|
||||
["cuda12.8"]="24.10-1.1.4.0"
|
||||
["cuda12.9"]="24.10-1.1.4.0"
|
||||
["cuda13.0"]="24.10-3.2.5.0"
|
||||
)
|
||||
|
||||
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
|
||||
TARGET=${1}
|
||||
OS_ARCH=$(uname -m)
|
||||
|
||||
print_usage() {
|
||||
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|rocm6.2]"
|
||||
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|cuda12.8|cuda12.9|cuda13.0|rocm6.2]"
|
||||
}
|
||||
|
||||
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
|
||||
@@ -53,7 +55,11 @@ if [[ -z ${OFED_VERSION} ]]; then
|
||||
OFED_VERSION=${DEFAULT_OFED_VERSION}
|
||||
fi
|
||||
|
||||
docker build -t ${GHCR}-common:base-${TARGET} \
|
||||
TAG_TMP="tmp-${TARGET}-${OS_ARCH}"
|
||||
TAG_BASE="base-${TARGET}-${OS_ARCH}"
|
||||
TAG_BASE_DEV="base-dev-${TARGET}-${OS_ARCH}"
|
||||
|
||||
docker build -t ${TAG_TMP} \
|
||||
-f docker/base-x.dockerfile \
|
||||
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
|
||||
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
|
||||
@@ -62,20 +68,54 @@ docker build -t ${GHCR}-common:base-${TARGET} \
|
||||
|
||||
if [[ ${TARGET} == rocm* ]]; then
|
||||
echo "Building ROCm base image..."
|
||||
docker build -t ${GHCR}:base-${TARGET} \
|
||||
docker build -t ${TAG_BASE} \
|
||||
-f docker/base-x-rocm.dockerfile \
|
||||
--build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \
|
||||
--build-arg BASE_IMAGE=${TAG_TMP} \
|
||||
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
|
||||
--build-arg TARGET=${TARGET} \
|
||||
--build-arg ARCH="gfx942" .
|
||||
docker rmi ${GHCR}-common:base-${TARGET}
|
||||
--build-arg GPU_ARCH="gfx942" .
|
||||
docker rmi ${TAG_TMP}
|
||||
else
|
||||
echo "Building CUDA base image..."
|
||||
docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
|
||||
docker rmi --no-prune ${GHCR}-common:base-${TARGET}
|
||||
docker tag ${TAG_TMP} ${TAG_BASE}
|
||||
docker rmi --no-prune ${TAG_TMP}
|
||||
fi
|
||||
|
||||
docker build -t ${GHCR}:base-dev-${TARGET} \
|
||||
docker build -t ${TAG_BASE_DEV} \
|
||||
-f docker/base-dev-x.dockerfile \
|
||||
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
|
||||
--build-arg BASE_IMAGE=${TAG_BASE} \
|
||||
--build-arg TARGET=${TARGET} .
|
||||
|
||||
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
|
||||
GHCR_TAG_BASE_DEV=${GHCR}:base-dev-${TARGET}
|
||||
GHCR_TAG_BASE_DEV_ARCH=${GHCR}:base-dev-${TARGET}-${OS_ARCH}
|
||||
|
||||
echo "Successfully built images:"
|
||||
echo " - ${TAG_BASE}"
|
||||
echo " - ${TAG_BASE_DEV}"
|
||||
echo ""
|
||||
echo "To push the base-dev image to ghcr.io,"
|
||||
echo ""
|
||||
echo "0. Login to ghcr.io:"
|
||||
echo ""
|
||||
echo " docker login ghcr.io"
|
||||
echo ""
|
||||
echo "1. Tag and push the arch-specific image:"
|
||||
echo ""
|
||||
echo " docker tag ${TAG_BASE_DEV} ${GHCR_TAG_BASE_DEV_ARCH} && \\"
|
||||
echo " docker push ${GHCR_TAG_BASE_DEV_ARCH}"
|
||||
echo ""
|
||||
echo "2. Create or update the multi-arch manifest:"
|
||||
echo ""
|
||||
echo " If \`${GHCR_TAG_BASE_DEV}\` already exists (adding another arch):"
|
||||
echo ""
|
||||
echo " docker buildx imagetools create \\"
|
||||
echo " --tag ${GHCR_TAG_BASE_DEV} \\"
|
||||
echo " --append ${GHCR_TAG_BASE_DEV_ARCH}"
|
||||
echo ""
|
||||
echo " If \`${GHCR_TAG_BASE_DEV}\` does not exist yet:"
|
||||
echo ""
|
||||
echo " docker buildx imagetools create \\"
|
||||
echo " --tag ${GHCR_TAG_BASE_DEV} \\"
|
||||
echo " ${GHCR_TAG_BASE_DEV_ARCH}"
|
||||
echo ""
|
||||
|
||||
9
python/requirements_cuda13.txt
Normal file
9
python/requirements_cuda13.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
mpi4py
|
||||
cupy-cuda13x
|
||||
prettytable
|
||||
netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
blake3
|
||||
Reference in New Issue
Block a user