Files
mscclpp/docker/build.sh
Binyang Li 7f3b088744 Add multi-nodes example & update doc (#455)
Documentation update:

*
[`docs/design/mscclpp-dsl.md`](diffhunk://#diff-02a69290fb3e02b8a069bf915fbf5266cfc2ac51c6e9ff8b5b19df51ed909b22L114-R114):
Updated the link to the examples folder to reflect the correct path.

New example script:

*
[`python/examples/allgather_allpairs_multinodes_packets.py`](diffhunk://#diff-ab42c16ecca0680d55b60b82a6913138c5fba4069b9c4493fbe8c72217fe54bcR1-R76):
Added a new example script demonstrating the allgather all-pairs
algorithm across multiple nodes using packet communication.

IR module improvements:

*
[`python/mscclpp/language/ir.py`](diffhunk://#diff-b025796b03fbbd9b2ca9aee2569547efa7a56101743bc4aa05661be0b52aeec9L470-R472):
Refined the sorting criteria for GPU instance channels and thread block
channels to include the channel type, ensuring a more accurate order.
Debugging enhancements:

*
[`src/executor/executor.cc`](diffhunk://#diff-60f7806d111e5cc12ded06358b5d5b09b8521e3858f182d8be81ac05147c535dR439-R441):
Added a debug log to indicate the start of communication collective
execution with details about the execution plan and collective.
*
[`src/include/debug.h`](diffhunk://#diff-24e5fda55e3712277be4bb99b3c348294a77ebd3046bfe716b74bdb32cd203dfR89):
Introduced a new debug log subsystem identifier `MSCCLPP_EXECUTOR` for
logging executor-related information.
2025-01-31 17:52:15 -08:00

76 lines
2.2 KiB
Bash
Executable File

#!/usr/bin/env bash
set -e
declare -A baseImageTable
baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
)
declare -A extraLdPathTable
extraLdPathTable=(
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
["rocm6.2"]="/opt/rocm/lib"
)
declare -A ofedVersionTable
ofedVersionTable=(
["cuda12.4"]="23.07-0.5.1.2"
)
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
TARGET=${1}
print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|rocm6.2]"
}
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
echo "Invalid target: ${TARGET}"
print_usage
exit 1
fi
echo "Target: ${TARGET}"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd ${SCRIPT_DIR}/..
DEFAULT_OFED_VERSION="5.2-2.2.3.0"
OFED_VERSION=${ofedVersionTable[${TARGET}]}
if [[ -z ${OFED_VERSION} ]]; then
OFED_VERSION=${DEFAULT_OFED_VERSION}
fi
docker build -t ${GHCR}-common:base-${TARGET} \
-f docker/base-x.dockerfile \
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} \
--build-arg OFED_VERSION=${OFED_VERSION} .
if [[ ${TARGET} == rocm* ]]; then
echo "Building ROCm base image..."
docker build -t ${GHCR}:base-${TARGET} \
-f docker/base-x-rocm.dockerfile \
--build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} \
--build-arg ARCH="gfx942" .
else
echo "Building CUDA base image..."
docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
fi
docker build -t ${GHCR}:base-dev-${TARGET} \
-f docker/base-dev-x.dockerfile \
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
--build-arg TARGET=${TARGET} .