mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 09:46:00 +00:00
Documentation update: * [`docs/design/mscclpp-dsl.md`](diffhunk://#diff-02a69290fb3e02b8a069bf915fbf5266cfc2ac51c6e9ff8b5b19df51ed909b22L114-R114): Updated the link to the examples folder to reflect the correct path. New example script: * [`python/examples/allgather_allpairs_multinodes_packets.py`](diffhunk://#diff-ab42c16ecca0680d55b60b82a6913138c5fba4069b9c4493fbe8c72217fe54bcR1-R76): Added a new example script demonstrating the allgather all-pairs algorithm across multiple nodes using packet communication. IR module improvements: * [`python/mscclpp/language/ir.py`](diffhunk://#diff-b025796b03fbbd9b2ca9aee2569547efa7a56101743bc4aa05661be0b52aeec9L470-R472): Refined the sorting criteria for GPU instance channels and thread block channels to include the channel type, ensuring a more accurate order. Debugging enhancements: * [`src/executor/executor.cc`](diffhunk://#diff-60f7806d111e5cc12ded06358b5d5b09b8521e3858f182d8be81ac05147c535dR439-R441): Added a debug log to indicate the start of communication collective execution with details about the execution plan and collective. * [`src/include/debug.h`](diffhunk://#diff-24e5fda55e3712277be4bb99b3c348294a77ebd3046bfe716b74bdb32cd203dfR89): Introduced a new debug log subsystem identifier `MSCCLPP_EXECUTOR` for logging executor-related information.
76 lines
2.2 KiB
Bash
Executable File
76 lines
2.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
declare -A baseImageTable
|
|
baseImageTable=(
|
|
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
|
|
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
|
|
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
|
|
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
|
|
["cuda12.4"]="nvidia/cuda:12.4.1-devel-ubuntu22.04"
|
|
["rocm6.2"]="rocm/rocm-terminal:6.2.1"
|
|
)
|
|
|
|
declare -A extraLdPathTable
|
|
extraLdPathTable=(
|
|
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
|
|
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
|
|
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
|
|
["rocm6.2"]="/opt/rocm/lib"
|
|
)
|
|
|
|
declare -A ofedVersionTable
|
|
ofedVersionTable=(
|
|
["cuda12.4"]="23.07-0.5.1.2"
|
|
)
|
|
|
|
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
|
|
TARGET=${1}
|
|
|
|
print_usage() {
|
|
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3|cuda12.4|rocm6.2]"
|
|
}
|
|
|
|
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
|
|
echo "Invalid target: ${TARGET}"
|
|
print_usage
|
|
exit 1
|
|
fi
|
|
echo "Target: ${TARGET}"
|
|
|
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
|
|
|
cd ${SCRIPT_DIR}/..
|
|
|
|
DEFAULT_OFED_VERSION="5.2-2.2.3.0"
|
|
OFED_VERSION=${ofedVersionTable[${TARGET}]}
|
|
if [[ -z ${OFED_VERSION} ]]; then
|
|
OFED_VERSION=${DEFAULT_OFED_VERSION}
|
|
fi
|
|
|
|
docker build -t ${GHCR}-common:base-${TARGET} \
|
|
-f docker/base-x.dockerfile \
|
|
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
|
|
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
|
|
--build-arg TARGET=${TARGET} \
|
|
--build-arg OFED_VERSION=${OFED_VERSION} .
|
|
|
|
if [[ ${TARGET} == rocm* ]]; then
|
|
echo "Building ROCm base image..."
|
|
docker build -t ${GHCR}:base-${TARGET} \
|
|
-f docker/base-x-rocm.dockerfile \
|
|
--build-arg BASE_IMAGE=${GHCR}-common:base-${TARGET} \
|
|
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
|
|
--build-arg TARGET=${TARGET} \
|
|
--build-arg ARCH="gfx942" .
|
|
else
|
|
echo "Building CUDA base image..."
|
|
docker tag ${GHCR}-common:base-${TARGET} ${GHCR}:base-${TARGET}
|
|
fi
|
|
|
|
docker build -t ${GHCR}:base-dev-${TARGET} \
|
|
-f docker/base-dev-x.dockerfile \
|
|
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
|
|
--build-arg TARGET=${TARGET} .
|