mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 08:50:21 +00:00
Merge branch 'main' into caiorocha/support_tbg_pipeline
This commit is contained in:
@@ -16,23 +16,24 @@ pr: none
|
||||
|
||||
|
||||
parameters:
|
||||
- name: vmssName
|
||||
type: string
|
||||
default: mscclpp-h100-multinode-ci
|
||||
- name: hostEntries
|
||||
type: string
|
||||
default: |
|
||||
10.0.0.10 mscclit-000000
|
||||
10.0.0.11 mscclit-000001
|
||||
10.0.0.5 mscclpp-h100-multinode-ci000000
|
||||
10.0.0.4 mscclpp-h100-multinode-ci000001
|
||||
|
||||
jobs:
|
||||
- job: MultiNodesTest
|
||||
displayName: Multi nodes test
|
||||
strategy:
|
||||
matrix:
|
||||
cuda11:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
|
||||
cuda12:
|
||||
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
|
||||
pool:
|
||||
name: mscclpp-it
|
||||
name: mscclpp-multi-node
|
||||
container:
|
||||
image: $[ variables['containerImage'] ]
|
||||
|
||||
@@ -42,25 +43,53 @@ jobs:
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
ENTRY="${{ parameters.hostEntries }}"
|
||||
if ! grep -qxF "$ENTRY" /etc/hosts; then
|
||||
echo "Adding to /etc/hosts"
|
||||
echo "$ENTRY" | sudo tee -a /etc/hosts
|
||||
else
|
||||
echo "Entry already exists, nothing to do."
|
||||
fi
|
||||
while IFS= read -r line; do
|
||||
[ -z "$line" ] && continue
|
||||
if ! grep -qxF "$line" /etc/hosts; then
|
||||
echo "Adding to /etc/hosts: $line"
|
||||
echo "$line" | sudo tee -a /etc/hosts
|
||||
else
|
||||
echo "Entry already exists: $line"
|
||||
fi
|
||||
done <<< "${{ parameters.hostEntries }}"
|
||||
|
||||
- task: Bash@3
|
||||
displayName: Generate deploy files
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
set -e
|
||||
VMSS="${{ parameters.vmssName }}"
|
||||
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
|
||||
NODE0="${VMSS}000000"
|
||||
NODE1="${VMSS}000001"
|
||||
|
||||
echo "Host ${NODE0}
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
Host ${NODE1}
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
|
||||
|
||||
printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
|
||||
|
||||
printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
|
||||
|
||||
- template: templates/deploy.yml
|
||||
parameters:
|
||||
subscription: msccl-it
|
||||
vmssName: mscclit-vmss
|
||||
resourceGroup: msccl-IT
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
gpuArch: '90'
|
||||
|
||||
- template: templates/run-remote-task.yml
|
||||
parameters:
|
||||
name: RunMscclppTest
|
||||
displayName: Run multi-nodes mscclpp-test
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
|
||||
continueOnError: true
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
|
||||
|
||||
@@ -68,7 +97,7 @@ jobs:
|
||||
parameters:
|
||||
name: RunMultiNodeUnitTest
|
||||
displayName: Run multi-nodes unit tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
|
||||
|
||||
@@ -76,7 +105,7 @@ jobs:
|
||||
parameters:
|
||||
name: RunMultiNodePythonTests
|
||||
displayName: Run multi-nodes python tests
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh pytests
|
||||
|
||||
@@ -84,12 +113,12 @@ jobs:
|
||||
parameters:
|
||||
name: RunMultiNodePythonBenchmark
|
||||
displayName: Run multi-nodes python benchmark
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
|
||||
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
|
||||
remoteScript: |
|
||||
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
|
||||
|
||||
- template: templates/stop.yml
|
||||
parameters:
|
||||
subscription: msccl-it
|
||||
vmssName: mscclit-vmss
|
||||
resourceGroup: msccl-IT
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
resourceGroup: mscclpp
|
||||
|
||||
@@ -44,6 +44,7 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci
|
||||
vmssName: mscclpp-ci
|
||||
gpuArch: '80'
|
||||
nvccGencode: "-gencode=arch=compute_80,code=sm_80"
|
||||
|
||||
- job: NcclTestH100
|
||||
@@ -64,4 +65,5 @@ jobs:
|
||||
parameters:
|
||||
subscription: mscclpp-ci-h100
|
||||
vmssName: mscclpp-h100-ci
|
||||
gpuArch: '90'
|
||||
nvccGencode: "-gencode=arch=compute_90,code=sm_90"
|
||||
@@ -10,6 +10,9 @@ parameters:
|
||||
type: string
|
||||
- name: vmssName
|
||||
type: string
|
||||
- name: gpuArch
|
||||
type: string
|
||||
default: '80'
|
||||
- name: nvccGencode
|
||||
type: string
|
||||
default: "-gencode=arch=compute_80,code=sm_80"
|
||||
@@ -19,6 +22,7 @@ steps:
|
||||
parameters:
|
||||
subscription: ${{ parameters.subscription }}
|
||||
vmssName: ${{ parameters.vmssName }}
|
||||
gpuArch: ${{ parameters.gpuArch }}
|
||||
deployArgs: 'nccltest-single-node'
|
||||
|
||||
- template: run-remote-task.yml
|
||||
|
||||
@@ -12,12 +12,16 @@ parameters:
|
||||
- name: workingDirectory
|
||||
type: string
|
||||
default: '$(System.DefaultWorkingDirectory)'
|
||||
- name: continueOnError
|
||||
type: boolean
|
||||
default: false
|
||||
|
||||
steps:
|
||||
- task: Bash@3
|
||||
${{ if ne(parameters.name, '') }}:
|
||||
name: ${{ parameters.name }}
|
||||
displayName: ${{ parameters.displayName }}
|
||||
continueOnError: ${{ parameters.continueOnError }}
|
||||
inputs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
|
||||
@@ -14,11 +14,6 @@ baseImageTable=(
|
||||
|
||||
declare -A extraLdPathTable
|
||||
extraLdPathTable=(
|
||||
["cuda11.8"]="/usr/local/cuda-11.8/compat"
|
||||
["cuda12.4"]="/usr/local/cuda-12.4/compat"
|
||||
["cuda12.8"]="/usr/local/cuda-12.8/compat"
|
||||
["cuda12.9"]="/usr/local/cuda-12.9/compat"
|
||||
["cuda13.0"]="/usr/local/cuda-13.0/compat"
|
||||
["rocm6.2"]="/opt/rocm/lib"
|
||||
)
|
||||
|
||||
|
||||
@@ -25,9 +25,9 @@
|
||||
```bash
|
||||
sudo apt-get install libnuma-dev
|
||||
```
|
||||
* (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.8 and Python Development Package
|
||||
* (Optional, for [building the Python module](#install-from-source-python-module)) Python >= 3.10 and Python Development Package
|
||||
```bash
|
||||
sudo apt-get satisfy "python3 (>=3.8), python3-dev (>=3.8)"
|
||||
sudo apt-get satisfy "python3 (>=3.10), python3-dev (>=3.10)"
|
||||
```
|
||||
If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
|
||||
* (Optional, for benchmarks) MPI
|
||||
@@ -100,13 +100,30 @@ There are a few optional CMake options you can set:
|
||||
(install-from-source-python-module)=
|
||||
## Install from Source (Python Module)
|
||||
|
||||
Python 3.8 or later is required.
|
||||
Python 3.10 or later is required.
|
||||
|
||||
```bash
|
||||
# For NVIDIA platforms
|
||||
$ python -m pip install .
|
||||
# For AMD platforms, set the C++ compiler to HIPCC
|
||||
$ CXX=/opt/rocm/bin/hipcc python -m pip install .
|
||||
# For NVIDIA platforms (specify your CUDA version)
|
||||
$ python -m pip install ".[cuda12]"
|
||||
# For AMD platforms
|
||||
$ CXX=/opt/rocm/bin/hipcc python -m pip install ".[rocm6]"
|
||||
```
|
||||
|
||||
> **Note:** A platform extra (`cuda11`, `cuda12`, `cuda13`, or `rocm6`) is required to install CuPy.
|
||||
> The CUDA extras install pre-built CuPy wheels. The `rocm6` extra installs CuPy from source,
|
||||
> which requires ROCm and may take longer. Running `pip install .` without an extra will not install CuPy.
|
||||
|
||||
Optional extras can be installed by specifying them in brackets. Available extras:
|
||||
- **`cuda11`**, **`cuda12`**, **`cuda13`**: Install a pre-built CuPy package for your CUDA version.
|
||||
- **`rocm6`**: Install CuPy from source for AMD ROCm platforms.
|
||||
- **`benchmark`**: Install benchmark dependencies (mpi4py, prettytable, netifaces, matplotlib).
|
||||
- **`test`**: Install test dependencies (pytest, mpi4py, netifaces).
|
||||
|
||||
```bash
|
||||
# Example: install with CUDA 12 and benchmark extras
|
||||
$ python -m pip install ".[cuda12,benchmark]"
|
||||
# Example: install with all extras for testing on CUDA 12
|
||||
$ python -m pip install ".[cuda12,benchmark,test]"
|
||||
```
|
||||
|
||||
(vscode-dev-container)=
|
||||
@@ -158,8 +175,9 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./bin/mp_unit_tests -ip_port 10.0
|
||||
[Install the MSCCL++ Python package](#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
|
||||
|
||||
```bash
|
||||
# Choose `requirements_*.txt` according to your CUDA/ROCm version.
|
||||
$ python3 -m pip install -r ./python/requirements_cuda12.txt
|
||||
# Install with benchmark dependencies and the appropriate CUDA/ROCm extras.
|
||||
# Replace `cuda12` with your platform: cuda11, cuda12, cuda13, or rocm6.
|
||||
$ python3 -m pip install ".[cuda12,benchmark,test]"
|
||||
$ mpirun -tag-output -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
|
||||
```
|
||||
|
||||
|
||||
@@ -12,7 +12,30 @@ build-backend = "scikit_build_core.build"
|
||||
name = "mscclpp"
|
||||
dynamic = ["version"]
|
||||
description = "MSCCL++ Python API"
|
||||
requires-python = ">=3.8"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"numpy",
|
||||
"blake3",
|
||||
"pybind11",
|
||||
"sortedcontainers",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
cuda11 = ["cupy-cuda11x"]
|
||||
cuda12 = ["cupy-cuda12x"]
|
||||
cuda13 = ["cupy-cuda13x"]
|
||||
rocm6 = ["cupy"]
|
||||
benchmark = [
|
||||
"mpi4py",
|
||||
"prettytable",
|
||||
"netifaces",
|
||||
"matplotlib",
|
||||
]
|
||||
test = [
|
||||
"pytest",
|
||||
"mpi4py",
|
||||
"netifaces",
|
||||
]
|
||||
|
||||
[tool.setuptools_scm]
|
||||
write_to = "python/mscclpp/_version.py"
|
||||
@@ -40,5 +63,5 @@ MSCCLPP_BUILD_TESTS = "OFF"
|
||||
|
||||
[tool.black]
|
||||
line-length = 120
|
||||
target-version = ['py38']
|
||||
target-version = ['py310']
|
||||
include = '\.pyi?$'
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.9.2)
|
||||
FetchContent_MakeAvailable(nanobind)
|
||||
|
||||
@@ -192,6 +192,9 @@ class NativeCodeCompiler:
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._initialized = False
|
||||
|
||||
def _do_init(self):
|
||||
self._is_hip = cp.cuda.runtime.is_hip
|
||||
self._device_arch = get_device_arch()
|
||||
self._compiler = self._get_compiler()
|
||||
@@ -226,6 +229,7 @@ class NativeCodeCompiler:
|
||||
]
|
||||
self._cache_dir = Path(env().cache_dir) / "native"
|
||||
self._cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
self._initialized = True
|
||||
|
||||
def _get_compiler(self) -> str:
|
||||
"""Get the path to the appropriate compiler.
|
||||
@@ -246,6 +250,8 @@ class NativeCodeCompiler:
|
||||
Returns:
|
||||
str: The GPU architecture string (e.g., "sm_90" for NVIDIA or "gfx90a" for AMD).
|
||||
"""
|
||||
if not self._initialized:
|
||||
self._do_init()
|
||||
return self._device_arch
|
||||
|
||||
def __call__(self, name: str, file: str, **kwds):
|
||||
@@ -290,6 +296,8 @@ class NativeCodeCompiler:
|
||||
>>> # Use the module to create an algorithm
|
||||
>>> algo = module.create_allreduce_algorithm(comm, buffer, size)
|
||||
"""
|
||||
if not self._initialized:
|
||||
self._do_init()
|
||||
if not os.path.isfile(file):
|
||||
raise FileNotFoundError(f"The specified source file does not exist: {file}")
|
||||
|
||||
|
||||
@@ -5,6 +5,6 @@ netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -5,6 +5,6 @@ netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -5,6 +5,6 @@ netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -5,6 +5,6 @@ netifaces
|
||||
pytest
|
||||
numpy
|
||||
matplotlib
|
||||
sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed
|
||||
sortedcontainers
|
||||
blake3
|
||||
pybind11
|
||||
@@ -1,7 +1,7 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
find_package(Python 3.10 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(nanobind GIT_REPOSITORY https://github.com/wjakob/nanobind.git GIT_TAG v1.4.0)
|
||||
FetchContent_MakeAvailable(nanobind)
|
||||
|
||||
@@ -283,7 +283,9 @@ bool isNvlsSupported() {
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
|
||||
return isMulticastSupported == 1;
|
||||
result = (isMulticastSupported == 1);
|
||||
isChecked = true;
|
||||
return result;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
@@ -300,9 +302,6 @@ bool isCuMemMapAllocated([[maybe_unused]] void* ptr) {
|
||||
return false;
|
||||
}
|
||||
MSCCLPP_CUTHROW(cuMemRelease(handle));
|
||||
if (!isNvlsSupported()) {
|
||||
throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage);
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
|
||||
}
|
||||
}
|
||||
} else if (transports.has(Transport::CudaIpc)) {
|
||||
// When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
|
||||
// try CudaIpc first and fall back to IB on failure.
|
||||
auto entry = getTransportInfo(Transport::CudaIpc);
|
||||
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
|
||||
// Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
|
||||
this->remoteMemMap = gpuIpcMem->map();
|
||||
this->data = this->remoteMemMap.get();
|
||||
bool hasIB = (transports & AllIBTransports).any();
|
||||
try {
|
||||
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
|
||||
this->remoteMemMap = gpuIpcMem->map();
|
||||
this->data = this->remoteMemMap.get();
|
||||
} catch (const BaseError& e) {
|
||||
if (!hasIB) {
|
||||
throw;
|
||||
}
|
||||
bool isSameHost = (getHostHash() == this->hostHash);
|
||||
if (isSameHost) {
|
||||
WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
|
||||
} else {
|
||||
INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this->data != nullptr) {
|
||||
INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
Host mscclit-000000
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
Host mscclit-000001
|
||||
Port 22345
|
||||
IdentityFile /root/mscclpp/sshkey
|
||||
StrictHostKeyChecking no
|
||||
@@ -6,10 +6,6 @@ PLATFORM="${3:-cuda}"
|
||||
|
||||
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
|
||||
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
|
||||
if [ "${TEST_NAME}" == "nccltest-single-node" ]; then
|
||||
ROOT_DIR="${ROOT_DIR}/mscclpp"
|
||||
SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp"
|
||||
fi
|
||||
DST_DIR="/tmp/mscclpp"
|
||||
if [ "${TEST_NAME}" == "nccltest-single-node" ] || [ "${TEST_NAME}" == "single-node-test" ]; then
|
||||
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"
|
||||
@@ -33,12 +29,34 @@ done
|
||||
|
||||
set -e
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
|
||||
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
|
||||
tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
|
||||
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
|
||||
rm -f /tmp/mscclpp.tar.gz
|
||||
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
|
||||
fi
|
||||
|
||||
# Install GDRCopy kernel module on host VMs (CUDA only)
|
||||
GDRCOPY_VERSION="2.5.2"
|
||||
if [ "${PLATFORM}" == "cuda" ]; then
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"if lsmod | grep -q gdrdrv; then
|
||||
echo 'gdrdrv module already loaded'
|
||||
else
|
||||
set -e
|
||||
sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
|
||||
cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
|
||||
tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
|
||||
CUDA=/usr/local/cuda ./build-deb-packages.sh
|
||||
sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
|
||||
sudo modprobe gdrdrv
|
||||
rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
|
||||
fi"
|
||||
fi
|
||||
|
||||
# force to pull the latest image
|
||||
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
|
||||
"sudo docker pull ${CONTAINERIMAGE}"
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
azureuser@mscclit-000000
|
||||
azureuser@mscclit-000001
|
||||
@@ -1,2 +0,0 @@
|
||||
mscclit-000000
|
||||
mscclit-000001
|
||||
@@ -1,3 +1,10 @@
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.98, "busBw":6.96, "size":24576, "time":6.18, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":7.42, "busBw":12.99, "size":49152, "time":6.62, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
|
||||
{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"}
|
||||
{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.54, "busBw":1.01, "size":8192, "time":15.10, "target":"latency"}
|
||||
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"}
|
||||
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":118.49,"busBw":222.17, "size":25165824, "time":212.39, "target":"throughput"}
|
||||
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":138.48,"busBw":259.65, "size":50331648, "time":363.40, "target":"throughput"}
|
||||
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"}
|
||||
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":96.94, "busBw":90.88, "size":1073741824, "time":11076.24,"target":"throughput"}
|
||||
@@ -1,83 +1,99 @@
|
||||
set -e
|
||||
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
|
||||
HEAD_HOST=$(head -1 ${HOSTFILE})
|
||||
# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
|
||||
HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
|
||||
if [ -z "${HEAD_IP}" ]; then
|
||||
HEAD_IP=${HEAD_HOST}
|
||||
fi
|
||||
MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
|
||||
MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
|
||||
|
||||
# Select perf baseline based on GPU type
|
||||
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
|
||||
if echo "${GPU_NAME}" | grep -qi "H100"; then
|
||||
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
|
||||
else
|
||||
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
|
||||
fi
|
||||
|
||||
function run_mscclpp_test()
|
||||
{
|
||||
echo "=================Run allgather_test_perf on 2 nodes========================="
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
|
||||
# For kernel 2, the message size must can be divided by 3
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
|
||||
echo "==================Run allreduce_test_perf on 2 nodes========================="
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
|
||||
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
|
||||
|
||||
echo "==================Run alltoall_test_perf on 2 nodes========================="
|
||||
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
|
||||
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
|
||||
|
||||
echo "========================Run performance check==============================="
|
||||
python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
|
||||
--baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
|
||||
--baseline-file ${PERF_BASELINE}
|
||||
}
|
||||
|
||||
function run_mp_ut()
|
||||
{
|
||||
echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
|
||||
mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
|
||||
mpirun ${MPI_ARGS} -tag-output -np 2 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
|
||||
|
||||
echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
|
||||
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
|
||||
mpirun ${MPI_ARGS} -tag-output -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
|
||||
}
|
||||
|
||||
function run_pytests()
|
||||
{
|
||||
echo "==================Run python tests================================"
|
||||
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
mpirun ${MPI_ARGS} -tag-output -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
|
||||
}
|
||||
|
||||
function run_py_benchmark()
|
||||
{
|
||||
echo "==================Run python benchmark================================"
|
||||
mpirun -allow-run-as-root -np 16 --bind-to numa \
|
||||
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
|
||||
-mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
|
||||
mpirun ${MPI_ARGS} -np 16 \
|
||||
${MSCCLPP_ENV} \
|
||||
-mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
|
||||
-x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
|
||||
-x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
|
||||
-x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
|
||||
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
|
||||
}
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
|
||||
@@ -5,11 +5,22 @@ PLATFORM="${1:-cuda}"
|
||||
mkdir -p /root/.ssh
|
||||
mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys
|
||||
chown root:root /root/.ssh/authorized_keys
|
||||
mv /root/mscclpp/test/deploy/config /root/.ssh/config
|
||||
chown root:root /root/.ssh/config
|
||||
chmod 400 /root/mscclpp/sshkey
|
||||
chown root:root /root/mscclpp/sshkey
|
||||
|
||||
# Generate SSH config from hostfile_mpi
|
||||
HOSTFILE_MPI=/root/mscclpp/test/deploy/hostfile_mpi
|
||||
if [ -f "${HOSTFILE_MPI}" ]; then
|
||||
> /root/.ssh/config
|
||||
while IFS= read -r host; do
|
||||
echo "Host ${host}" >> /root/.ssh/config
|
||||
echo " Port 22345" >> /root/.ssh/config
|
||||
echo " IdentityFile /root/mscclpp/sshkey" >> /root/.ssh/config
|
||||
echo " StrictHostKeyChecking no" >> /root/.ssh/config
|
||||
done < "${HOSTFILE_MPI}"
|
||||
chown root:root /root/.ssh/config
|
||||
fi
|
||||
|
||||
if [ "${PLATFORM}" == "cuda" ]; then
|
||||
nvidia-smi -pm 1
|
||||
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
||||
@@ -18,14 +29,26 @@ if [ "${PLATFORM}" == "cuda" ]; then
|
||||
fi
|
||||
|
||||
make -C /root/mscclpp/tools/peer-access-test
|
||||
set +e
|
||||
/root/mscclpp/tools/peer-access-test/peer_access_test
|
||||
make -C /root/mscclpp/tools/peer-access-test clean
|
||||
|
||||
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
|
||||
pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
|
||||
elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
|
||||
pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
|
||||
PEER_ACCESS_EXIT_CODE=$?
|
||||
set -e
|
||||
if [ ${PEER_ACCESS_EXIT_CODE} -eq 2 ] && [ "${PLATFORM}" == "cuda" ]; then
|
||||
# Exit code 2 = CUDA init failure (e.g., driver/toolkit version mismatch).
|
||||
# Add CUDA compat libs for forward compatibility and retry.
|
||||
CUDA_COMPAT_PATH="/usr/local/cuda/compat"
|
||||
if [ -d "${CUDA_COMPAT_PATH}" ]; then
|
||||
echo "Adding ${CUDA_COMPAT_PATH} to LD_LIBRARY_PATH for forward compatibility"
|
||||
export LD_LIBRARY_PATH="${CUDA_COMPAT_PATH}:${LD_LIBRARY_PATH}"
|
||||
/root/mscclpp/tools/peer-access-test/peer_access_test
|
||||
else
|
||||
echo "CUDA compat libs not found at ${CUDA_COMPAT_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
elif [ ${PEER_ACCESS_EXIT_CODE} -ne 0 ]; then
|
||||
exit ${PEER_ACCESS_EXIT_CODE}
|
||||
fi
|
||||
make -C /root/mscclpp/tools/peer-access-test clean
|
||||
|
||||
if [ "${PLATFORM}" == "rocm" ]; then
|
||||
export CXX=/opt/rocm/bin/hipcc
|
||||
@@ -36,7 +59,19 @@ if [ -f "${PIP_CMAKE_ARGS_FILE}" ]; then
|
||||
export CMAKE_ARGS="$(cat ${PIP_CMAKE_ARGS_FILE})"
|
||||
echo "Using CMAKE_ARGS: ${CMAKE_ARGS}"
|
||||
fi
|
||||
cd /root/mscclpp && pip3 install .
|
||||
|
||||
cd /root/mscclpp
|
||||
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
|
||||
pip3 install ".[cuda11,benchmark,test]"
|
||||
elif [[ "${CUDA_VERSION}" == *"12."* ]]; then
|
||||
pip3 install ".[cuda12,benchmark,test]"
|
||||
elif [[ "${CUDA_VERSION}" == *"13."* ]]; then
|
||||
pip3 install ".[cuda13,benchmark,test]"
|
||||
elif [ "${PLATFORM}" == "rocm" ]; then
|
||||
pip3 install ".[rocm6,benchmark,test]"
|
||||
else
|
||||
pip3 install ".[benchmark,test]"
|
||||
fi
|
||||
pip3 install setuptools_scm
|
||||
python3 -m setuptools_scm --force-write-version-files
|
||||
|
||||
|
||||
@@ -13,6 +13,10 @@ constexpr auto cudaSuccess = hipSuccess;
|
||||
|
||||
#include <iostream>
|
||||
|
||||
// Exit code 2 indicates CUDA initialization failure (e.g., driver/toolkit mismatch).
|
||||
// This allows callers to distinguish it from other failures and retry with compat libs.
|
||||
constexpr int EXIT_CUDA_INIT_FAILURE = 2;
|
||||
|
||||
#define CUDACHECK(cmd) \
|
||||
do { \
|
||||
cudaError_t e = cmd; \
|
||||
@@ -25,7 +29,11 @@ constexpr auto cudaSuccess = hipSuccess;
|
||||
int main() {
|
||||
bool canAccessPeerAll = true;
|
||||
int devCount = 0;
|
||||
CUDACHECK(cudaGetDeviceCount(&devCount));
|
||||
cudaError_t err = cudaGetDeviceCount(&devCount);
|
||||
if (err != cudaSuccess) {
|
||||
std::cerr << "Failed: cudaGetDeviceCount(&devCount) returned " << err << std::endl;
|
||||
return EXIT_CUDA_INIT_FAILURE;
|
||||
}
|
||||
std::cout << "Detected " << devCount << " device(s)" << std::endl;
|
||||
if (devCount >= 2) {
|
||||
for (int i = 0; i < devCount; ++i) {
|
||||
|
||||
Reference in New Issue
Block a user