From 184dcbf9d774ff8ced74a53d24e596900333d321 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 24 Feb 2026 15:55:59 -0800 Subject: [PATCH] Add CI pipeline for no-IB environment testing (#755) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Add CI pipeline support for testing in environments without InfiniBand (IB) hardware. ## Changes ### IB stubs for no-IB builds (`src/core/ib.cc`) - Added stub implementations for `IbMr` and `IbQp` classes in the `#else // !defined(USE_IBVERBS)` block so the library links successfully when built with `-DMSCCLPP_USE_IB=OFF`. ### Environment variable to disable IB tests (`MSCCLPP_DISABLE_IB_TESTS`) - Added `disableIbTests` field to the `Env` class (`include/mscclpp/env.hpp`, `src/core/env.cpp`), reading from `MSCCLPP_DISABLE_IB_TESTS` env var. - Exposed as `disable_ib_tests` in Python bindings (`python/csrc/env_py.cpp`). - Updated `python/test/test_mscclpp.py` to skip IB-dependent tests (`create_group_and_connection` with IB transport, `test_h2h_semaphores`, `test_h2h_semaphores_gil_release`) when `env().disable_ib_tests` is true. ### CI pipeline (`ut-no-ib-env.yaml`, `ut.yml`) The no-IB environment pipeline runs two phases: 1. **No-IB build phase**: Build with `-DMSCCLPP_USE_IB=OFF`, deploy, run unit tests, multi-process unit tests, and pytests (with `MSCCLPP_DISABLE_IB_TESTS=1`). 2. **IB build phase**: Rebuild with IB enabled (default), stop the existing container, redeploy, and run pytests (with `MSCCLPP_DISABLE_IB_TESTS=1`) — verifying that the full IB-enabled build works correctly in a non-IB environment when IB tests are skipped. Also increased the job timeout from 40 to 60 minutes to accommodate the two-phase pipeline. --- .azure-pipelines/templates/ut-no-ib-env.yaml | 106 ++++++++++++++++++- .azure-pipelines/ut.yml | 2 +- python/test/test_mscclpp.py | 13 +-- src/core/ib.cc | 28 +++++ 4 files changed, 140 insertions(+), 9 deletions(-) diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml index e6576f6d..0d97f9fc 100644 --- a/.azure-pipelines/templates/ut-no-ib-env.yaml +++ b/.azure-pipelines/templates/ut-no-ib-env.yaml @@ -16,7 +16,7 @@ steps: targetType: 'inline' script: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. make -j workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -55,6 +55,51 @@ steps: arguments: single-node-test false workingDirectory: $(System.DefaultWorkingDirectory) +- task: Bash@3 + name: UnitTests + displayName: Run mscclpp unit tests + inputs: + targetType: inline + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ + cd /root/mscclpp; \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + ./build/bin/unit_tests"' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ + export PATH=/usr/local/mpi/bin:\$PATH; \ + cd /root/mscclpp; \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \ + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \ + mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 name: PyTests displayName: Run pytests @@ -73,7 +118,64 @@ steps: export PATH=/usr/local/mpi/bin:\$PATH \ export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"' + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: StopContainer + displayName: Stop existing container + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true" + rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: BuildWithIb + displayName: Rebuild with IB + inputs: + targetType: 'inline' + script: | + rm -rf build && mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: DeployTestEnvWithIb + displayName: Deploy Test Env (with IB build) + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: single-node-test false + workingDirectory: $(System.DefaultWorkingDirectory) + +- task: Bash@3 + name: PyTestsWithIbBuildDisableIb + displayName: Run pytests (IB build, IB tests disabled) + inputs: + targetType: inline + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ + export PATH=/usr/local/mpi/bin:\$PATH \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + cd /root/mscclpp; \ + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' kill $CHILD_PID workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 960f3eae..4aac07e6 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -113,7 +113,7 @@ jobs: gpuArch: '90' - job: UnitTestNoIBEnv - timeoutInMinutes: 40 + timeoutInMinutes: 60 displayName: Test No IB Environment pool: name: msccl-ci-h100 diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index a6899642..6b3119cb 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -162,13 +162,10 @@ def create_connection(group: CommGroup, connection_type: str): def create_group_and_connection(mpi_group: MpiGroup, connection_type: str): if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: pytest.skip("cannot use nvlink/nvls for cross node") + if connection_type == "IB" and os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) - try: - connection = create_connection(group, connection_type) - except Error as e: - if connection_type == "IB" and e.args[0] == ErrorCode.InvalidUsage: - pytest.skip("IB not supported on this node") - raise + connection = create_connection(group, connection_type) return group, connection @@ -281,6 +278,8 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str, @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores(mpi_group: MpiGroup): + if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) tran = group.my_ib_device(group.my_rank % 8) endpoint = EndpointConfig(tran, Device(DeviceType.CPU)) @@ -301,6 +300,8 @@ def test_h2h_semaphores(mpi_group: MpiGroup): @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores_gil_release(mpi_group: MpiGroup): + if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) tran = group.my_ib_device(group.my_rank % 8) endpoint = EndpointConfig(tran, Device(DeviceType.CPU)) diff --git a/src/core/ib.cc b/src/core/ib.cc index 2e7b867d..b8854a6e 100644 --- a/src/core/ib.cc +++ b/src/core/ib.cc @@ -636,6 +636,34 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport) { return ""; } MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string&) { return Transport::Unknown; } +IbMr::~IbMr() {} +IbMrInfo IbMr::getInfo() const { return IbMrInfo(); } +const void* IbMr::getBuff() const { return nullptr; } +uint32_t IbMr::getLkey() const { return 0; } + +IbQp::~IbQp() {} +void IbQp::rtr(const IbQpInfo& /*info*/) {} +void IbQp::rts() {} +void IbQp::stageSendWrite(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/, + uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/) {} +void IbQp::stageSendAtomicAdd(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint64_t /*wrId*/, uint64_t /*dstOffset*/, + uint64_t /*addVal*/, bool /*signaled*/) {} +void IbQp::stageSendWriteWithImm(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/, + uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/, + unsigned int /*immData*/) {} +void IbQp::postSend() {} +void IbQp::stageRecv(uint64_t /*wrId*/) {} +void IbQp::stageRecv(const IbMr* /*mr*/, uint64_t /*wrId*/, uint32_t /*size*/, uint64_t /*offset*/) {} +void IbQp::postRecv() {} +int IbQp::pollSendCq() { return 0; } +int IbQp::pollRecvCq() { return 0; } +int IbQp::getSendWcStatus(int /*idx*/) const { return 0; } +std::string IbQp::getSendWcStatusString(int /*idx*/) const { return ""; } +int IbQp::getNumSendCqItems() const { return 0; } +int IbQp::getRecvWcStatus(int /*idx*/) const { return 0; } +std::string IbQp::getRecvWcStatusString(int /*idx*/) const { return ""; } +unsigned int IbQp::getRecvWcImmData(int /*idx*/) const { return 0; } + #endif // !defined(USE_IBVERBS) } // namespace mscclpp