diff --git a/.azure-pipelines/templates/ut-no-ib-env.yaml b/.azure-pipelines/templates/ut-no-ib-env.yaml index e6576f6d..0d97f9fc 100644 --- a/.azure-pipelines/templates/ut-no-ib-env.yaml +++ b/.azure-pipelines/templates/ut-no-ib-env.yaml @@ -16,7 +16,7 @@ steps: targetType: 'inline' script: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_USE_IB=OFF -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. make -j workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -55,6 +55,51 @@ steps: arguments: single-node-test false workingDirectory: $(System.DefaultWorkingDirectory) +- task: Bash@3 + name: UnitTests + displayName: Run mscclpp unit tests + inputs: + targetType: inline + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ + cd /root/mscclpp; \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + ./build/bin/unit_tests"' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: MpUnitTests + displayName: Run mscclpp multi-process unit tests + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ + export PATH=/usr/local/mpi/bin:\$PATH; \ + cd /root/mscclpp; \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + mpirun --allow-run-as-root -tag-output -np 2 ./build/bin/mp_unit_tests; \ + mpirun --allow-run-as-root -tag-output -np 4 ./build/bin/mp_unit_tests; \ + mpirun --allow-run-as-root -tag-output -np 8 ./build/bin/mp_unit_tests"' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' + - task: Bash@3 name: PyTests displayName: Run pytests @@ -73,7 +118,64 @@ steps: export PATH=/usr/local/mpi/bin:\$PATH \ export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py::test_executor -x"' + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' + kill $CHILD_PID + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: StopContainer + displayName: Stop existing container + inputs: + targetType: 'inline' + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ + "sudo docker stop mscclpp-test || true; sudo docker rm mscclpp-test || true" + rm -f $(System.DefaultWorkingDirectory)/sshkey $(System.DefaultWorkingDirectory)/sshkey.pub + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: BuildWithIb + displayName: Rebuild with IB + inputs: + targetType: 'inline' + script: | + rm -rf build && mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + make -j + workingDirectory: '$(System.DefaultWorkingDirectory)' + +- task: Bash@3 + name: DeployTestEnvWithIb + displayName: Deploy Test Env (with IB build) + inputs: + targetType: filePath + filePath: test/deploy/deploy.sh + arguments: single-node-test false + workingDirectory: $(System.DefaultWorkingDirectory) + +- task: Bash@3 + name: PyTestsWithIbBuildDisableIb + displayName: Run pytests (IB build, IB tests disabled) + inputs: + targetType: inline + script: | + set -e + HOSTFILE=$(System.DefaultWorkingDirectory)/test/deploy/hostfile_ci + SSH_OPTION="StrictHostKeyChecking=no" + KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} + : > azureuser@10.0.0.4 + tail -f azureuser@10.0.0.4 & + CHILD_PID=$! + parallel-ssh -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -o . \ + -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \ + export PATH=/usr/local/mpi/bin:\$PATH \ + export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ + cd /root/mscclpp; \ + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x MSCCLPP_DISABLE_IB_TESTS=1 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' kill $CHILD_PID workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 960f3eae..4aac07e6 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -113,7 +113,7 @@ jobs: gpuArch: '90' - job: UnitTestNoIBEnv - timeoutInMinutes: 40 + timeoutInMinutes: 60 displayName: Test No IB Environment pool: name: msccl-ci-h100 diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index a6899642..6b3119cb 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -162,13 +162,10 @@ def create_connection(group: CommGroup, connection_type: str): def create_group_and_connection(mpi_group: MpiGroup, connection_type: str): if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: pytest.skip("cannot use nvlink/nvls for cross node") + if connection_type == "IB" and os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) - try: - connection = create_connection(group, connection_type) - except Error as e: - if connection_type == "IB" and e.args[0] == ErrorCode.InvalidUsage: - pytest.skip("IB not supported on this node") - raise + connection = create_connection(group, connection_type) return group, connection @@ -281,6 +278,8 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str, @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores(mpi_group: MpiGroup): + if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) tran = group.my_ib_device(group.my_rank % 8) endpoint = EndpointConfig(tran, Device(DeviceType.CPU)) @@ -301,6 +300,8 @@ def test_h2h_semaphores(mpi_group: MpiGroup): @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores_gil_release(mpi_group: MpiGroup): + if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0": + pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1") group = CommGroup(mpi_group.comm) tran = group.my_ib_device(group.my_rank % 8) endpoint = EndpointConfig(tran, Device(DeviceType.CPU)) diff --git a/src/core/ib.cc b/src/core/ib.cc index 2e7b867d..b8854a6e 100644 --- a/src/core/ib.cc +++ b/src/core/ib.cc @@ -636,6 +636,34 @@ MSCCLPP_API_CPP std::string getIBDeviceName(Transport) { return ""; } MSCCLPP_API_CPP Transport getIBTransportByDeviceName(const std::string&) { return Transport::Unknown; } +IbMr::~IbMr() {} +IbMrInfo IbMr::getInfo() const { return IbMrInfo(); } +const void* IbMr::getBuff() const { return nullptr; } +uint32_t IbMr::getLkey() const { return 0; } + +IbQp::~IbQp() {} +void IbQp::rtr(const IbQpInfo& /*info*/) {} +void IbQp::rts() {} +void IbQp::stageSendWrite(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/, + uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/) {} +void IbQp::stageSendAtomicAdd(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint64_t /*wrId*/, uint64_t /*dstOffset*/, + uint64_t /*addVal*/, bool /*signaled*/) {} +void IbQp::stageSendWriteWithImm(const IbMr* /*mr*/, const IbMrInfo& /*info*/, uint32_t /*size*/, uint64_t /*wrId*/, + uint64_t /*srcOffset*/, uint64_t /*dstOffset*/, bool /*signaled*/, + unsigned int /*immData*/) {} +void IbQp::postSend() {} +void IbQp::stageRecv(uint64_t /*wrId*/) {} +void IbQp::stageRecv(const IbMr* /*mr*/, uint64_t /*wrId*/, uint32_t /*size*/, uint64_t /*offset*/) {} +void IbQp::postRecv() {} +int IbQp::pollSendCq() { return 0; } +int IbQp::pollRecvCq() { return 0; } +int IbQp::getSendWcStatus(int /*idx*/) const { return 0; } +std::string IbQp::getSendWcStatusString(int /*idx*/) const { return ""; } +int IbQp::getNumSendCqItems() const { return 0; } +int IbQp::getRecvWcStatus(int /*idx*/) const { return 0; } +std::string IbQp::getRecvWcStatusString(int /*idx*/) const { return ""; } +unsigned int IbQp::getRecvWcImmData(int /*idx*/) const { return 0; } + #endif // !defined(USE_IBVERBS) } // namespace mscclpp