mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-06-08 15:30:41 +00:00
Add CI pipeline for no-IB environment testing (#755)
## Summary Add CI pipeline support for testing in environments without InfiniBand (IB) hardware. ## Changes ### IB stubs for no-IB builds (`src/core/ib.cc`) - Added stub implementations for `IbMr` and `IbQp` classes in the `#else // !defined(USE_IBVERBS)` block so the library links successfully when built with `-DMSCCLPP_USE_IB=OFF`. ### Environment variable to disable IB tests (`MSCCLPP_DISABLE_IB_TESTS`) - Added `disableIbTests` field to the `Env` class (`include/mscclpp/env.hpp`, `src/core/env.cpp`), reading from `MSCCLPP_DISABLE_IB_TESTS` env var. - Exposed as `disable_ib_tests` in Python bindings (`python/csrc/env_py.cpp`). - Updated `python/test/test_mscclpp.py` to skip IB-dependent tests (`create_group_and_connection` with IB transport, `test_h2h_semaphores`, `test_h2h_semaphores_gil_release`) when `env().disable_ib_tests` is true. ### CI pipeline (`ut-no-ib-env.yaml`, `ut.yml`) The no-IB environment pipeline runs two phases: 1. **No-IB build phase**: Build with `-DMSCCLPP_USE_IB=OFF`, deploy, run unit tests, multi-process unit tests, and pytests (with `MSCCLPP_DISABLE_IB_TESTS=1`). 2. **IB build phase**: Rebuild with IB enabled (default), stop the existing container, redeploy, and run pytests (with `MSCCLPP_DISABLE_IB_TESTS=1`) — verifying that the full IB-enabled build works correctly in a non-IB environment when IB tests are skipped. Also increased the job timeout from 40 to 60 minutes to accommodate the two-phase pipeline.
This commit is contained in:
@@ -162,13 +162,10 @@ def create_connection(group: CommGroup, connection_type: str):
|
||||
def create_group_and_connection(mpi_group: MpiGroup, connection_type: str):
|
||||
if (connection_type == "NVLink" or connection_type == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
|
||||
pytest.skip("cannot use nvlink/nvls for cross node")
|
||||
if connection_type == "IB" and os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
|
||||
pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
|
||||
group = CommGroup(mpi_group.comm)
|
||||
try:
|
||||
connection = create_connection(group, connection_type)
|
||||
except Error as e:
|
||||
if connection_type == "IB" and e.args[0] == ErrorCode.InvalidUsage:
|
||||
pytest.skip("IB not supported on this node")
|
||||
raise
|
||||
connection = create_connection(group, connection_type)
|
||||
return group, connection
|
||||
|
||||
|
||||
@@ -281,6 +278,8 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, connection_type: str,
|
||||
|
||||
@parametrize_mpi_groups(2, 4, 8, 16)
|
||||
def test_h2h_semaphores(mpi_group: MpiGroup):
|
||||
if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
|
||||
pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
|
||||
group = CommGroup(mpi_group.comm)
|
||||
tran = group.my_ib_device(group.my_rank % 8)
|
||||
endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
|
||||
@@ -301,6 +300,8 @@ def test_h2h_semaphores(mpi_group: MpiGroup):
|
||||
|
||||
@parametrize_mpi_groups(2, 4, 8, 16)
|
||||
def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
|
||||
if os.environ.get("MSCCLPP_DISABLE_IB_TESTS", "0") != "0":
|
||||
pytest.skip("IB tests are disabled via MSCCLPP_DISABLE_IB_TESTS=1")
|
||||
group = CommGroup(mpi_group.comm)
|
||||
tran = group.my_ib_device(group.my_rank % 8)
|
||||
endpoint = EndpointConfig(tran, Device(DeviceType.CPU))
|
||||
|
||||
Reference in New Issue
Block a user