for 4 nodes

2026-05-11 17:00:22 +00:00 · 2026-04-13 20:52:52 +00:00
parent 36abcbedd3
commit a2a1b89181
4 changed files with 3677 additions and 36 deletions
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -61,6 +61,26 @@ def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Call
    return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters


+def get_prev_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
+    """Determine the previous rank in the ring based on the split_mask topology."""
+    group_size = split_mask + 1
+    num_groups = num_ranks // group_size
+    position_in_group = my_rank & split_mask
+    group_id = my_rank // group_size
+    prev_group_id = (group_id - 1 + num_groups) % num_groups
+    return prev_group_id * group_size + position_in_group
+
+
+def get_next_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
+    """Determine the next rank in the ring based on the split_mask topology."""
+    group_size = split_mask + 1
+    num_groups = num_ranks // group_size
+    position_in_group = my_rank & split_mask
+    group_id = my_rank // group_size
+    next_group_id = (group_id + 1) % num_groups
+    return next_group_id * group_size + position_in_group
+
+
 def bench_correctness(
    collective: str,
    input_buf: Union[cp.ndarray, list[cp.ndarray]],
@@ -71,6 +91,7 @@ def bench_correctness(
    num_ranks: int,
    n_iters: int,
    func: Union[Callable, list[Callable]],
+    split_mask: int = 0,
 ):
    """Validate correctness. For sendrecv, buffers and func are lists of 2 for double-buffer."""
    type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
@@ -123,6 +144,9 @@ def bench_correctness(
                + struct.pack("Q", cur_input.nbytes // type_size)
                + pack(num_ranks, rank, i)
            )
+            if "sendrecv" in collective:
+                prev_rank = get_prev_rank(rank, num_ranks, split_mask)
+                test_data_params += pack(prev_rank)
            test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
        graph = stream.end_capture()
    graph.launch(stream)
@@ -208,6 +232,7 @@ def main(
    packet_type: PacketType = PacketType.LL16,
    n_iters: int = 10,
    n_graph_iters: int = 10,
+    split_mask: int = 0,
 ):
    mscclpp_group = CommGroup(MPI.COMM_WORLD)
    cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
@@ -270,6 +295,7 @@ def main(
        mscclpp_group.nranks,
        n_iters,
        executor_funcs if sendrecv_mode else executor_func,
+        split_mask=split_mask,
    )

    mscclpp_group.barrier()
@@ -298,6 +324,7 @@ if __name__ == "__main__":
    parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
    parser.add_argument("--n_iters", type=int, default=10)
    parser.add_argument("--n_graph_iters", type=int, default=10)
+    parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x0, help="split mask for sendrecv (e.g. 0x3)")
    args = parser.parse_args()

    packet_type = PacketType.LL16
@@ -313,4 +340,5 @@ if __name__ == "__main__":
        packet_type,
        args.n_iters,
        args.n_graph_iters,
+        args.split_mask,
    )
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -122,13 +122,14 @@ TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
 TEST_DATA_ALL_TO_ALL(int32, int)

-// Sendrecv verification: ring receive from prev rank.
+// Sendrecv verification: receive from prev rank in the ring.
 // Replays the same PRNG sequence that fill_data used on the sender (prev_rank).
+// prev_rank is passed explicitly since the ring topology depends on split_mask.
 #define TEST_DATA_SEND_RECV(FuncNameType, DataType)                                                       \
  extern "C" __global__ void __launch_bounds__(1024, 1) test_data_send_recv_##FuncNameType(               \
-      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {  \
-    int peer_rank = (my_rank - 1 + num_ranks) % num_ranks;                                                \
-    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq);          \
+      DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq,    \
+      int prev_rank) {                                                                                    \
+    unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq);          \
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
      seed = ranqd1(seed);                                                                                \
      test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x);                                  \
--- a/run-sendrecv2.sh
+++ b/run-sendrecv2.sh
@@ -1,12 +1,12 @@
 module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1

 MPI_ARGS=""
-MPI_ARGS+=" -x CUDA_VISIBLE_DEVICES=1 -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
+MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1   --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
 MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
 MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic  -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
 MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin:$PATH "
-MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_0"
+MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR  -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2"
 MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3   /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py   -path /home/azhpcuser/binyli/mscclpp/test.json"


-mpirun -np 4 --hostfile ./hosts --map-by ppr:1:node  $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 
+mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node  $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 --split_mask 0x3
--- a/test.json
+++ b/test.json