From 57f7be62602c0a6a68cc6c607af6bc7ccce504d7 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 11 Apr 2026 05:28:29 +0000 Subject: [PATCH] WIP --- python/mscclpp/default_algos/send_recv.py | 2 +- run_onenode.sh | 4 +-- test.json | 42 +++++++---------------- 3 files changed, 16 insertions(+), 32 deletions(-) diff --git a/python/mscclpp/default_algos/send_recv.py b/python/mscclpp/default_algos/send_recv.py index 2127eb91..08a49ad2 100644 --- a/python/mscclpp/default_algos/send_recv.py +++ b/python/mscclpp/default_algos/send_recv.py @@ -21,7 +21,7 @@ def send_recv_test(name, nnodes, gpus_per_node, split_mask): use_double_scratch_buffer=False, min_message_size=0, max_message_size=2**64 - 1, - instances=4 + instances=1 ): # Creating separate port channels for next and prev directions. # When prev and next are the same peer (e.g., 2-node ring), both channels go to the same peer diff --git a/run_onenode.sh b/run_onenode.sh index 6e7541d1..50b49e12 100755 --- a/run_onenode.sh +++ b/run_onenode.sh @@ -5,9 +5,9 @@ MPI_ARGS="" MPI_ARGS+="-x CUDA_VISIBLE_DEVICES=0,2 --mca coll ^ucc,hcoll -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 " MPI_ARGS+="-x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH" MPI_ARGS+=" -x MSCCLPP_IBV_MODE=host -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so" -MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_3 -x PATH=/home/azhpcuser/mahdieh/mscclpp/mscclpp2/bin/:$PATH " +MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_3 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin/:$PATH " MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR -x MSCCLPP_IB_GID_INDEX=3" -MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp/mscclpp/bin/python3 /home/azhpcuser/mahdieh/mscclpp/python/test/executor_test.py -path /home/azhpcuser/mahdieh/mscclpp/test.json" +MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3 /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py -path /home/azhpcuser/binyli/mscclpp/test.json" diff --git a/test.json b/test.json index 294c2a13..511b7907 100644 --- a/test.json +++ b/test.json @@ -1,6 +1,6 @@ { - "name": "send_recv_test", - "collective": "test", + "name": "sendrecv", + "collective": "sendrecv", "protocol": "Simple", "inplace": false, "reuse_resources": false, @@ -24,7 +24,7 @@ { "name": "wait", "channel_ids": [ - 0 + 1 ], "channel_type": "port" }, @@ -32,7 +32,7 @@ "name": "nop" }, { - "name": "put", + "name": "pws", "src_buff": [ { "type": "i", @@ -48,17 +48,7 @@ } ], "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "signal", - "channel_ids": [ - 0 + 1 ], "channel_type": "port" }, @@ -74,6 +64,7 @@ { "channel_type": "port", "channel_ids": [ + 1, 0 ] } @@ -92,6 +83,7 @@ { "channel_type": "port", "connected_to": [ + 1, 1 ] } @@ -126,7 +118,7 @@ { "name": "wait", "channel_ids": [ - 0 + 1 ], "channel_type": "port" }, @@ -134,7 +126,7 @@ "name": "nop" }, { - "name": "put", + "name": "pws", "src_buff": [ { "type": "i", @@ -150,17 +142,7 @@ } ], "channel_ids": [ - 0 - ], - "channel_type": "port" - }, - { - "name": "nop" - }, - { - "name": "signal", - "channel_ids": [ - 0 + 1 ], "channel_type": "port" }, @@ -176,7 +158,8 @@ { "channel_type": "port", "channel_ids": [ - 0 + 0, + 1 ] } ], @@ -194,6 +177,7 @@ { "channel_type": "port", "connected_to": [ + 0, 0 ] }