diff --git a/run-sendrecv2.sh b/run-sendrecv2.sh index 556cc09d..c6fd42de 100755 --- a/run-sendrecv2.sh +++ b/run-sendrecv2.sh @@ -4,9 +4,9 @@ MPI_ARGS="" MPI_ARGS+=" -x CUDA_VISIBLE_DEVICES=1 -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1" MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH" MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so" -MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/:$PATH " +MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin:$PATH " MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_0" -MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/python3 /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/python/test/executor_test.py -path /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/sendrecv.json" +MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3 /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py -path /home/azhpcuser/binyli/mscclpp/test.json" -mpirun -np 2 --hostfile ./hosts --map-by ppr:1:node $MPI_ARGS --size 1K +mpirun -np 4 --hostfile ./hosts --map-by ppr:1:node $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 diff --git a/test.json b/test.json index 4f412033..3b98c1a4 100644 --- a/test.json +++ b/test.json @@ -288,10 +288,10 @@ 1, 1, 1, - 1, - 1, - 1, - 1 + 3, + 3, + 3, + 3 ] } ], @@ -365,8 +365,8 @@ { "channel_type": "port", "channel_ids": [ - 0, - 4 + 4, + 0 ] } ], @@ -432,8 +432,8 @@ { "channel_type": "port", "channel_ids": [ - 1, - 5 + 5, + 1 ] } ], @@ -499,8 +499,8 @@ { "channel_type": "port", "channel_ids": [ - 2, - 6 + 6, + 2 ] } ], @@ -566,8 +566,610 @@ { "channel_type": "port", "channel_ids": [ - 3, - 7 + 7, + 3 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + } + ], + "channels": [ + { + "channel_type": "port", + "connected_to": [ + 2, + 2, + 2, + 2, + 0, + 0, + 0, + 0 + ] + } + ], + "remote_buffers": [ + { + "rank": 2, + "type": "o", + "access_channel_types": [ + "port" + ] + } + ], + "semaphores": [] + }, + { + "id": 2, + "input_chunks": 4, + "output_chunks": 4, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 4, + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 5, + 1 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 2, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 6, + 2 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 3, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 3, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 3, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 7, + 3 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + } + ], + "channels": [ + { + "channel_type": "port", + "connected_to": [ + 3, + 3, + 3, + 3, + 1, + 1, + 1, + 1 + ] + } + ], + "remote_buffers": [ + { + "rank": 3, + "type": "o", + "access_channel_types": [ + "port" + ] + } + ], + "semaphores": [] + }, + { + "id": 3, + "input_chunks": 4, + "output_chunks": 4, + "scratch_chunks": 0, + "threadblocks": [ + { + "id": 0, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 0, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 0, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 4, + 0 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 1, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 1, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 1, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 5, + 1 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 2, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 2, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 2, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 6, + 2 + ] + } + ], + "remote_buffer_refs": [ + { + "access_channel_type": "port", + "remote_buffer_ids": [ + 0 + ] + } + ] + }, + { + "id": 3, + "ops": [ + { + "name": "signal", + "channel_ids": [ + 0 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "nop" + }, + { + "name": "pws", + "src_buff": [ + { + "type": "i", + "index": 3, + "size": 1 + } + ], + "dst_buff": [ + { + "buffer_id": 0, + "index": 3, + "size": 1 + } + ], + "channel_ids": [ + 1 + ], + "channel_type": "port" + }, + { + "name": "wait", + "channel_ids": [ + 0 + ], + "channel_type": "port" + } + ], + "channels": [ + { + "channel_type": "port", + "channel_ids": [ + 7, + 3 ] } ], @@ -589,10 +1191,10 @@ 0, 0, 0, - 0, - 0, - 0, - 0 + 2, + 2, + 2, + 2 ] } ],