From a191f16b76dbb0b27b94484fd468a741f9f73e5b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 17 Mar 2026 20:06:15 +0000 Subject: [PATCH] add scripts --- generate-json.sh | 18 ++++++++++++++++++ run.sh | 15 +++++++++++++++ run_onenode.sh | 14 ++++++++++++++ 3 files changed, 47 insertions(+) create mode 100755 generate-json.sh create mode 100755 run.sh create mode 100755 run_onenode.sh diff --git a/generate-json.sh b/generate-json.sh new file mode 100755 index 00000000..25c21b14 --- /dev/null +++ b/generate-json.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -ex + +# Check if the number of arguments is exactly 1 +if [ "$#" -ne 3 ]; then + echo "Usage: $0 " + exit 1 +fi + +HOSTFILE=$1 +NNODES=$2 +PPN=$3 + +parallel-scp -h "$HOSTFILE" -p32 -t1800 -r python/test/executor_test.py /home/azhpcuser/mahdieh/mscclpp/python/test/ + +parallel-scp -h "$HOSTFILE" -p32 -t1800 -r python/mscclpp/default_algos/mscclpp_send_recv.py /home/azhpcuser/mahdieh/mscclpp/python/mscclpp/default_algos/ + +parallel-ssh -h "$HOSTFILE" -p32 -i -t1800 "cd /home/azhpcuser/mahdieh/mscclpp && source mscclpp/bin/activate && python3 python/mscclpp/default_algos/mscclpp_send_recv.py --name send_recv_test --nnodes $NNODES --gpus_per_node $PPN --split_mask 0x3 > test.json " diff --git a/run.sh b/run.sh new file mode 100755 index 00000000..1d603f26 --- /dev/null +++ b/run.sh @@ -0,0 +1,15 @@ + +module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1 + +MPI_ARGS="" +MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1" +MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH" +MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so" +MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/mahdieh/mscclpp/mscclpp2/bin/:$PATH " +MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2" +MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp/mscclpp/bin/python3 /home/azhpcuser/mahdieh/mscclpp/python/test/executor_test.py -path /home/azhpcuser/mahdieh/mscclpp/test.json" + + +mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node $MPI_ARGS --size 1G --n_iters 30 #--n_graph_iters 100 + +#mpirun -np 8 --hostfile /home/azhpcuser/binyli/hostfile --map-by ppr:4:node -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH -x MSCCLPP_IBV_MODE=host-no-atomic -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2 -x PATH=/home/azhpcuser/binyli/mscclpp/bin:$PATH -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=WARN -x MSCCLPP_IB_GID_INDEX=3 /home/azhpcuser/binyli/mscclpp/bin/python3 /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py -path /home/azhpcuser/binyli/mscclpp/test.json --size 1G --n_iters 30 diff --git a/run_onenode.sh b/run_onenode.sh new file mode 100755 index 00000000..6e7541d1 --- /dev/null +++ b/run_onenode.sh @@ -0,0 +1,14 @@ + +module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1 + +MPI_ARGS="" +MPI_ARGS+="-x CUDA_VISIBLE_DEVICES=0,2 --mca coll ^ucc,hcoll -mca coll_hcoll_enable 0 --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1 " +MPI_ARGS+="-x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH" +MPI_ARGS+=" -x MSCCLPP_IBV_MODE=host -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so" +MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_3 -x PATH=/home/azhpcuser/mahdieh/mscclpp/mscclpp2/bin/:$PATH " +MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR -x MSCCLPP_IB_GID_INDEX=3" +MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp/mscclpp/bin/python3 /home/azhpcuser/mahdieh/mscclpp/python/test/executor_test.py -path /home/azhpcuser/mahdieh/mscclpp/test.json" + + + +mpirun -np 2 $MPI_ARGS --size 4K --n_iters 500 --n_graph_iters 100