This commit is contained in:
Ubuntu
2026-04-11 06:40:19 +00:00
parent 456ef7e5ba
commit 36abcbedd3
2 changed files with 621 additions and 19 deletions

View File

@@ -4,9 +4,9 @@ MPI_ARGS=""
MPI_ARGS+=" -x CUDA_VISIBLE_DEVICES=1 -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/:$PATH "
MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin:$PATH "
MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_0"
MPI_ARGS+=" /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/mscclpp/bin/python3 /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/python/test/executor_test.py -path /home/azhpcuser/mahdieh/mscclpp-unittest/mscclpp/sendrecv.json"
MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3 /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py -path /home/azhpcuser/binyli/mscclpp/test.json"
mpirun -np 2 --hostfile ./hosts --map-by ppr:1:node $MPI_ARGS --size 1K
mpirun -np 4 --hostfile ./hosts --map-by ppr:1:node $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5

634
test.json
View File

@@ -288,10 +288,10 @@
1,
1,
1,
1,
1,
1,
1
3,
3,
3,
3
]
}
],
@@ -365,8 +365,8 @@
{
"channel_type": "port",
"channel_ids": [
0,
4
4,
0
]
}
],
@@ -432,8 +432,8 @@
{
"channel_type": "port",
"channel_ids": [
1,
5
5,
1
]
}
],
@@ -499,8 +499,8 @@
{
"channel_type": "port",
"channel_ids": [
2,
6
6,
2
]
}
],
@@ -566,8 +566,610 @@
{
"channel_type": "port",
"channel_ids": [
3,
7
7,
3
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
}
],
"channels": [
{
"channel_type": "port",
"connected_to": [
2,
2,
2,
2,
0,
0,
0,
0
]
}
],
"remote_buffers": [
{
"rank": 2,
"type": "o",
"access_channel_types": [
"port"
]
}
],
"semaphores": []
},
{
"id": 2,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 0,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
4,
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
5,
1
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 2,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
6,
2
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 3,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 3,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 3,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
7,
3
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
}
],
"channels": [
{
"channel_type": "port",
"connected_to": [
3,
3,
3,
3,
1,
1,
1,
1
]
}
],
"remote_buffers": [
{
"rank": 3,
"type": "o",
"access_channel_types": [
"port"
]
}
],
"semaphores": []
},
{
"id": 3,
"input_chunks": 4,
"output_chunks": 4,
"scratch_chunks": 0,
"threadblocks": [
{
"id": 0,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 0,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 0,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
4,
0
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 1,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 1,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 1,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
5,
1
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 2,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 2,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 2,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
6,
2
]
}
],
"remote_buffer_refs": [
{
"access_channel_type": "port",
"remote_buffer_ids": [
0
]
}
]
},
{
"id": 3,
"ops": [
{
"name": "signal",
"channel_ids": [
0
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "nop"
},
{
"name": "pws",
"src_buff": [
{
"type": "i",
"index": 3,
"size": 1
}
],
"dst_buff": [
{
"buffer_id": 0,
"index": 3,
"size": 1
}
],
"channel_ids": [
1
],
"channel_type": "port"
},
{
"name": "wait",
"channel_ids": [
0
],
"channel_type": "port"
}
],
"channels": [
{
"channel_type": "port",
"channel_ids": [
7,
3
]
}
],
@@ -589,10 +1191,10 @@
0,
0,
0,
0,
0,
0,
0
2,
2,
2,
2
]
}
],