mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
for 4 nodes
This commit is contained in:
@@ -61,6 +61,26 @@ def bench_time(n_iters: int, n_graph_iters: int, func: Union[Callable, list[Call
|
||||
return cp.cuda.get_elapsed_time(start, end) / n_iters * 1000.0 / n_graph_iters
|
||||
|
||||
|
||||
def get_prev_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
|
||||
"""Determine the previous rank in the ring based on the split_mask topology."""
|
||||
group_size = split_mask + 1
|
||||
num_groups = num_ranks // group_size
|
||||
position_in_group = my_rank & split_mask
|
||||
group_id = my_rank // group_size
|
||||
prev_group_id = (group_id - 1 + num_groups) % num_groups
|
||||
return prev_group_id * group_size + position_in_group
|
||||
|
||||
|
||||
def get_next_rank(my_rank: int, num_ranks: int, split_mask: int) -> int:
|
||||
"""Determine the next rank in the ring based on the split_mask topology."""
|
||||
group_size = split_mask + 1
|
||||
num_groups = num_ranks // group_size
|
||||
position_in_group = my_rank & split_mask
|
||||
group_id = my_rank // group_size
|
||||
next_group_id = (group_id + 1) % num_groups
|
||||
return next_group_id * group_size + position_in_group
|
||||
|
||||
|
||||
def bench_correctness(
|
||||
collective: str,
|
||||
input_buf: Union[cp.ndarray, list[cp.ndarray]],
|
||||
@@ -71,6 +91,7 @@ def bench_correctness(
|
||||
num_ranks: int,
|
||||
n_iters: int,
|
||||
func: Union[Callable, list[Callable]],
|
||||
split_mask: int = 0,
|
||||
):
|
||||
"""Validate correctness. For sendrecv, buffers and func are lists of 2 for double-buffer."""
|
||||
type_size = cp.dtype(parse_dtype(dtype_str)).itemsize
|
||||
@@ -123,6 +144,9 @@ def bench_correctness(
|
||||
+ struct.pack("Q", cur_input.nbytes // type_size)
|
||||
+ pack(num_ranks, rank, i)
|
||||
)
|
||||
if "sendrecv" in collective:
|
||||
prev_rank = get_prev_rank(rank, num_ranks, split_mask)
|
||||
test_data_params += pack(prev_rank)
|
||||
test_data_kernel.launch_kernel(test_data_params, nblocks, nthreads, 0, stream)
|
||||
graph = stream.end_capture()
|
||||
graph.launch(stream)
|
||||
@@ -208,6 +232,7 @@ def main(
|
||||
packet_type: PacketType = PacketType.LL16,
|
||||
n_iters: int = 10,
|
||||
n_graph_iters: int = 10,
|
||||
split_mask: int = 0,
|
||||
):
|
||||
mscclpp_group = CommGroup(MPI.COMM_WORLD)
|
||||
cp.cuda.Device(mscclpp_group.my_rank % mscclpp_group.nranks_per_node).use()
|
||||
@@ -270,6 +295,7 @@ def main(
|
||||
mscclpp_group.nranks,
|
||||
n_iters,
|
||||
executor_funcs if sendrecv_mode else executor_func,
|
||||
split_mask=split_mask,
|
||||
)
|
||||
|
||||
mscclpp_group.barrier()
|
||||
@@ -298,6 +324,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
|
||||
parser.add_argument("--n_iters", type=int, default=10)
|
||||
parser.add_argument("--n_graph_iters", type=int, default=10)
|
||||
parser.add_argument("--split_mask", type=lambda x: int(x, 0), default=0x0, help="split mask for sendrecv (e.g. 0x3)")
|
||||
args = parser.parse_args()
|
||||
|
||||
packet_type = PacketType.LL16
|
||||
@@ -313,4 +340,5 @@ if __name__ == "__main__":
|
||||
packet_type,
|
||||
args.n_iters,
|
||||
args.n_graph_iters,
|
||||
args.split_mask,
|
||||
)
|
||||
|
||||
@@ -122,13 +122,14 @@ TEST_DATA_ALL_TO_ALL(float16, __half)
|
||||
TEST_DATA_ALL_TO_ALL(float32, float)
|
||||
TEST_DATA_ALL_TO_ALL(int32, int)
|
||||
|
||||
// Sendrecv verification: ring receive from prev rank.
|
||||
// Sendrecv verification: receive from prev rank in the ring.
|
||||
// Replays the same PRNG sequence that fill_data used on the sender (prev_rank).
|
||||
// prev_rank is passed explicitly since the ring topology depends on split_mask.
|
||||
#define TEST_DATA_SEND_RECV(FuncNameType, DataType) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_send_recv_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
int peer_rank = (my_rank - 1 + num_ranks) % num_ranks; \
|
||||
unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + peer_rank + seq); \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq, \
|
||||
int prev_rank) { \
|
||||
unsigned int seed = (unsigned int)(blockIdx.x * blockDim.x + threadIdx.x + prev_rank + seq); \
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
|
||||
seed = ranqd1(seed); \
|
||||
test_buf[i] = DataType(seed % blockDim.x) / DataType(blockDim.x); \
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
module load mpi/hpcx #mpi/hpcx-mrc #mpi/hpcx-mrc-2.23.1
|
||||
|
||||
MPI_ARGS=""
|
||||
MPI_ARGS+=" -x CUDA_VISIBLE_DEVICES=1 -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
|
||||
MPI_ARGS+=" -mca coll_hcoll_enable 0 --mca coll ^ucc,hcoll --mca btl tcp,vader,self --mca pml ob1 --mca oob_tcp_if_include enP22p1s0f1 --mca btl_tcp_if_include enP22p1s0f1"
|
||||
MPI_ARGS+=" -x MSCCLPP_IBV_SO=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/libibverbs.so -x UCX_NET_DEVICES=enP22p1s0f1 -x LD_LIBRARY_PATH=/opt/microsoft/mrc/Azure-Compute-AI-HPC-Perf-verbs-mrc/mrc-header-lib:$LD_LIBRARY_PATH"
|
||||
MPI_ARGS+=" -x MSCCLPP_SOCKET_IFNAME=enP22p1s0f1 -x MSCCLPP_IBV_MODE=host-no-atomic -x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
|
||||
MPI_ARGS+=" -x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 -x PATH=/home/azhpcuser/binyli/mscclpp_venv/bin:$PATH "
|
||||
MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_0"
|
||||
MPI_ARGS+=" -x MSCCLPP_LOG_LEVEL=ERROR -x MSCCLPP_DEBUG=ERROR -x MSCCLPP_IB_GID_INDEX=3 -x MSCCLPP_HCA_DEVICES=mlx5_1,mlx5_0,mlx5_3,mlx5_2"
|
||||
MPI_ARGS+=" /home/azhpcuser/binyli/mscclpp_venv/bin/python3 /home/azhpcuser/binyli/mscclpp/python/test/executor_test.py -path /home/azhpcuser/binyli/mscclpp/test.json"
|
||||
|
||||
|
||||
mpirun -np 4 --hostfile ./hosts --map-by ppr:1:node $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5
|
||||
mpirun -np 16 --hostfile ./hosts --map-by ppr:4:node $MPI_ARGS --size 1G --n_iters 20 --n_graph_iters 5 --split_mask 0x3
|
||||
|
||||
Reference in New Issue
Block a user