mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
92 lines
3.4 KiB
Python
92 lines
3.4 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT License.
|
|
|
|
"""
|
|
Reduce NVLS Test
|
|
|
|
This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
|
|
NVLS SwitchChannels. Each GPU reduces its chunk via the
|
|
NVSwitch and broadcasts the result to all other GPUs.
|
|
"""
|
|
|
|
import argparse
|
|
from mscclpp.language.channel import *
|
|
from mscclpp.language.rank import *
|
|
from mscclpp.language.general import *
|
|
from mscclpp.language.program import *
|
|
from mscclpp.language.collectives import *
|
|
|
|
|
|
def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
|
|
chunksperloop = 1
|
|
collective = AllReduce(gpu_size, chunksperloop, True)
|
|
with CollectiveProgram(
|
|
name,
|
|
collective,
|
|
gpu_size,
|
|
instances=1,
|
|
protocol="Simple",
|
|
num_threads_per_block=num_threads_per_block,
|
|
use_double_scratch_buffer=False,
|
|
min_message_size=min_message_size,
|
|
max_message_size=max_message_size,
|
|
):
|
|
# Creating Channels
|
|
nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
|
|
channels = {}
|
|
for gpu in range(gpu_size):
|
|
for peer in range(gpu_size):
|
|
if peer != gpu:
|
|
channels[(peer, gpu)] = MemoryChannel(peer, gpu)
|
|
|
|
# Synchronization to Ensure all the GPUs are Ready
|
|
for gpu in range(gpu_size):
|
|
src_rank = gpu
|
|
for peer in range(gpu_size):
|
|
if peer != src_rank:
|
|
dst_rank = peer
|
|
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
|
|
for peer in range(gpu_size):
|
|
if peer != src_rank:
|
|
dst_rank = peer
|
|
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
|
|
|
|
# Reducing and Storing the data
|
|
for gpu in range(gpu_size):
|
|
buffer_offset = gpu
|
|
rank = Rank(gpu)
|
|
input_buffer = rank.get_input_buffer()
|
|
nvls_chan.at_rank(gpu).reduce(
|
|
buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
|
|
)
|
|
nvls_chan.at_rank(gpu).broadcast(
|
|
src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
|
|
)
|
|
|
|
# Synchronization to Ensure the GPUs finished
|
|
for gpu in range(gpu_size):
|
|
src_rank = gpu
|
|
for peer in range(gpu_size):
|
|
if peer != src_rank:
|
|
dst_rank = peer
|
|
channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
|
|
for peer in range(gpu_size):
|
|
if peer != src_rank:
|
|
dst_rank = peer
|
|
channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
|
|
|
|
print(JSON())
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("--name", type=str, help="name of the program")
|
|
parser.add_argument("--num_gpus", type=int, help="number of gpus")
|
|
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
|
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
|
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
|
|
|
args = parser.parse_args()
|
|
|
|
reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
|