# Copyright (c) Microsoft Corporation. # Licensed under the MIT License. """ Reduce NVLS Test This file tests the executor MULTI_LOAD_REDUCE_STORE operation using NVLS SwitchChannels. Each GPU reduces its chunk via the NVSwitch and broadcasts the result to all other GPUs. """ import argparse from mscclpp.language.channel import * from mscclpp.language.rank import * from mscclpp.language.general import * from mscclpp.language.program import * from mscclpp.language.collectives import * def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size): chunksperloop = 1 collective = AllReduce(gpu_size, chunksperloop, True) with CollectiveProgram( name, collective, gpu_size, instances=1, protocol="Simple", num_threads_per_block=num_threads_per_block, use_double_scratch_buffer=False, min_message_size=min_message_size, max_message_size=max_message_size, ): # Creating Channels nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input) channels = {} for gpu in range(gpu_size): for peer in range(gpu_size): if peer != gpu: channels[(peer, gpu)] = MemoryChannel(peer, gpu) # Synchronization to Ensure all the GPUs are Ready for gpu in range(gpu_size): src_rank = gpu for peer in range(gpu_size): if peer != src_rank: dst_rank = peer channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True) for peer in range(gpu_size): if peer != src_rank: dst_rank = peer channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after) # Reducing and Storing the data for gpu in range(gpu_size): buffer_offset = gpu rank = Rank(gpu) input_buffer = rank.get_input_buffer() nvls_chan.at_rank(gpu).reduce( buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0 ) nvls_chan.at_rank(gpu).broadcast( src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0 ) # Synchronization to Ensure the GPUs finished for gpu in range(gpu_size): src_rank = gpu for peer in range(gpu_size): if peer != src_rank: dst_rank = peer channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before) for peer in range(gpu_size): if peer != src_rank: dst_rank = peer channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True) print(JSON()) parser = argparse.ArgumentParser() parser.add_argument("--name", type=str, help="name of the program") parser.add_argument("--num_gpus", type=int, help="number of gpus") parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") args = parser.parse_args() reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)