# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import argparse
from mscclpp.language import *
from mscclpp.language.collectives import AllReduce
from mscclpp.language.buffer import Buffer


def allreduce_nvls(gpus, instances):
    """
    Allreduce via NVLS channel
    Steps:
    1. Sync all the ranks to make sure the data is ready.
    2. Call group_load_reduce to reduce the data.
    3. Call group_store to propagate the data to all the ranks.
    """
    size = gpus
    chunksperloop = gpus
    collective = AllReduce(size, chunksperloop, True)
    with MSCCLPPProgram(
        "allreduce_nvls",
        collective,
        size,
        instances,
    ):
        # Each rank sends the nth chunk to the nth rank into scratch space
        for rank in range(size):
            index = rank
            c = chunk(rank, Buffer.input, index)
            reduce_chunks = []
            # make sure the data is ready
            for nghr in range(size):
                if rank != nghr:
                    c_peer = chunk(nghr, Buffer.input, index)
                    reduce_chunks.append(c_peer)
                    c.signal(nghr, Buffer.input, index, sendtb=0)
            for nghr in range(size):
                if rank != nghr:
                    c.wait(nghr, Buffer.input, index, recvtb=0)
            c = c.group_load_reduce(reduce_chunks, recvtb=0)
            ngbrs = [nghr for nghr in range(size) if nghr != rank]
            c.group_store(ngbrs, sendtb=0)

        Json()
        Check()


parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")

args = parser.parse_args()

allreduce_nvls(args.num_gpus, args.instances)