mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 01:36:10 +00:00
Merge mscclpp-lang to mscclpp project (#442)
First step to merge msccl-tools into mscclpp repo. In this step will move all msccl related code, pass the current tests and do some necessary refactor. Add `mscclpp.language` module Add `_InstructionOptimizer` and `DagOptimizer` class to optimize the dag Add `DagLower` to lower dag to intermediate representation Add documents for mscclpp.language Remove msccl related code
This commit is contained in:
65
python/examples/allreduce_allpairs.py
Normal file
65
python/examples/allreduce_allpairs.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.collectives import AllReduce
|
||||
from mscclpp.language.buffer import Buffer
|
||||
|
||||
|
||||
def allreduce_allpairs(gpus, instances, protocol):
|
||||
"""
|
||||
Demonstrate allreduce with all pairs algorithm using put semantics.
|
||||
Steps:
|
||||
1. Sync all ranks to ensure the data is ready.
|
||||
2. Each rank reads chunks from all peers and reduces the data.
|
||||
3. Put the reduced data to all peers.
|
||||
4. Sync all ranks to ensure the data is received.
|
||||
"""
|
||||
size = gpus
|
||||
chunksperloop = gpus * gpus
|
||||
collective = AllReduce(size, chunksperloop, True)
|
||||
with MSCCLPPProgram("allreduce_pairs", collective, size, instances, protocol=protocol):
|
||||
for rank in range(size):
|
||||
for tb in range(size):
|
||||
index = rank * size
|
||||
c = chunk(rank, Buffer.input, index + tb)
|
||||
# step1 make sure the data is ready
|
||||
for nghr in range(size):
|
||||
peer_index = nghr * size
|
||||
if rank != nghr:
|
||||
# signal peer the buffer is ready
|
||||
c_peer = chunk(rank, Buffer.input, peer_index + tb)
|
||||
c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
|
||||
# step2 reduce the chunks and send to peers
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.put(nghr, Buffer.input, index + tb, sendtb=tb)
|
||||
# step3 signal the peers buffer is ready
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
peer_index = nghr * size
|
||||
c_peer = chunk(rank, Buffer.input, peer_index + tb)
|
||||
c_peer.wait(nghr, Buffer.input, peer_index + tb, recvtb=tb)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
parser.add_argument("--protocol", type=str, default="Simple", choices=["Simple"], help="Protocol")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
|
||||
Reference in New Issue
Block a user