mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-06-06 07:52:00 +00:00
Merge mscclpp-lang to mscclpp project (#442)
First step to merge msccl-tools into mscclpp repo. In this step will move all msccl related code, pass the current tests and do some necessary refactor. Add `mscclpp.language` module Add `_InstructionOptimizer` and `DagOptimizer` class to optimize the dag Add `DagLower` to lower dag to intermediate representation Add documents for mscclpp.language Remove msccl related code
This commit is contained in:
55
python/examples/allgather_barrier.py
Normal file
55
python/examples/allgather_barrier.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.buffer import Buffer
|
||||
from mscclpp.language.collectives import AllGather
|
||||
from mscclpp.language.types import ChannelType, ReplicationPolicy
|
||||
|
||||
|
||||
def allgather_test(gpus, instances):
|
||||
"""
|
||||
Demonstrates how to use barrier in the MSCCL++ DSL with an allgather collective.
|
||||
This example uses an allpairs algorithm for the allgather operation.
|
||||
Steps:
|
||||
1. Each rank sends a chunk to all other ranks' output buffers and copies the chunk to its own output buffer.
|
||||
2. A barrier is called to synchronize the send and copy operations, and signal peers that the data has been sent.
|
||||
3. Wait for all the chunks from other ranks to be received.
|
||||
"""
|
||||
size = gpus
|
||||
collective = AllGather(size, 1, False)
|
||||
with MSCCLPPProgram(
|
||||
"allgather_with_barrier",
|
||||
collective,
|
||||
size,
|
||||
instances,
|
||||
protocol="Simple",
|
||||
replication_policy=ReplicationPolicy.interleaved,
|
||||
):
|
||||
for n in range(gpus):
|
||||
c = chunk(n, Buffer.input, 0, 1)
|
||||
for peer in range(gpus):
|
||||
if n != peer:
|
||||
c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
|
||||
else:
|
||||
c.copy(n, Buffer.output, n, sendtb=peer)
|
||||
# explicit barrier
|
||||
r = rank(n)
|
||||
r.barrier(tb_list=list(range(gpus)))
|
||||
for peer in range(gpus):
|
||||
if n != peer:
|
||||
c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
|
||||
|
||||
for n in range(gpus):
|
||||
for peer in range(gpus):
|
||||
c = chunk(n, Buffer.output, peer, 1)
|
||||
if n != peer:
|
||||
c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.sm)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
args = parser.parse_args()
|
||||
allgather_test(args.num_gpus, args.instances)
|
||||
65
python/examples/allreduce_allpairs.py
Normal file
65
python/examples/allreduce_allpairs.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.collectives import AllReduce
|
||||
from mscclpp.language.buffer import Buffer
|
||||
|
||||
|
||||
def allreduce_allpairs(gpus, instances, protocol):
|
||||
"""
|
||||
Demonstrate allreduce with all pairs algorithm using put semantics.
|
||||
Steps:
|
||||
1. Sync all ranks to ensure the data is ready.
|
||||
2. Each rank reads chunks from all peers and reduces the data.
|
||||
3. Put the reduced data to all peers.
|
||||
4. Sync all ranks to ensure the data is received.
|
||||
"""
|
||||
size = gpus
|
||||
chunksperloop = gpus * gpus
|
||||
collective = AllReduce(size, chunksperloop, True)
|
||||
with MSCCLPPProgram("allreduce_pairs", collective, size, instances, protocol=protocol):
|
||||
for rank in range(size):
|
||||
for tb in range(size):
|
||||
index = rank * size
|
||||
c = chunk(rank, Buffer.input, index + tb)
|
||||
# step1 make sure the data is ready
|
||||
for nghr in range(size):
|
||||
peer_index = nghr * size
|
||||
if rank != nghr:
|
||||
# signal peer the buffer is ready
|
||||
c_peer = chunk(rank, Buffer.input, peer_index + tb)
|
||||
c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
|
||||
# step2 reduce the chunks and send to peers
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.put(nghr, Buffer.input, index + tb, sendtb=tb)
|
||||
# step3 signal the peers buffer is ready
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
peer_index = nghr * size
|
||||
c_peer = chunk(rank, Buffer.input, peer_index + tb)
|
||||
c_peer.wait(nghr, Buffer.input, peer_index + tb, recvtb=tb)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
parser.add_argument("--protocol", type=str, default="Simple", choices=["Simple"], help="Protocol")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
|
||||
78
python/examples/allreduce_allpairs_get.py
Normal file
78
python/examples/allreduce_allpairs_get.py
Normal file
@@ -0,0 +1,78 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.collectives import AllReduce
|
||||
from mscclpp.language.buffer import Buffer
|
||||
|
||||
|
||||
def allreduce_allpairs(gpus, instances):
|
||||
"""
|
||||
AllReduce with all pairs algorithm using get semantics.
|
||||
Steps:
|
||||
1. Sync all ranks to ensure the data is ready.
|
||||
2. Each rank read chunks from all peers and reduces the data.
|
||||
3. Signal all ranks to notify that the data is ready.
|
||||
4. Wait for all chunks to be ready, then retrieve the chunks from all peers.
|
||||
"""
|
||||
size = gpus
|
||||
chunksperloop = gpus * gpus
|
||||
collective = AllReduce(size, chunksperloop, True)
|
||||
with MSCCLPPProgram(
|
||||
"allreduce_pairs",
|
||||
collective,
|
||||
size,
|
||||
instances,
|
||||
protocol="Simple",
|
||||
):
|
||||
|
||||
# Each rank sends the nth chunk to the nth rank into scratch space
|
||||
for rank in range(size):
|
||||
for tb in range(size):
|
||||
index = rank * size
|
||||
c = chunk(rank, Buffer.input, index + tb)
|
||||
# make sure the data is ready
|
||||
for nghr in range(size):
|
||||
peer_index = nghr * size
|
||||
if rank != nghr:
|
||||
c_peer = chunk(rank, Buffer.input, peer_index + tb)
|
||||
c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
|
||||
# reduce the chunks
|
||||
for i in range(size):
|
||||
nghr = (rank + i) % size
|
||||
if rank != nghr:
|
||||
c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
|
||||
|
||||
# wait for all the chunks is ready, then get the chunks
|
||||
for rank in range(size):
|
||||
for tb in range(size):
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
index = nghr * size
|
||||
c = chunk(rank, Buffer.input, index + tb)
|
||||
c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
|
||||
for i in range(size):
|
||||
nghr = (rank + i) % size
|
||||
index = nghr * size
|
||||
if rank != nghr:
|
||||
c = chunk(rank, Buffer.input, index + tb)
|
||||
c.get(nghr, Buffer.input, index + tb, recvtb=tb)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
allreduce_allpairs(args.num_gpus, args.instances)
|
||||
69
python/examples/allreduce_allpairs_packet.py
Normal file
69
python/examples/allreduce_allpairs_packet.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.collectives import AllReduce
|
||||
from mscclpp.language.buffer import Buffer
|
||||
|
||||
|
||||
def allreduce_allpairs(gpus, instances):
|
||||
"""
|
||||
AllReduce with all pairs algorithm using packets format.
|
||||
Steps:
|
||||
1. Each rank sends its nth chunk to the nth rank's scratch space.
|
||||
2. Each rank performs a local reduction on its nth chunk using data from all other ranks' scratch spaces.
|
||||
3. Each rank sends the reduced data to all other ranks' scratch spaces.
|
||||
4. Each rank retrieves the final reduced result from the scratch space.
|
||||
"""
|
||||
size = gpus
|
||||
chunksperloop = gpus * gpus
|
||||
collective = AllReduce(size, chunksperloop, True)
|
||||
with MSCCLPPProgram(
|
||||
"allreduce_packets",
|
||||
collective,
|
||||
size,
|
||||
instances,
|
||||
protocol="LL",
|
||||
use_double_scratch_buffer=True,
|
||||
):
|
||||
# Each rank sends the nth chunk to the nth rank into scratch space
|
||||
for r1 in range(size):
|
||||
for tb in range(size):
|
||||
if tb == r1:
|
||||
continue
|
||||
remote_rank = tb
|
||||
index = remote_rank * size
|
||||
c = chunk(r1, Buffer.input, index, size)
|
||||
c.put_packet(remote_rank, "scratch", index=r1 * size, sendtb=tb)
|
||||
|
||||
# Each rank performs a local reduction on the nth chunk
|
||||
# Utilize 8 threadblocks for this reduction for better parallelism
|
||||
for r in range(size):
|
||||
for index in range(size):
|
||||
c = chunk(r, Buffer.input, r * size + index)
|
||||
for peer in range(size):
|
||||
if peer != r:
|
||||
c.reduce_packet(chunk(r, "scratch", peer * size + index), recvtb=index)
|
||||
for peer in range(size):
|
||||
if peer != r:
|
||||
c.put_packet(peer, "scratch", (size * size) + r * size + index, sendtb=index)
|
||||
|
||||
# Each rank get final result from scratch space
|
||||
for r in range(size):
|
||||
for peer in range(size):
|
||||
if peer != r:
|
||||
c = chunk(r, "scratch", size * size + peer * size, size)
|
||||
c.copy_packet(r, Buffer.input, peer * size, sendtb=peer)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
allreduce_allpairs(args.num_gpus, args.instances)
|
||||
55
python/examples/allreduce_nvls.py
Normal file
55
python/examples/allreduce_nvls.py
Normal file
@@ -0,0 +1,55 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.collectives import AllReduce
|
||||
from mscclpp.language.buffer import Buffer
|
||||
|
||||
|
||||
def allreduce_nvls(gpus, instances):
|
||||
"""
|
||||
Allreduce via NVLS channel
|
||||
Steps:
|
||||
1. Sync all the ranks to make sure the data is ready.
|
||||
2. Call group_load_reduce to reduce the data.
|
||||
3. Call group_store to propagate the data to all the ranks.
|
||||
"""
|
||||
size = gpus
|
||||
chunksperloop = gpus
|
||||
collective = AllReduce(size, chunksperloop, True)
|
||||
with MSCCLPPProgram(
|
||||
"allreduce_nvls",
|
||||
collective,
|
||||
size,
|
||||
instances,
|
||||
):
|
||||
# Each rank sends the nth chunk to the nth rank into scratch space
|
||||
for rank in range(size):
|
||||
index = rank
|
||||
c = chunk(rank, Buffer.input, index)
|
||||
reduce_chunks = []
|
||||
# make sure the data is ready
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c_peer = chunk(nghr, Buffer.input, index)
|
||||
reduce_chunks.append(c_peer)
|
||||
c.signal(nghr, Buffer.input, index, sendtb=0)
|
||||
for nghr in range(size):
|
||||
if rank != nghr:
|
||||
c.wait(nghr, Buffer.input, index, recvtb=0)
|
||||
c = c.group_load_reduce(reduce_chunks, recvtb=0)
|
||||
ngbrs = [nghr for nghr in range(size) if nghr != rank]
|
||||
c.group_store(ngbrs, sendtb=0)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
allreduce_nvls(args.num_gpus, args.instances)
|
||||
59
python/examples/allreduce_ring.py
Normal file
59
python/examples/allreduce_ring.py
Normal file
@@ -0,0 +1,59 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.collectives import AllReduce
|
||||
from mscclpp.language.buffer import Buffer
|
||||
|
||||
|
||||
def allreduce_ring(size, instances):
|
||||
"""
|
||||
Implements a ring based allreduce.
|
||||
Steps:
|
||||
1. Send signal to next rank and wait for signal from previous rank. Make sure the data is ready in previous rank.
|
||||
2. Reduce the data and send to next rank.
|
||||
3. After all the data is reduced, propagate the data to all the ranks.
|
||||
"""
|
||||
collective = AllReduce(size, size, True)
|
||||
with MSCCLPPProgram(
|
||||
f"allreduce_ring",
|
||||
collective,
|
||||
size,
|
||||
instances,
|
||||
protocol="Simple",
|
||||
):
|
||||
# Reduce ring
|
||||
for step in range(0, size - 1):
|
||||
for index in range(0, size):
|
||||
rank = (index + step) % size
|
||||
next_rank = (index + step + 1) % size
|
||||
c = chunk(rank, Buffer.input, index)
|
||||
c.signal(next_rank, Buffer.input, index, 0)
|
||||
prev_rank = (index + step - 1) % size
|
||||
c = chunk(rank, Buffer.input, (index + size - 1) % size)
|
||||
c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0)
|
||||
c.reduce(chunk(prev_rank, Buffer.input, (index + size - 1) % size), recvtb=0)
|
||||
|
||||
# Propagate ring
|
||||
for step in range(-1, size - 2):
|
||||
for index in range(0, size):
|
||||
rank = (index + step) % size
|
||||
c = chunk(rank, Buffer.input, index)
|
||||
next_rank = (index + step + 1) % size
|
||||
c.put(next_rank, Buffer.input, index, sendtb=0)
|
||||
c.signal(next_rank, Buffer.input, index, 0)
|
||||
prev_rank = (index + step - 1) % size
|
||||
c = chunk(rank, Buffer.input, (index + size - 1) % size)
|
||||
c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
args = parser.parse_args()
|
||||
|
||||
allreduce_ring(args.num_gpus, args.instances)
|
||||
57
python/examples/send_recv_packet.py
Normal file
57
python/examples/send_recv_packet.py
Normal file
@@ -0,0 +1,57 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.collectives import SendRecv
|
||||
from mscclpp.language.buffer import Buffer
|
||||
from mscclpp.language.types import ChannelType
|
||||
|
||||
|
||||
def send_recv(instances):
|
||||
"""
|
||||
Send and receive data between two ranks using proxy channels, with LL protocol and double scratch buffer.
|
||||
Steps:
|
||||
1. Each rank sends a chunk to every other rank's scratch buffer with packet format via proxy channel.
|
||||
2. Wait for the data to be received, then copy it to the output buffer.
|
||||
"""
|
||||
size = 2
|
||||
chunksperloop = 1
|
||||
collective = SendRecv(size, chunksperloop, False)
|
||||
with MSCCLPPProgram(
|
||||
"send_recv",
|
||||
collective,
|
||||
size,
|
||||
instances,
|
||||
protocol="LL",
|
||||
use_double_scratch_buffer=True,
|
||||
):
|
||||
for r in range(size):
|
||||
for nghr in range(size):
|
||||
if nghr == r:
|
||||
continue
|
||||
c = chunk(r, Buffer.input, 0)
|
||||
c.put_packet(
|
||||
nghr,
|
||||
"scratch",
|
||||
1,
|
||||
sendtb=0,
|
||||
chan_type=ChannelType.proxy,
|
||||
temp_buffer="scratch",
|
||||
temp_buffer_index=0,
|
||||
)
|
||||
|
||||
for r in range(size):
|
||||
c = chunk(r, "scratch", 1)
|
||||
c.copy_packet(r, Buffer.output, 0, sendtb=0)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
send_recv(args.instances)
|
||||
56
python/examples/send_recv_proxy.py
Normal file
56
python/examples/send_recv_proxy.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
from mscclpp.language import *
|
||||
from mscclpp.language.buffer import Buffer
|
||||
from mscclpp.language.collectives import SendRecv
|
||||
from mscclpp.language.types import ChannelType
|
||||
|
||||
|
||||
def send_recv(instances):
|
||||
"""
|
||||
Send and receive data between two ranks using proxy channels.
|
||||
steps:
|
||||
1. Each rank sends a chunk to the other rank's scratch buffer and signals the other rank that the data has been sent.
|
||||
2. Wait for the data to be received then copy it to the output buffer.
|
||||
"""
|
||||
size = 2
|
||||
chunksperloop = 1
|
||||
collective = SendRecv(size, chunksperloop, False)
|
||||
with MSCCLPPProgram(
|
||||
"send_recv",
|
||||
collective,
|
||||
size,
|
||||
instances,
|
||||
):
|
||||
for r in range(size):
|
||||
for nghr in range(size):
|
||||
if nghr == r:
|
||||
continue
|
||||
c = chunk(r, Buffer.input, 0)
|
||||
c.put(
|
||||
nghr,
|
||||
"scratch",
|
||||
1,
|
||||
sendtb=0,
|
||||
chan_type=ChannelType.proxy,
|
||||
)
|
||||
c.signal(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy)
|
||||
c.flush(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy)
|
||||
|
||||
for r in range(size):
|
||||
c = chunk(r, "scratch", 1)
|
||||
c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.proxy)
|
||||
c.copy(r, Buffer.output, 0, sendtb=0)
|
||||
|
||||
Json()
|
||||
Check()
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("instances", type=int, help="number of instances")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
send_recv(args.instances)
|
||||
Reference in New Issue
Block a user