Merge mscclpp-lang to mscclpp project (#442)

First step to merge msccl-tools into mscclpp repo. In this step will
move all msccl related code, pass the current tests and do some
necessary refactor.

Add `mscclpp.language` module
Add `_InstructionOptimizer` and `DagOptimizer` class to optimize the dag
Add `DagLower` to lower dag to intermediate representation 
Add documents for mscclpp.language
Remove msccl related code
This commit is contained in:
Binyang Li
2025-01-22 09:47:37 -08:00
committed by GitHub
parent 4ee15b7ad0
commit af0bb86e07
28 changed files with 3417 additions and 18 deletions

View File

@@ -0,0 +1,55 @@
import argparse
from mscclpp.language import *
from mscclpp.language.buffer import Buffer
from mscclpp.language.collectives import AllGather
from mscclpp.language.types import ChannelType, ReplicationPolicy
def allgather_test(gpus, instances):
"""
Demonstrates how to use barrier in the MSCCL++ DSL with an allgather collective.
This example uses an allpairs algorithm for the allgather operation.
Steps:
1. Each rank sends a chunk to all other ranks' output buffers and copies the chunk to its own output buffer.
2. A barrier is called to synchronize the send and copy operations, and signal peers that the data has been sent.
3. Wait for all the chunks from other ranks to be received.
"""
size = gpus
collective = AllGather(size, 1, False)
with MSCCLPPProgram(
"allgather_with_barrier",
collective,
size,
instances,
protocol="Simple",
replication_policy=ReplicationPolicy.interleaved,
):
for n in range(gpus):
c = chunk(n, Buffer.input, 0, 1)
for peer in range(gpus):
if n != peer:
c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
else:
c.copy(n, Buffer.output, n, sendtb=peer)
# explicit barrier
r = rank(n)
r.barrier(tb_list=list(range(gpus)))
for peer in range(gpus):
if n != peer:
c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
for n in range(gpus):
for peer in range(gpus):
c = chunk(n, Buffer.output, peer, 1)
if n != peer:
c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.sm)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
allgather_test(args.num_gpus, args.instances)

View File

@@ -0,0 +1,65 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.collectives import AllReduce
from mscclpp.language.buffer import Buffer
def allreduce_allpairs(gpus, instances, protocol):
"""
Demonstrate allreduce with all pairs algorithm using put semantics.
Steps:
1. Sync all ranks to ensure the data is ready.
2. Each rank reads chunks from all peers and reduces the data.
3. Put the reduced data to all peers.
4. Sync all ranks to ensure the data is received.
"""
size = gpus
chunksperloop = gpus * gpus
collective = AllReduce(size, chunksperloop, True)
with MSCCLPPProgram("allreduce_pairs", collective, size, instances, protocol=protocol):
for rank in range(size):
for tb in range(size):
index = rank * size
c = chunk(rank, Buffer.input, index + tb)
# step1 make sure the data is ready
for nghr in range(size):
peer_index = nghr * size
if rank != nghr:
# signal peer the buffer is ready
c_peer = chunk(rank, Buffer.input, peer_index + tb)
c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
for nghr in range(size):
if rank != nghr:
c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
# step2 reduce the chunks and send to peers
for nghr in range(size):
if rank != nghr:
c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
for nghr in range(size):
if rank != nghr:
c.put(nghr, Buffer.input, index + tb, sendtb=tb)
# step3 signal the peers buffer is ready
for nghr in range(size):
if rank != nghr:
c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
for nghr in range(size):
if rank != nghr:
peer_index = nghr * size
c_peer = chunk(rank, Buffer.input, peer_index + tb)
c_peer.wait(nghr, Buffer.input, peer_index + tb, recvtb=tb)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")
parser.add_argument("--protocol", type=str, default="Simple", choices=["Simple"], help="Protocol")
args = parser.parse_args()
allreduce_allpairs(args.num_gpus, args.instances, args.protocol)

View File

@@ -0,0 +1,78 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.collectives import AllReduce
from mscclpp.language.buffer import Buffer
def allreduce_allpairs(gpus, instances):
"""
AllReduce with all pairs algorithm using get semantics.
Steps:
1. Sync all ranks to ensure the data is ready.
2. Each rank read chunks from all peers and reduces the data.
3. Signal all ranks to notify that the data is ready.
4. Wait for all chunks to be ready, then retrieve the chunks from all peers.
"""
size = gpus
chunksperloop = gpus * gpus
collective = AllReduce(size, chunksperloop, True)
with MSCCLPPProgram(
"allreduce_pairs",
collective,
size,
instances,
protocol="Simple",
):
# Each rank sends the nth chunk to the nth rank into scratch space
for rank in range(size):
for tb in range(size):
index = rank * size
c = chunk(rank, Buffer.input, index + tb)
# make sure the data is ready
for nghr in range(size):
peer_index = nghr * size
if rank != nghr:
c_peer = chunk(rank, Buffer.input, peer_index + tb)
c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
for nghr in range(size):
if rank != nghr:
c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
# reduce the chunks
for i in range(size):
nghr = (rank + i) % size
if rank != nghr:
c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
for nghr in range(size):
if rank != nghr:
c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
# wait for all the chunks is ready, then get the chunks
for rank in range(size):
for tb in range(size):
for nghr in range(size):
if rank != nghr:
index = nghr * size
c = chunk(rank, Buffer.input, index + tb)
c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
for i in range(size):
nghr = (rank + i) % size
index = nghr * size
if rank != nghr:
c = chunk(rank, Buffer.input, index + tb)
c.get(nghr, Buffer.input, index + tb, recvtb=tb)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
allreduce_allpairs(args.num_gpus, args.instances)

View File

@@ -0,0 +1,69 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.collectives import AllReduce
from mscclpp.language.buffer import Buffer
def allreduce_allpairs(gpus, instances):
"""
AllReduce with all pairs algorithm using packets format.
Steps:
1. Each rank sends its nth chunk to the nth rank's scratch space.
2. Each rank performs a local reduction on its nth chunk using data from all other ranks' scratch spaces.
3. Each rank sends the reduced data to all other ranks' scratch spaces.
4. Each rank retrieves the final reduced result from the scratch space.
"""
size = gpus
chunksperloop = gpus * gpus
collective = AllReduce(size, chunksperloop, True)
with MSCCLPPProgram(
"allreduce_packets",
collective,
size,
instances,
protocol="LL",
use_double_scratch_buffer=True,
):
# Each rank sends the nth chunk to the nth rank into scratch space
for r1 in range(size):
for tb in range(size):
if tb == r1:
continue
remote_rank = tb
index = remote_rank * size
c = chunk(r1, Buffer.input, index, size)
c.put_packet(remote_rank, "scratch", index=r1 * size, sendtb=tb)
# Each rank performs a local reduction on the nth chunk
# Utilize 8 threadblocks for this reduction for better parallelism
for r in range(size):
for index in range(size):
c = chunk(r, Buffer.input, r * size + index)
for peer in range(size):
if peer != r:
c.reduce_packet(chunk(r, "scratch", peer * size + index), recvtb=index)
for peer in range(size):
if peer != r:
c.put_packet(peer, "scratch", (size * size) + r * size + index, sendtb=index)
# Each rank get final result from scratch space
for r in range(size):
for peer in range(size):
if peer != r:
c = chunk(r, "scratch", size * size + peer * size, size)
c.copy_packet(r, Buffer.input, peer * size, sendtb=peer)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
allreduce_allpairs(args.num_gpus, args.instances)

View File

@@ -0,0 +1,55 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.collectives import AllReduce
from mscclpp.language.buffer import Buffer
def allreduce_nvls(gpus, instances):
"""
Allreduce via NVLS channel
Steps:
1. Sync all the ranks to make sure the data is ready.
2. Call group_load_reduce to reduce the data.
3. Call group_store to propagate the data to all the ranks.
"""
size = gpus
chunksperloop = gpus
collective = AllReduce(size, chunksperloop, True)
with MSCCLPPProgram(
"allreduce_nvls",
collective,
size,
instances,
):
# Each rank sends the nth chunk to the nth rank into scratch space
for rank in range(size):
index = rank
c = chunk(rank, Buffer.input, index)
reduce_chunks = []
# make sure the data is ready
for nghr in range(size):
if rank != nghr:
c_peer = chunk(nghr, Buffer.input, index)
reduce_chunks.append(c_peer)
c.signal(nghr, Buffer.input, index, sendtb=0)
for nghr in range(size):
if rank != nghr:
c.wait(nghr, Buffer.input, index, recvtb=0)
c = c.group_load_reduce(reduce_chunks, recvtb=0)
ngbrs = [nghr for nghr in range(size) if nghr != rank]
c.group_store(ngbrs, sendtb=0)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
allreduce_nvls(args.num_gpus, args.instances)

View File

@@ -0,0 +1,59 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.collectives import AllReduce
from mscclpp.language.buffer import Buffer
def allreduce_ring(size, instances):
"""
Implements a ring based allreduce.
Steps:
1. Send signal to next rank and wait for signal from previous rank. Make sure the data is ready in previous rank.
2. Reduce the data and send to next rank.
3. After all the data is reduced, propagate the data to all the ranks.
"""
collective = AllReduce(size, size, True)
with MSCCLPPProgram(
f"allreduce_ring",
collective,
size,
instances,
protocol="Simple",
):
# Reduce ring
for step in range(0, size - 1):
for index in range(0, size):
rank = (index + step) % size
next_rank = (index + step + 1) % size
c = chunk(rank, Buffer.input, index)
c.signal(next_rank, Buffer.input, index, 0)
prev_rank = (index + step - 1) % size
c = chunk(rank, Buffer.input, (index + size - 1) % size)
c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0)
c.reduce(chunk(prev_rank, Buffer.input, (index + size - 1) % size), recvtb=0)
# Propagate ring
for step in range(-1, size - 2):
for index in range(0, size):
rank = (index + step) % size
c = chunk(rank, Buffer.input, index)
next_rank = (index + step + 1) % size
c.put(next_rank, Buffer.input, index, sendtb=0)
c.signal(next_rank, Buffer.input, index, 0)
prev_rank = (index + step - 1) % size
c = chunk(rank, Buffer.input, (index + size - 1) % size)
c.wait(prev_rank, Buffer.input, (index + size - 1) % size, 0)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
allreduce_ring(args.num_gpus, args.instances)

View File

@@ -0,0 +1,57 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.collectives import SendRecv
from mscclpp.language.buffer import Buffer
from mscclpp.language.types import ChannelType
def send_recv(instances):
"""
Send and receive data between two ranks using proxy channels, with LL protocol and double scratch buffer.
Steps:
1. Each rank sends a chunk to every other rank's scratch buffer with packet format via proxy channel.
2. Wait for the data to be received, then copy it to the output buffer.
"""
size = 2
chunksperloop = 1
collective = SendRecv(size, chunksperloop, False)
with MSCCLPPProgram(
"send_recv",
collective,
size,
instances,
protocol="LL",
use_double_scratch_buffer=True,
):
for r in range(size):
for nghr in range(size):
if nghr == r:
continue
c = chunk(r, Buffer.input, 0)
c.put_packet(
nghr,
"scratch",
1,
sendtb=0,
chan_type=ChannelType.proxy,
temp_buffer="scratch",
temp_buffer_index=0,
)
for r in range(size):
c = chunk(r, "scratch", 1)
c.copy_packet(r, Buffer.output, 0, sendtb=0)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
send_recv(args.instances)

View File

@@ -0,0 +1,56 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.buffer import Buffer
from mscclpp.language.collectives import SendRecv
from mscclpp.language.types import ChannelType
def send_recv(instances):
"""
Send and receive data between two ranks using proxy channels.
steps:
1. Each rank sends a chunk to the other rank's scratch buffer and signals the other rank that the data has been sent.
2. Wait for the data to be received then copy it to the output buffer.
"""
size = 2
chunksperloop = 1
collective = SendRecv(size, chunksperloop, False)
with MSCCLPPProgram(
"send_recv",
collective,
size,
instances,
):
for r in range(size):
for nghr in range(size):
if nghr == r:
continue
c = chunk(r, Buffer.input, 0)
c.put(
nghr,
"scratch",
1,
sendtb=0,
chan_type=ChannelType.proxy,
)
c.signal(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy)
c.flush(nghr, "scratch", 1, sendtb=0, chan_type=ChannelType.proxy)
for r in range(size):
c = chunk(r, "scratch", 1)
c.wait(1 - r, Buffer.input, 0, recvtb=0, chan_type=ChannelType.proxy)
c.copy(r, Buffer.output, 0, sendtb=0)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
send_recv(args.instances)