Files
mscclpp/python/examples/allgather_allpairs_multinodes_packets.py
Binyang Li 7f3b088744 Add multi-nodes example & update doc (#455)
Documentation update:

*
[`docs/design/mscclpp-dsl.md`](diffhunk://#diff-02a69290fb3e02b8a069bf915fbf5266cfc2ac51c6e9ff8b5b19df51ed909b22L114-R114):
Updated the link to the examples folder to reflect the correct path.

New example script:

*
[`python/examples/allgather_allpairs_multinodes_packets.py`](diffhunk://#diff-ab42c16ecca0680d55b60b82a6913138c5fba4069b9c4493fbe8c72217fe54bcR1-R76):
Added a new example script demonstrating the allgather all-pairs
algorithm across multiple nodes using packet communication.

IR module improvements:

*
[`python/mscclpp/language/ir.py`](diffhunk://#diff-b025796b03fbbd9b2ca9aee2569547efa7a56101743bc4aa05661be0b52aeec9L470-R472):
Refined the sorting criteria for GPU instance channels and thread block
channels to include the channel type, ensuring a more accurate order.
Debugging enhancements:

*
[`src/executor/executor.cc`](diffhunk://#diff-60f7806d111e5cc12ded06358b5d5b09b8521e3858f182d8be81ac05147c535dR439-R441):
Added a debug log to indicate the start of communication collective
execution with details about the execution plan and collective.
*
[`src/include/debug.h`](diffhunk://#diff-24e5fda55e3712277be4bb99b3c348294a77ebd3046bfe716b74bdb32cd203dfR89):
Introduced a new debug log subsystem identifier `MSCCLPP_EXECUTOR` for
logging executor-related information.
2025-01-31 17:52:15 -08:00

75 lines
2.6 KiB
Python

# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
from mscclpp.language import *
from mscclpp.language.collectives import AllGather
from mscclpp.language.buffer import Buffer
from mscclpp.language.types import ChannelType, ReplicationPolicy
def allgather_multinodes_allpair(gpus, gpus_per_node, instances):
"""
Implements a multi-node allgather collective using an allpairs algorithm with MSCCL++ DSL.
@param gpus: Total number of GPUs
@param gpus_per_node: Number of GPUs per node
Steps:
1. Each rank sends a chunk to all other ranks' scratch buffers using packet format.
2. Copy the chunk from the scratch buffer to the output buffer using packet format.
"""
collective = AllGather(gpus, 1, True)
with MSCCLPPProgram(
"allgather_multinodes_allpair",
collective,
gpus,
instances,
protocol="LL",
replication_policy=ReplicationPolicy.interleaved,
num_threads_per_block=1024,
):
for g in range(gpus):
src_rank = g
c = chunk(src_rank, Buffer.input, 0, 1)
for peer in range(1, gpus):
dst_rank = (src_rank + peer) % gpus
tb = dst_rank if dst_rank < src_rank else dst_rank - 1
if src_rank // gpus_per_node == dst_rank // gpus_per_node:
c.put_packet(dst_rank, Buffer.scratch, index=src_rank, sendtb=tb)
else:
c.put_packet(
dst_rank,
Buffer.scratch,
index=src_rank,
sendtb=tb,
chan_type=ChannelType.port,
temp_buffer=Buffer.scratch,
temp_buffer_index=src_rank,
)
# Copying packet from local scratch buffer to local buffer
for g in range(gpus):
src_rank = g
src_offset = src_rank
for peer in range(1, gpus):
dst_rank = (g + peer) % gpus
tb = src_offset if src_offset < dst_rank else src_offset - 1
c = chunk(dst_rank, Buffer.scratch, src_offset, 1)
c.copy_packet(dst_rank, Buffer.output, src_offset, sendtb=tb + gpus - 1)
Json()
Check()
parser = argparse.ArgumentParser()
parser.add_argument("num_gpus", type=int, help="number of gpus")
parser.add_argument("gpus_per_node", type=int, help="number of gpus")
parser.add_argument("instances", type=int, help="number of instances")
args = parser.parse_args()
allgather_multinodes_allpair(
args.num_gpus,
args.gpus_per_node,
args.instances,
)