mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-19 22:39:11 +00:00
Documentation update: * [`docs/design/mscclpp-dsl.md`](diffhunk://#diff-02a69290fb3e02b8a069bf915fbf5266cfc2ac51c6e9ff8b5b19df51ed909b22L114-R114): Updated the link to the examples folder to reflect the correct path. New example script: * [`python/examples/allgather_allpairs_multinodes_packets.py`](diffhunk://#diff-ab42c16ecca0680d55b60b82a6913138c5fba4069b9c4493fbe8c72217fe54bcR1-R76): Added a new example script demonstrating the allgather all-pairs algorithm across multiple nodes using packet communication. IR module improvements: * [`python/mscclpp/language/ir.py`](diffhunk://#diff-b025796b03fbbd9b2ca9aee2569547efa7a56101743bc4aa05661be0b52aeec9L470-R472): Refined the sorting criteria for GPU instance channels and thread block channels to include the channel type, ensuring a more accurate order. Debugging enhancements: * [`src/executor/executor.cc`](diffhunk://#diff-60f7806d111e5cc12ded06358b5d5b09b8521e3858f182d8be81ac05147c535dR439-R441): Added a debug log to indicate the start of communication collective execution with details about the execution plan and collective. * [`src/include/debug.h`](diffhunk://#diff-24e5fda55e3712277be4bb99b3c348294a77ebd3046bfe716b74bdb32cd203dfR89): Introduced a new debug log subsystem identifier `MSCCLPP_EXECUTOR` for logging executor-related information.
75 lines
2.6 KiB
Python
75 lines
2.6 KiB
Python
# Copyright (c) Microsoft Corporation.
|
|
# Licensed under the MIT License.
|
|
|
|
import argparse
|
|
from mscclpp.language import *
|
|
from mscclpp.language.collectives import AllGather
|
|
from mscclpp.language.buffer import Buffer
|
|
from mscclpp.language.types import ChannelType, ReplicationPolicy
|
|
|
|
|
|
def allgather_multinodes_allpair(gpus, gpus_per_node, instances):
|
|
"""
|
|
Implements a multi-node allgather collective using an allpairs algorithm with MSCCL++ DSL.
|
|
@param gpus: Total number of GPUs
|
|
@param gpus_per_node: Number of GPUs per node
|
|
Steps:
|
|
1. Each rank sends a chunk to all other ranks' scratch buffers using packet format.
|
|
2. Copy the chunk from the scratch buffer to the output buffer using packet format.
|
|
"""
|
|
collective = AllGather(gpus, 1, True)
|
|
with MSCCLPPProgram(
|
|
"allgather_multinodes_allpair",
|
|
collective,
|
|
gpus,
|
|
instances,
|
|
protocol="LL",
|
|
replication_policy=ReplicationPolicy.interleaved,
|
|
num_threads_per_block=1024,
|
|
):
|
|
for g in range(gpus):
|
|
src_rank = g
|
|
c = chunk(src_rank, Buffer.input, 0, 1)
|
|
for peer in range(1, gpus):
|
|
dst_rank = (src_rank + peer) % gpus
|
|
tb = dst_rank if dst_rank < src_rank else dst_rank - 1
|
|
if src_rank // gpus_per_node == dst_rank // gpus_per_node:
|
|
c.put_packet(dst_rank, Buffer.scratch, index=src_rank, sendtb=tb)
|
|
else:
|
|
c.put_packet(
|
|
dst_rank,
|
|
Buffer.scratch,
|
|
index=src_rank,
|
|
sendtb=tb,
|
|
chan_type=ChannelType.port,
|
|
temp_buffer=Buffer.scratch,
|
|
temp_buffer_index=src_rank,
|
|
)
|
|
|
|
# Copying packet from local scratch buffer to local buffer
|
|
for g in range(gpus):
|
|
src_rank = g
|
|
src_offset = src_rank
|
|
for peer in range(1, gpus):
|
|
dst_rank = (g + peer) % gpus
|
|
tb = src_offset if src_offset < dst_rank else src_offset - 1
|
|
c = chunk(dst_rank, Buffer.scratch, src_offset, 1)
|
|
c.copy_packet(dst_rank, Buffer.output, src_offset, sendtb=tb + gpus - 1)
|
|
|
|
Json()
|
|
Check()
|
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("num_gpus", type=int, help="number of gpus")
|
|
parser.add_argument("gpus_per_node", type=int, help="number of gpus")
|
|
parser.add_argument("instances", type=int, help="number of instances")
|
|
|
|
args = parser.parse_args()
|
|
|
|
allgather_multinodes_allpair(
|
|
args.num_gpus,
|
|
args.gpus_per_node,
|
|
args.instances,
|
|
)
|