wip

2026-05-12 01:10:22 +00:00 · 2026-05-12 00:54:55 +00:00
parent 406cfcbe4e
commit 2c269be58c
1 changed files with 11 additions and 0 deletions
--- a/python/mscclpp/default_algos/allreduce_multi_nodes.py
+++ b/python/mscclpp/default_algos/allreduce_multi_nodes.py
@@ -56,6 +56,17 @@ def allreduce_multi_nodes(spec: AlgoSpec, thread_block_group_size: int) -> Colle
                )
            )

+        # Scratch buffer layout (3 contiguous regions):
+        #   Region 1 [0, total_gpus):
+        #     Intra-node reduce-scatter. Each GPU receives chunks from gpus_per_node peers,
+        #     packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots.
+        #   Region 2 [total_gpus, total_gpus + num_nodes * packets_per_gpu):
+        #     Inter-node exchange. Each GPU receives reduced chunks from num_nodes nodes,
+        #     packets_per_gpu each → num_nodes * packets_per_gpu slots.
+        #   Region 3 [total_gpus + num_nodes * packets_per_gpu, end):
+        #     Intra-node broadcast. Each GPU receives final reduced data from gpus_per_node peers,
+        #     packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots.
+        # Total = 2 * total_gpus + num_nodes * packets_per_gpu
        scratch_buffer_size = 2 * total_gpus + packets_per_gpu * num_nodes
        for node_id in range(num_nodes):
            for local_gpu_id in range(gpus_per_node):