diff --git a/python/mscclpp/ext/ep/buffer.py b/python/mscclpp/ext/ep/buffer.py index 3164048f..c747f750 100644 --- a/python/mscclpp/ext/ep/buffer.py +++ b/python/mscclpp/ext/ep/buffer.py @@ -17,8 +17,9 @@ Current status (see ``src/ext/ep/README.md``): * Internode HT (MSCCL++ PortChannel + MemoryChannel) dispatch and combine: ported and validated on 2 nodes x 8 H100 GPUs with ``test/python/ext/ep/test_internode_multirank.py``. -* Internode low-latency kernels: structural port (NVSHMEM/IBGDA -> - MSCCL++ PortChannel), **untested on multi-node H100**. +* Internode low-latency kernels (NVSHMEM/IBGDA -> MSCCL++ PortChannel): + ported and validated on 2 nodes x 8 H100 GPUs with + ``test/python/ext/ep/test_low_latency_multirank.py``. """ from __future__ import annotations diff --git a/src/ext/ep/event.hpp b/src/ext/ep/event.hpp index 1cfc828e..d5a77526 100644 --- a/src/ext/ep/event.hpp +++ b/src/ext/ep/event.hpp @@ -1,5 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +#pragma once #include #include diff --git a/src/ext/ep/kernels/internode_ll.cu b/src/ext/ep/kernels/internode_ll.cu index 47b3a279..613f9e63 100644 --- a/src/ext/ep/kernels/internode_ll.cu +++ b/src/ext/ep/kernels/internode_ll.cu @@ -22,9 +22,9 @@ // position in the connected-peer map. In the recommended 1-GPU-per-node // LL topology, `peer_idx == dst_rank`; see src/ext/ep/README.md. // -// WARNING: This port is untested on multi-node H100; performance will NOT -// match IBGDA (host-proxy adds latency). Functional correctness needs -// validation on real hardware. +// Validated on 2 nodes x 8 H100 GPUs via +// `test/python/ext/ep/test_low_latency_multirank.py`. Performance does NOT +// match IBGDA (host-proxy adds latency); see README for measurements. #include "configs.cuh" #include "exception.cuh"